aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndroid Build Coastguard Worker <android-build-coastguard-worker@google.com>2023-07-03 15:54:04 +0000
committerAndroid Build Coastguard Worker <android-build-coastguard-worker@google.com>2023-07-03 15:54:04 +0000
commit820dd59d13bfc2f3b1ac742672ec56bcc504dfce (patch)
tree425ab8ac9963150604207ffcd751ad6ae7b90056
parent2f5f5fbe486322921ee72ad7410e73cd4cec6790 (diff)
parentb078732699d802725219e0baf76cd6c3cc742513 (diff)
downloadbstr-android14-mainline-adbd-release.tar.gz
Change-Id: I2798205640ee562f450b0a06b054700dd706ee8d
-rw-r--r--.cargo_vcs_info.json7
-rw-r--r--Android.bp15
-rw-r--r--COPYING4
-rw-r--r--Cargo.toml80
-rw-r--r--Cargo.toml.orig45
-rw-r--r--METADATA14
-rw-r--r--README.md72
-rw-r--r--src/ascii.rs23
-rw-r--r--src/bstr.rs32
-rw-r--r--src/bstring.rs46
-rw-r--r--src/byteset/mod.rs3
-rw-r--r--src/byteset/scalar.rs46
-rw-r--r--src/ext_slice.rs691
-rw-r--r--src/ext_vec.rs131
-rw-r--r--src/impls.rs232
-rw-r--r--src/io.rs35
-rw-r--r--src/lib.rs113
-rw-r--r--src/tests.rs2
-rw-r--r--src/unicode/data/GraphemeBreakTest.txt6
-rw-r--r--src/unicode/data/SentenceBreakTest.txt6
-rw-r--r--src/unicode/data/WordBreakTest.txt6
-rw-r--r--src/unicode/fsm/grapheme_break_fwd.bigendian.dfabin10589 -> 10781 bytes
-rw-r--r--src/unicode/fsm/grapheme_break_fwd.littleendian.dfabin10589 -> 10781 bytes
-rw-r--r--src/unicode/fsm/grapheme_break_fwd.rs26
-rw-r--r--src/unicode/fsm/grapheme_break_rev.bigendian.dfabin53905 -> 55271 bytes
-rw-r--r--src/unicode/fsm/grapheme_break_rev.littleendian.dfabin53905 -> 55271 bytes
-rw-r--r--src/unicode/fsm/grapheme_break_rev.rs26
-rw-r--r--src/unicode/fsm/regional_indicator_rev.rs26
-rw-r--r--src/unicode/fsm/sentence_break_fwd.bigendian.dfabin149903 -> 153619 bytes
-rw-r--r--src/unicode/fsm/sentence_break_fwd.littleendian.dfabin149903 -> 153619 bytes
-rw-r--r--src/unicode/fsm/sentence_break_fwd.rs26
-rw-r--r--src/unicode/fsm/simple_word_fwd.bigendian.dfabin8975 -> 9237 bytes
-rw-r--r--src/unicode/fsm/simple_word_fwd.littleendian.dfabin8975 -> 9237 bytes
-rw-r--r--src/unicode/fsm/simple_word_fwd.rs26
-rw-r--r--src/unicode/fsm/whitespace_anchored_fwd.rs26
-rw-r--r--src/unicode/fsm/whitespace_anchored_rev.rs26
-rw-r--r--src/unicode/fsm/word_break_fwd.bigendian.dfabin229739 -> 236309 bytes
-rw-r--r--src/unicode/fsm/word_break_fwd.littleendian.dfabin229739 -> 236309 bytes
-rw-r--r--src/unicode/fsm/word_break_fwd.rs26
-rw-r--r--src/unicode/grapheme.rs44
-rw-r--r--src/unicode/mod.rs10
-rw-r--r--src/unicode/sentence.rs15
-rw-r--r--src/unicode/whitespace.rs6
-rw-r--r--src/unicode/word.rs26
-rw-r--r--src/utf8.rs31
45 files changed, 1259 insertions, 690 deletions
diff --git a/.cargo_vcs_info.json b/.cargo_vcs_info.json
index ef4fb69..ff424fa 100644
--- a/.cargo_vcs_info.json
+++ b/.cargo_vcs_info.json
@@ -1,5 +1,6 @@
{
"git": {
- "sha1": "e38e7a7ca986f9499b30202f49d79e531d14d192"
- }
-}
+ "sha1": "f72910a192f37e85932211bef957fbadaecefbaf"
+ },
+ "path_in_vcs": ""
+} \ No newline at end of file
diff --git a/Android.bp b/Android.bp
index 7fb581b..fef6007 100644
--- a/Android.bp
+++ b/Android.bp
@@ -44,19 +44,24 @@ rust_library {
host_supported: true,
crate_name: "bstr",
cargo_env_compat: true,
- cargo_pkg_version: "0.2.17",
+ cargo_pkg_version: "1.3.0",
srcs: ["src/lib.rs"],
- edition: "2018",
+ edition: "2021",
features: [
+ "alloc",
"default",
- "lazy_static",
- "regex-automata",
"std",
"unicode",
],
rustlibs: [
- "liblazy_static",
"libmemchr",
+ "libonce_cell",
"libregex_automata",
],
+ apex_available: [
+ "//apex_available:platform",
+ "//apex_available:anyapex",
+ ],
+ product_available: true,
+ vendor_available: true,
}
diff --git a/COPYING b/COPYING
index d5a7d7e..e343d38 100644
--- a/COPYING
+++ b/COPYING
@@ -1,8 +1,8 @@
This project is licensed under either of
* Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or
- http://www.apache.org/licenses/LICENSE-2.0)
+ https://www.apache.org/licenses/LICENSE-2.0)
* MIT license ([LICENSE-MIT](LICENSE-MIT) or
- http://opensource.org/licenses/MIT)
+ https://opensource.org/licenses/MIT)
at your option.
diff --git a/Cargo.toml b/Cargo.toml
index 0f206ba..135bd38 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -10,32 +10,77 @@
# See Cargo.toml.orig for the original contents.
[package]
-edition = "2018"
+edition = "2021"
+rust-version = "1.60"
name = "bstr"
-version = "0.2.17"
+version = "1.3.0"
authors = ["Andrew Gallant <jamslam@gmail.com>"]
exclude = ["/.github"]
description = "A string type that is not required to be valid UTF-8."
homepage = "https://github.com/BurntSushi/bstr"
documentation = "https://docs.rs/bstr"
readme = "README.md"
-keywords = ["string", "str", "byte", "bytes", "text"]
-categories = ["text-processing", "encoding"]
+keywords = [
+ "string",
+ "str",
+ "byte",
+ "bytes",
+ "text",
+]
+categories = [
+ "text-processing",
+ "encoding",
+]
license = "MIT OR Apache-2.0"
repository = "https://github.com/BurntSushi/bstr"
+resolver = "2"
+
+[package.metadata.docs.rs]
+all-features = true
+rustdoc-args = [
+ "--cfg",
+ "docsrs",
+]
+
[profile.release]
debug = true
[lib]
bench = false
-[dependencies.lazy_static]
-version = "1.2.0"
-optional = true
+
+[[example]]
+name = "graphemes"
+required-features = [
+ "std",
+ "unicode",
+]
+
+[[example]]
+name = "lines"
+required-features = ["std"]
+
+[[example]]
+name = "uppercase"
+required-features = [
+ "std",
+ "unicode",
+]
+
+[[example]]
+name = "words"
+required-features = [
+ "std",
+ "unicode",
+]
[dependencies.memchr]
version = "2.4.0"
default-features = false
+[dependencies.once_cell]
+version = "1.14.0"
+optional = true
+
[dependencies.regex-automata]
version = "0.1.5"
optional = true
@@ -45,6 +90,7 @@ default-features = false
version = "1.0.85"
optional = true
default-features = false
+
[dev-dependencies.quickcheck]
version = "1"
default-features = false
@@ -56,8 +102,18 @@ version = "0.1.3"
version = "1.2.1"
[features]
-default = ["std", "unicode"]
-serde1 = ["std", "serde1-nostd", "serde/std"]
-serde1-nostd = ["serde"]
-std = ["memchr/std"]
-unicode = ["lazy_static", "regex-automata"]
+alloc = ["serde?/alloc"]
+default = [
+ "std",
+ "unicode",
+]
+serde = ["dep:serde"]
+std = [
+ "alloc",
+ "memchr/std",
+ "serde?/std",
+]
+unicode = [
+ "dep:once_cell",
+ "dep:regex-automata",
+]
diff --git a/Cargo.toml.orig b/Cargo.toml.orig
index cbb6283..ef559d7 100644
--- a/Cargo.toml.orig
+++ b/Cargo.toml.orig
@@ -1,6 +1,6 @@
[package]
name = "bstr"
-version = "0.2.17" #:version
+version = "1.3.0" #:version
authors = ["Andrew Gallant <jamslam@gmail.com>"]
description = "A string type that is not required to be valid UTF-8."
documentation = "https://docs.rs/bstr"
@@ -11,7 +11,9 @@ keywords = ["string", "str", "byte", "bytes", "text"]
license = "MIT OR Apache-2.0"
categories = ["text-processing", "encoding"]
exclude = ["/.github"]
-edition = "2018"
+edition = "2021"
+rust-version = "1.60"
+resolver = "2"
[workspace]
members = ["bench"]
@@ -21,14 +23,14 @@ bench = false
[features]
default = ["std", "unicode"]
-std = ["memchr/std"]
-unicode = ["lazy_static", "regex-automata"]
-serde1 = ["std", "serde1-nostd", "serde/std"]
-serde1-nostd = ["serde"]
+std = ["alloc", "memchr/std", "serde?/std"]
+alloc = ["serde?/alloc"]
+unicode = ["dep:once_cell", "dep:regex-automata"]
+serde = ["dep:serde"]
[dependencies]
memchr = { version = "2.4.0", default-features = false }
-lazy_static = { version = "1.2.0", optional = true }
+once_cell = { version = "1.14.0", optional = true }
regex-automata = { version = "0.1.5", default-features = false, optional = true }
serde = { version = "1.0.85", default-features = false, optional = true }
@@ -37,5 +39,34 @@ quickcheck = { version = "1", default-features = false }
ucd-parse = "0.1.3"
unicode-segmentation = "1.2.1"
+[package.metadata.docs.rs]
+# We want to document all features.
+all-features = true
+# Since this crate's feature setup is pretty complicated, it is worth opting
+# into a nightly unstable option to show the features that need to be enabled
+# for public API items. To do that, we set 'docsrs', and when that's enabled,
+# we enable the 'doc_auto_cfg' feature.
+#
+# To test this locally, run:
+#
+# RUSTDOCFLAGS="--cfg docsrs" cargo +nightly doc --all-features
+rustdoc-args = ["--cfg", "docsrs"]
+
[profile.release]
debug = true
+
+[[example]]
+name = "graphemes"
+required-features = ["std", "unicode"]
+
+[[example]]
+name = "lines"
+required-features = ["std"]
+
+[[example]]
+name = "uppercase"
+required-features = ["std", "unicode"]
+
+[[example]]
+name = "words"
+required-features = ["std", "unicode"]
diff --git a/METADATA b/METADATA
index bfc1d19..587f058 100644
--- a/METADATA
+++ b/METADATA
@@ -1,3 +1,7 @@
+# This project was upgraded with external_updater.
+# Usage: tools/external_updater/updater.sh update rust/crates/bstr
+# For more info, check https://cs.android.com/android/platform/superproject/+/master:tools/external_updater/README.md
+
name: "bstr"
description: "A string type that is not required to be valid UTF-8."
third_party {
@@ -7,13 +11,13 @@ third_party {
}
url {
type: ARCHIVE
- value: "https://static.crates.io/crates/bstr/bstr-0.2.17.crate"
+ value: "https://static.crates.io/crates/bstr/bstr-1.3.0.crate"
}
- version: "0.2.17"
+ version: "1.3.0"
license_type: NOTICE
last_upgrade_date {
- year: 2021
- month: 9
- day: 22
+ year: 2023
+ month: 3
+ day: 2
}
}
diff --git a/README.md b/README.md
index 13bf0fc..080926e 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@ differs from the standard library's `String` and `str` types in that they are
not required to be valid UTF-8, but may be fully or partially valid UTF-8.
[![Build status](https://github.com/BurntSushi/bstr/workflows/ci/badge.svg)](https://github.com/BurntSushi/bstr/actions)
-[![](https://meritbadge.herokuapp.com/bstr)](https://crates.io/crates/bstr)
+[![crates.io](https://img.shields.io/crates/v/bstr.svg)](https://crates.io/crates/bstr)
### Documentation
@@ -17,7 +17,7 @@ https://docs.rs/bstr
### When should I use byte strings?
See this part of the documentation for more details:
-https://docs.rs/bstr/0.2.*/bstr/#when-should-i-use-byte-strings.
+<https://docs.rs/bstr/1.*/bstr/#when-should-i-use-byte-strings>.
The short story is that byte strings are useful when it is inconvenient or
incorrect to require valid UTF-8.
@@ -29,7 +29,7 @@ Add this to your `Cargo.toml`:
```toml
[dependencies]
-bstr = "0.2"
+bstr = "1"
```
@@ -38,13 +38,11 @@ bstr = "0.2"
The following two examples exhibit both the API features of byte strings and
the I/O convenience functions provided for reading line-by-line quickly.
-This first example simply shows how to efficiently iterate over lines in
-stdin, and print out lines containing a particular substring:
+This first example simply shows how to efficiently iterate over lines in stdin,
+and print out lines containing a particular substring:
```rust
-use std::error::Error;
-use std::io::{self, Write};
-
+use std::{error::Error, io::{self, Write}};
use bstr::{ByteSlice, io::BufReadExt};
fn main() -> Result<(), Box<dyn Error>> {
@@ -65,9 +63,7 @@ This example shows how to count all of the words (Unicode-aware) in stdin,
line-by-line:
```rust
-use std::error::Error;
-use std::io;
-
+use std::{error::Error, io};
use bstr::{ByteSlice, io::BufReadExt};
fn main() -> Result<(), Box<dyn Error>> {
@@ -88,9 +84,7 @@ text, this is quite a bit faster than what you can (easily) do with standard
library APIs. (N.B. Any invalid UTF-8 bytes are passed through unchanged.)
```rust
-use std::error::Error;
-use std::io::{self, Write};
-
+use std::{error::Error, io::{self, Write}};
use bstr::{ByteSlice, io::BufReadExt};
fn main() -> Result<(), Box<dyn Error>> {
@@ -113,9 +107,7 @@ clusters) from each line, where invalid UTF-8 sequences are generally treated
as a single character and are passed through correctly:
```rust
-use std::error::Error;
-use std::io::{self, Write};
-
+use std::{error::Error, io::{self, Write}};
use bstr::{ByteSlice, io::BufReadExt};
fn main() -> Result<(), Box<dyn Error>> {
@@ -140,25 +132,27 @@ fn main() -> Result<(), Box<dyn Error>> {
### Cargo features
-This crates comes with a few features that control standard library, serde
-and Unicode support.
+This crates comes with a few features that control standard library, serde and
+Unicode support.
* `std` - **Enabled** by default. This provides APIs that require the standard
- library, such as `Vec<u8>`.
+ library, such as `Vec<u8>` and `PathBuf`. Enabling this feature also enables
+ the `alloc` feature.
+* `alloc` - **Enabled** by default. This provides APIs that require allocations
+ via the `alloc` crate, such as `Vec<u8>`.
* `unicode` - **Enabled** by default. This provides APIs that require sizable
Unicode data compiled into the binary. This includes, but is not limited to,
grapheme/word/sentence segmenters. When this is disabled, basic support such
- as UTF-8 decoding is still included.
-* `serde1` - **Disabled** by default. Enables implementations of serde traits
- for the `BStr` and `BString` types.
-* `serde1-nostd` - **Disabled** by default. Enables implementations of serde
- traits for the `BStr` type only, intended for use without the standard
- library. Generally, you either want `serde1` or `serde1-nostd`, not both.
+ as UTF-8 decoding is still included. Note that currently, enabling this
+ feature also requires enabling the `std` feature. It is expected that this
+ limitation will be lifted at some point.
+* `serde` - Enables implementations of serde traits for `BStr`, and also
+ `BString` when `alloc` is enabled.
### Minimum Rust version policy
-This crate's minimum supported `rustc` version (MSRV) is `1.41.1`.
+This crate's minimum supported `rustc` version (MSRV) is `1.60.0`.
In general, this crate will be conservative with respect to the minimum
supported version of Rust. MSRV may be bumped in minor version releases.
@@ -166,27 +160,27 @@ supported version of Rust. MSRV may be bumped in minor version releases.
### Future work
-Since this is meant to be a core crate, getting a `1.0` release is a priority.
-My hope is to move to `1.0` within the next year and commit to its API so that
-`bstr` can be used as a public dependency.
+Since it is plausible that some of the types in this crate might end up in your
+public API (e.g., `BStr` and `BString`), we will commit to being very
+conservative with respect to new major version releases. It's difficult to say
+precisely how conservative, but unless there is a major issue with the `1.0`
+release, I wouldn't expect a `2.0` release to come out any sooner than some
+period of years.
A large part of the API surface area was taken from the standard library, so
from an API design perspective, a good portion of this crate should be on solid
-ground already. The main differences from the standard library are in how the
-various substring search routines work. The standard library provides generic
+ground. The main differences from the standard library are in how the various
+substring search routines work. The standard library provides generic
infrastructure for supporting different types of searches with a single method,
where as this library prefers to define new methods for each type of search and
drop the generic infrastructure.
Some _probable_ future considerations for APIs include, but are not limited to:
-* A convenience layer on top of the `aho-corasick` crate.
* Unicode normalization.
* More sophisticated support for dealing with Unicode case, perhaps by
combining the use cases supported by [`caseless`](https://docs.rs/caseless)
and [`unicase`](https://docs.rs/unicase).
-* Add facilities for dealing with OS strings and file paths, probably via
- simple conversion routines.
Here are some examples that are _probably_ out of scope for this crate:
@@ -208,16 +202,16 @@ achieved with the standard library `Vec<u8>`/`&[u8]` APIs and the ecosystem of
library crates. For example:
* The standard library's
- [`Utf8Error`](https://doc.rust-lang.org/std/str/struct.Utf8Error.html)
- can be used for incremental lossy decoding of `&[u8]`.
+ [`Utf8Error`](https://doc.rust-lang.org/std/str/struct.Utf8Error.html) can be
+ used for incremental lossy decoding of `&[u8]`.
* The
[`unicode-segmentation`](https://unicode-rs.github.io/unicode-segmentation/unicode_segmentation/index.html)
crate can be used for iterating over graphemes (or words), but is only
implemented for `&str` types. One could use `Utf8Error` above to implement
grapheme iteration with the same semantics as what `bstr` provides (automatic
Unicode replacement codepoint substitution).
-* The [`twoway`](https://docs.rs/twoway) crate can be used for
- fast substring searching on `&[u8]`.
+* The [`twoway`](https://docs.rs/twoway) crate can be used for fast substring
+ searching on `&[u8]`.
So why create `bstr`? Part of the point of the `bstr` crate is to provide a
uniform API of coupled components instead of relying on users to piece together
diff --git a/src/ascii.rs b/src/ascii.rs
index bb2b679..259d41f 100644
--- a/src/ascii.rs
+++ b/src/ascii.rs
@@ -23,18 +23,18 @@ use core::mem;
// means we can effectively skip the _mm_cmpeq_epi8 step and jump straight to
// _mm_movemask_epi8.
-#[cfg(any(test, not(target_arch = "x86_64")))]
+#[cfg(any(test, miri, not(target_arch = "x86_64")))]
const USIZE_BYTES: usize = mem::size_of::<usize>();
-#[cfg(any(test, not(target_arch = "x86_64")))]
+#[cfg(any(test, miri, not(target_arch = "x86_64")))]
const FALLBACK_LOOP_SIZE: usize = 2 * USIZE_BYTES;
// This is a mask where the most significant bit of each byte in the usize
// is set. We test this bit to determine whether a character is ASCII or not.
// Namely, a single byte is regarded as an ASCII codepoint if and only if it's
// most significant bit is not set.
-#[cfg(any(test, not(target_arch = "x86_64")))]
+#[cfg(any(test, miri, not(target_arch = "x86_64")))]
const ASCII_MASK_U64: u64 = 0x8080808080808080;
-#[cfg(any(test, not(target_arch = "x86_64")))]
+#[cfg(any(test, miri, not(target_arch = "x86_64")))]
const ASCII_MASK: usize = ASCII_MASK_U64 as usize;
/// Returns the index of the first non ASCII byte in the given slice.
@@ -42,18 +42,18 @@ const ASCII_MASK: usize = ASCII_MASK_U64 as usize;
/// If slice only contains ASCII bytes, then the length of the slice is
/// returned.
pub fn first_non_ascii_byte(slice: &[u8]) -> usize {
- #[cfg(not(target_arch = "x86_64"))]
+ #[cfg(any(miri, not(target_arch = "x86_64")))]
{
first_non_ascii_byte_fallback(slice)
}
- #[cfg(target_arch = "x86_64")]
+ #[cfg(all(not(miri), target_arch = "x86_64"))]
{
first_non_ascii_byte_sse2(slice)
}
}
-#[cfg(any(test, not(target_arch = "x86_64")))]
+#[cfg(any(test, miri, not(target_arch = "x86_64")))]
fn first_non_ascii_byte_fallback(slice: &[u8]) -> usize {
let align = USIZE_BYTES - 1;
let start_ptr = slice.as_ptr();
@@ -115,7 +115,7 @@ fn first_non_ascii_byte_fallback(slice: &[u8]) -> usize {
}
}
-#[cfg(target_arch = "x86_64")]
+#[cfg(all(not(miri), target_arch = "x86_64"))]
fn first_non_ascii_byte_sse2(slice: &[u8]) -> usize {
use core::arch::x86_64::*;
@@ -221,7 +221,7 @@ unsafe fn first_non_ascii_byte_slow(
/// bytes is not an ASCII byte.
///
/// The position returned is always in the inclusive range [0, 7].
-#[cfg(any(test, not(target_arch = "x86_64")))]
+#[cfg(any(test, miri, not(target_arch = "x86_64")))]
fn first_non_ascii_byte_mask(mask: usize) -> usize {
#[cfg(target_endian = "little")]
{
@@ -245,7 +245,7 @@ unsafe fn ptr_sub(ptr: *const u8, amt: usize) -> *const u8 {
ptr.offset((amt as isize).wrapping_neg())
}
-#[cfg(any(test, not(target_arch = "x86_64")))]
+#[cfg(any(test, miri, not(target_arch = "x86_64")))]
unsafe fn read_unaligned_usize(ptr: *const u8) -> usize {
use core::ptr;
@@ -286,6 +286,7 @@ mod tests {
#[test]
#[cfg(target_arch = "x86_64")]
+ #[cfg(not(miri))]
fn positive_sse2_forward() {
for i in 0..517 {
let b = "a".repeat(i).into_bytes();
@@ -294,6 +295,7 @@ mod tests {
}
#[test]
+ #[cfg(not(miri))]
fn negative_fallback_forward() {
for i in 0..517 {
for align in 0..65 {
@@ -315,6 +317,7 @@ mod tests {
#[test]
#[cfg(target_arch = "x86_64")]
+ #[cfg(not(miri))]
fn negative_sse2_forward() {
for i in 0..517 {
for align in 0..65 {
diff --git a/src/bstr.rs b/src/bstr.rs
index 1e3c91b..5036f06 100644
--- a/src/bstr.rs
+++ b/src/bstr.rs
@@ -1,5 +1,8 @@
use core::mem;
+#[cfg(feature = "alloc")]
+use alloc::boxed::Box;
+
/// A wrapper for `&[u8]` that provides convenient string oriented trait impls.
///
/// If you need ownership or a growable byte string buffer, then use
@@ -33,8 +36,31 @@ pub struct BStr {
}
impl BStr {
+ /// Directly creates a `BStr` slice from anything that can be converted
+ /// to a byte slice.
+ ///
+ /// This is very similar to the [`B`](crate::B) function, except this
+ /// returns a `&BStr` instead of a `&[u8]`.
+ ///
+ /// This is a cost-free conversion.
+ ///
+ /// # Example
+ ///
+ /// You can create `BStr`'s from byte arrays, byte slices or even string
+ /// slices:
+ ///
+ /// ```
+ /// use bstr::BStr;
+ ///
+ /// let a = BStr::new(b"abc");
+ /// let b = BStr::new(&b"abc"[..]);
+ /// let c = BStr::new("abc");
+ ///
+ /// assert_eq!(a, b);
+ /// assert_eq!(a, c);
+ /// ```
#[inline]
- pub(crate) fn new<B: ?Sized + AsRef<[u8]>>(bytes: &B) -> &BStr {
+ pub fn new<'a, B: ?Sized + AsRef<[u8]>>(bytes: &'a B) -> &'a BStr {
BStr::from_bytes(bytes.as_ref())
}
@@ -56,13 +82,13 @@ impl BStr {
}
#[inline]
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
pub(crate) fn from_boxed_bytes(slice: Box<[u8]>) -> Box<BStr> {
unsafe { Box::from_raw(Box::into_raw(slice) as _) }
}
#[inline]
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
pub(crate) fn into_boxed_bytes(slice: Box<BStr>) -> Box<[u8]> {
unsafe { Box::from_raw(Box::into_raw(slice) as _) }
}
diff --git a/src/bstring.rs b/src/bstring.rs
index 30093ba..d144b1d 100644
--- a/src/bstring.rs
+++ b/src/bstring.rs
@@ -1,3 +1,5 @@
+use alloc::vec::Vec;
+
use crate::bstr::BStr;
/// A wrapper for `Vec<u8>` that provides convenient string oriented trait
@@ -38,16 +40,43 @@ use crate::bstr::BStr;
/// region of memory containing the bytes, a length and a capacity.
#[derive(Clone, Hash)]
pub struct BString {
- pub(crate) bytes: Vec<u8>,
+ bytes: Vec<u8>,
}
impl BString {
+ /// Constructs a new `BString` from the given [`Vec`].
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use bstr::BString;
+ ///
+ /// let mut b = BString::new(Vec::with_capacity(10));
+ /// ```
+ ///
+ /// This function is `const`:
+ ///
+ /// ```
+ /// use bstr::BString;
+ ///
+ /// const B: BString = BString::new(vec![]);
+ /// ```
+ #[inline]
+ pub const fn new(bytes: Vec<u8>) -> BString {
+ BString { bytes }
+ }
+
#[inline]
pub(crate) fn as_bytes(&self) -> &[u8] {
&self.bytes
}
#[inline]
+ pub(crate) fn as_bytes_mut(&mut self) -> &mut [u8] {
+ &mut self.bytes
+ }
+
+ #[inline]
pub(crate) fn as_bstr(&self) -> &BStr {
BStr::new(&self.bytes)
}
@@ -56,4 +85,19 @@ impl BString {
pub(crate) fn as_mut_bstr(&mut self) -> &mut BStr {
BStr::new_mut(&mut self.bytes)
}
+
+ #[inline]
+ pub(crate) fn as_vec(&self) -> &Vec<u8> {
+ &self.bytes
+ }
+
+ #[inline]
+ pub(crate) fn as_vec_mut(&mut self) -> &mut Vec<u8> {
+ &mut self.bytes
+ }
+
+ #[inline]
+ pub(crate) fn into_vec(self) -> Vec<u8> {
+ self.bytes
+ }
}
diff --git a/src/byteset/mod.rs b/src/byteset/mod.rs
index 043d309..c6c697c 100644
--- a/src/byteset/mod.rs
+++ b/src/byteset/mod.rs
@@ -1,4 +1,5 @@
use memchr::{memchr, memchr2, memchr3, memrchr, memrchr2, memrchr3};
+
mod scalar;
#[inline]
@@ -79,7 +80,7 @@ pub(crate) fn rfind_not(haystack: &[u8], byteset: &[u8]) -> Option<usize> {
}
}
-#[cfg(test)]
+#[cfg(all(test, feature = "std", not(miri)))]
mod tests {
quickcheck::quickcheck! {
fn qc_byteset_forward_matches_naive(
diff --git a/src/byteset/scalar.rs b/src/byteset/scalar.rs
index 9bd34a8..28bff67 100644
--- a/src/byteset/scalar.rs
+++ b/src/byteset/scalar.rs
@@ -1,9 +1,8 @@
// This is adapted from `fallback.rs` from rust-memchr. It's modified to return
-// the 'inverse' query of memchr, e.g. finding the first byte not in the provided
-// set. This is simple for the 1-byte case.
+// the 'inverse' query of memchr, e.g. finding the first byte not in the
+// provided set. This is simple for the 1-byte case.
-use core::cmp;
-use core::usize;
+use core::{cmp, usize};
#[cfg(target_pointer_width = "32")]
const USIZE_BYTES: usize = 4;
@@ -29,10 +28,11 @@ pub fn inv_memchr(n1: u8, haystack: &[u8]) -> Option<usize> {
let loop_size = cmp::min(LOOP_SIZE, haystack.len());
let align = USIZE_BYTES - 1;
let start_ptr = haystack.as_ptr();
- let end_ptr = haystack[haystack.len()..].as_ptr();
- let mut ptr = start_ptr;
unsafe {
+ let end_ptr = haystack.as_ptr().add(haystack.len());
+ let mut ptr = start_ptr;
+
if haystack.len() < USIZE_BYTES {
return forward_search(start_ptr, end_ptr, ptr, confirm);
}
@@ -68,10 +68,11 @@ pub fn inv_memrchr(n1: u8, haystack: &[u8]) -> Option<usize> {
let loop_size = cmp::min(LOOP_SIZE, haystack.len());
let align = USIZE_BYTES - 1;
let start_ptr = haystack.as_ptr();
- let end_ptr = haystack[haystack.len()..].as_ptr();
- let mut ptr = end_ptr;
unsafe {
+ let end_ptr = haystack.as_ptr().add(haystack.len());
+ let mut ptr = end_ptr;
+
if haystack.len() < USIZE_BYTES {
return reverse_search(start_ptr, end_ptr, ptr, confirm);
}
@@ -81,7 +82,7 @@ pub fn inv_memrchr(n1: u8, haystack: &[u8]) -> Option<usize> {
return reverse_search(start_ptr, end_ptr, ptr, confirm);
}
- ptr = (end_ptr as usize & !align) as *const u8;
+ ptr = ptr.sub(end_ptr as usize & align);
debug_assert!(start_ptr <= ptr && ptr <= end_ptr);
while loop_size == LOOP_SIZE && ptr >= start_ptr.add(loop_size) {
debug_assert_eq!(0, (ptr as usize) % USIZE_BYTES);
@@ -174,9 +175,10 @@ pub(crate) fn reverse_search_bytes<F: Fn(u8) -> bool>(
}
}
-#[cfg(test)]
+#[cfg(all(test, feature = "std"))]
mod tests {
use super::{inv_memchr, inv_memrchr};
+
// search string, search byte, inv_memchr result, inv_memrchr result.
// these are expanded into a much larger set of tests in build_tests
const TESTS: &[(&[u8], u8, usize, usize)] = &[
@@ -192,10 +194,15 @@ mod tests {
type TestCase = (Vec<u8>, u8, Option<(usize, usize)>);
fn build_tests() -> Vec<TestCase> {
+ #[cfg(not(miri))]
+ const MAX_PER: usize = 515;
+ #[cfg(miri)]
+ const MAX_PER: usize = 10;
+
let mut result = vec![];
for &(search, byte, fwd_pos, rev_pos) in TESTS {
result.push((search.to_vec(), byte, Some((fwd_pos, rev_pos))));
- for i in 1..515 {
+ for i in 1..MAX_PER {
// add a bunch of copies of the search byte to the end.
let mut suffixed: Vec<u8> = search.into();
suffixed.extend(std::iter::repeat(byte).take(i));
@@ -225,7 +232,7 @@ mod tests {
}
// build non-matching tests for several sizes
- for i in 0..515 {
+ for i in 0..MAX_PER {
result.push((
std::iter::repeat(b'\0').take(i).collect(),
b'\0',
@@ -239,6 +246,12 @@ mod tests {
#[test]
fn test_inv_memchr() {
use crate::{ByteSlice, B};
+
+ #[cfg(not(miri))]
+ const MAX_OFFSET: usize = 130;
+ #[cfg(miri)]
+ const MAX_OFFSET: usize = 13;
+
for (search, byte, matching) in build_tests() {
assert_eq!(
inv_memchr(byte, &search),
@@ -256,13 +269,14 @@ mod tests {
// better printing
B(&search).as_bstr(),
);
- // Test a rather large number off offsets for potential alignment issues
- for offset in 1..130 {
+ // Test a rather large number off offsets for potential alignment
+ // issues.
+ for offset in 1..MAX_OFFSET {
if offset >= search.len() {
break;
}
- // If this would cause us to shift the results off the end, skip
- // it so that we don't have to recompute them.
+ // If this would cause us to shift the results off the end,
+ // skip it so that we don't have to recompute them.
if let Some((f, r)) = matching {
if offset > f || offset > r {
break;
diff --git a/src/ext_slice.rs b/src/ext_slice.rs
index 0cc73af..91af450 100644
--- a/src/ext_slice.rs
+++ b/src/ext_slice.rs
@@ -1,17 +1,16 @@
+use core::{iter, slice, str};
+
+#[cfg(all(feature = "alloc", feature = "unicode"))]
+use alloc::vec;
+#[cfg(feature = "alloc")]
+use alloc::{borrow::Cow, string::String, vec::Vec};
+
#[cfg(feature = "std")]
-use std::borrow::Cow;
-#[cfg(feature = "std")]
-use std::ffi::OsStr;
-#[cfg(feature = "std")]
-use std::path::Path;
+use std::{ffi::OsStr, path::Path};
-use core::{iter, ops, ptr, slice, str};
use memchr::{memchr, memmem, memrchr};
-use crate::ascii;
-use crate::bstr::BStr;
-use crate::byteset;
-#[cfg(feature = "std")]
+#[cfg(feature = "alloc")]
use crate::ext_vec::ByteVec;
#[cfg(feature = "unicode")]
use crate::unicode::{
@@ -19,7 +18,12 @@ use crate::unicode::{
SentenceIndices, Sentences, WordIndices, Words, WordsWithBreakIndices,
WordsWithBreaks,
};
-use crate::utf8::{self, CharIndices, Chars, Utf8Chunks, Utf8Error};
+use crate::{
+ ascii,
+ bstr::BStr,
+ byteset,
+ utf8::{self, CharIndices, Chars, Utf8Chunks, Utf8Error},
+};
/// A short-hand constructor for building a `&[u8]`.
///
@@ -83,13 +87,30 @@ impl ByteSlice for [u8] {
}
}
+impl<const N: usize> ByteSlice for [u8; N] {
+ #[inline]
+ fn as_bytes(&self) -> &[u8] {
+ self
+ }
+
+ #[inline]
+ fn as_bytes_mut(&mut self) -> &mut [u8] {
+ self
+ }
+}
+
/// Ensure that callers cannot implement `ByteSlice` by making an
/// umplementable trait its super trait.
-pub trait Sealed {}
-impl Sealed for [u8] {}
+mod private {
+ pub trait Sealed {}
+}
+impl private::Sealed for [u8] {}
+impl<const N: usize> private::Sealed for [u8; N] {}
/// A trait that extends `&[u8]` with string oriented methods.
-pub trait ByteSlice: Sealed {
+///
+/// This trait is sealed and cannot be implemented outside of `bstr`.
+pub trait ByteSlice: private::Sealed {
/// A method for accessing the raw bytes of this type. This is always a
/// no-op and callers shouldn't care about it. This only exists for making
/// the extension trait work.
@@ -149,11 +170,12 @@ pub trait ByteSlice: Sealed {
/// Create an immutable byte string from an OS string slice.
///
- /// On Unix, this always succeeds and is zero cost. On non-Unix systems,
- /// this returns `None` if the given OS string is not valid UTF-8. (For
- /// example, on Windows, file paths are allowed to be a sequence of
- /// arbitrary 16-bit integers. Not all such sequences can be transcoded to
- /// valid UTF-8.)
+ /// When the underlying bytes of OS strings are accessible, then this
+ /// always succeeds and is zero cost. Otherwise, this returns `None` if the
+ /// given OS string is not valid UTF-8. (For example, when the underlying
+ /// bytes are inaccessible on Windows, file paths are allowed to be a
+ /// sequence of arbitrary 16-bit integers. Not all such sequences can be
+ /// transcoded to valid UTF-8.)
///
/// # Examples
///
@@ -190,10 +212,12 @@ pub trait ByteSlice: Sealed {
/// Create an immutable byte string from a file path.
///
- /// On Unix, this always succeeds and is zero cost. On non-Unix systems,
- /// this returns `None` if the given path is not valid UTF-8. (For example,
- /// on Windows, file paths are allowed to be a sequence of arbitrary 16-bit
- /// integers. Not all such sequences can be transcoded to valid UTF-8.)
+ /// When the underlying bytes of paths are accessible, then this always
+ /// succeeds and is zero cost. Otherwise, this returns `None` if the given
+ /// path is not valid UTF-8. (For example, when the underlying bytes are
+ /// inaccessible on Windows, file paths are allowed to be a sequence of
+ /// arbitrary 16-bit integers. Not all such sequences can be transcoded to
+ /// valid UTF-8.)
///
/// # Examples
///
@@ -230,6 +254,7 @@ pub trait ByteSlice: Sealed {
/// Basic usage:
///
/// ```
+ /// # #[cfg(feature = "alloc")] {
/// use bstr::{B, ByteSlice, ByteVec};
///
/// # fn example() -> Result<(), bstr::Utf8Error> {
@@ -241,6 +266,7 @@ pub trait ByteSlice: Sealed {
/// let err = bstring.to_str().unwrap_err();
/// assert_eq!(8, err.valid_up_to());
/// # Ok(()) }; example().unwrap()
+ /// # }
/// ```
#[inline]
fn to_str(&self) -> Result<&str, Utf8Error> {
@@ -301,7 +327,7 @@ pub trait ByteSlice: Sealed {
/// [W3C's Encoding standard](https://www.w3.org/TR/encoding/).
/// For a more precise description of the maximal subpart strategy, see
/// the Unicode Standard, Chapter 3, Section 9. See also
- /// [Public Review Issue #121](http://www.unicode.org/review/pr-121.html).
+ /// [Public Review Issue #121](https://www.unicode.org/review/pr-121.html).
///
/// N.B. Rust's standard library also appears to use the same strategy,
/// but it does not appear to be an API guarantee.
@@ -341,7 +367,7 @@ pub trait ByteSlice: Sealed {
/// let bs = B(b"\x61\xF1\x80\x80\xE1\x80\xC2\x62");
/// assert_eq!("a\u{FFFD}\u{FFFD}\u{FFFD}b", bs.to_str_lossy());
/// ```
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
#[inline]
fn to_str_lossy(&self) -> Cow<'_, str> {
match utf8::validate(self.as_bytes()) {
@@ -398,7 +424,7 @@ pub trait ByteSlice: Sealed {
/// bstring.to_str_lossy_into(&mut dest);
/// assert_eq!("☃βツ\u{FFFD}", dest);
/// ```
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
#[inline]
fn to_str_lossy_into(&self, dest: &mut String) {
let mut bytes = self.as_bytes();
@@ -428,12 +454,15 @@ pub trait ByteSlice: Sealed {
/// Create an OS string slice from this byte string.
///
- /// On Unix, this always succeeds and is zero cost. On non-Unix systems,
- /// this returns a UTF-8 decoding error if this byte string is not valid
- /// UTF-8. (For example, on Windows, file paths are allowed to be a
- /// sequence of arbitrary 16-bit integers. There is no obvious mapping from
- /// an arbitrary sequence of 8-bit integers to an arbitrary sequence of
- /// 16-bit integers.)
+ /// When OS strings can be constructed from arbitrary byte sequences, this
+ /// always succeeds and is zero cost. Otherwise, this returns a UTF-8
+ /// decoding error if this byte string is not valid UTF-8. (For example,
+ /// assuming the representation of `OsStr` is opaque on Windows, file paths
+ /// are allowed to be a sequence of arbitrary 16-bit integers. There is
+ /// no obvious mapping from an arbitrary sequence of 8-bit integers to an
+ /// arbitrary sequence of 16-bit integers. If the representation of `OsStr`
+ /// is even opened up, then this will convert any sequence of bytes to an
+ /// `OsStr` without cost.)
///
/// # Examples
///
@@ -467,13 +496,13 @@ pub trait ByteSlice: Sealed {
/// Lossily create an OS string slice from this byte string.
///
- /// On Unix, this always succeeds and is zero cost. On non-Unix systems,
- /// this will perform a UTF-8 check and lossily convert this byte string
- /// into valid UTF-8 using the Unicode replacement codepoint.
+ /// When OS strings can be constructed from arbitrary byte sequences, this
+ /// is zero cost and always returns a slice. Otherwise, this will perform a
+ /// UTF-8 check and lossily convert this byte string into valid UTF-8 using
+ /// the Unicode replacement codepoint.
///
- /// Note that this can prevent the correct roundtripping of file paths on
- /// non-Unix systems such as Windows, where file paths are an arbitrary
- /// sequence of 16-bit integers.
+ /// Note that this can prevent the correct roundtripping of file paths when
+ /// the representation of `OsStr` is opaque.
///
/// # Examples
///
@@ -512,12 +541,15 @@ pub trait ByteSlice: Sealed {
/// Create a path slice from this byte string.
///
- /// On Unix, this always succeeds and is zero cost. On non-Unix systems,
- /// this returns a UTF-8 decoding error if this byte string is not valid
- /// UTF-8. (For example, on Windows, file paths are allowed to be a
- /// sequence of arbitrary 16-bit integers. There is no obvious mapping from
- /// an arbitrary sequence of 8-bit integers to an arbitrary sequence of
- /// 16-bit integers.)
+ /// When paths can be constructed from arbitrary byte sequences, this
+ /// always succeeds and is zero cost. Otherwise, this returns a UTF-8
+ /// decoding error if this byte string is not valid UTF-8. (For example,
+ /// assuming the representation of `Path` is opaque on Windows, file paths
+ /// are allowed to be a sequence of arbitrary 16-bit integers. There is
+ /// no obvious mapping from an arbitrary sequence of 8-bit integers to an
+ /// arbitrary sequence of 16-bit integers. If the representation of `Path`
+ /// is even opened up, then this will convert any sequence of bytes to an
+ /// `Path` without cost.)
///
/// # Examples
///
@@ -537,13 +569,13 @@ pub trait ByteSlice: Sealed {
/// Lossily create a path slice from this byte string.
///
- /// On Unix, this always succeeds and is zero cost. On non-Unix systems,
- /// this will perform a UTF-8 check and lossily convert this byte string
- /// into valid UTF-8 using the Unicode replacement codepoint.
+ /// When paths can be constructed from arbitrary byte sequences, this is
+ /// zero cost and always returns a slice. Otherwise, this will perform a
+ /// UTF-8 check and lossily convert this byte string into valid UTF-8 using
+ /// the Unicode replacement codepoint.
///
- /// Note that this can prevent the correct roundtripping of file paths on
- /// non-Unix systems such as Windows, where file paths are an arbitrary
- /// sequence of 16-bit integers.
+ /// Note that this can prevent the correct roundtripping of file paths when
+ /// the representation of `Path` is opaque.
///
/// # Examples
///
@@ -584,15 +616,10 @@ pub trait ByteSlice: Sealed {
/// assert_eq!(b"foo".repeatn(4), B("foofoofoofoo"));
/// assert_eq!(b"foo".repeatn(0), B(""));
/// ```
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
#[inline]
fn repeatn(&self, n: usize) -> Vec<u8> {
- let bs = self.as_bytes();
- let mut dst = vec![0; bs.len() * n];
- for i in 0..n {
- dst[i * bs.len()..(i + 1) * bs.len()].copy_from_slice(bs);
- }
- dst
+ self.as_bytes().repeat(n)
}
/// Returns true if and only if this byte string contains the given needle.
@@ -759,10 +786,10 @@ pub trait ByteSlice: Sealed {
/// assert_eq!(matches, vec![0]);
/// ```
#[inline]
- fn find_iter<'a, B: ?Sized + AsRef<[u8]>>(
- &'a self,
- needle: &'a B,
- ) -> Find<'a> {
+ fn find_iter<'h, 'n, B: ?Sized + AsRef<[u8]>>(
+ &'h self,
+ needle: &'n B,
+ ) -> Find<'h, 'n> {
Find::new(self.as_bytes(), needle.as_ref())
}
@@ -804,10 +831,10 @@ pub trait ByteSlice: Sealed {
/// assert_eq!(matches, vec![0]);
/// ```
#[inline]
- fn rfind_iter<'a, B: ?Sized + AsRef<[u8]>>(
- &'a self,
- needle: &'a B,
- ) -> FindReverse<'a> {
+ fn rfind_iter<'h, 'n, B: ?Sized + AsRef<[u8]>>(
+ &'h self,
+ needle: &'n B,
+ ) -> FindReverse<'h, 'n> {
FindReverse::new(self.as_bytes(), needle.as_ref())
}
@@ -926,14 +953,17 @@ pub trait ByteSlice: Sealed {
/// assert_eq!(b"foo bar baz".find_byteset(b"zr"), Some(6));
/// assert_eq!(b"foo baz bar".find_byteset(b"bzr"), Some(4));
/// assert_eq!(None, b"foo baz bar".find_byteset(b"\t\n"));
+ /// // The empty byteset never matches.
+ /// assert_eq!(None, b"abc".find_byteset(b""));
+ /// assert_eq!(None, b"".find_byteset(b""));
/// ```
#[inline]
fn find_byteset<B: AsRef<[u8]>>(&self, byteset: B) -> Option<usize> {
byteset::find(self.as_bytes(), byteset.as_ref())
}
- /// Returns the index of the first occurrence of a byte that is not a member
- /// of the provided set.
+ /// Returns the index of the first occurrence of a byte that is not a
+ /// member of the provided set.
///
/// The `byteset` may be any type that can be cheaply converted into a
/// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`, but
@@ -963,6 +993,10 @@ pub trait ByteSlice: Sealed {
/// assert_eq!(b"foo bar baz".find_not_byteset(b"fo "), Some(4));
/// assert_eq!(b"\t\tbaz bar".find_not_byteset(b" \t\r\n"), Some(2));
/// assert_eq!(b"foo\nbaz\tbar".find_not_byteset(b"\t\n"), Some(0));
+ /// // The negation of the empty byteset matches everything.
+ /// assert_eq!(Some(0), b"abc".find_not_byteset(b""));
+ /// // But an empty string never contains anything.
+ /// assert_eq!(None, b"".find_not_byteset(b""));
/// ```
#[inline]
fn find_not_byteset<B: AsRef<[u8]>>(&self, byteset: B) -> Option<usize> {
@@ -1043,8 +1077,9 @@ pub trait ByteSlice: Sealed {
byteset::rfind_not(self.as_bytes(), byteset.as_ref())
}
- /// Returns an iterator over the fields in a byte string, separated by
- /// contiguous whitespace.
+ /// Returns an iterator over the fields in a byte string, separated
+ /// by contiguous whitespace (according to the Unicode property
+ /// `White_Space`).
///
/// # Example
///
@@ -1065,6 +1100,7 @@ pub trait ByteSlice: Sealed {
///
/// assert_eq!(0, B(" \n\t\u{2003}\n \t").fields().count());
/// ```
+ #[cfg(feature = "unicode")]
#[inline]
fn fields(&self) -> Fields<'_> {
Fields::new(self.as_bytes())
@@ -1191,10 +1227,10 @@ pub trait ByteSlice: Sealed {
/// It does *not* give you `["a", "b", "c"]`. For that behavior, use
/// [`fields`](#method.fields) instead.
#[inline]
- fn split_str<'a, B: ?Sized + AsRef<[u8]>>(
- &'a self,
- splitter: &'a B,
- ) -> Split<'a> {
+ fn split_str<'h, 's, B: ?Sized + AsRef<[u8]>>(
+ &'h self,
+ splitter: &'s B,
+ ) -> Split<'h, 's> {
Split::new(self.as_bytes(), splitter.as_ref())
}
@@ -1285,13 +1321,101 @@ pub trait ByteSlice: Sealed {
///
/// It does *not* give you `["a", "b", "c"]`.
#[inline]
- fn rsplit_str<'a, B: ?Sized + AsRef<[u8]>>(
- &'a self,
- splitter: &'a B,
- ) -> SplitReverse<'a> {
+ fn rsplit_str<'h, 's, B: ?Sized + AsRef<[u8]>>(
+ &'h self,
+ splitter: &'s B,
+ ) -> SplitReverse<'h, 's> {
SplitReverse::new(self.as_bytes(), splitter.as_ref())
}
+ /// Split this byte string at the first occurrence of `splitter`.
+ ///
+ /// If the `splitter` is found in the byte string, returns a tuple
+ /// containing the parts of the string before and after the first occurrence
+ /// of `splitter` respectively. Otherwise, if there are no occurrences of
+ /// `splitter` in the byte string, returns `None`.
+ ///
+ /// The splitter may be any type that can be cheaply converted into a
+ /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`.
+ ///
+ /// If you need to split on the *last* instance of a delimiter instead, see
+ /// the [`ByteSlice::rsplit_once_str`](#method.rsplit_once_str) method .
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// assert_eq!(
+ /// B("foo,bar").split_once_str(","),
+ /// Some((B("foo"), B("bar"))),
+ /// );
+ /// assert_eq!(
+ /// B("foo,bar,baz").split_once_str(","),
+ /// Some((B("foo"), B("bar,baz"))),
+ /// );
+ /// assert_eq!(B("foo").split_once_str(","), None);
+ /// assert_eq!(B("foo,").split_once_str(b","), Some((B("foo"), B(""))));
+ /// assert_eq!(B(",foo").split_once_str(b","), Some((B(""), B("foo"))));
+ /// ```
+ #[inline]
+ fn split_once_str<'a, B: ?Sized + AsRef<[u8]>>(
+ &'a self,
+ splitter: &B,
+ ) -> Option<(&'a [u8], &'a [u8])> {
+ let bytes = self.as_bytes();
+ let splitter = splitter.as_ref();
+ let start = Finder::new(splitter).find(bytes)?;
+ let end = start + splitter.len();
+ Some((&bytes[..start], &bytes[end..]))
+ }
+
+ /// Split this byte string at the last occurrence of `splitter`.
+ ///
+ /// If the `splitter` is found in the byte string, returns a tuple
+ /// containing the parts of the string before and after the last occurrence
+ /// of `splitter`, respectively. Otherwise, if there are no occurrences of
+ /// `splitter` in the byte string, returns `None`.
+ ///
+ /// The splitter may be any type that can be cheaply converted into a
+ /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`.
+ ///
+ /// If you need to split on the *first* instance of a delimiter instead, see
+ /// the [`ByteSlice::split_once_str`](#method.split_once_str) method.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// assert_eq!(
+ /// B("foo,bar").rsplit_once_str(","),
+ /// Some((B("foo"), B("bar"))),
+ /// );
+ /// assert_eq!(
+ /// B("foo,bar,baz").rsplit_once_str(","),
+ /// Some((B("foo,bar"), B("baz"))),
+ /// );
+ /// assert_eq!(B("foo").rsplit_once_str(","), None);
+ /// assert_eq!(B("foo,").rsplit_once_str(b","), Some((B("foo"), B(""))));
+ /// assert_eq!(B(",foo").rsplit_once_str(b","), Some((B(""), B("foo"))));
+ /// ```
+ #[inline]
+ fn rsplit_once_str<'a, B: ?Sized + AsRef<[u8]>>(
+ &'a self,
+ splitter: &B,
+ ) -> Option<(&'a [u8], &'a [u8])> {
+ let bytes = self.as_bytes();
+ let splitter = splitter.as_ref();
+ let start = FinderReverse::new(splitter).rfind(bytes)?;
+ let end = start + splitter.len();
+ Some((&bytes[..start], &bytes[end..]))
+ }
+
/// Returns an iterator of at most `limit` substrings of this byte string,
/// separated by the given byte string. If `limit` substrings are yielded,
/// then the last substring will contain the remainder of this byte string.
@@ -1328,11 +1452,11 @@ pub trait ByteSlice: Sealed {
/// assert!(x.is_empty());
/// ```
#[inline]
- fn splitn_str<'a, B: ?Sized + AsRef<[u8]>>(
- &'a self,
+ fn splitn_str<'h, 's, B: ?Sized + AsRef<[u8]>>(
+ &'h self,
limit: usize,
- splitter: &'a B,
- ) -> SplitN<'a> {
+ splitter: &'s B,
+ ) -> SplitN<'h, 's> {
SplitN::new(self.as_bytes(), splitter.as_ref(), limit)
}
@@ -1374,11 +1498,11 @@ pub trait ByteSlice: Sealed {
/// assert!(x.is_empty());
/// ```
#[inline]
- fn rsplitn_str<'a, B: ?Sized + AsRef<[u8]>>(
- &'a self,
+ fn rsplitn_str<'h, 's, B: ?Sized + AsRef<[u8]>>(
+ &'h self,
limit: usize,
- splitter: &'a B,
- ) -> SplitNReverse<'a> {
+ splitter: &'s B,
+ ) -> SplitNReverse<'h, 's> {
SplitNReverse::new(self.as_bytes(), splitter.as_ref(), limit)
}
@@ -1416,7 +1540,7 @@ pub trait ByteSlice: Sealed {
/// let s = b"foo".replace("", "Z");
/// assert_eq!(s, "ZfZoZoZ".as_bytes());
/// ```
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
#[inline]
fn replace<N: AsRef<[u8]>, R: AsRef<[u8]>>(
&self,
@@ -1462,7 +1586,7 @@ pub trait ByteSlice: Sealed {
/// let s = b"foo".replacen("", "Z", 2);
/// assert_eq!(s, "ZfZoo".as_bytes());
/// ```
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
#[inline]
fn replacen<N: AsRef<[u8]>, R: AsRef<[u8]>>(
&self,
@@ -1520,7 +1644,7 @@ pub trait ByteSlice: Sealed {
/// s.replace_into("", "Z", &mut dest);
/// assert_eq!(dest, "ZfZoZoZ".as_bytes());
/// ```
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
#[inline]
fn replace_into<N: AsRef<[u8]>, R: AsRef<[u8]>>(
&self,
@@ -1584,7 +1708,7 @@ pub trait ByteSlice: Sealed {
/// s.replacen_into("", "Z", 2, &mut dest);
/// assert_eq!(dest, "ZfZoo".as_bytes());
/// ```
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
#[inline]
fn replacen_into<N: AsRef<[u8]>, R: AsRef<[u8]>>(
&self,
@@ -1795,11 +1919,12 @@ pub trait ByteSlice: Sealed {
/// assert_eq!(vec![(0, 5, "à̖"), (5, 13, "🇺🇸")], graphemes);
/// ```
///
- /// This example shows what happens when invalid UTF-8 is enountered. Note
+ /// This example shows what happens when invalid UTF-8 is encountered. Note
/// that the offsets are valid indices into the original string, and do
/// not necessarily correspond to the length of the `&str` returned!
///
/// ```
+ /// # #[cfg(all(feature = "alloc"))] {
/// use bstr::{ByteSlice, ByteVec};
///
/// let mut bytes = vec![];
@@ -1813,6 +1938,7 @@ pub trait ByteSlice: Sealed {
/// graphemes,
/// vec![(0, 5, "à̖"), (5, 6, "\u{FFFD}"), (6, 14, "🇺🇸")]
/// );
+ /// # }
/// ```
#[cfg(feature = "unicode")]
#[inline]
@@ -2277,7 +2403,7 @@ pub trait ByteSlice: Sealed {
/// let s = B(b"FOO\xFFBAR\xE2\x98BAZ");
/// assert_eq!(B(b"foo\xFFbar\xE2\x98baz"), s.to_lowercase().as_bytes());
/// ```
- #[cfg(all(feature = "std", feature = "unicode"))]
+ #[cfg(all(feature = "alloc", feature = "unicode"))]
#[inline]
fn to_lowercase(&self) -> Vec<u8> {
let mut buf = vec![];
@@ -2339,7 +2465,7 @@ pub trait ByteSlice: Sealed {
/// s.to_lowercase_into(&mut buf);
/// assert_eq!(B(b"foo\xFFbar\xE2\x98baz"), buf.as_bytes());
/// ```
- #[cfg(all(feature = "std", feature = "unicode"))]
+ #[cfg(all(feature = "alloc", feature = "unicode"))]
#[inline]
fn to_lowercase_into(&self, buf: &mut Vec<u8>) {
// TODO: This is the best we can do given what std exposes I think.
@@ -2394,7 +2520,7 @@ pub trait ByteSlice: Sealed {
/// let s = B(b"FOO\xFFBAR\xE2\x98BAZ");
/// assert_eq!(s.to_ascii_lowercase(), B(b"foo\xFFbar\xE2\x98baz"));
/// ```
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
#[inline]
fn to_ascii_lowercase(&self) -> Vec<u8> {
self.as_bytes().to_ascii_lowercase()
@@ -2424,11 +2550,13 @@ pub trait ByteSlice: Sealed {
/// Invalid UTF-8 remains as is:
///
/// ```
+ /// # #[cfg(feature = "alloc")] {
/// use bstr::{B, ByteSlice, ByteVec};
///
/// let mut s = <Vec<u8>>::from_slice(b"FOO\xFFBAR\xE2\x98BAZ");
/// s.make_ascii_lowercase();
/// assert_eq!(s, B(b"foo\xFFbar\xE2\x98baz"));
+ /// # }
/// ```
#[inline]
fn make_ascii_lowercase(&mut self) {
@@ -2480,7 +2608,7 @@ pub trait ByteSlice: Sealed {
/// let s = B(b"foo\xFFbar\xE2\x98baz");
/// assert_eq!(s.to_uppercase(), B(b"FOO\xFFBAR\xE2\x98BAZ"));
/// ```
- #[cfg(all(feature = "std", feature = "unicode"))]
+ #[cfg(all(feature = "alloc", feature = "unicode"))]
#[inline]
fn to_uppercase(&self) -> Vec<u8> {
let mut buf = vec![];
@@ -2542,7 +2670,7 @@ pub trait ByteSlice: Sealed {
/// s.to_uppercase_into(&mut buf);
/// assert_eq!(buf, B(b"FOO\xFFBAR\xE2\x98BAZ"));
/// ```
- #[cfg(all(feature = "std", feature = "unicode"))]
+ #[cfg(all(feature = "alloc", feature = "unicode"))]
#[inline]
fn to_uppercase_into(&self, buf: &mut Vec<u8>) {
// TODO: This is the best we can do given what std exposes I think.
@@ -2594,7 +2722,7 @@ pub trait ByteSlice: Sealed {
/// let s = B(b"foo\xFFbar\xE2\x98baz");
/// assert_eq!(s.to_ascii_uppercase(), B(b"FOO\xFFBAR\xE2\x98BAZ"));
/// ```
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
#[inline]
fn to_ascii_uppercase(&self) -> Vec<u8> {
self.as_bytes().to_ascii_uppercase()
@@ -2624,11 +2752,13 @@ pub trait ByteSlice: Sealed {
/// Invalid UTF-8 remains as is:
///
/// ```
+ /// # #[cfg(feature = "alloc")] {
/// use bstr::{B, ByteSlice, ByteVec};
///
/// let mut s = <Vec<u8>>::from_slice(b"foo\xFFbar\xE2\x98baz");
/// s.make_ascii_uppercase();
/// assert_eq!(s, B(b"FOO\xFFBAR\xE2\x98BAZ"));
+ /// # }
/// ```
#[inline]
fn make_ascii_uppercase(&mut self) {
@@ -2900,72 +3030,6 @@ pub trait ByteSlice: Sealed {
Some(index)
}
}
-
- /// Copies elements from one part of the slice to another part of itself,
- /// where the parts may be overlapping.
- ///
- /// `src` is the range within this byte string to copy from, while `dest`
- /// is the starting index of the range within this byte string to copy to.
- /// The length indicated by `src` must be less than or equal to the number
- /// of bytes from `dest` to the end of the byte string.
- ///
- /// # Panics
- ///
- /// Panics if either range is out of bounds, or if `src` is too big to fit
- /// into `dest`, or if the end of `src` is before the start.
- ///
- /// # Examples
- ///
- /// Copying four bytes within a byte string:
- ///
- /// ```
- /// use bstr::{B, ByteSlice};
- ///
- /// let mut buf = *b"Hello, World!";
- /// let s = &mut buf;
- /// s.copy_within_str(1..5, 8);
- /// assert_eq!(s, B("Hello, Wello!"));
- /// ```
- #[inline]
- fn copy_within_str<R>(&mut self, src: R, dest: usize)
- where
- R: ops::RangeBounds<usize>,
- {
- // TODO: Deprecate this once slice::copy_within stabilizes.
- let src_start = match src.start_bound() {
- ops::Bound::Included(&n) => n,
- ops::Bound::Excluded(&n) => {
- n.checked_add(1).expect("attempted to index slice beyond max")
- }
- ops::Bound::Unbounded => 0,
- };
- let src_end = match src.end_bound() {
- ops::Bound::Included(&n) => {
- n.checked_add(1).expect("attempted to index slice beyond max")
- }
- ops::Bound::Excluded(&n) => n,
- ops::Bound::Unbounded => self.as_bytes().len(),
- };
- assert!(src_start <= src_end, "src end is before src start");
- assert!(src_end <= self.as_bytes().len(), "src is out of bounds");
- let count = src_end - src_start;
- assert!(
- dest <= self.as_bytes().len() - count,
- "dest is out of bounds",
- );
-
- // SAFETY: This is safe because we use ptr::copy to handle overlapping
- // copies, and is also safe because we've checked all the bounds above.
- // Finally, we are only dealing with u8 data, which is Copy, which
- // means we can copy without worrying about ownership/destructors.
- unsafe {
- ptr::copy(
- self.as_bytes().get_unchecked(src_start),
- self.as_bytes_mut().get_unchecked_mut(dest),
- count,
- );
- }
- }
}
/// A single substring searcher fixed to a particular needle.
@@ -3138,22 +3202,22 @@ impl<'a> FinderReverse<'a> {
///
/// Matches are reported by the byte offset at which they begin.
///
-/// `'a` is the shorter of two lifetimes: the byte string being searched or the
-/// byte string being looked for.
+/// `'h` is the lifetime of the haystack while `'n` is the lifetime of the
+/// needle.
#[derive(Debug)]
-pub struct Find<'a> {
- it: memmem::FindIter<'a, 'a>,
- haystack: &'a [u8],
- needle: &'a [u8],
+pub struct Find<'h, 'n> {
+ it: memmem::FindIter<'h, 'n>,
+ haystack: &'h [u8],
+ needle: &'n [u8],
}
-impl<'a> Find<'a> {
- fn new(haystack: &'a [u8], needle: &'a [u8]) -> Find<'a> {
+impl<'h, 'n> Find<'h, 'n> {
+ fn new(haystack: &'h [u8], needle: &'n [u8]) -> Find<'h, 'n> {
Find { it: memmem::find_iter(haystack, needle), haystack, needle }
}
}
-impl<'a> Iterator for Find<'a> {
+impl<'h, 'n> Iterator for Find<'h, 'n> {
type Item = usize;
#[inline]
@@ -3166,17 +3230,17 @@ impl<'a> Iterator for Find<'a> {
///
/// Matches are reported by the byte offset at which they begin.
///
-/// `'a` is the shorter of two lifetimes: the byte string being searched or the
-/// byte string being looked for.
+/// `'h` is the lifetime of the haystack while `'n` is the lifetime of the
+/// needle.
#[derive(Debug)]
-pub struct FindReverse<'a> {
- it: memmem::FindRevIter<'a, 'a>,
- haystack: &'a [u8],
- needle: &'a [u8],
+pub struct FindReverse<'h, 'n> {
+ it: memmem::FindRevIter<'h, 'n>,
+ haystack: &'h [u8],
+ needle: &'n [u8],
}
-impl<'a> FindReverse<'a> {
- fn new(haystack: &'a [u8], needle: &'a [u8]) -> FindReverse<'a> {
+impl<'h, 'n> FindReverse<'h, 'n> {
+ fn new(haystack: &'h [u8], needle: &'n [u8]) -> FindReverse<'h, 'n> {
FindReverse {
it: memmem::rfind_iter(haystack, needle),
haystack,
@@ -3184,16 +3248,16 @@ impl<'a> FindReverse<'a> {
}
}
- fn haystack(&self) -> &'a [u8] {
+ fn haystack(&self) -> &'h [u8] {
self.haystack
}
- fn needle(&self) -> &[u8] {
+ fn needle(&self) -> &'n [u8] {
self.needle
}
}
-impl<'a> Iterator for FindReverse<'a> {
+impl<'h, 'n> Iterator for FindReverse<'h, 'n> {
type Item = usize;
#[inline]
@@ -3215,7 +3279,7 @@ impl<'a> Bytes<'a> {
/// This has the same lifetime as the original slice,
/// and so the iterator can continue to be used while this exists.
#[inline]
- pub fn as_slice(&self) -> &'a [u8] {
+ pub fn as_bytes(&self) -> &'a [u8] {
self.it.as_slice()
}
}
@@ -3252,21 +3316,27 @@ impl<'a> iter::FusedIterator for Bytes<'a> {}
/// An iterator over the fields in a byte string, separated by whitespace.
///
+/// Whitespace for this iterator is defined by the Unicode property
+/// `White_Space`.
+///
/// This iterator splits on contiguous runs of whitespace, such that the fields
/// in `foo\t\t\n \nbar` are `foo` and `bar`.
///
/// `'a` is the lifetime of the byte string being split.
+#[cfg(feature = "unicode")]
#[derive(Debug)]
pub struct Fields<'a> {
it: FieldsWith<'a, fn(char) -> bool>,
}
+#[cfg(feature = "unicode")]
impl<'a> Fields<'a> {
fn new(bytes: &'a [u8]) -> Fields<'a> {
Fields { it: bytes.fields_with(|ch| ch.is_whitespace()) }
}
}
+#[cfg(feature = "unicode")]
impl<'a> Iterator for Fields<'a> {
type Item = &'a [u8];
@@ -3328,10 +3398,11 @@ impl<'a, F: FnMut(char) -> bool> Iterator for FieldsWith<'a, F> {
/// An iterator over substrings in a byte string, split by a separator.
///
-/// `'a` is the lifetime of the byte string being split.
+/// `'h` is the lifetime of the byte string being split (the haystack), while
+/// `'s` is the lifetime of the byte string doing the splitting.
#[derive(Debug)]
-pub struct Split<'a> {
- finder: Find<'a>,
+pub struct Split<'h, 's> {
+ finder: Find<'h, 's>,
/// The end position of the previous match of our splitter. The element
/// we yield corresponds to the substring starting at `last` up to the
/// beginning of the next match of the splitter.
@@ -3342,18 +3413,18 @@ pub struct Split<'a> {
done: bool,
}
-impl<'a> Split<'a> {
- fn new(haystack: &'a [u8], splitter: &'a [u8]) -> Split<'a> {
+impl<'h, 's> Split<'h, 's> {
+ fn new(haystack: &'h [u8], splitter: &'s [u8]) -> Split<'h, 's> {
let finder = haystack.find_iter(splitter);
Split { finder, last: 0, done: false }
}
}
-impl<'a> Iterator for Split<'a> {
- type Item = &'a [u8];
+impl<'h, 's> Iterator for Split<'h, 's> {
+ type Item = &'h [u8];
#[inline]
- fn next(&mut self) -> Option<&'a [u8]> {
+ fn next(&mut self) -> Option<&'h [u8]> {
let haystack = self.finder.haystack;
match self.finder.next() {
Some(start) => {
@@ -3383,11 +3454,11 @@ impl<'a> Iterator for Split<'a> {
/// An iterator over substrings in a byte string, split by a separator, in
/// reverse.
///
-/// `'a` is the lifetime of the byte string being split, while `F` is the type
-/// of the predicate, i.e., `FnMut(char) -> bool`.
+/// `'h` is the lifetime of the byte string being split (the haystack), while
+/// `'s` is the lifetime of the byte string doing the splitting.
#[derive(Debug)]
-pub struct SplitReverse<'a> {
- finder: FindReverse<'a>,
+pub struct SplitReverse<'h, 's> {
+ finder: FindReverse<'h, 's>,
/// The end position of the previous match of our splitter. The element
/// we yield corresponds to the substring starting at `last` up to the
/// beginning of the next match of the splitter.
@@ -3398,18 +3469,18 @@ pub struct SplitReverse<'a> {
done: bool,
}
-impl<'a> SplitReverse<'a> {
- fn new(haystack: &'a [u8], splitter: &'a [u8]) -> SplitReverse<'a> {
+impl<'h, 's> SplitReverse<'h, 's> {
+ fn new(haystack: &'h [u8], splitter: &'s [u8]) -> SplitReverse<'h, 's> {
let finder = haystack.rfind_iter(splitter);
SplitReverse { finder, last: haystack.len(), done: false }
}
}
-impl<'a> Iterator for SplitReverse<'a> {
- type Item = &'a [u8];
+impl<'h, 's> Iterator for SplitReverse<'h, 's> {
+ type Item = &'h [u8];
#[inline]
- fn next(&mut self) -> Option<&'a [u8]> {
+ fn next(&mut self) -> Option<&'h [u8]> {
let haystack = self.finder.haystack();
match self.finder.next() {
Some(start) => {
@@ -3440,31 +3511,31 @@ impl<'a> Iterator for SplitReverse<'a> {
/// An iterator over at most `n` substrings in a byte string, split by a
/// separator.
///
-/// `'a` is the lifetime of the byte string being split, while `F` is the type
-/// of the predicate, i.e., `FnMut(char) -> bool`.
+/// `'h` is the lifetime of the byte string being split (the haystack), while
+/// `'s` is the lifetime of the byte string doing the splitting.
#[derive(Debug)]
-pub struct SplitN<'a> {
- split: Split<'a>,
+pub struct SplitN<'h, 's> {
+ split: Split<'h, 's>,
limit: usize,
count: usize,
}
-impl<'a> SplitN<'a> {
+impl<'h, 's> SplitN<'h, 's> {
fn new(
- haystack: &'a [u8],
- splitter: &'a [u8],
+ haystack: &'h [u8],
+ splitter: &'s [u8],
limit: usize,
- ) -> SplitN<'a> {
+ ) -> SplitN<'h, 's> {
let split = haystack.split_str(splitter);
SplitN { split, limit, count: 0 }
}
}
-impl<'a> Iterator for SplitN<'a> {
- type Item = &'a [u8];
+impl<'h, 's> Iterator for SplitN<'h, 's> {
+ type Item = &'h [u8];
#[inline]
- fn next(&mut self) -> Option<&'a [u8]> {
+ fn next(&mut self) -> Option<&'h [u8]> {
self.count += 1;
if self.count > self.limit || self.split.done {
None
@@ -3479,31 +3550,31 @@ impl<'a> Iterator for SplitN<'a> {
/// An iterator over at most `n` substrings in a byte string, split by a
/// separator, in reverse.
///
-/// `'a` is the lifetime of the byte string being split, while `F` is the type
-/// of the predicate, i.e., `FnMut(char) -> bool`.
+/// `'h` is the lifetime of the byte string being split (the haystack), while
+/// `'s` is the lifetime of the byte string doing the splitting.
#[derive(Debug)]
-pub struct SplitNReverse<'a> {
- split: SplitReverse<'a>,
+pub struct SplitNReverse<'h, 's> {
+ split: SplitReverse<'h, 's>,
limit: usize,
count: usize,
}
-impl<'a> SplitNReverse<'a> {
+impl<'h, 's> SplitNReverse<'h, 's> {
fn new(
- haystack: &'a [u8],
- splitter: &'a [u8],
+ haystack: &'h [u8],
+ splitter: &'s [u8],
limit: usize,
- ) -> SplitNReverse<'a> {
+ ) -> SplitNReverse<'h, 's> {
let split = haystack.rsplit_str(splitter);
SplitNReverse { split, limit, count: 0 }
}
}
-impl<'a> Iterator for SplitNReverse<'a> {
- type Item = &'a [u8];
+impl<'h, 's> Iterator for SplitNReverse<'h, 's> {
+ type Item = &'h [u8];
#[inline]
- fn next(&mut self) -> Option<&'a [u8]> {
+ fn next(&mut self) -> Option<&'h [u8]> {
self.count += 1;
if self.count > self.limit || self.split.done {
None
@@ -3521,6 +3592,7 @@ impl<'a> Iterator for SplitNReverse<'a> {
/// `\n`.
///
/// `'a` is the lifetime of the byte string being iterated over.
+#[derive(Clone, Debug)]
pub struct Lines<'a> {
it: LinesWithTerminator<'a>,
}
@@ -3529,6 +3601,28 @@ impl<'a> Lines<'a> {
fn new(bytes: &'a [u8]) -> Lines<'a> {
Lines { it: LinesWithTerminator::new(bytes) }
}
+
+ /// Return a copy of the rest of the underlying bytes without affecting the
+ /// iterator itself.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// let s = b"\
+ /// foo
+ /// bar\r
+ /// baz";
+ /// let mut lines = s.lines();
+ /// assert_eq!(lines.next(), Some(B("foo")));
+ /// assert_eq!(lines.as_bytes(), B("bar\r\nbaz"));
+ /// ```
+ pub fn as_bytes(&self) -> &'a [u8] {
+ self.it.bytes
+ }
}
impl<'a> Iterator for Lines<'a> {
@@ -3536,17 +3630,19 @@ impl<'a> Iterator for Lines<'a> {
#[inline]
fn next(&mut self) -> Option<&'a [u8]> {
- let mut line = self.it.next()?;
- if line.last_byte() == Some(b'\n') {
- line = &line[..line.len() - 1];
- if line.last_byte() == Some(b'\r') {
- line = &line[..line.len() - 1];
- }
- }
- Some(line)
+ Some(trim_last_terminator(self.it.next()?))
}
}
+impl<'a> DoubleEndedIterator for Lines<'a> {
+ #[inline]
+ fn next_back(&mut self) -> Option<Self::Item> {
+ Some(trim_last_terminator(self.it.next_back()?))
+ }
+}
+
+impl<'a> iter::FusedIterator for Lines<'a> {}
+
/// An iterator over all lines in a byte string, including their terminators.
///
/// For this iterator, the only line terminator recognized is `\n`. (Since
@@ -3560,6 +3656,7 @@ impl<'a> Iterator for Lines<'a> {
/// the original byte string.
///
/// `'a` is the lifetime of the byte string being iterated over.
+#[derive(Clone, Debug)]
pub struct LinesWithTerminator<'a> {
bytes: &'a [u8],
}
@@ -3568,6 +3665,28 @@ impl<'a> LinesWithTerminator<'a> {
fn new(bytes: &'a [u8]) -> LinesWithTerminator<'a> {
LinesWithTerminator { bytes }
}
+
+ /// Return a copy of the rest of the underlying bytes without affecting the
+ /// iterator itself.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// let s = b"\
+ /// foo
+ /// bar\r
+ /// baz";
+ /// let mut lines = s.lines_with_terminator();
+ /// assert_eq!(lines.next(), Some(B("foo\n")));
+ /// assert_eq!(lines.as_bytes(), B("bar\r\nbaz"));
+ /// ```
+ pub fn as_bytes(&self) -> &'a [u8] {
+ self.bytes
+ }
}
impl<'a> Iterator for LinesWithTerminator<'a> {
@@ -3591,10 +3710,43 @@ impl<'a> Iterator for LinesWithTerminator<'a> {
}
}
-#[cfg(test)]
+impl<'a> DoubleEndedIterator for LinesWithTerminator<'a> {
+ #[inline]
+ fn next_back(&mut self) -> Option<Self::Item> {
+ let end = self.bytes.len().checked_sub(1)?;
+ match self.bytes[..end].rfind_byte(b'\n') {
+ None => {
+ let line = self.bytes;
+ self.bytes = b"";
+ Some(line)
+ }
+ Some(end) => {
+ let line = &self.bytes[end + 1..];
+ self.bytes = &self.bytes[..end + 1];
+ Some(line)
+ }
+ }
+ }
+}
+
+impl<'a> iter::FusedIterator for LinesWithTerminator<'a> {}
+
+fn trim_last_terminator(mut s: &[u8]) -> &[u8] {
+ if s.last_byte() == Some(b'\n') {
+ s = &s[..s.len() - 1];
+ if s.last_byte() == Some(b'\r') {
+ s = &s[..s.len() - 1];
+ }
+ }
+ s
+}
+
+#[cfg(all(test, feature = "std"))]
mod tests {
- use crate::ext_slice::{ByteSlice, B};
- use crate::tests::LOSSY_TESTS;
+ use crate::{
+ ext_slice::{ByteSlice, Lines, LinesWithTerminator, B},
+ tests::LOSSY_TESTS,
+ };
#[test]
fn to_str_lossy() {
@@ -3622,34 +3774,55 @@ mod tests {
}
#[test]
- #[should_panic]
- fn copy_within_fail1() {
- let mut buf = *b"foobar";
- let s = &mut buf;
- s.copy_within_str(0..2, 5);
- }
+ fn lines_iteration() {
+ macro_rules! t {
+ ($it:expr, $forward:expr) => {
+ let mut res: Vec<&[u8]> = Vec::from($forward);
+ assert_eq!($it.collect::<Vec<_>>(), res);
+ res.reverse();
+ assert_eq!($it.rev().collect::<Vec<_>>(), res);
+ };
+ }
- #[test]
- #[should_panic]
- fn copy_within_fail2() {
- let mut buf = *b"foobar";
- let s = &mut buf;
- s.copy_within_str(3..2, 0);
- }
+ t!(Lines::new(b""), []);
+ t!(LinesWithTerminator::new(b""), []);
- #[test]
- #[should_panic]
- fn copy_within_fail3() {
- let mut buf = *b"foobar";
- let s = &mut buf;
- s.copy_within_str(5..7, 0);
- }
+ t!(Lines::new(b"\n"), [B("")]);
+ t!(Lines::new(b"\r\n"), [B("")]);
+ t!(LinesWithTerminator::new(b"\n"), [B("\n")]);
- #[test]
- #[should_panic]
- fn copy_within_fail4() {
- let mut buf = *b"foobar";
- let s = &mut buf;
- s.copy_within_str(0..1, 6);
+ t!(Lines::new(b"a"), [B("a")]);
+ t!(LinesWithTerminator::new(b"a"), [B("a")]);
+
+ t!(Lines::new(b"abc"), [B("abc")]);
+ t!(LinesWithTerminator::new(b"abc"), [B("abc")]);
+
+ t!(Lines::new(b"abc\n"), [B("abc")]);
+ t!(Lines::new(b"abc\r\n"), [B("abc")]);
+ t!(LinesWithTerminator::new(b"abc\n"), [B("abc\n")]);
+
+ t!(Lines::new(b"abc\n\n"), [B("abc"), B("")]);
+ t!(LinesWithTerminator::new(b"abc\n\n"), [B("abc\n"), B("\n")]);
+
+ t!(Lines::new(b"abc\n\ndef"), [B("abc"), B(""), B("def")]);
+ t!(
+ LinesWithTerminator::new(b"abc\n\ndef"),
+ [B("abc\n"), B("\n"), B("def")]
+ );
+
+ t!(Lines::new(b"abc\n\ndef\n"), [B("abc"), B(""), B("def")]);
+ t!(
+ LinesWithTerminator::new(b"abc\n\ndef\n"),
+ [B("abc\n"), B("\n"), B("def\n")]
+ );
+
+ t!(Lines::new(b"\na\nb\n"), [B(""), B("a"), B("b")]);
+ t!(
+ LinesWithTerminator::new(b"\na\nb\n"),
+ [B("\n"), B("a\n"), B("b\n")]
+ );
+
+ t!(Lines::new(b"\n\n\n"), [B(""), B(""), B("")]);
+ t!(LinesWithTerminator::new(b"\n\n\n"), [B("\n"), B("\n"), B("\n")]);
}
}
diff --git a/src/ext_vec.rs b/src/ext_vec.rs
index 5beb0e1..b8e2be2 100644
--- a/src/ext_vec.rs
+++ b/src/ext_vec.rs
@@ -1,16 +1,21 @@
-use std::borrow::Cow;
-use std::error;
-use std::ffi::{OsStr, OsString};
-use std::fmt;
-use std::iter;
-use std::ops;
-use std::path::{Path, PathBuf};
-use std::ptr;
-use std::str;
-use std::vec;
-
-use crate::ext_slice::ByteSlice;
-use crate::utf8::{self, Utf8Error};
+use core::fmt;
+use core::iter;
+use core::ops;
+use core::ptr;
+
+use alloc::{borrow::Cow, string::String, vec, vec::Vec};
+
+#[cfg(feature = "std")]
+use std::{
+ error,
+ ffi::{OsStr, OsString},
+ path::{Path, PathBuf},
+};
+
+use crate::{
+ ext_slice::ByteSlice,
+ utf8::{self, Utf8Error},
+};
/// Concatenate the elements given by the iterator together into a single
/// `Vec<u8>`.
@@ -99,8 +104,10 @@ impl ByteVec for Vec<u8> {
/// Ensure that callers cannot implement `ByteSlice` by making an
/// umplementable trait its super trait.
-pub trait Sealed {}
-impl Sealed for Vec<u8> {}
+mod private {
+ pub trait Sealed {}
+}
+impl private::Sealed for Vec<u8> {}
/// A trait that extends `Vec<u8>` with string oriented methods.
///
@@ -114,7 +121,9 @@ impl Sealed for Vec<u8> {}
/// let s = Vec::from_slice(b"abc"); // NOT ByteVec::from_slice("...")
/// assert_eq!(s, B("abc"));
/// ```
-pub trait ByteVec: Sealed {
+///
+/// This trait is sealed and cannot be implemented outside of `bstr`.
+pub trait ByteVec: private::Sealed {
/// A method for accessing the raw vector bytes of this type. This is
/// always a no-op and callers shouldn't care about it. This only exists
/// for making the extension trait work.
@@ -154,8 +163,9 @@ pub trait ByteVec: Sealed {
/// Create a new byte string from an owned OS string.
///
- /// On Unix, this always succeeds and is zero cost. On non-Unix systems,
- /// this returns the original OS string if it is not valid UTF-8.
+ /// When the underlying bytes of OS strings are accessible, then this
+ /// always succeeds and is zero cost. Otherwise, this returns the given
+ /// `OsString` if it is not valid UTF-8.
///
/// # Examples
///
@@ -171,6 +181,7 @@ pub trait ByteVec: Sealed {
/// assert_eq!(bs, B("foo"));
/// ```
#[inline]
+ #[cfg(feature = "std")]
fn from_os_string(os_str: OsString) -> Result<Vec<u8>, OsString> {
#[cfg(unix)]
#[inline]
@@ -191,10 +202,11 @@ pub trait ByteVec: Sealed {
/// Lossily create a new byte string from an OS string slice.
///
- /// On Unix, this always succeeds, is zero cost and always returns a slice.
- /// On non-Unix systems, this does a UTF-8 check. If the given OS string
- /// slice is not valid UTF-8, then it is lossily decoded into valid UTF-8
- /// (with invalid bytes replaced by the Unicode replacement codepoint).
+ /// When the underlying bytes of OS strings are accessible, then this is
+ /// zero cost and always returns a slice. Otherwise, a UTF-8 check is
+ /// performed and if the given OS string is not valid UTF-8, then it is
+ /// lossily decoded into valid UTF-8 (with invalid bytes replaced by the
+ /// Unicode replacement codepoint).
///
/// # Examples
///
@@ -210,6 +222,7 @@ pub trait ByteVec: Sealed {
/// assert_eq!(bs, B("foo"));
/// ```
#[inline]
+ #[cfg(feature = "std")]
fn from_os_str_lossy<'a>(os_str: &'a OsStr) -> Cow<'a, [u8]> {
#[cfg(unix)]
#[inline]
@@ -233,8 +246,9 @@ pub trait ByteVec: Sealed {
/// Create a new byte string from an owned file path.
///
- /// On Unix, this always succeeds and is zero cost. On non-Unix systems,
- /// this returns the original path if it is not valid UTF-8.
+ /// When the underlying bytes of paths are accessible, then this always
+ /// succeeds and is zero cost. Otherwise, this returns the given `PathBuf`
+ /// if it is not valid UTF-8.
///
/// # Examples
///
@@ -250,16 +264,18 @@ pub trait ByteVec: Sealed {
/// assert_eq!(bs, B("foo"));
/// ```
#[inline]
+ #[cfg(feature = "std")]
fn from_path_buf(path: PathBuf) -> Result<Vec<u8>, PathBuf> {
Vec::from_os_string(path.into_os_string()).map_err(PathBuf::from)
}
/// Lossily create a new byte string from a file path.
///
- /// On Unix, this always succeeds, is zero cost and always returns a slice.
- /// On non-Unix systems, this does a UTF-8 check. If the given path is not
- /// valid UTF-8, then it is lossily decoded into valid UTF-8 (with invalid
- /// bytes replaced by the Unicode replacement codepoint).
+ /// When the underlying bytes of paths are accessible, then this is
+ /// zero cost and always returns a slice. Otherwise, a UTF-8 check is
+ /// performed and if the given path is not valid UTF-8, then it is lossily
+ /// decoded into valid UTF-8 (with invalid bytes replaced by the Unicode
+ /// replacement codepoint).
///
/// # Examples
///
@@ -275,6 +291,7 @@ pub trait ByteVec: Sealed {
/// assert_eq!(bs, B("foo"));
/// ```
#[inline]
+ #[cfg(feature = "std")]
fn from_path_lossy<'a>(path: &'a Path) -> Cow<'a, [u8]> {
Vec::from_os_str_lossy(path.as_os_str())
}
@@ -363,12 +380,10 @@ pub trait ByteVec: Sealed {
/// ```
/// use bstr::ByteVec;
///
- /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
/// let bytes = Vec::from("hello");
- /// let string = bytes.into_string()?;
+ /// let string = bytes.into_string().unwrap();
///
/// assert_eq!("hello", string);
- /// # Ok(()) }; example().unwrap()
/// ```
///
/// If this byte string is not valid UTF-8, then an error will be returned.
@@ -469,8 +484,9 @@ pub trait ByteVec: Sealed {
/// Converts this byte string into an OS string, in place.
///
- /// On Unix, this always succeeds and is zero cost. On non-Unix systems,
- /// this returns the original byte string if it is not valid UTF-8.
+ /// When OS strings can be constructed from arbitrary byte sequences, this
+ /// always succeeds and is zero cost. Otherwise, if this byte string is not
+ /// valid UTF-8, then an error (with the original byte string) is returned.
///
/// # Examples
///
@@ -485,14 +501,15 @@ pub trait ByteVec: Sealed {
/// let os_str = bs.into_os_string().expect("should be valid UTF-8");
/// assert_eq!(os_str, OsStr::new("foo"));
/// ```
+ #[cfg(feature = "std")]
#[inline]
- fn into_os_string(self) -> Result<OsString, Vec<u8>>
+ fn into_os_string(self) -> Result<OsString, FromUtf8Error>
where
Self: Sized,
{
#[cfg(unix)]
#[inline]
- fn imp(v: Vec<u8>) -> Result<OsString, Vec<u8>> {
+ fn imp(v: Vec<u8>) -> Result<OsString, FromUtf8Error> {
use std::os::unix::ffi::OsStringExt;
Ok(OsString::from_vec(v))
@@ -500,11 +517,8 @@ pub trait ByteVec: Sealed {
#[cfg(not(unix))]
#[inline]
- fn imp(v: Vec<u8>) -> Result<OsString, Vec<u8>> {
- match v.into_string() {
- Ok(s) => Ok(OsString::from(s)),
- Err(err) => Err(err.into_vec()),
- }
+ fn imp(v: Vec<u8>) -> Result<OsString, FromUtf8Error> {
+ v.into_string().map(OsString::from)
}
imp(self.into_vec())
@@ -512,13 +526,13 @@ pub trait ByteVec: Sealed {
/// Lossily converts this byte string into an OS string, in place.
///
- /// On Unix, this always succeeds and is zero cost. On non-Unix systems,
- /// this will perform a UTF-8 check and lossily convert this byte string
- /// into valid UTF-8 using the Unicode replacement codepoint.
+ /// When OS strings can be constructed from arbitrary byte sequences, this
+ /// is zero cost and always returns a slice. Otherwise, this will perform a
+ /// UTF-8 check and lossily convert this byte string into valid UTF-8 using
+ /// the Unicode replacement codepoint.
///
- /// Note that this can prevent the correct roundtripping of file paths on
- /// non-Unix systems such as Windows, where file paths are an arbitrary
- /// sequence of 16-bit integers.
+ /// Note that this can prevent the correct roundtripping of file paths when
+ /// the representation of `OsString` is opaque.
///
/// # Examples
///
@@ -532,6 +546,7 @@ pub trait ByteVec: Sealed {
/// assert_eq!(os_str.to_string_lossy(), "foo\u{FFFD}bar");
/// ```
#[inline]
+ #[cfg(feature = "std")]
fn into_os_string_lossy(self) -> OsString
where
Self: Sized,
@@ -555,8 +570,9 @@ pub trait ByteVec: Sealed {
/// Converts this byte string into an owned file path, in place.
///
- /// On Unix, this always succeeds and is zero cost. On non-Unix systems,
- /// this returns the original byte string if it is not valid UTF-8.
+ /// When paths can be constructed from arbitrary byte sequences, this
+ /// always succeeds and is zero cost. Otherwise, if this byte string is not
+ /// valid UTF-8, then an error (with the original byte string) is returned.
///
/// # Examples
///
@@ -569,8 +585,9 @@ pub trait ByteVec: Sealed {
/// let path = bs.into_path_buf().expect("should be valid UTF-8");
/// assert_eq!(path.as_os_str(), "foo");
/// ```
+ #[cfg(feature = "std")]
#[inline]
- fn into_path_buf(self) -> Result<PathBuf, Vec<u8>>
+ fn into_path_buf(self) -> Result<PathBuf, FromUtf8Error>
where
Self: Sized,
{
@@ -579,13 +596,13 @@ pub trait ByteVec: Sealed {
/// Lossily converts this byte string into an owned file path, in place.
///
- /// On Unix, this always succeeds and is zero cost. On non-Unix systems,
- /// this will perform a UTF-8 check and lossily convert this byte string
- /// into valid UTF-8 using the Unicode replacement codepoint.
+ /// When paths can be constructed from arbitrary byte sequences, this is
+ /// zero cost and always returns a slice. Otherwise, this will perform a
+ /// UTF-8 check and lossily convert this byte string into valid UTF-8 using
+ /// the Unicode replacement codepoint.
///
- /// Note that this can prevent the correct roundtripping of file paths on
- /// non-Unix systems such as Windows, where file paths are an arbitrary
- /// sequence of 16-bit integers.
+ /// Note that this can prevent the correct roundtripping of file paths when
+ /// the representation of `PathBuf` is opaque.
///
/// # Examples
///
@@ -599,6 +616,7 @@ pub trait ByteVec: Sealed {
/// assert_eq!(path.to_string_lossy(), "foo\u{FFFD}bar");
/// ```
#[inline]
+ #[cfg(feature = "std")]
fn into_path_buf_lossy(self) -> PathBuf
where
Self: Sized,
@@ -1029,6 +1047,7 @@ impl FromUtf8Error {
}
}
+#[cfg(feature = "std")]
impl error::Error for FromUtf8Error {
#[inline]
fn description(&self) -> &str {
@@ -1043,7 +1062,7 @@ impl fmt::Display for FromUtf8Error {
}
}
-#[cfg(test)]
+#[cfg(all(test, feature = "std"))]
mod tests {
use crate::ext_vec::ByteVec;
diff --git a/src/impls.rs b/src/impls.rs
index 85a27ba..c063cb6 100644
--- a/src/impls.rs
+++ b/src/impls.rs
@@ -18,7 +18,7 @@ macro_rules! impl_partial_eq {
};
}
-#[cfg(feature = "std")]
+#[cfg(feature = "alloc")]
macro_rules! impl_partial_eq_cow {
($lhs:ty, $rhs:ty) => {
impl<'a, 'b> PartialEq<$rhs> for $lhs {
@@ -59,17 +59,22 @@ macro_rules! impl_partial_ord {
};
}
-#[cfg(feature = "std")]
+#[cfg(feature = "alloc")]
mod bstring {
- use std::borrow::{Borrow, Cow, ToOwned};
- use std::cmp::Ordering;
- use std::fmt;
- use std::iter::FromIterator;
- use std::ops;
+ use core::{
+ cmp::Ordering, convert::TryFrom, fmt, iter::FromIterator, ops,
+ };
- use crate::bstr::BStr;
- use crate::bstring::BString;
- use crate::ext_vec::ByteVec;
+ use alloc::{
+ borrow::{Borrow, Cow, ToOwned},
+ string::String,
+ vec,
+ vec::Vec,
+ };
+
+ use crate::{
+ bstr::BStr, bstring::BString, ext_slice::ByteSlice, ext_vec::ByteVec,
+ };
impl fmt::Display for BString {
#[inline]
@@ -90,21 +95,21 @@ mod bstring {
#[inline]
fn deref(&self) -> &Vec<u8> {
- &self.bytes
+ self.as_vec()
}
}
impl ops::DerefMut for BString {
#[inline]
fn deref_mut(&mut self) -> &mut Vec<u8> {
- &mut self.bytes
+ self.as_vec_mut()
}
}
impl AsRef<[u8]> for BString {
#[inline]
fn as_ref(&self) -> &[u8] {
- &self.bytes
+ self.as_bytes()
}
}
@@ -118,7 +123,7 @@ mod bstring {
impl AsMut<[u8]> for BString {
#[inline]
fn as_mut(&mut self) -> &mut [u8] {
- &mut self.bytes
+ self.as_bytes_mut()
}
}
@@ -161,14 +166,14 @@ mod bstring {
impl From<Vec<u8>> for BString {
#[inline]
fn from(s: Vec<u8>) -> BString {
- BString { bytes: s }
+ BString::new(s)
}
}
impl From<BString> for Vec<u8> {
#[inline]
fn from(s: BString) -> Vec<u8> {
- s.bytes
+ s.into_vec()
}
}
@@ -200,6 +205,24 @@ mod bstring {
}
}
+ impl TryFrom<BString> for String {
+ type Error = crate::FromUtf8Error;
+
+ #[inline]
+ fn try_from(s: BString) -> Result<String, crate::FromUtf8Error> {
+ s.into_vec().into_string()
+ }
+ }
+
+ impl<'a> TryFrom<&'a BString> for &'a str {
+ type Error = crate::Utf8Error;
+
+ #[inline]
+ fn try_from(s: &'a BString) -> Result<&'a str, crate::Utf8Error> {
+ s.as_bytes().to_str()
+ }
+ }
+
impl FromIterator<char> for BString {
#[inline]
fn from_iter<T: IntoIterator<Item = char>>(iter: T) -> BString {
@@ -279,7 +302,7 @@ mod bstring {
impl PartialOrd for BString {
#[inline]
fn partial_cmp(&self, other: &BString) -> Option<Ordering> {
- PartialOrd::partial_cmp(&self.bytes, &other.bytes)
+ PartialOrd::partial_cmp(self.as_bytes(), other.as_bytes())
}
}
@@ -301,15 +324,12 @@ mod bstring {
}
mod bstr {
- #[cfg(feature = "std")]
- use std::borrow::Cow;
+ use core::{cmp::Ordering, convert::TryFrom, fmt, ops};
- use core::cmp::Ordering;
- use core::fmt;
- use core::ops;
+ #[cfg(feature = "alloc")]
+ use alloc::{borrow::Cow, boxed::Box, string::String, vec::Vec};
- use crate::bstr::BStr;
- use crate::ext_slice::ByteSlice;
+ use crate::{bstr::BStr, ext_slice::ByteSlice};
impl fmt::Display for BStr {
#[inline]
@@ -543,6 +563,13 @@ mod bstr {
}
}
+ impl AsRef<BStr> for BStr {
+ #[inline]
+ fn as_ref(&self) -> &BStr {
+ self
+ }
+ }
+
impl AsRef<BStr> for [u8] {
#[inline]
fn as_ref(&self) -> &BStr {
@@ -590,6 +617,13 @@ mod bstr {
}
}
+ impl<'a> From<&'a BStr> for &'a [u8] {
+ #[inline]
+ fn from(s: &'a BStr) -> &'a [u8] {
+ BStr::as_bytes(s)
+ }
+ }
+
impl<'a> From<&'a str> for &'a BStr {
#[inline]
fn from(s: &'a str) -> &'a BStr {
@@ -597,7 +631,7 @@ mod bstr {
}
}
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
impl<'a> From<&'a BStr> for Cow<'a, BStr> {
#[inline]
fn from(s: &'a BStr) -> Cow<'a, BStr> {
@@ -605,7 +639,7 @@ mod bstr {
}
}
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
impl From<Box<[u8]>> for Box<BStr> {
#[inline]
fn from(s: Box<[u8]>) -> Box<BStr> {
@@ -613,7 +647,7 @@ mod bstr {
}
}
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
impl From<Box<BStr>> for Box<[u8]> {
#[inline]
fn from(s: Box<BStr>) -> Box<[u8]> {
@@ -621,6 +655,33 @@ mod bstr {
}
}
+ impl<'a> TryFrom<&'a BStr> for &'a str {
+ type Error = crate::Utf8Error;
+
+ #[inline]
+ fn try_from(s: &'a BStr) -> Result<&'a str, crate::Utf8Error> {
+ s.as_bytes().to_str()
+ }
+ }
+
+ #[cfg(feature = "alloc")]
+ impl<'a> TryFrom<&'a BStr> for String {
+ type Error = crate::Utf8Error;
+
+ #[inline]
+ fn try_from(s: &'a BStr) -> Result<String, crate::Utf8Error> {
+ Ok(s.as_bytes().to_str()?.into())
+ }
+ }
+
+ #[cfg(feature = "alloc")]
+ impl Clone for Box<BStr> {
+ #[inline]
+ fn clone(&self) -> Self {
+ BStr::from_boxed_bytes(self.as_bytes().into())
+ }
+ }
+
impl Eq for BStr {}
impl PartialEq<BStr> for BStr {
@@ -635,19 +696,19 @@ mod bstr {
impl_partial_eq!(BStr, str);
impl_partial_eq!(BStr, &'a str);
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
impl_partial_eq!(BStr, Vec<u8>);
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
impl_partial_eq!(&'a BStr, Vec<u8>);
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
impl_partial_eq!(BStr, String);
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
impl_partial_eq!(&'a BStr, String);
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
impl_partial_eq_cow!(&'a BStr, Cow<'a, BStr>);
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
impl_partial_eq_cow!(&'a BStr, Cow<'a, str>);
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
impl_partial_eq_cow!(&'a BStr, Cow<'a, [u8]>);
impl PartialOrd for BStr {
@@ -669,17 +730,17 @@ mod bstr {
impl_partial_ord!(BStr, str);
impl_partial_ord!(BStr, &'a str);
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
impl_partial_ord!(BStr, Vec<u8>);
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
impl_partial_ord!(&'a BStr, Vec<u8>);
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
impl_partial_ord!(BStr, String);
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
impl_partial_ord!(&'a BStr, String);
}
-#[cfg(feature = "serde1-nostd")]
+#[cfg(feature = "serde")]
mod bstr_serde {
use core::fmt;
@@ -737,17 +798,18 @@ mod bstr_serde {
}
}
-#[cfg(feature = "serde1")]
+#[cfg(all(feature = "serde", feature = "alloc"))]
mod bstring_serde {
- use std::cmp;
- use std::fmt;
+ use core::{cmp, fmt};
+
+ use alloc::{boxed::Box, string::String, vec::Vec};
use serde::{
de::Error, de::SeqAccess, de::Visitor, Deserialize, Deserializer,
Serialize, Serializer,
};
- use crate::bstring::BString;
+ use crate::{bstr::BStr, bstring::BString};
impl Serialize for BString {
#[inline]
@@ -823,10 +885,82 @@ mod bstring_serde {
deserializer.deserialize_byte_buf(BStringVisitor)
}
}
+
+ impl<'de> Deserialize<'de> for Box<BStr> {
+ #[inline]
+ fn deserialize<D>(deserializer: D) -> Result<Box<BStr>, D::Error>
+ where
+ D: Deserializer<'de>,
+ {
+ struct BoxedBStrVisitor;
+
+ impl<'de> Visitor<'de> for BoxedBStrVisitor {
+ type Value = Box<BStr>;
+
+ fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ f.write_str("a boxed byte string")
+ }
+
+ #[inline]
+ fn visit_seq<V: SeqAccess<'de>>(
+ self,
+ mut visitor: V,
+ ) -> Result<Box<BStr>, V::Error> {
+ let len = cmp::min(visitor.size_hint().unwrap_or(0), 256);
+ let mut bytes = Vec::with_capacity(len);
+ while let Some(v) = visitor.next_element()? {
+ bytes.push(v);
+ }
+ Ok(BStr::from_boxed_bytes(bytes.into_boxed_slice()))
+ }
+
+ #[inline]
+ fn visit_bytes<E: Error>(
+ self,
+ value: &[u8],
+ ) -> Result<Box<BStr>, E> {
+ Ok(BStr::from_boxed_bytes(
+ value.to_vec().into_boxed_slice(),
+ ))
+ }
+
+ #[inline]
+ fn visit_byte_buf<E: Error>(
+ self,
+ value: Vec<u8>,
+ ) -> Result<Box<BStr>, E> {
+ Ok(BStr::from_boxed_bytes(value.into_boxed_slice()))
+ }
+
+ #[inline]
+ fn visit_str<E: Error>(
+ self,
+ value: &str,
+ ) -> Result<Box<BStr>, E> {
+ Ok(BStr::from_boxed_bytes(
+ value.as_bytes().to_vec().into_boxed_slice(),
+ ))
+ }
+
+ #[inline]
+ fn visit_string<E: Error>(
+ self,
+ value: String,
+ ) -> Result<Box<BStr>, E> {
+ Ok(BStr::from_boxed_bytes(
+ value.into_bytes().into_boxed_slice(),
+ ))
+ }
+ }
+
+ deserializer.deserialize_byte_buf(BoxedBStrVisitor)
+ }
+ }
}
-#[cfg(test)]
+#[cfg(all(test, feature = "std"))]
mod display {
+ #[cfg(not(miri))]
use crate::bstring::BString;
use crate::ByteSlice;
@@ -926,6 +1060,7 @@ mod display {
);
}
+ #[cfg(not(miri))]
quickcheck::quickcheck! {
fn total_length(bstr: BString) -> bool {
let size = bstr.chars().count();
@@ -934,7 +1069,7 @@ mod display {
}
}
-#[cfg(test)]
+#[cfg(all(test, feature = "alloc"))]
mod bstring_arbitrary {
use crate::bstring::BString;
@@ -946,12 +1081,13 @@ mod bstring_arbitrary {
}
fn shrink(&self) -> Box<dyn Iterator<Item = BString>> {
- Box::new(self.bytes.shrink().map(BString::from))
+ Box::new(self.as_vec().shrink().map(BString::from))
}
}
}
#[test]
+#[cfg(feature = "std")]
fn test_debug() {
use crate::{ByteSlice, B};
@@ -973,10 +1109,12 @@ fn test_debug() {
// See: https://github.com/BurntSushi/bstr/issues/82
#[test]
+#[cfg(feature = "std")]
fn test_cows_regression() {
- use crate::ByteSlice;
use std::borrow::Cow;
+ use crate::ByteSlice;
+
let c1 = Cow::from(b"hello bstr".as_bstr());
let c2 = b"goodbye bstr".as_bstr();
assert_ne!(c1, c2);
diff --git a/src/io.rs b/src/io.rs
index ad6f3c1..a648145 100644
--- a/src/io.rs
+++ b/src/io.rs
@@ -7,12 +7,13 @@ facilities for conveniently and efficiently working with lines as byte strings.
More APIs may be added in the future.
*/
+use alloc::{vec, vec::Vec};
+
use std::io;
-use crate::ext_slice::ByteSlice;
-use crate::ext_vec::ByteVec;
+use crate::{ext_slice::ByteSlice, ext_vec::ByteVec};
-/// An extention trait for
+/// An extension trait for
/// [`std::io::BufRead`](https://doc.rust-lang.org/std/io/trait.BufRead.html)
/// which provides convenience APIs for dealing with byte strings.
pub trait BufReadExt: io::BufRead {
@@ -36,7 +37,7 @@ pub trait BufReadExt: io::BufRead {
/// use bstr::io::BufReadExt;
///
/// # fn example() -> Result<(), io::Error> {
- /// let cursor = io::Cursor::new(b"lorem\nipsum\r\ndolor");
+ /// let mut cursor = io::Cursor::new(b"lorem\nipsum\r\ndolor");
///
/// let mut lines = vec![];
/// for result in cursor.byte_lines() {
@@ -79,7 +80,7 @@ pub trait BufReadExt: io::BufRead {
/// use bstr::io::BufReadExt;
///
/// # fn example() -> Result<(), io::Error> {
- /// let cursor = io::Cursor::new(b"lorem\x00ipsum\x00dolor");
+ /// let mut cursor = io::Cursor::new(b"lorem\x00ipsum\x00dolor");
///
/// let mut records = vec![];
/// for result in cursor.byte_records(b'\x00') {
@@ -122,7 +123,7 @@ pub trait BufReadExt: io::BufRead {
/// use bstr::io::BufReadExt;
///
/// # fn example() -> Result<(), io::Error> {
- /// let cursor = io::Cursor::new(b"lorem\nipsum\r\ndolor");
+ /// let mut cursor = io::Cursor::new(b"lorem\nipsum\r\ndolor");
///
/// let mut lines = vec![];
/// cursor.for_byte_line(|line| {
@@ -135,7 +136,7 @@ pub trait BufReadExt: io::BufRead {
/// assert_eq!(lines[2], "dolor".as_bytes());
/// # Ok(()) }; example().unwrap()
/// ```
- fn for_byte_line<F>(self, mut for_each_line: F) -> io::Result<()>
+ fn for_byte_line<F>(&mut self, mut for_each_line: F) -> io::Result<()>
where
Self: Sized,
F: FnMut(&[u8]) -> io::Result<bool>,
@@ -169,7 +170,7 @@ pub trait BufReadExt: io::BufRead {
/// use bstr::io::BufReadExt;
///
/// # fn example() -> Result<(), io::Error> {
- /// let cursor = io::Cursor::new(b"lorem\x00ipsum\x00dolor");
+ /// let mut cursor = io::Cursor::new(b"lorem\x00ipsum\x00dolor");
///
/// let mut records = vec![];
/// cursor.for_byte_record(b'\x00', |record| {
@@ -183,7 +184,7 @@ pub trait BufReadExt: io::BufRead {
/// # Ok(()) }; example().unwrap()
/// ```
fn for_byte_record<F>(
- self,
+ &mut self,
terminator: u8,
mut for_each_record: F,
) -> io::Result<()>
@@ -223,7 +224,7 @@ pub trait BufReadExt: io::BufRead {
/// use bstr::io::BufReadExt;
///
/// # fn example() -> Result<(), io::Error> {
- /// let cursor = io::Cursor::new(b"lorem\nipsum\r\ndolor");
+ /// let mut cursor = io::Cursor::new(b"lorem\nipsum\r\ndolor");
///
/// let mut lines = vec![];
/// cursor.for_byte_line_with_terminator(|line| {
@@ -237,7 +238,7 @@ pub trait BufReadExt: io::BufRead {
/// # Ok(()) }; example().unwrap()
/// ```
fn for_byte_line_with_terminator<F>(
- self,
+ &mut self,
for_each_line: F,
) -> io::Result<()>
where
@@ -269,11 +270,10 @@ pub trait BufReadExt: io::BufRead {
/// ```
/// use std::io;
///
- /// use bstr::B;
- /// use bstr::io::BufReadExt;
+ /// use bstr::{io::BufReadExt, B};
///
/// # fn example() -> Result<(), io::Error> {
- /// let cursor = io::Cursor::new(b"lorem\x00ipsum\x00dolor");
+ /// let mut cursor = io::Cursor::new(b"lorem\x00ipsum\x00dolor");
///
/// let mut records = vec![];
/// cursor.for_byte_record_with_terminator(b'\x00', |record| {
@@ -287,7 +287,7 @@ pub trait BufReadExt: io::BufRead {
/// # Ok(()) }; example().unwrap()
/// ```
fn for_byte_record_with_terminator<F>(
- mut self,
+ &mut self,
terminator: u8,
mut for_each_record: F,
) -> io::Result<()>
@@ -438,11 +438,12 @@ fn trim_record_slice(mut record: &[u8], terminator: u8) -> &[u8] {
record
}
-#[cfg(test)]
+#[cfg(all(test, feature = "std"))]
mod tests {
- use super::BufReadExt;
use crate::bstring::BString;
+ use super::BufReadExt;
+
fn collect_lines<B: AsRef<[u8]>>(slice: B) -> Vec<BString> {
let mut lines = vec![];
slice
diff --git a/src/lib.rs b/src/lib.rs
index 41142c9..3d334ac 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -34,7 +34,7 @@ additional string oriented methods. Operations such as iterating over
graphemes, searching for substrings, replacing substrings, trimming and case
conversion are examples of things not provided on the standard library `&[u8]`
APIs but are provided by this crate. For example, this code iterates over all
-of occurrences of a subtring:
+of occurrences of a substring:
```
use bstr::ByteSlice;
@@ -52,23 +52,27 @@ Here's another example showing how to do a search and replace (and also showing
use of the `B` function):
```
+# #[cfg(feature = "alloc")] {
use bstr::{B, ByteSlice};
let old = B("foo ☃☃☃ foo foo quux foo");
let new = old.replace("foo", "hello");
assert_eq!(new, B("hello ☃☃☃ hello hello quux hello"));
+# }
```
And here's an example that shows case conversion, even in the presence of
invalid UTF-8:
```
+# #[cfg(all(feature = "alloc", feature = "unicode"))] {
use bstr::{ByteSlice, ByteVec};
let mut lower = Vec::from("hello β");
lower[0] = b'\xFF';
// lowercase β is uppercased to Β
assert_eq!(lower.to_uppercase(), b"\xFFELLO \xCE\x92");
+# }
```
# Convenient debug representation
@@ -98,10 +102,8 @@ method converts any `&[u8]` to a `&BStr`.
# When should I use byte strings?
-This library reflects my hypothesis that UTF-8 by convention is a better trade
-off in some circumstances than guaranteed UTF-8. It's possible, perhaps even
-likely, that this is a niche concern for folks working closely with core text
-primitives.
+This library reflects my belief that UTF-8 by convention is a better trade
+off in some circumstances than guaranteed UTF-8.
The first time this idea hit me was in the implementation of Rust's regex
engine. In particular, very little of the internal implementation cares at all
@@ -134,24 +136,26 @@ incremental way by only parsing chunks at a time, but this is often complex to
do or impractical. For example, many regex engines only accept one contiguous
sequence of bytes at a time with no way to perform incremental matching.
-In summary, conventional UTF-8 byte strings provided by this library are
-definitely useful in some limited circumstances, but how useful they are more
-broadly isn't clear yet.
-
# `bstr` in public APIs
-Since this library is not yet `1.0`, you should not use it in the public API of
-your crates until it hits `1.0` (unless you're OK with with tracking breaking
-releases of `bstr`). It is expected that `bstr 1.0` will be released before
-2022.
+This library is past version `1` and is expected to remain at version `1` for
+the foreseeable future. Therefore, it is encouraged to put types from `bstr`
+(like `BStr` and `BString`) in your public API if that makes sense for your
+crate.
+
+With that said, in general, it should be possible to avoid putting anything
+in this crate into your public APIs. Namely, you should never need to use the
+`ByteSlice` or `ByteVec` traits as bounds on public APIs, since their only
+purpose is to extend the methods on the concrete types `[u8]` and `Vec<u8>`,
+respectively. Similarly, it should not be necessary to put either the `BStr` or
+`BString` types into public APIs. If you want to use them internally, then they
+can be converted to/from `[u8]`/`Vec<u8>` as needed. The conversions are free.
+
+So while it shouldn't ever be 100% necessary to make `bstr` a public
+dependency, there may be cases where it is convenient to do so. This is an
+explicitly supported use case of `bstr`, and as such, major version releases
+should be exceptionally rare.
-In general, it should be possible to avoid putting anything in this crate into
-your public APIs. Namely, you should never need to use the `ByteSlice` or
-`ByteVec` traits as bounds on public APIs, since their only purpose is to
-extend the methods on the concrete types `[u8]` and `Vec<u8>`, respectively.
-Similarly, it should not be necessary to put either the `BStr` or `BString`
-types into public APIs. If you want to use them internally, then they can
-be converted to/from `[u8]`/`Vec<u8>` as needed.
# Differences with standard strings
@@ -318,7 +322,8 @@ they can do:
by accessing their underlying 16-bit integer representation. Unfortunately,
this isn't zero cost (it introduces a second WTF-8 decoding step) and it's
not clear this is a good thing to do, since WTF-8 should ideally remain an
- internal implementation detail.
+ internal implementation detail. This is roughly the approach taken by the
+ [`os_str_bytes`](https://crates.io/crates/os_str_bytes) crate.
2. One could instead declare that they will not handle paths on Windows that
are not valid UTF-16, and return an error when one is encountered.
3. Like (2), but instead of returning an error, lossily decode the file path
@@ -365,19 +370,57 @@ UTF-8, and thus contain latent bugs on Unix where paths with invalid UTF-8 are
not terribly uncommon. If you instead use byte strings, then you're guaranteed
to write correct code for Unix, at the cost of getting a corner case wrong on
Windows.
+
+# Cargo features
+
+This crates comes with a few features that control standard library, serde
+and Unicode support.
+
+* `std` - **Enabled** by default. This provides APIs that require the standard
+ library, such as `Vec<u8>` and `PathBuf`. Enabling this feature also enables
+ the `alloc` feature and any other relevant `std` features for dependencies.
+* `alloc` - **Enabled** by default. This provides APIs that require allocations
+ via the `alloc` crate, such as `Vec<u8>`.
+* `unicode` - **Enabled** by default. This provides APIs that require sizable
+ Unicode data compiled into the binary. This includes, but is not limited to,
+ grapheme/word/sentence segmenters. When this is disabled, basic support such
+ as UTF-8 decoding is still included. Note that currently, enabling this
+ feature also requires enabling the `std` feature. It is expected that this
+ limitation will be lifted at some point.
+* `serde` - Enables implementations of serde traits for `BStr`, and also
+ `BString` when `alloc` is enabled.
*/
-#![cfg_attr(not(feature = "std"), no_std)]
+#![cfg_attr(not(any(feature = "std", test)), no_std)]
+#![cfg_attr(docsrs, feature(doc_auto_cfg))]
+
+// Why do we do this? Well, in order for us to use once_cell's 'Lazy' type to
+// load DFAs, it requires enabling its 'std' feature. Yet, there is really
+// nothing about our 'unicode' feature that requires 'std'. We could declare
+// that 'unicode = [std, ...]', which would be fine, but once regex-automata
+// 0.3 is a thing, I believe we can drop once_cell altogether and thus drop
+// the need for 'std' to be enabled when 'unicode' is enabled. But if we make
+// 'unicode' also enable 'std', then it would be a breaking change to remove
+// 'std' from that list.
+//
+// So, for right now, we force folks to explicitly say they want 'std' if they
+// want 'unicode'. In the future, we should be able to relax this.
+#[cfg(all(feature = "unicode", not(feature = "std")))]
+compile_error!("enabling 'unicode' requires enabling 'std'");
+
+#[cfg(feature = "alloc")]
+extern crate alloc;
pub use crate::bstr::BStr;
-#[cfg(feature = "std")]
+#[cfg(feature = "alloc")]
pub use crate::bstring::BString;
+#[cfg(feature = "unicode")]
+pub use crate::ext_slice::Fields;
pub use crate::ext_slice::{
- ByteSlice, Bytes, Fields, FieldsWith, Find, FindReverse, Finder,
- FinderReverse, Lines, LinesWithTerminator, Split, SplitN, SplitNReverse,
- SplitReverse, B,
+ ByteSlice, Bytes, FieldsWith, Find, FindReverse, Finder, FinderReverse,
+ Lines, LinesWithTerminator, Split, SplitN, SplitNReverse, SplitReverse, B,
};
-#[cfg(feature = "std")]
+#[cfg(feature = "alloc")]
pub use crate::ext_vec::{concat, join, ByteVec, DrainBytes, FromUtf8Error};
#[cfg(feature = "unicode")]
pub use crate::unicode::{
@@ -391,26 +434,28 @@ pub use crate::utf8::{
mod ascii;
mod bstr;
-#[cfg(feature = "std")]
+#[cfg(feature = "alloc")]
mod bstring;
mod byteset;
mod ext_slice;
-#[cfg(feature = "std")]
+#[cfg(feature = "alloc")]
mod ext_vec;
mod impls;
#[cfg(feature = "std")]
pub mod io;
-#[cfg(test)]
+#[cfg(all(test, feature = "std"))]
mod tests;
#[cfg(feature = "unicode")]
mod unicode;
mod utf8;
-#[cfg(test)]
+#[cfg(all(test, feature = "std"))]
mod apitests {
- use crate::bstr::BStr;
- use crate::bstring::BString;
- use crate::ext_slice::{Finder, FinderReverse};
+ use crate::{
+ bstr::BStr,
+ bstring::BString,
+ ext_slice::{Finder, FinderReverse},
+ };
#[test]
fn oibits() {
diff --git a/src/tests.rs b/src/tests.rs
index f4179fd..03a4461 100644
--- a/src/tests.rs
+++ b/src/tests.rs
@@ -6,7 +6,7 @@
///
/// The first element in each tuple is the expected result of lossy decoding,
/// while the second element is the input given.
-pub const LOSSY_TESTS: &[(&str, &[u8])] = &[
+pub(crate) const LOSSY_TESTS: &[(&str, &[u8])] = &[
("a", b"a"),
("\u{FFFD}", b"\xFF"),
("\u{FFFD}\u{FFFD}", b"\xFF\xFF"),
diff --git a/src/unicode/data/GraphemeBreakTest.txt b/src/unicode/data/GraphemeBreakTest.txt
index fb4fec9..eff2fd3 100644
--- a/src/unicode/data/GraphemeBreakTest.txt
+++ b/src/unicode/data/GraphemeBreakTest.txt
@@ -1,6 +1,6 @@
-# GraphemeBreakTest-12.1.0.txt
-# Date: 2019-03-10, 10:53:12 GMT
-# © 2019 Unicode®, Inc.
+# GraphemeBreakTest-14.0.0.txt
+# Date: 2021-03-08, 06:22:32 GMT
+# © 2021 Unicode®, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
# For terms of use, see http://www.unicode.org/terms_of_use.html
#
diff --git a/src/unicode/data/SentenceBreakTest.txt b/src/unicode/data/SentenceBreakTest.txt
index 7c1c34a..61ea42c 100644
--- a/src/unicode/data/SentenceBreakTest.txt
+++ b/src/unicode/data/SentenceBreakTest.txt
@@ -1,6 +1,6 @@
-# SentenceBreakTest-12.1.0.txt
-# Date: 2019-03-10, 10:53:28 GMT
-# © 2019 Unicode®, Inc.
+# SentenceBreakTest-14.0.0.txt
+# Date: 2021-03-08, 06:22:40 GMT
+# © 2021 Unicode®, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
# For terms of use, see http://www.unicode.org/terms_of_use.html
#
diff --git a/src/unicode/data/WordBreakTest.txt b/src/unicode/data/WordBreakTest.txt
index facd892..1d1435b 100644
--- a/src/unicode/data/WordBreakTest.txt
+++ b/src/unicode/data/WordBreakTest.txt
@@ -1,6 +1,6 @@
-# WordBreakTest-12.1.0.txt
-# Date: 2019-03-10, 10:53:29 GMT
-# © 2019 Unicode®, Inc.
+# WordBreakTest-14.0.0.txt
+# Date: 2021-03-08, 06:22:40 GMT
+# © 2021 Unicode®, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
# For terms of use, see http://www.unicode.org/terms_of_use.html
#
diff --git a/src/unicode/fsm/grapheme_break_fwd.bigendian.dfa b/src/unicode/fsm/grapheme_break_fwd.bigendian.dfa
index 0efaaf2..31f99c1 100644
--- a/src/unicode/fsm/grapheme_break_fwd.bigendian.dfa
+++ b/src/unicode/fsm/grapheme_break_fwd.bigendian.dfa
Binary files differ
diff --git a/src/unicode/fsm/grapheme_break_fwd.littleendian.dfa b/src/unicode/fsm/grapheme_break_fwd.littleendian.dfa
index eb24025..3a51728 100644
--- a/src/unicode/fsm/grapheme_break_fwd.littleendian.dfa
+++ b/src/unicode/fsm/grapheme_break_fwd.littleendian.dfa
Binary files differ
diff --git a/src/unicode/fsm/grapheme_break_fwd.rs b/src/unicode/fsm/grapheme_break_fwd.rs
index b53b1d7..dea4a7e 100644
--- a/src/unicode/fsm/grapheme_break_fwd.rs
+++ b/src/unicode/fsm/grapheme_break_fwd.rs
@@ -2,11 +2,12 @@
//
// ucd-generate dfa --name GRAPHEME_BREAK_FWD --sparse --minimize --anchored --state-size 2 src/unicode/fsm/ [snip (arg too long)]
//
-// ucd-generate 0.2.9 is available on crates.io.
+// ucd-generate 0.2.12 is available on crates.io.
#[cfg(target_endian = "big")]
-lazy_static::lazy_static! {
- pub static ref GRAPHEME_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u16> = {
+pub static GRAPHEME_BREAK_FWD: ::once_cell::sync::Lazy<
+ ::regex_automata::SparseDFA<&'static [u8], u16>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u8; 0],
@@ -18,15 +19,13 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("grapheme_break_fwd.bigendian.dfa"),
};
- unsafe {
- ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) }
+});
#[cfg(target_endian = "little")]
-lazy_static::lazy_static! {
- pub static ref GRAPHEME_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u16> = {
+pub static GRAPHEME_BREAK_FWD: ::once_cell::sync::Lazy<
+ ::regex_automata::SparseDFA<&'static [u8], u16>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u8; 0],
@@ -38,8 +37,5 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("grapheme_break_fwd.littleendian.dfa"),
};
- unsafe {
- ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) }
+});
diff --git a/src/unicode/fsm/grapheme_break_rev.bigendian.dfa b/src/unicode/fsm/grapheme_break_rev.bigendian.dfa
index d42cd36..742d2a6 100644
--- a/src/unicode/fsm/grapheme_break_rev.bigendian.dfa
+++ b/src/unicode/fsm/grapheme_break_rev.bigendian.dfa
Binary files differ
diff --git a/src/unicode/fsm/grapheme_break_rev.littleendian.dfa b/src/unicode/fsm/grapheme_break_rev.littleendian.dfa
index c75ea5f..d1937f2 100644
--- a/src/unicode/fsm/grapheme_break_rev.littleendian.dfa
+++ b/src/unicode/fsm/grapheme_break_rev.littleendian.dfa
Binary files differ
diff --git a/src/unicode/fsm/grapheme_break_rev.rs b/src/unicode/fsm/grapheme_break_rev.rs
index 93e888c..2d2cd54 100644
--- a/src/unicode/fsm/grapheme_break_rev.rs
+++ b/src/unicode/fsm/grapheme_break_rev.rs
@@ -2,11 +2,12 @@
//
// ucd-generate dfa --name GRAPHEME_BREAK_REV --reverse --longest --sparse --minimize --anchored --state-size 2 src/unicode/fsm/ [snip (arg too long)]
//
-// ucd-generate 0.2.9 is available on crates.io.
+// ucd-generate 0.2.12 is available on crates.io.
#[cfg(target_endian = "big")]
-lazy_static::lazy_static! {
- pub static ref GRAPHEME_BREAK_REV: ::regex_automata::SparseDFA<&'static [u8], u16> = {
+pub static GRAPHEME_BREAK_REV: ::once_cell::sync::Lazy<
+ ::regex_automata::SparseDFA<&'static [u8], u16>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u8; 0],
@@ -18,15 +19,13 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("grapheme_break_rev.bigendian.dfa"),
};
- unsafe {
- ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) }
+});
#[cfg(target_endian = "little")]
-lazy_static::lazy_static! {
- pub static ref GRAPHEME_BREAK_REV: ::regex_automata::SparseDFA<&'static [u8], u16> = {
+pub static GRAPHEME_BREAK_REV: ::once_cell::sync::Lazy<
+ ::regex_automata::SparseDFA<&'static [u8], u16>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u8; 0],
@@ -38,8 +37,5 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("grapheme_break_rev.littleendian.dfa"),
};
- unsafe {
- ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) }
+});
diff --git a/src/unicode/fsm/regional_indicator_rev.rs b/src/unicode/fsm/regional_indicator_rev.rs
index 2bf7e4c..db7a40f 100644
--- a/src/unicode/fsm/regional_indicator_rev.rs
+++ b/src/unicode/fsm/regional_indicator_rev.rs
@@ -2,11 +2,12 @@
//
// ucd-generate dfa --name REGIONAL_INDICATOR_REV --reverse --classes --minimize --anchored --premultiply --state-size 1 src/unicode/fsm/ \p{gcb=Regional_Indicator}
//
-// ucd-generate 0.2.9 is available on crates.io.
+// ucd-generate 0.2.12 is available on crates.io.
#[cfg(target_endian = "big")]
-lazy_static::lazy_static! {
- pub static ref REGIONAL_INDICATOR_REV: ::regex_automata::DenseDFA<&'static [u8], u8> = {
+pub static REGIONAL_INDICATOR_REV: ::once_cell::sync::Lazy<
+ ::regex_automata::DenseDFA<&'static [u8], u8>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u8; 0],
@@ -18,15 +19,13 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("regional_indicator_rev.bigendian.dfa"),
};
- unsafe {
- ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) }
+});
#[cfg(target_endian = "little")]
-lazy_static::lazy_static! {
- pub static ref REGIONAL_INDICATOR_REV: ::regex_automata::DenseDFA<&'static [u8], u8> = {
+pub static REGIONAL_INDICATOR_REV: ::once_cell::sync::Lazy<
+ ::regex_automata::DenseDFA<&'static [u8], u8>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u8; 0],
@@ -38,8 +37,5 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("regional_indicator_rev.littleendian.dfa"),
};
- unsafe {
- ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) }
+});
diff --git a/src/unicode/fsm/sentence_break_fwd.bigendian.dfa b/src/unicode/fsm/sentence_break_fwd.bigendian.dfa
index a1813d7..1abdae8 100644
--- a/src/unicode/fsm/sentence_break_fwd.bigendian.dfa
+++ b/src/unicode/fsm/sentence_break_fwd.bigendian.dfa
Binary files differ
diff --git a/src/unicode/fsm/sentence_break_fwd.littleendian.dfa b/src/unicode/fsm/sentence_break_fwd.littleendian.dfa
index 2763583..2f8aadd 100644
--- a/src/unicode/fsm/sentence_break_fwd.littleendian.dfa
+++ b/src/unicode/fsm/sentence_break_fwd.littleendian.dfa
Binary files differ
diff --git a/src/unicode/fsm/sentence_break_fwd.rs b/src/unicode/fsm/sentence_break_fwd.rs
index cc937a4..97dd658 100644
--- a/src/unicode/fsm/sentence_break_fwd.rs
+++ b/src/unicode/fsm/sentence_break_fwd.rs
@@ -2,11 +2,12 @@
//
// ucd-generate dfa --name SENTENCE_BREAK_FWD --minimize --sparse --anchored --state-size 4 src/unicode/fsm/ [snip (arg too long)]
//
-// ucd-generate 0.2.9 is available on crates.io.
+// ucd-generate 0.2.12 is available on crates.io.
#[cfg(target_endian = "big")]
-lazy_static::lazy_static! {
- pub static ref SENTENCE_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u32> = {
+pub static SENTENCE_BREAK_FWD: ::once_cell::sync::Lazy<
+ ::regex_automata::SparseDFA<&'static [u8], u32>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u8; 0],
@@ -18,15 +19,13 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("sentence_break_fwd.bigendian.dfa"),
};
- unsafe {
- ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) }
+});
#[cfg(target_endian = "little")]
-lazy_static::lazy_static! {
- pub static ref SENTENCE_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u32> = {
+pub static SENTENCE_BREAK_FWD: ::once_cell::sync::Lazy<
+ ::regex_automata::SparseDFA<&'static [u8], u32>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u8; 0],
@@ -38,8 +37,5 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("sentence_break_fwd.littleendian.dfa"),
};
- unsafe {
- ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) }
+});
diff --git a/src/unicode/fsm/simple_word_fwd.bigendian.dfa b/src/unicode/fsm/simple_word_fwd.bigendian.dfa
index adc64c1..888e465 100644
--- a/src/unicode/fsm/simple_word_fwd.bigendian.dfa
+++ b/src/unicode/fsm/simple_word_fwd.bigendian.dfa
Binary files differ
diff --git a/src/unicode/fsm/simple_word_fwd.littleendian.dfa b/src/unicode/fsm/simple_word_fwd.littleendian.dfa
index dd48386..a1d527c 100644
--- a/src/unicode/fsm/simple_word_fwd.littleendian.dfa
+++ b/src/unicode/fsm/simple_word_fwd.littleendian.dfa
Binary files differ
diff --git a/src/unicode/fsm/simple_word_fwd.rs b/src/unicode/fsm/simple_word_fwd.rs
index f1f3da5..32b69b6 100644
--- a/src/unicode/fsm/simple_word_fwd.rs
+++ b/src/unicode/fsm/simple_word_fwd.rs
@@ -2,11 +2,12 @@
//
// ucd-generate dfa --name SIMPLE_WORD_FWD --sparse --minimize --state-size 2 src/unicode/fsm/ \w
//
-// ucd-generate 0.2.9 is available on crates.io.
+// ucd-generate 0.2.12 is available on crates.io.
#[cfg(target_endian = "big")]
-lazy_static::lazy_static! {
- pub static ref SIMPLE_WORD_FWD: ::regex_automata::SparseDFA<&'static [u8], u16> = {
+pub static SIMPLE_WORD_FWD: ::once_cell::sync::Lazy<
+ ::regex_automata::SparseDFA<&'static [u8], u16>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u8; 0],
@@ -18,15 +19,13 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("simple_word_fwd.bigendian.dfa"),
};
- unsafe {
- ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) }
+});
#[cfg(target_endian = "little")]
-lazy_static::lazy_static! {
- pub static ref SIMPLE_WORD_FWD: ::regex_automata::SparseDFA<&'static [u8], u16> = {
+pub static SIMPLE_WORD_FWD: ::once_cell::sync::Lazy<
+ ::regex_automata::SparseDFA<&'static [u8], u16>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u8; 0],
@@ -38,8 +37,5 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("simple_word_fwd.littleendian.dfa"),
};
- unsafe {
- ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) }
+});
diff --git a/src/unicode/fsm/whitespace_anchored_fwd.rs b/src/unicode/fsm/whitespace_anchored_fwd.rs
index 419b5d4..0780412 100644
--- a/src/unicode/fsm/whitespace_anchored_fwd.rs
+++ b/src/unicode/fsm/whitespace_anchored_fwd.rs
@@ -2,11 +2,12 @@
//
// ucd-generate dfa --name WHITESPACE_ANCHORED_FWD --anchored --classes --premultiply --minimize --state-size 1 src/unicode/fsm/ \s+
//
-// ucd-generate 0.2.9 is available on crates.io.
+// ucd-generate 0.2.12 is available on crates.io.
#[cfg(target_endian = "big")]
-lazy_static::lazy_static! {
- pub static ref WHITESPACE_ANCHORED_FWD: ::regex_automata::DenseDFA<&'static [u8], u8> = {
+pub static WHITESPACE_ANCHORED_FWD: ::once_cell::sync::Lazy<
+ ::regex_automata::DenseDFA<&'static [u8], u8>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u8; 0],
@@ -18,15 +19,13 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("whitespace_anchored_fwd.bigendian.dfa"),
};
- unsafe {
- ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) }
+});
#[cfg(target_endian = "little")]
-lazy_static::lazy_static! {
- pub static ref WHITESPACE_ANCHORED_FWD: ::regex_automata::DenseDFA<&'static [u8], u8> = {
+pub static WHITESPACE_ANCHORED_FWD: ::once_cell::sync::Lazy<
+ ::regex_automata::DenseDFA<&'static [u8], u8>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u8; 0],
@@ -38,8 +37,5 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("whitespace_anchored_fwd.littleendian.dfa"),
};
- unsafe {
- ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) }
+});
diff --git a/src/unicode/fsm/whitespace_anchored_rev.rs b/src/unicode/fsm/whitespace_anchored_rev.rs
index 301b03c..3d0d7a6 100644
--- a/src/unicode/fsm/whitespace_anchored_rev.rs
+++ b/src/unicode/fsm/whitespace_anchored_rev.rs
@@ -2,11 +2,12 @@
//
// ucd-generate dfa --name WHITESPACE_ANCHORED_REV --reverse --anchored --classes --premultiply --minimize --state-size 2 src/unicode/fsm/ \s+
//
-// ucd-generate 0.2.9 is available on crates.io.
+// ucd-generate 0.2.12 is available on crates.io.
#[cfg(target_endian = "big")]
-lazy_static::lazy_static! {
- pub static ref WHITESPACE_ANCHORED_REV: ::regex_automata::DenseDFA<&'static [u16], u16> = {
+pub static WHITESPACE_ANCHORED_REV: ::once_cell::sync::Lazy<
+ ::regex_automata::DenseDFA<&'static [u16], u16>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u16; 0],
@@ -18,15 +19,13 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("whitespace_anchored_rev.bigendian.dfa"),
};
- unsafe {
- ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) }
+});
#[cfg(target_endian = "little")]
-lazy_static::lazy_static! {
- pub static ref WHITESPACE_ANCHORED_REV: ::regex_automata::DenseDFA<&'static [u16], u16> = {
+pub static WHITESPACE_ANCHORED_REV: ::once_cell::sync::Lazy<
+ ::regex_automata::DenseDFA<&'static [u16], u16>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u16; 0],
@@ -38,8 +37,5 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("whitespace_anchored_rev.littleendian.dfa"),
};
- unsafe {
- ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) }
+});
diff --git a/src/unicode/fsm/word_break_fwd.bigendian.dfa b/src/unicode/fsm/word_break_fwd.bigendian.dfa
index 1e75db6..efb9c81 100644
--- a/src/unicode/fsm/word_break_fwd.bigendian.dfa
+++ b/src/unicode/fsm/word_break_fwd.bigendian.dfa
Binary files differ
diff --git a/src/unicode/fsm/word_break_fwd.littleendian.dfa b/src/unicode/fsm/word_break_fwd.littleendian.dfa
index e3093a3..9a716d0 100644
--- a/src/unicode/fsm/word_break_fwd.littleendian.dfa
+++ b/src/unicode/fsm/word_break_fwd.littleendian.dfa
Binary files differ
diff --git a/src/unicode/fsm/word_break_fwd.rs b/src/unicode/fsm/word_break_fwd.rs
index fb041b7..dcb5f6b 100644
--- a/src/unicode/fsm/word_break_fwd.rs
+++ b/src/unicode/fsm/word_break_fwd.rs
@@ -2,11 +2,12 @@
//
// ucd-generate dfa --name WORD_BREAK_FWD --sparse --minimize --anchored --state-size 4 src/unicode/fsm/ [snip (arg too long)]
//
-// ucd-generate 0.2.9 is available on crates.io.
+// ucd-generate 0.2.12 is available on crates.io.
#[cfg(target_endian = "big")]
-lazy_static::lazy_static! {
- pub static ref WORD_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u32> = {
+pub static WORD_BREAK_FWD: ::once_cell::sync::Lazy<
+ ::regex_automata::SparseDFA<&'static [u8], u32>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u8; 0],
@@ -18,15 +19,13 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("word_break_fwd.bigendian.dfa"),
};
- unsafe {
- ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) }
+});
#[cfg(target_endian = "little")]
-lazy_static::lazy_static! {
- pub static ref WORD_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u32> = {
+pub static WORD_BREAK_FWD: ::once_cell::sync::Lazy<
+ ::regex_automata::SparseDFA<&'static [u8], u32>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u8; 0],
@@ -38,8 +37,5 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("word_break_fwd.littleendian.dfa"),
};
- unsafe {
- ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) }
+});
diff --git a/src/unicode/grapheme.rs b/src/unicode/grapheme.rs
index ad31cf1..13b730c 100644
--- a/src/unicode/grapheme.rs
+++ b/src/unicode/grapheme.rs
@@ -1,10 +1,14 @@
use regex_automata::DFA;
-use crate::ext_slice::ByteSlice;
-use crate::unicode::fsm::grapheme_break_fwd::GRAPHEME_BREAK_FWD;
-use crate::unicode::fsm::grapheme_break_rev::GRAPHEME_BREAK_REV;
-use crate::unicode::fsm::regional_indicator_rev::REGIONAL_INDICATOR_REV;
-use crate::utf8;
+use crate::{
+ ext_slice::ByteSlice,
+ unicode::fsm::{
+ grapheme_break_fwd::GRAPHEME_BREAK_FWD,
+ grapheme_break_rev::GRAPHEME_BREAK_REV,
+ regional_indicator_rev::REGIONAL_INDICATOR_REV,
+ },
+ utf8,
+};
/// An iterator over grapheme clusters in a byte string.
///
@@ -125,7 +129,7 @@ pub struct GraphemeIndices<'a> {
impl<'a> GraphemeIndices<'a> {
pub(crate) fn new(bs: &'a [u8]) -> GraphemeIndices<'a> {
- GraphemeIndices { bs: bs, forward_index: 0, reverse_index: bs.len() }
+ GraphemeIndices { bs, forward_index: 0, reverse_index: bs.len() }
}
/// View the underlying data as a subslice of the original data.
@@ -191,6 +195,22 @@ impl<'a> DoubleEndedIterator for GraphemeIndices<'a> {
pub fn decode_grapheme(bs: &[u8]) -> (&str, usize) {
if bs.is_empty() {
("", 0)
+ } else if bs.len() >= 2
+ && bs[0].is_ascii()
+ && bs[1].is_ascii()
+ && !bs[0].is_ascii_whitespace()
+ {
+ // FIXME: It is somewhat sad that we have to special case this, but it
+ // leads to a significant speed up in predominantly ASCII text. The
+ // issue here is that the DFA has a bit of overhead, and running it for
+ // every byte in mostly ASCII text results in a bit slowdown. We should
+ // re-litigate this once regex-automata 0.3 is out, but it might be
+ // hard to avoid the special case. A DFA is always going to at least
+ // require some memory access.
+
+ // Safe because all ASCII bytes are valid UTF-8.
+ let grapheme = unsafe { bs[..1].to_str_unchecked() };
+ (grapheme, 1)
} else if let Some(end) = GRAPHEME_BREAK_FWD.find(bs) {
// Safe because a match can only occur for valid UTF-8.
let grapheme = unsafe { bs[..end].to_str_unchecked() };
@@ -257,15 +277,17 @@ fn adjust_rev_for_regional_indicator(mut bs: &[u8], i: usize) -> usize {
}
}
-#[cfg(test)]
+#[cfg(all(test, feature = "std"))]
mod tests {
+ #[cfg(not(miri))]
use ucd_parse::GraphemeClusterBreakTest;
+ use crate::{ext_slice::ByteSlice, tests::LOSSY_TESTS};
+
use super::*;
- use crate::ext_slice::ByteSlice;
- use crate::tests::LOSSY_TESTS;
#[test]
+ #[cfg(not(miri))]
fn forward_ucd() {
for (i, test) in ucdtests().into_iter().enumerate() {
let given = test.grapheme_clusters.concat();
@@ -288,6 +310,7 @@ mod tests {
}
#[test]
+ #[cfg(not(miri))]
fn reverse_ucd() {
for (i, test) in ucdtests().into_iter().enumerate() {
let given = test.grapheme_clusters.concat();
@@ -329,15 +352,18 @@ mod tests {
}
}
+ #[cfg(not(miri))]
fn uniescape(s: &str) -> String {
s.chars().flat_map(|c| c.escape_unicode()).collect::<String>()
}
+ #[cfg(not(miri))]
fn uniescape_vec(strs: &[String]) -> Vec<String> {
strs.iter().map(|s| uniescape(s)).collect()
}
/// Return all of the UCD for grapheme breaks.
+ #[cfg(not(miri))]
fn ucdtests() -> Vec<GraphemeClusterBreakTest> {
const TESTDATA: &'static str =
include_str!("data/GraphemeBreakTest.txt");
diff --git a/src/unicode/mod.rs b/src/unicode/mod.rs
index 60318f4..80638e8 100644
--- a/src/unicode/mod.rs
+++ b/src/unicode/mod.rs
@@ -1,8 +1,8 @@
-pub use self::grapheme::{decode_grapheme, GraphemeIndices, Graphemes};
-pub use self::sentence::{SentenceIndices, Sentences};
-pub use self::whitespace::{whitespace_len_fwd, whitespace_len_rev};
-pub use self::word::{
- WordIndices, Words, WordsWithBreakIndices, WordsWithBreaks,
+pub use self::{
+ grapheme::{decode_grapheme, GraphemeIndices, Graphemes},
+ sentence::{SentenceIndices, Sentences},
+ whitespace::{whitespace_len_fwd, whitespace_len_rev},
+ word::{WordIndices, Words, WordsWithBreakIndices, WordsWithBreaks},
};
mod fsm;
diff --git a/src/unicode/sentence.rs b/src/unicode/sentence.rs
index 063f342..ff29c7e 100644
--- a/src/unicode/sentence.rs
+++ b/src/unicode/sentence.rs
@@ -1,8 +1,9 @@
use regex_automata::DFA;
-use crate::ext_slice::ByteSlice;
-use crate::unicode::fsm::sentence_break_fwd::SENTENCE_BREAK_FWD;
-use crate::utf8;
+use crate::{
+ ext_slice::ByteSlice,
+ unicode::fsm::sentence_break_fwd::SENTENCE_BREAK_FWD, utf8,
+};
/// An iterator over sentences in a byte string.
///
@@ -97,7 +98,7 @@ pub struct SentenceIndices<'a> {
impl<'a> SentenceIndices<'a> {
pub(crate) fn new(bs: &'a [u8]) -> SentenceIndices<'a> {
- SentenceIndices { bs: bs, forward_index: 0 }
+ SentenceIndices { bs, forward_index: 0 }
}
/// View the underlying data as a subslice of the original data.
@@ -156,13 +157,15 @@ fn decode_sentence(bs: &[u8]) -> (&str, usize) {
}
}
-#[cfg(test)]
+#[cfg(all(test, feature = "std"))]
mod tests {
+ #[cfg(not(miri))]
use ucd_parse::SentenceBreakTest;
use crate::ext_slice::ByteSlice;
#[test]
+ #[cfg(not(miri))]
fn forward_ucd() {
for (i, test) in ucdtests().into_iter().enumerate() {
let given = test.sentences.concat();
@@ -198,11 +201,13 @@ mod tests {
bytes.sentences().collect()
}
+ #[cfg(not(miri))]
fn strs_to_bstrs<S: AsRef<str>>(strs: &[S]) -> Vec<&[u8]> {
strs.iter().map(|s| s.as_ref().as_bytes()).collect()
}
/// Return all of the UCD for sentence breaks.
+ #[cfg(not(miri))]
fn ucdtests() -> Vec<SentenceBreakTest> {
const TESTDATA: &'static str =
include_str!("data/SentenceBreakTest.txt");
diff --git a/src/unicode/whitespace.rs b/src/unicode/whitespace.rs
index 949a83f..b5eff30 100644
--- a/src/unicode/whitespace.rs
+++ b/src/unicode/whitespace.rs
@@ -1,7 +1,9 @@
use regex_automata::DFA;
-use crate::unicode::fsm::whitespace_anchored_fwd::WHITESPACE_ANCHORED_FWD;
-use crate::unicode::fsm::whitespace_anchored_rev::WHITESPACE_ANCHORED_REV;
+use crate::unicode::fsm::{
+ whitespace_anchored_fwd::WHITESPACE_ANCHORED_FWD,
+ whitespace_anchored_rev::WHITESPACE_ANCHORED_REV,
+};
/// Return the first position of a non-whitespace character.
pub fn whitespace_len_fwd(slice: &[u8]) -> usize {
diff --git a/src/unicode/word.rs b/src/unicode/word.rs
index e0a5701..849f0c8 100644
--- a/src/unicode/word.rs
+++ b/src/unicode/word.rs
@@ -1,9 +1,12 @@
use regex_automata::DFA;
-use crate::ext_slice::ByteSlice;
-use crate::unicode::fsm::simple_word_fwd::SIMPLE_WORD_FWD;
-use crate::unicode::fsm::word_break_fwd::WORD_BREAK_FWD;
-use crate::utf8;
+use crate::{
+ ext_slice::ByteSlice,
+ unicode::fsm::{
+ simple_word_fwd::SIMPLE_WORD_FWD, word_break_fwd::WORD_BREAK_FWD,
+ },
+ utf8,
+};
/// An iterator over words in a byte string.
///
@@ -254,7 +257,7 @@ pub struct WordsWithBreakIndices<'a> {
impl<'a> WordsWithBreakIndices<'a> {
pub(crate) fn new(bs: &'a [u8]) -> WordsWithBreakIndices<'a> {
- WordsWithBreakIndices { bs: bs, forward_index: 0 }
+ WordsWithBreakIndices { bs, forward_index: 0 }
}
/// View the underlying data as a subslice of the original data.
@@ -316,13 +319,15 @@ fn decode_word(bs: &[u8]) -> (&str, usize) {
}
}
-#[cfg(test)]
+#[cfg(all(test, feature = "std"))]
mod tests {
+ #[cfg(not(miri))]
use ucd_parse::WordBreakTest;
use crate::ext_slice::ByteSlice;
#[test]
+ #[cfg(not(miri))]
fn forward_ucd() {
for (i, test) in ucdtests().into_iter().enumerate() {
let given = test.words.concat();
@@ -379,17 +384,26 @@ mod tests {
assert_eq!(vec!["1XY"], words(b"1XY"));
assert_eq!(vec!["\u{FEFF}", "Ты"], words("\u{FEFF}Ты".as_bytes()));
+
+ // Tests that Vithkuqi works, which was introduced in Unicode 14.
+ // This test fails prior to Unicode 14.
+ assert_eq!(
+ vec!["\u{10570}\u{10597}"],
+ words("\u{10570}\u{10597}".as_bytes())
+ );
}
fn words(bytes: &[u8]) -> Vec<&str> {
bytes.words_with_breaks().collect()
}
+ #[cfg(not(miri))]
fn strs_to_bstrs<S: AsRef<str>>(strs: &[S]) -> Vec<&[u8]> {
strs.iter().map(|s| s.as_ref().as_bytes()).collect()
}
/// Return all of the UCD for word breaks.
+ #[cfg(not(miri))]
fn ucdtests() -> Vec<WordBreakTest> {
const TESTDATA: &'static str = include_str!("data/WordBreakTest.txt");
diff --git a/src/utf8.rs b/src/utf8.rs
index 5c7de36..4b5bc20 100644
--- a/src/utf8.rs
+++ b/src/utf8.rs
@@ -1,13 +1,9 @@
-use core::char;
-use core::cmp;
-use core::fmt;
-use core::str;
+use core::{char, cmp, fmt, str};
+
#[cfg(feature = "std")]
use std::error;
-use crate::ascii;
-use crate::bstr::BStr;
-use crate::ext_slice::ByteSlice;
+use crate::{ascii, bstr::BStr, ext_slice::ByteSlice};
// The UTF-8 decoder provided here is based on the one presented here:
// https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
@@ -75,7 +71,7 @@ const STATES_FORWARD: &'static [u8] = &[
///
/// When invalid UTF-8 byte sequences are found, they are substituted with the
/// Unicode replacement codepoint (`U+FFFD`) using the
-/// ["maximal subpart" strategy](http://www.unicode.org/review/pr-121.html).
+/// ["maximal subpart" strategy](https://www.unicode.org/review/pr-121.html).
///
/// This iterator is created by the
/// [`chars`](trait.ByteSlice.html#method.chars) method provided by the
@@ -146,7 +142,7 @@ impl<'a> DoubleEndedIterator for Chars<'a> {
///
/// When invalid UTF-8 byte sequences are found, they are substituted with the
/// Unicode replacement codepoint (`U+FFFD`) using the
-/// ["maximal subpart" strategy](http://www.unicode.org/review/pr-121.html).
+/// ["maximal subpart" strategy](https://www.unicode.org/review/pr-121.html).
///
/// Note that this is slightly different from the `CharIndices` iterator
/// provided by the standard library. Aside from working on possibly invalid
@@ -168,7 +164,7 @@ pub struct CharIndices<'a> {
impl<'a> CharIndices<'a> {
pub(crate) fn new(bs: &'a [u8]) -> CharIndices<'a> {
- CharIndices { bs: bs, forward_index: 0, reverse_index: bs.len() }
+ CharIndices { bs, forward_index: 0, reverse_index: bs.len() }
}
/// View the underlying data as a subslice of the original data.
@@ -392,7 +388,7 @@ impl<'a> ::core::iter::FusedIterator for Utf8Chunks<'a> {}
/// assert_eq!(err.error_len(), Some(3));
///
/// // In contrast to the above which contains a single invalid prefix,
-/// // consider the case of multiple individal bytes that are never valid
+/// // consider the case of multiple individual bytes that are never valid
/// // prefixes. Note how the value of error_len changes!
/// let s = b"foobar\xFF\xFFquux";
/// let err = s.to_str().unwrap_err();
@@ -406,7 +402,7 @@ impl<'a> ::core::iter::FusedIterator for Utf8Chunks<'a> {}
/// assert_eq!(err.valid_up_to(), 6);
/// assert_eq!(err.error_len(), Some(1));
/// ```
-#[derive(Debug, Eq, PartialEq)]
+#[derive(Clone, Debug, Eq, PartialEq)]
pub struct Utf8Error {
valid_up_to: usize,
error_len: Option<usize>,
@@ -854,13 +850,15 @@ fn is_leading_or_invalid_utf8_byte(b: u8) -> bool {
(b & 0b1100_0000) != 0b1000_0000
}
-#[cfg(test)]
+#[cfg(all(test, feature = "std"))]
mod tests {
use std::char;
- use crate::ext_slice::{ByteSlice, B};
- use crate::tests::LOSSY_TESTS;
- use crate::utf8::{self, Utf8Error};
+ use crate::{
+ ext_slice::{ByteSlice, B},
+ tests::LOSSY_TESTS,
+ utf8::{self, Utf8Error},
+ };
fn utf8e(valid_up_to: usize) -> Utf8Error {
Utf8Error { valid_up_to, error_len: None }
@@ -871,6 +869,7 @@ mod tests {
}
#[test]
+ #[cfg(not(miri))]
fn validate_all_codepoints() {
for i in 0..(0x10FFFF + 1) {
let cp = match char::from_u32(i) {