aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/ascii.rs23
-rw-r--r--src/bstr.rs32
-rw-r--r--src/bstring.rs46
-rw-r--r--src/byteset/mod.rs3
-rw-r--r--src/byteset/scalar.rs46
-rw-r--r--src/ext_slice.rs666
-rw-r--r--src/ext_vec.rs121
-rw-r--r--src/impls.rs144
-rw-r--r--src/io.rs33
-rw-r--r--src/lib.rs111
-rw-r--r--src/tests.rs2
-rw-r--r--src/unicode/data/GraphemeBreakTest.txt6
-rw-r--r--src/unicode/data/SentenceBreakTest.txt6
-rw-r--r--src/unicode/data/WordBreakTest.txt6
-rw-r--r--src/unicode/fsm/grapheme_break_fwd.bigendian.dfabin10589 -> 10781 bytes
-rw-r--r--src/unicode/fsm/grapheme_break_fwd.littleendian.dfabin10589 -> 10781 bytes
-rw-r--r--src/unicode/fsm/grapheme_break_fwd.rs26
-rw-r--r--src/unicode/fsm/grapheme_break_rev.bigendian.dfabin53905 -> 55271 bytes
-rw-r--r--src/unicode/fsm/grapheme_break_rev.littleendian.dfabin53905 -> 55271 bytes
-rw-r--r--src/unicode/fsm/grapheme_break_rev.rs26
-rw-r--r--src/unicode/fsm/regional_indicator_rev.rs26
-rw-r--r--src/unicode/fsm/sentence_break_fwd.bigendian.dfabin149903 -> 153619 bytes
-rw-r--r--src/unicode/fsm/sentence_break_fwd.littleendian.dfabin149903 -> 153619 bytes
-rw-r--r--src/unicode/fsm/sentence_break_fwd.rs26
-rw-r--r--src/unicode/fsm/simple_word_fwd.bigendian.dfabin8975 -> 9237 bytes
-rw-r--r--src/unicode/fsm/simple_word_fwd.littleendian.dfabin8975 -> 9237 bytes
-rw-r--r--src/unicode/fsm/simple_word_fwd.rs26
-rw-r--r--src/unicode/fsm/whitespace_anchored_fwd.rs26
-rw-r--r--src/unicode/fsm/whitespace_anchored_rev.rs26
-rw-r--r--src/unicode/fsm/word_break_fwd.bigendian.dfabin229739 -> 236309 bytes
-rw-r--r--src/unicode/fsm/word_break_fwd.littleendian.dfabin229739 -> 236309 bytes
-rw-r--r--src/unicode/fsm/word_break_fwd.rs26
-rw-r--r--src/unicode/grapheme.rs44
-rw-r--r--src/unicode/mod.rs10
-rw-r--r--src/unicode/sentence.rs15
-rw-r--r--src/unicode/whitespace.rs6
-rw-r--r--src/unicode/word.rs26
-rw-r--r--src/utf8.rs29
38 files changed, 977 insertions, 606 deletions
diff --git a/src/ascii.rs b/src/ascii.rs
index bb2b679..259d41f 100644
--- a/src/ascii.rs
+++ b/src/ascii.rs
@@ -23,18 +23,18 @@ use core::mem;
// means we can effectively skip the _mm_cmpeq_epi8 step and jump straight to
// _mm_movemask_epi8.
-#[cfg(any(test, not(target_arch = "x86_64")))]
+#[cfg(any(test, miri, not(target_arch = "x86_64")))]
const USIZE_BYTES: usize = mem::size_of::<usize>();
-#[cfg(any(test, not(target_arch = "x86_64")))]
+#[cfg(any(test, miri, not(target_arch = "x86_64")))]
const FALLBACK_LOOP_SIZE: usize = 2 * USIZE_BYTES;
// This is a mask where the most significant bit of each byte in the usize
// is set. We test this bit to determine whether a character is ASCII or not.
// Namely, a single byte is regarded as an ASCII codepoint if and only if it's
// most significant bit is not set.
-#[cfg(any(test, not(target_arch = "x86_64")))]
+#[cfg(any(test, miri, not(target_arch = "x86_64")))]
const ASCII_MASK_U64: u64 = 0x8080808080808080;
-#[cfg(any(test, not(target_arch = "x86_64")))]
+#[cfg(any(test, miri, not(target_arch = "x86_64")))]
const ASCII_MASK: usize = ASCII_MASK_U64 as usize;
/// Returns the index of the first non ASCII byte in the given slice.
@@ -42,18 +42,18 @@ const ASCII_MASK: usize = ASCII_MASK_U64 as usize;
/// If slice only contains ASCII bytes, then the length of the slice is
/// returned.
pub fn first_non_ascii_byte(slice: &[u8]) -> usize {
- #[cfg(not(target_arch = "x86_64"))]
+ #[cfg(any(miri, not(target_arch = "x86_64")))]
{
first_non_ascii_byte_fallback(slice)
}
- #[cfg(target_arch = "x86_64")]
+ #[cfg(all(not(miri), target_arch = "x86_64"))]
{
first_non_ascii_byte_sse2(slice)
}
}
-#[cfg(any(test, not(target_arch = "x86_64")))]
+#[cfg(any(test, miri, not(target_arch = "x86_64")))]
fn first_non_ascii_byte_fallback(slice: &[u8]) -> usize {
let align = USIZE_BYTES - 1;
let start_ptr = slice.as_ptr();
@@ -115,7 +115,7 @@ fn first_non_ascii_byte_fallback(slice: &[u8]) -> usize {
}
}
-#[cfg(target_arch = "x86_64")]
+#[cfg(all(not(miri), target_arch = "x86_64"))]
fn first_non_ascii_byte_sse2(slice: &[u8]) -> usize {
use core::arch::x86_64::*;
@@ -221,7 +221,7 @@ unsafe fn first_non_ascii_byte_slow(
/// bytes is not an ASCII byte.
///
/// The position returned is always in the inclusive range [0, 7].
-#[cfg(any(test, not(target_arch = "x86_64")))]
+#[cfg(any(test, miri, not(target_arch = "x86_64")))]
fn first_non_ascii_byte_mask(mask: usize) -> usize {
#[cfg(target_endian = "little")]
{
@@ -245,7 +245,7 @@ unsafe fn ptr_sub(ptr: *const u8, amt: usize) -> *const u8 {
ptr.offset((amt as isize).wrapping_neg())
}
-#[cfg(any(test, not(target_arch = "x86_64")))]
+#[cfg(any(test, miri, not(target_arch = "x86_64")))]
unsafe fn read_unaligned_usize(ptr: *const u8) -> usize {
use core::ptr;
@@ -286,6 +286,7 @@ mod tests {
#[test]
#[cfg(target_arch = "x86_64")]
+ #[cfg(not(miri))]
fn positive_sse2_forward() {
for i in 0..517 {
let b = "a".repeat(i).into_bytes();
@@ -294,6 +295,7 @@ mod tests {
}
#[test]
+ #[cfg(not(miri))]
fn negative_fallback_forward() {
for i in 0..517 {
for align in 0..65 {
@@ -315,6 +317,7 @@ mod tests {
#[test]
#[cfg(target_arch = "x86_64")]
+ #[cfg(not(miri))]
fn negative_sse2_forward() {
for i in 0..517 {
for align in 0..65 {
diff --git a/src/bstr.rs b/src/bstr.rs
index 1e3c91b..5036f06 100644
--- a/src/bstr.rs
+++ b/src/bstr.rs
@@ -1,5 +1,8 @@
use core::mem;
+#[cfg(feature = "alloc")]
+use alloc::boxed::Box;
+
/// A wrapper for `&[u8]` that provides convenient string oriented trait impls.
///
/// If you need ownership or a growable byte string buffer, then use
@@ -33,8 +36,31 @@ pub struct BStr {
}
impl BStr {
+ /// Directly creates a `BStr` slice from anything that can be converted
+ /// to a byte slice.
+ ///
+ /// This is very similar to the [`B`](crate::B) function, except this
+ /// returns a `&BStr` instead of a `&[u8]`.
+ ///
+ /// This is a cost-free conversion.
+ ///
+ /// # Example
+ ///
+ /// You can create `BStr`'s from byte arrays, byte slices or even string
+ /// slices:
+ ///
+ /// ```
+ /// use bstr::BStr;
+ ///
+ /// let a = BStr::new(b"abc");
+ /// let b = BStr::new(&b"abc"[..]);
+ /// let c = BStr::new("abc");
+ ///
+ /// assert_eq!(a, b);
+ /// assert_eq!(a, c);
+ /// ```
#[inline]
- pub(crate) fn new<B: ?Sized + AsRef<[u8]>>(bytes: &B) -> &BStr {
+ pub fn new<'a, B: ?Sized + AsRef<[u8]>>(bytes: &'a B) -> &'a BStr {
BStr::from_bytes(bytes.as_ref())
}
@@ -56,13 +82,13 @@ impl BStr {
}
#[inline]
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
pub(crate) fn from_boxed_bytes(slice: Box<[u8]>) -> Box<BStr> {
unsafe { Box::from_raw(Box::into_raw(slice) as _) }
}
#[inline]
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
pub(crate) fn into_boxed_bytes(slice: Box<BStr>) -> Box<[u8]> {
unsafe { Box::from_raw(Box::into_raw(slice) as _) }
}
diff --git a/src/bstring.rs b/src/bstring.rs
index 30093ba..d144b1d 100644
--- a/src/bstring.rs
+++ b/src/bstring.rs
@@ -1,3 +1,5 @@
+use alloc::vec::Vec;
+
use crate::bstr::BStr;
/// A wrapper for `Vec<u8>` that provides convenient string oriented trait
@@ -38,16 +40,43 @@ use crate::bstr::BStr;
/// region of memory containing the bytes, a length and a capacity.
#[derive(Clone, Hash)]
pub struct BString {
- pub(crate) bytes: Vec<u8>,
+ bytes: Vec<u8>,
}
impl BString {
+ /// Constructs a new `BString` from the given [`Vec`].
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use bstr::BString;
+ ///
+ /// let mut b = BString::new(Vec::with_capacity(10));
+ /// ```
+ ///
+ /// This function is `const`:
+ ///
+ /// ```
+ /// use bstr::BString;
+ ///
+ /// const B: BString = BString::new(vec![]);
+ /// ```
+ #[inline]
+ pub const fn new(bytes: Vec<u8>) -> BString {
+ BString { bytes }
+ }
+
#[inline]
pub(crate) fn as_bytes(&self) -> &[u8] {
&self.bytes
}
#[inline]
+ pub(crate) fn as_bytes_mut(&mut self) -> &mut [u8] {
+ &mut self.bytes
+ }
+
+ #[inline]
pub(crate) fn as_bstr(&self) -> &BStr {
BStr::new(&self.bytes)
}
@@ -56,4 +85,19 @@ impl BString {
pub(crate) fn as_mut_bstr(&mut self) -> &mut BStr {
BStr::new_mut(&mut self.bytes)
}
+
+ #[inline]
+ pub(crate) fn as_vec(&self) -> &Vec<u8> {
+ &self.bytes
+ }
+
+ #[inline]
+ pub(crate) fn as_vec_mut(&mut self) -> &mut Vec<u8> {
+ &mut self.bytes
+ }
+
+ #[inline]
+ pub(crate) fn into_vec(self) -> Vec<u8> {
+ self.bytes
+ }
}
diff --git a/src/byteset/mod.rs b/src/byteset/mod.rs
index 043d309..c6c697c 100644
--- a/src/byteset/mod.rs
+++ b/src/byteset/mod.rs
@@ -1,4 +1,5 @@
use memchr::{memchr, memchr2, memchr3, memrchr, memrchr2, memrchr3};
+
mod scalar;
#[inline]
@@ -79,7 +80,7 @@ pub(crate) fn rfind_not(haystack: &[u8], byteset: &[u8]) -> Option<usize> {
}
}
-#[cfg(test)]
+#[cfg(all(test, feature = "std", not(miri)))]
mod tests {
quickcheck::quickcheck! {
fn qc_byteset_forward_matches_naive(
diff --git a/src/byteset/scalar.rs b/src/byteset/scalar.rs
index 9bd34a8..28bff67 100644
--- a/src/byteset/scalar.rs
+++ b/src/byteset/scalar.rs
@@ -1,9 +1,8 @@
// This is adapted from `fallback.rs` from rust-memchr. It's modified to return
-// the 'inverse' query of memchr, e.g. finding the first byte not in the provided
-// set. This is simple for the 1-byte case.
+// the 'inverse' query of memchr, e.g. finding the first byte not in the
+// provided set. This is simple for the 1-byte case.
-use core::cmp;
-use core::usize;
+use core::{cmp, usize};
#[cfg(target_pointer_width = "32")]
const USIZE_BYTES: usize = 4;
@@ -29,10 +28,11 @@ pub fn inv_memchr(n1: u8, haystack: &[u8]) -> Option<usize> {
let loop_size = cmp::min(LOOP_SIZE, haystack.len());
let align = USIZE_BYTES - 1;
let start_ptr = haystack.as_ptr();
- let end_ptr = haystack[haystack.len()..].as_ptr();
- let mut ptr = start_ptr;
unsafe {
+ let end_ptr = haystack.as_ptr().add(haystack.len());
+ let mut ptr = start_ptr;
+
if haystack.len() < USIZE_BYTES {
return forward_search(start_ptr, end_ptr, ptr, confirm);
}
@@ -68,10 +68,11 @@ pub fn inv_memrchr(n1: u8, haystack: &[u8]) -> Option<usize> {
let loop_size = cmp::min(LOOP_SIZE, haystack.len());
let align = USIZE_BYTES - 1;
let start_ptr = haystack.as_ptr();
- let end_ptr = haystack[haystack.len()..].as_ptr();
- let mut ptr = end_ptr;
unsafe {
+ let end_ptr = haystack.as_ptr().add(haystack.len());
+ let mut ptr = end_ptr;
+
if haystack.len() < USIZE_BYTES {
return reverse_search(start_ptr, end_ptr, ptr, confirm);
}
@@ -81,7 +82,7 @@ pub fn inv_memrchr(n1: u8, haystack: &[u8]) -> Option<usize> {
return reverse_search(start_ptr, end_ptr, ptr, confirm);
}
- ptr = (end_ptr as usize & !align) as *const u8;
+ ptr = ptr.sub(end_ptr as usize & align);
debug_assert!(start_ptr <= ptr && ptr <= end_ptr);
while loop_size == LOOP_SIZE && ptr >= start_ptr.add(loop_size) {
debug_assert_eq!(0, (ptr as usize) % USIZE_BYTES);
@@ -174,9 +175,10 @@ pub(crate) fn reverse_search_bytes<F: Fn(u8) -> bool>(
}
}
-#[cfg(test)]
+#[cfg(all(test, feature = "std"))]
mod tests {
use super::{inv_memchr, inv_memrchr};
+
// search string, search byte, inv_memchr result, inv_memrchr result.
// these are expanded into a much larger set of tests in build_tests
const TESTS: &[(&[u8], u8, usize, usize)] = &[
@@ -192,10 +194,15 @@ mod tests {
type TestCase = (Vec<u8>, u8, Option<(usize, usize)>);
fn build_tests() -> Vec<TestCase> {
+ #[cfg(not(miri))]
+ const MAX_PER: usize = 515;
+ #[cfg(miri)]
+ const MAX_PER: usize = 10;
+
let mut result = vec![];
for &(search, byte, fwd_pos, rev_pos) in TESTS {
result.push((search.to_vec(), byte, Some((fwd_pos, rev_pos))));
- for i in 1..515 {
+ for i in 1..MAX_PER {
// add a bunch of copies of the search byte to the end.
let mut suffixed: Vec<u8> = search.into();
suffixed.extend(std::iter::repeat(byte).take(i));
@@ -225,7 +232,7 @@ mod tests {
}
// build non-matching tests for several sizes
- for i in 0..515 {
+ for i in 0..MAX_PER {
result.push((
std::iter::repeat(b'\0').take(i).collect(),
b'\0',
@@ -239,6 +246,12 @@ mod tests {
#[test]
fn test_inv_memchr() {
use crate::{ByteSlice, B};
+
+ #[cfg(not(miri))]
+ const MAX_OFFSET: usize = 130;
+ #[cfg(miri)]
+ const MAX_OFFSET: usize = 13;
+
for (search, byte, matching) in build_tests() {
assert_eq!(
inv_memchr(byte, &search),
@@ -256,13 +269,14 @@ mod tests {
// better printing
B(&search).as_bstr(),
);
- // Test a rather large number off offsets for potential alignment issues
- for offset in 1..130 {
+ // Test a rather large number off offsets for potential alignment
+ // issues.
+ for offset in 1..MAX_OFFSET {
if offset >= search.len() {
break;
}
- // If this would cause us to shift the results off the end, skip
- // it so that we don't have to recompute them.
+ // If this would cause us to shift the results off the end,
+ // skip it so that we don't have to recompute them.
if let Some((f, r)) = matching {
if offset > f || offset > r {
break;
diff --git a/src/ext_slice.rs b/src/ext_slice.rs
index 0cc73af..ec52a61 100644
--- a/src/ext_slice.rs
+++ b/src/ext_slice.rs
@@ -1,17 +1,16 @@
+use core::{iter, slice, str};
+
+#[cfg(all(feature = "alloc", feature = "unicode"))]
+use alloc::vec;
+#[cfg(feature = "alloc")]
+use alloc::{borrow::Cow, string::String, vec::Vec};
+
#[cfg(feature = "std")]
-use std::borrow::Cow;
-#[cfg(feature = "std")]
-use std::ffi::OsStr;
-#[cfg(feature = "std")]
-use std::path::Path;
+use std::{ffi::OsStr, path::Path};
-use core::{iter, ops, ptr, slice, str};
use memchr::{memchr, memmem, memrchr};
-use crate::ascii;
-use crate::bstr::BStr;
-use crate::byteset;
-#[cfg(feature = "std")]
+#[cfg(feature = "alloc")]
use crate::ext_vec::ByteVec;
#[cfg(feature = "unicode")]
use crate::unicode::{
@@ -19,7 +18,12 @@ use crate::unicode::{
SentenceIndices, Sentences, WordIndices, Words, WordsWithBreakIndices,
WordsWithBreaks,
};
-use crate::utf8::{self, CharIndices, Chars, Utf8Chunks, Utf8Error};
+use crate::{
+ ascii,
+ bstr::BStr,
+ byteset,
+ utf8::{self, CharIndices, Chars, Utf8Chunks, Utf8Error},
+};
/// A short-hand constructor for building a `&[u8]`.
///
@@ -149,11 +153,12 @@ pub trait ByteSlice: Sealed {
/// Create an immutable byte string from an OS string slice.
///
- /// On Unix, this always succeeds and is zero cost. On non-Unix systems,
- /// this returns `None` if the given OS string is not valid UTF-8. (For
- /// example, on Windows, file paths are allowed to be a sequence of
- /// arbitrary 16-bit integers. Not all such sequences can be transcoded to
- /// valid UTF-8.)
+ /// When the underlying bytes of OS strings are accessible, then this
+ /// always succeeds and is zero cost. Otherwise, this returns `None` if the
+ /// given OS string is not valid UTF-8. (For example, when the underlying
+ /// bytes are inaccessible on Windows, file paths are allowed to be a
+ /// sequence of arbitrary 16-bit integers. Not all such sequences can be
+ /// transcoded to valid UTF-8.)
///
/// # Examples
///
@@ -190,10 +195,12 @@ pub trait ByteSlice: Sealed {
/// Create an immutable byte string from a file path.
///
- /// On Unix, this always succeeds and is zero cost. On non-Unix systems,
- /// this returns `None` if the given path is not valid UTF-8. (For example,
- /// on Windows, file paths are allowed to be a sequence of arbitrary 16-bit
- /// integers. Not all such sequences can be transcoded to valid UTF-8.)
+ /// When the underlying bytes of paths are accessible, then this always
+ /// succeeds and is zero cost. Otherwise, this returns `None` if the given
+ /// path is not valid UTF-8. (For example, when the underlying bytes are
+ /// inaccessible on Windows, file paths are allowed to be a sequence of
+ /// arbitrary 16-bit integers. Not all such sequences can be transcoded to
+ /// valid UTF-8.)
///
/// # Examples
///
@@ -230,6 +237,7 @@ pub trait ByteSlice: Sealed {
/// Basic usage:
///
/// ```
+ /// # #[cfg(feature = "alloc")] {
/// use bstr::{B, ByteSlice, ByteVec};
///
/// # fn example() -> Result<(), bstr::Utf8Error> {
@@ -241,6 +249,7 @@ pub trait ByteSlice: Sealed {
/// let err = bstring.to_str().unwrap_err();
/// assert_eq!(8, err.valid_up_to());
/// # Ok(()) }; example().unwrap()
+ /// # }
/// ```
#[inline]
fn to_str(&self) -> Result<&str, Utf8Error> {
@@ -301,7 +310,7 @@ pub trait ByteSlice: Sealed {
/// [W3C's Encoding standard](https://www.w3.org/TR/encoding/).
/// For a more precise description of the maximal subpart strategy, see
/// the Unicode Standard, Chapter 3, Section 9. See also
- /// [Public Review Issue #121](http://www.unicode.org/review/pr-121.html).
+ /// [Public Review Issue #121](https://www.unicode.org/review/pr-121.html).
///
/// N.B. Rust's standard library also appears to use the same strategy,
/// but it does not appear to be an API guarantee.
@@ -341,7 +350,7 @@ pub trait ByteSlice: Sealed {
/// let bs = B(b"\x61\xF1\x80\x80\xE1\x80\xC2\x62");
/// assert_eq!("a\u{FFFD}\u{FFFD}\u{FFFD}b", bs.to_str_lossy());
/// ```
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
#[inline]
fn to_str_lossy(&self) -> Cow<'_, str> {
match utf8::validate(self.as_bytes()) {
@@ -398,7 +407,7 @@ pub trait ByteSlice: Sealed {
/// bstring.to_str_lossy_into(&mut dest);
/// assert_eq!("☃βツ\u{FFFD}", dest);
/// ```
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
#[inline]
fn to_str_lossy_into(&self, dest: &mut String) {
let mut bytes = self.as_bytes();
@@ -428,12 +437,15 @@ pub trait ByteSlice: Sealed {
/// Create an OS string slice from this byte string.
///
- /// On Unix, this always succeeds and is zero cost. On non-Unix systems,
- /// this returns a UTF-8 decoding error if this byte string is not valid
- /// UTF-8. (For example, on Windows, file paths are allowed to be a
- /// sequence of arbitrary 16-bit integers. There is no obvious mapping from
- /// an arbitrary sequence of 8-bit integers to an arbitrary sequence of
- /// 16-bit integers.)
+ /// When OS strings can be constructed from arbitrary byte sequences, this
+ /// always succeeds and is zero cost. Otherwise, this returns a UTF-8
+ /// decoding error if this byte string is not valid UTF-8. (For example,
+ /// assuming the representation of `OsStr` is opaque on Windows, file paths
+ /// are allowed to be a sequence of arbitrary 16-bit integers. There is
+ /// no obvious mapping from an arbitrary sequence of 8-bit integers to an
+ /// arbitrary sequence of 16-bit integers. If the representation of `OsStr`
+ /// is even opened up, then this will convert any sequence of bytes to an
+ /// `OsStr` without cost.)
///
/// # Examples
///
@@ -467,13 +479,13 @@ pub trait ByteSlice: Sealed {
/// Lossily create an OS string slice from this byte string.
///
- /// On Unix, this always succeeds and is zero cost. On non-Unix systems,
- /// this will perform a UTF-8 check and lossily convert this byte string
- /// into valid UTF-8 using the Unicode replacement codepoint.
+ /// When OS strings can be constructed from arbitrary byte sequences, this
+ /// is zero cost and always returns a slice. Otherwise, this will perform a
+ /// UTF-8 check and lossily convert this byte string into valid UTF-8 using
+ /// the Unicode replacement codepoint.
///
- /// Note that this can prevent the correct roundtripping of file paths on
- /// non-Unix systems such as Windows, where file paths are an arbitrary
- /// sequence of 16-bit integers.
+ /// Note that this can prevent the correct roundtripping of file paths when
+ /// the representation of `OsStr` is opaque.
///
/// # Examples
///
@@ -512,12 +524,15 @@ pub trait ByteSlice: Sealed {
/// Create a path slice from this byte string.
///
- /// On Unix, this always succeeds and is zero cost. On non-Unix systems,
- /// this returns a UTF-8 decoding error if this byte string is not valid
- /// UTF-8. (For example, on Windows, file paths are allowed to be a
- /// sequence of arbitrary 16-bit integers. There is no obvious mapping from
- /// an arbitrary sequence of 8-bit integers to an arbitrary sequence of
- /// 16-bit integers.)
+ /// When paths can be constructed from arbitrary byte sequences, this
+ /// always succeeds and is zero cost. Otherwise, this returns a UTF-8
+ /// decoding error if this byte string is not valid UTF-8. (For example,
+ /// assuming the representation of `Path` is opaque on Windows, file paths
+ /// are allowed to be a sequence of arbitrary 16-bit integers. There is
+ /// no obvious mapping from an arbitrary sequence of 8-bit integers to an
+ /// arbitrary sequence of 16-bit integers. If the representation of `Path`
+ /// is even opened up, then this will convert any sequence of bytes to an
+ /// `Path` without cost.)
///
/// # Examples
///
@@ -537,13 +552,13 @@ pub trait ByteSlice: Sealed {
/// Lossily create a path slice from this byte string.
///
- /// On Unix, this always succeeds and is zero cost. On non-Unix systems,
- /// this will perform a UTF-8 check and lossily convert this byte string
- /// into valid UTF-8 using the Unicode replacement codepoint.
+ /// When paths can be constructed from arbitrary byte sequences, this is
+ /// zero cost and always returns a slice. Otherwise, this will perform a
+ /// UTF-8 check and lossily convert this byte string into valid UTF-8 using
+ /// the Unicode replacement codepoint.
///
- /// Note that this can prevent the correct roundtripping of file paths on
- /// non-Unix systems such as Windows, where file paths are an arbitrary
- /// sequence of 16-bit integers.
+ /// Note that this can prevent the correct roundtripping of file paths when
+ /// the representation of `Path` is opaque.
///
/// # Examples
///
@@ -584,15 +599,10 @@ pub trait ByteSlice: Sealed {
/// assert_eq!(b"foo".repeatn(4), B("foofoofoofoo"));
/// assert_eq!(b"foo".repeatn(0), B(""));
/// ```
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
#[inline]
fn repeatn(&self, n: usize) -> Vec<u8> {
- let bs = self.as_bytes();
- let mut dst = vec![0; bs.len() * n];
- for i in 0..n {
- dst[i * bs.len()..(i + 1) * bs.len()].copy_from_slice(bs);
- }
- dst
+ self.as_bytes().repeat(n)
}
/// Returns true if and only if this byte string contains the given needle.
@@ -759,10 +769,10 @@ pub trait ByteSlice: Sealed {
/// assert_eq!(matches, vec![0]);
/// ```
#[inline]
- fn find_iter<'a, B: ?Sized + AsRef<[u8]>>(
- &'a self,
- needle: &'a B,
- ) -> Find<'a> {
+ fn find_iter<'h, 'n, B: ?Sized + AsRef<[u8]>>(
+ &'h self,
+ needle: &'n B,
+ ) -> Find<'h, 'n> {
Find::new(self.as_bytes(), needle.as_ref())
}
@@ -804,10 +814,10 @@ pub trait ByteSlice: Sealed {
/// assert_eq!(matches, vec![0]);
/// ```
#[inline]
- fn rfind_iter<'a, B: ?Sized + AsRef<[u8]>>(
- &'a self,
- needle: &'a B,
- ) -> FindReverse<'a> {
+ fn rfind_iter<'h, 'n, B: ?Sized + AsRef<[u8]>>(
+ &'h self,
+ needle: &'n B,
+ ) -> FindReverse<'h, 'n> {
FindReverse::new(self.as_bytes(), needle.as_ref())
}
@@ -926,14 +936,17 @@ pub trait ByteSlice: Sealed {
/// assert_eq!(b"foo bar baz".find_byteset(b"zr"), Some(6));
/// assert_eq!(b"foo baz bar".find_byteset(b"bzr"), Some(4));
/// assert_eq!(None, b"foo baz bar".find_byteset(b"\t\n"));
+ /// // The empty byteset never matches.
+ /// assert_eq!(None, b"abc".find_byteset(b""));
+ /// assert_eq!(None, b"".find_byteset(b""));
/// ```
#[inline]
fn find_byteset<B: AsRef<[u8]>>(&self, byteset: B) -> Option<usize> {
byteset::find(self.as_bytes(), byteset.as_ref())
}
- /// Returns the index of the first occurrence of a byte that is not a member
- /// of the provided set.
+ /// Returns the index of the first occurrence of a byte that is not a
+ /// member of the provided set.
///
/// The `byteset` may be any type that can be cheaply converted into a
/// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`, but
@@ -963,6 +976,10 @@ pub trait ByteSlice: Sealed {
/// assert_eq!(b"foo bar baz".find_not_byteset(b"fo "), Some(4));
/// assert_eq!(b"\t\tbaz bar".find_not_byteset(b" \t\r\n"), Some(2));
/// assert_eq!(b"foo\nbaz\tbar".find_not_byteset(b"\t\n"), Some(0));
+ /// // The negation of the empty byteset matches everything.
+ /// assert_eq!(Some(0), b"abc".find_not_byteset(b""));
+ /// // But an empty string never contains anything.
+ /// assert_eq!(None, b"".find_not_byteset(b""));
/// ```
#[inline]
fn find_not_byteset<B: AsRef<[u8]>>(&self, byteset: B) -> Option<usize> {
@@ -1043,8 +1060,9 @@ pub trait ByteSlice: Sealed {
byteset::rfind_not(self.as_bytes(), byteset.as_ref())
}
- /// Returns an iterator over the fields in a byte string, separated by
- /// contiguous whitespace.
+ /// Returns an iterator over the fields in a byte string, separated
+ /// by contiguous whitespace (according to the Unicode property
+ /// `White_Space`).
///
/// # Example
///
@@ -1065,6 +1083,7 @@ pub trait ByteSlice: Sealed {
///
/// assert_eq!(0, B(" \n\t\u{2003}\n \t").fields().count());
/// ```
+ #[cfg(feature = "unicode")]
#[inline]
fn fields(&self) -> Fields<'_> {
Fields::new(self.as_bytes())
@@ -1191,10 +1210,10 @@ pub trait ByteSlice: Sealed {
/// It does *not* give you `["a", "b", "c"]`. For that behavior, use
/// [`fields`](#method.fields) instead.
#[inline]
- fn split_str<'a, B: ?Sized + AsRef<[u8]>>(
- &'a self,
- splitter: &'a B,
- ) -> Split<'a> {
+ fn split_str<'h, 's, B: ?Sized + AsRef<[u8]>>(
+ &'h self,
+ splitter: &'s B,
+ ) -> Split<'h, 's> {
Split::new(self.as_bytes(), splitter.as_ref())
}
@@ -1285,13 +1304,101 @@ pub trait ByteSlice: Sealed {
///
/// It does *not* give you `["a", "b", "c"]`.
#[inline]
- fn rsplit_str<'a, B: ?Sized + AsRef<[u8]>>(
- &'a self,
- splitter: &'a B,
- ) -> SplitReverse<'a> {
+ fn rsplit_str<'h, 's, B: ?Sized + AsRef<[u8]>>(
+ &'h self,
+ splitter: &'s B,
+ ) -> SplitReverse<'h, 's> {
SplitReverse::new(self.as_bytes(), splitter.as_ref())
}
+ /// Split this byte string at the first occurance of `splitter`.
+ ///
+ /// If the `splitter` is found in the byte string, returns a tuple
+ /// containing the parts of the string before and after the first occurance
+ /// of `splitter` respectively. Otherwise, if there are no occurances of
+ /// `splitter` in the byte string, returns `None`.
+ ///
+ /// The splitter may be any type that can be cheaply converted into a
+ /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`.
+ ///
+ /// If you need to split on the *last* instance of a delimiter instead, see
+ /// the [`ByteSlice::rsplit_once_str`](#method.rsplit_once_str) method .
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// assert_eq!(
+ /// B("foo,bar").split_once_str(","),
+ /// Some((B("foo"), B("bar"))),
+ /// );
+ /// assert_eq!(
+ /// B("foo,bar,baz").split_once_str(","),
+ /// Some((B("foo"), B("bar,baz"))),
+ /// );
+ /// assert_eq!(B("foo").split_once_str(","), None);
+ /// assert_eq!(B("foo,").split_once_str(b","), Some((B("foo"), B(""))));
+ /// assert_eq!(B(",foo").split_once_str(b","), Some((B(""), B("foo"))));
+ /// ```
+ #[inline]
+ fn split_once_str<'a, B: ?Sized + AsRef<[u8]>>(
+ &'a self,
+ splitter: &B,
+ ) -> Option<(&'a [u8], &'a [u8])> {
+ let bytes = self.as_bytes();
+ let splitter = splitter.as_ref();
+ let start = Finder::new(splitter).find(bytes)?;
+ let end = start + splitter.len();
+ Some((&bytes[..start], &bytes[end..]))
+ }
+
+ /// Split this byte string at the last occurance of `splitter`.
+ ///
+ /// If the `splitter` is found in the byte string, returns a tuple
+ /// containing the parts of the string before and after the last occurance
+ /// of `splitter`, respectively. Otherwise, if there are no occurances of
+ /// `splitter` in the byte string, returns `None`.
+ ///
+ /// The splitter may be any type that can be cheaply converted into a
+ /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`.
+ ///
+ /// If you need to split on the *first* instance of a delimiter instead, see
+ /// the [`ByteSlice::split_once_str`](#method.split_once_str) method.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// assert_eq!(
+ /// B("foo,bar").rsplit_once_str(","),
+ /// Some((B("foo"), B("bar"))),
+ /// );
+ /// assert_eq!(
+ /// B("foo,bar,baz").rsplit_once_str(","),
+ /// Some((B("foo,bar"), B("baz"))),
+ /// );
+ /// assert_eq!(B("foo").rsplit_once_str(","), None);
+ /// assert_eq!(B("foo,").rsplit_once_str(b","), Some((B("foo"), B(""))));
+ /// assert_eq!(B(",foo").rsplit_once_str(b","), Some((B(""), B("foo"))));
+ /// ```
+ #[inline]
+ fn rsplit_once_str<'a, B: ?Sized + AsRef<[u8]>>(
+ &'a self,
+ splitter: &B,
+ ) -> Option<(&'a [u8], &'a [u8])> {
+ let bytes = self.as_bytes();
+ let splitter = splitter.as_ref();
+ let start = FinderReverse::new(splitter).rfind(bytes)?;
+ let end = start + splitter.len();
+ Some((&bytes[..start], &bytes[end..]))
+ }
+
/// Returns an iterator of at most `limit` substrings of this byte string,
/// separated by the given byte string. If `limit` substrings are yielded,
/// then the last substring will contain the remainder of this byte string.
@@ -1328,11 +1435,11 @@ pub trait ByteSlice: Sealed {
/// assert!(x.is_empty());
/// ```
#[inline]
- fn splitn_str<'a, B: ?Sized + AsRef<[u8]>>(
- &'a self,
+ fn splitn_str<'h, 's, B: ?Sized + AsRef<[u8]>>(
+ &'h self,
limit: usize,
- splitter: &'a B,
- ) -> SplitN<'a> {
+ splitter: &'s B,
+ ) -> SplitN<'h, 's> {
SplitN::new(self.as_bytes(), splitter.as_ref(), limit)
}
@@ -1374,11 +1481,11 @@ pub trait ByteSlice: Sealed {
/// assert!(x.is_empty());
/// ```
#[inline]
- fn rsplitn_str<'a, B: ?Sized + AsRef<[u8]>>(
- &'a self,
+ fn rsplitn_str<'h, 's, B: ?Sized + AsRef<[u8]>>(
+ &'h self,
limit: usize,
- splitter: &'a B,
- ) -> SplitNReverse<'a> {
+ splitter: &'s B,
+ ) -> SplitNReverse<'h, 's> {
SplitNReverse::new(self.as_bytes(), splitter.as_ref(), limit)
}
@@ -1416,7 +1523,7 @@ pub trait ByteSlice: Sealed {
/// let s = b"foo".replace("", "Z");
/// assert_eq!(s, "ZfZoZoZ".as_bytes());
/// ```
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
#[inline]
fn replace<N: AsRef<[u8]>, R: AsRef<[u8]>>(
&self,
@@ -1462,7 +1569,7 @@ pub trait ByteSlice: Sealed {
/// let s = b"foo".replacen("", "Z", 2);
/// assert_eq!(s, "ZfZoo".as_bytes());
/// ```
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
#[inline]
fn replacen<N: AsRef<[u8]>, R: AsRef<[u8]>>(
&self,
@@ -1520,7 +1627,7 @@ pub trait ByteSlice: Sealed {
/// s.replace_into("", "Z", &mut dest);
/// assert_eq!(dest, "ZfZoZoZ".as_bytes());
/// ```
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
#[inline]
fn replace_into<N: AsRef<[u8]>, R: AsRef<[u8]>>(
&self,
@@ -1584,7 +1691,7 @@ pub trait ByteSlice: Sealed {
/// s.replacen_into("", "Z", 2, &mut dest);
/// assert_eq!(dest, "ZfZoo".as_bytes());
/// ```
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
#[inline]
fn replacen_into<N: AsRef<[u8]>, R: AsRef<[u8]>>(
&self,
@@ -1800,6 +1907,7 @@ pub trait ByteSlice: Sealed {
/// not necessarily correspond to the length of the `&str` returned!
///
/// ```
+ /// # #[cfg(all(feature = "alloc"))] {
/// use bstr::{ByteSlice, ByteVec};
///
/// let mut bytes = vec![];
@@ -1813,6 +1921,7 @@ pub trait ByteSlice: Sealed {
/// graphemes,
/// vec![(0, 5, "à̖"), (5, 6, "\u{FFFD}"), (6, 14, "🇺🇸")]
/// );
+ /// # }
/// ```
#[cfg(feature = "unicode")]
#[inline]
@@ -2277,7 +2386,7 @@ pub trait ByteSlice: Sealed {
/// let s = B(b"FOO\xFFBAR\xE2\x98BAZ");
/// assert_eq!(B(b"foo\xFFbar\xE2\x98baz"), s.to_lowercase().as_bytes());
/// ```
- #[cfg(all(feature = "std", feature = "unicode"))]
+ #[cfg(all(feature = "alloc", feature = "unicode"))]
#[inline]
fn to_lowercase(&self) -> Vec<u8> {
let mut buf = vec![];
@@ -2339,7 +2448,7 @@ pub trait ByteSlice: Sealed {
/// s.to_lowercase_into(&mut buf);
/// assert_eq!(B(b"foo\xFFbar\xE2\x98baz"), buf.as_bytes());
/// ```
- #[cfg(all(feature = "std", feature = "unicode"))]
+ #[cfg(all(feature = "alloc", feature = "unicode"))]
#[inline]
fn to_lowercase_into(&self, buf: &mut Vec<u8>) {
// TODO: This is the best we can do given what std exposes I think.
@@ -2394,7 +2503,7 @@ pub trait ByteSlice: Sealed {
/// let s = B(b"FOO\xFFBAR\xE2\x98BAZ");
/// assert_eq!(s.to_ascii_lowercase(), B(b"foo\xFFbar\xE2\x98baz"));
/// ```
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
#[inline]
fn to_ascii_lowercase(&self) -> Vec<u8> {
self.as_bytes().to_ascii_lowercase()
@@ -2424,11 +2533,13 @@ pub trait ByteSlice: Sealed {
/// Invalid UTF-8 remains as is:
///
/// ```
+ /// # #[cfg(feature = "alloc")] {
/// use bstr::{B, ByteSlice, ByteVec};
///
/// let mut s = <Vec<u8>>::from_slice(b"FOO\xFFBAR\xE2\x98BAZ");
/// s.make_ascii_lowercase();
/// assert_eq!(s, B(b"foo\xFFbar\xE2\x98baz"));
+ /// # }
/// ```
#[inline]
fn make_ascii_lowercase(&mut self) {
@@ -2480,7 +2591,7 @@ pub trait ByteSlice: Sealed {
/// let s = B(b"foo\xFFbar\xE2\x98baz");
/// assert_eq!(s.to_uppercase(), B(b"FOO\xFFBAR\xE2\x98BAZ"));
/// ```
- #[cfg(all(feature = "std", feature = "unicode"))]
+ #[cfg(all(feature = "alloc", feature = "unicode"))]
#[inline]
fn to_uppercase(&self) -> Vec<u8> {
let mut buf = vec![];
@@ -2542,7 +2653,7 @@ pub trait ByteSlice: Sealed {
/// s.to_uppercase_into(&mut buf);
/// assert_eq!(buf, B(b"FOO\xFFBAR\xE2\x98BAZ"));
/// ```
- #[cfg(all(feature = "std", feature = "unicode"))]
+ #[cfg(all(feature = "alloc", feature = "unicode"))]
#[inline]
fn to_uppercase_into(&self, buf: &mut Vec<u8>) {
// TODO: This is the best we can do given what std exposes I think.
@@ -2594,7 +2705,7 @@ pub trait ByteSlice: Sealed {
/// let s = B(b"foo\xFFbar\xE2\x98baz");
/// assert_eq!(s.to_ascii_uppercase(), B(b"FOO\xFFBAR\xE2\x98BAZ"));
/// ```
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
#[inline]
fn to_ascii_uppercase(&self) -> Vec<u8> {
self.as_bytes().to_ascii_uppercase()
@@ -2624,11 +2735,13 @@ pub trait ByteSlice: Sealed {
/// Invalid UTF-8 remains as is:
///
/// ```
+ /// # #[cfg(feature = "alloc")] {
/// use bstr::{B, ByteSlice, ByteVec};
///
/// let mut s = <Vec<u8>>::from_slice(b"foo\xFFbar\xE2\x98baz");
/// s.make_ascii_uppercase();
/// assert_eq!(s, B(b"FOO\xFFBAR\xE2\x98BAZ"));
+ /// # }
/// ```
#[inline]
fn make_ascii_uppercase(&mut self) {
@@ -2900,72 +3013,6 @@ pub trait ByteSlice: Sealed {
Some(index)
}
}
-
- /// Copies elements from one part of the slice to another part of itself,
- /// where the parts may be overlapping.
- ///
- /// `src` is the range within this byte string to copy from, while `dest`
- /// is the starting index of the range within this byte string to copy to.
- /// The length indicated by `src` must be less than or equal to the number
- /// of bytes from `dest` to the end of the byte string.
- ///
- /// # Panics
- ///
- /// Panics if either range is out of bounds, or if `src` is too big to fit
- /// into `dest`, or if the end of `src` is before the start.
- ///
- /// # Examples
- ///
- /// Copying four bytes within a byte string:
- ///
- /// ```
- /// use bstr::{B, ByteSlice};
- ///
- /// let mut buf = *b"Hello, World!";
- /// let s = &mut buf;
- /// s.copy_within_str(1..5, 8);
- /// assert_eq!(s, B("Hello, Wello!"));
- /// ```
- #[inline]
- fn copy_within_str<R>(&mut self, src: R, dest: usize)
- where
- R: ops::RangeBounds<usize>,
- {
- // TODO: Deprecate this once slice::copy_within stabilizes.
- let src_start = match src.start_bound() {
- ops::Bound::Included(&n) => n,
- ops::Bound::Excluded(&n) => {
- n.checked_add(1).expect("attempted to index slice beyond max")
- }
- ops::Bound::Unbounded => 0,
- };
- let src_end = match src.end_bound() {
- ops::Bound::Included(&n) => {
- n.checked_add(1).expect("attempted to index slice beyond max")
- }
- ops::Bound::Excluded(&n) => n,
- ops::Bound::Unbounded => self.as_bytes().len(),
- };
- assert!(src_start <= src_end, "src end is before src start");
- assert!(src_end <= self.as_bytes().len(), "src is out of bounds");
- let count = src_end - src_start;
- assert!(
- dest <= self.as_bytes().len() - count,
- "dest is out of bounds",
- );
-
- // SAFETY: This is safe because we use ptr::copy to handle overlapping
- // copies, and is also safe because we've checked all the bounds above.
- // Finally, we are only dealing with u8 data, which is Copy, which
- // means we can copy without worrying about ownership/destructors.
- unsafe {
- ptr::copy(
- self.as_bytes().get_unchecked(src_start),
- self.as_bytes_mut().get_unchecked_mut(dest),
- count,
- );
- }
- }
}
/// A single substring searcher fixed to a particular needle.
@@ -3138,22 +3185,22 @@ impl<'a> FinderReverse<'a> {
///
/// Matches are reported by the byte offset at which they begin.
///
-/// `'a` is the shorter of two lifetimes: the byte string being searched or the
-/// byte string being looked for.
+/// `'h` is the lifetime of the haystack while `'n` is the lifetime of the
+/// needle.
#[derive(Debug)]
-pub struct Find<'a> {
- it: memmem::FindIter<'a, 'a>,
- haystack: &'a [u8],
- needle: &'a [u8],
+pub struct Find<'h, 'n> {
+ it: memmem::FindIter<'h, 'n>,
+ haystack: &'h [u8],
+ needle: &'n [u8],
}
-impl<'a> Find<'a> {
- fn new(haystack: &'a [u8], needle: &'a [u8]) -> Find<'a> {
+impl<'h, 'n> Find<'h, 'n> {
+ fn new(haystack: &'h [u8], needle: &'n [u8]) -> Find<'h, 'n> {
Find { it: memmem::find_iter(haystack, needle), haystack, needle }
}
}
-impl<'a> Iterator for Find<'a> {
+impl<'h, 'n> Iterator for Find<'h, 'n> {
type Item = usize;
#[inline]
@@ -3166,17 +3213,17 @@ impl<'a> Iterator for Find<'a> {
///
/// Matches are reported by the byte offset at which they begin.
///
-/// `'a` is the shorter of two lifetimes: the byte string being searched or the
-/// byte string being looked for.
+/// `'h` is the lifetime of the haystack while `'n` is the lifetime of the
+/// needle.
#[derive(Debug)]
-pub struct FindReverse<'a> {
- it: memmem::FindRevIter<'a, 'a>,
- haystack: &'a [u8],
- needle: &'a [u8],
+pub struct FindReverse<'h, 'n> {
+ it: memmem::FindRevIter<'h, 'n>,
+ haystack: &'h [u8],
+ needle: &'n [u8],
}
-impl<'a> FindReverse<'a> {
- fn new(haystack: &'a [u8], needle: &'a [u8]) -> FindReverse<'a> {
+impl<'h, 'n> FindReverse<'h, 'n> {
+ fn new(haystack: &'h [u8], needle: &'n [u8]) -> FindReverse<'h, 'n> {
FindReverse {
it: memmem::rfind_iter(haystack, needle),
haystack,
@@ -3184,16 +3231,16 @@ impl<'a> FindReverse<'a> {
}
}
- fn haystack(&self) -> &'a [u8] {
+ fn haystack(&self) -> &'h [u8] {
self.haystack
}
- fn needle(&self) -> &[u8] {
+ fn needle(&self) -> &'n [u8] {
self.needle
}
}
-impl<'a> Iterator for FindReverse<'a> {
+impl<'h, 'n> Iterator for FindReverse<'h, 'n> {
type Item = usize;
#[inline]
@@ -3215,7 +3262,7 @@ impl<'a> Bytes<'a> {
/// This has the same lifetime as the original slice,
/// and so the iterator can continue to be used while this exists.
#[inline]
- pub fn as_slice(&self) -> &'a [u8] {
+ pub fn as_bytes(&self) -> &'a [u8] {
self.it.as_slice()
}
}
@@ -3252,21 +3299,27 @@ impl<'a> iter::FusedIterator for Bytes<'a> {}
/// An iterator over the fields in a byte string, separated by whitespace.
///
+/// Whitespace for this iterator is defined by the Unicode property
+/// `White_Space`.
+///
/// This iterator splits on contiguous runs of whitespace, such that the fields
/// in `foo\t\t\n \nbar` are `foo` and `bar`.
///
/// `'a` is the lifetime of the byte string being split.
+#[cfg(feature = "unicode")]
#[derive(Debug)]
pub struct Fields<'a> {
it: FieldsWith<'a, fn(char) -> bool>,
}
+#[cfg(feature = "unicode")]
impl<'a> Fields<'a> {
fn new(bytes: &'a [u8]) -> Fields<'a> {
Fields { it: bytes.fields_with(|ch| ch.is_whitespace()) }
}
}
+#[cfg(feature = "unicode")]
impl<'a> Iterator for Fields<'a> {
type Item = &'a [u8];
@@ -3328,10 +3381,11 @@ impl<'a, F: FnMut(char) -> bool> Iterator for FieldsWith<'a, F> {
/// An iterator over substrings in a byte string, split by a separator.
///
-/// `'a` is the lifetime of the byte string being split.
+/// `'h` is the lifetime of the byte string being split (the haystack), while
+/// `'s` is the lifetime of the byte string doing the splitting.
#[derive(Debug)]
-pub struct Split<'a> {
- finder: Find<'a>,
+pub struct Split<'h, 's> {
+ finder: Find<'h, 's>,
/// The end position of the previous match of our splitter. The element
/// we yield corresponds to the substring starting at `last` up to the
/// beginning of the next match of the splitter.
@@ -3342,18 +3396,18 @@ pub struct Split<'a> {
done: bool,
}
-impl<'a> Split<'a> {
- fn new(haystack: &'a [u8], splitter: &'a [u8]) -> Split<'a> {
+impl<'h, 's> Split<'h, 's> {
+ fn new(haystack: &'h [u8], splitter: &'s [u8]) -> Split<'h, 's> {
let finder = haystack.find_iter(splitter);
Split { finder, last: 0, done: false }
}
}
-impl<'a> Iterator for Split<'a> {
- type Item = &'a [u8];
+impl<'h, 's> Iterator for Split<'h, 's> {
+ type Item = &'h [u8];
#[inline]
- fn next(&mut self) -> Option<&'a [u8]> {
+ fn next(&mut self) -> Option<&'h [u8]> {
let haystack = self.finder.haystack;
match self.finder.next() {
Some(start) => {
@@ -3383,11 +3437,11 @@ impl<'a> Iterator for Split<'a> {
/// An iterator over substrings in a byte string, split by a separator, in
/// reverse.
///
-/// `'a` is the lifetime of the byte string being split, while `F` is the type
-/// of the predicate, i.e., `FnMut(char) -> bool`.
+/// `'h` is the lifetime of the byte string being split (the haystack), while
+/// `'s` is the lifetime of the byte string doing the splitting.
#[derive(Debug)]
-pub struct SplitReverse<'a> {
- finder: FindReverse<'a>,
+pub struct SplitReverse<'h, 's> {
+ finder: FindReverse<'h, 's>,
/// The end position of the previous match of our splitter. The element
/// we yield corresponds to the substring starting at `last` up to the
/// beginning of the next match of the splitter.
@@ -3398,18 +3452,18 @@ pub struct SplitReverse<'a> {
done: bool,
}
-impl<'a> SplitReverse<'a> {
- fn new(haystack: &'a [u8], splitter: &'a [u8]) -> SplitReverse<'a> {
+impl<'h, 's> SplitReverse<'h, 's> {
+ fn new(haystack: &'h [u8], splitter: &'s [u8]) -> SplitReverse<'h, 's> {
let finder = haystack.rfind_iter(splitter);
SplitReverse { finder, last: haystack.len(), done: false }
}
}
-impl<'a> Iterator for SplitReverse<'a> {
- type Item = &'a [u8];
+impl<'h, 's> Iterator for SplitReverse<'h, 's> {
+ type Item = &'h [u8];
#[inline]
- fn next(&mut self) -> Option<&'a [u8]> {
+ fn next(&mut self) -> Option<&'h [u8]> {
let haystack = self.finder.haystack();
match self.finder.next() {
Some(start) => {
@@ -3440,31 +3494,31 @@ impl<'a> Iterator for SplitReverse<'a> {
/// An iterator over at most `n` substrings in a byte string, split by a
/// separator.
///
-/// `'a` is the lifetime of the byte string being split, while `F` is the type
-/// of the predicate, i.e., `FnMut(char) -> bool`.
+/// `'h` is the lifetime of the byte string being split (the haystack), while
+/// `'s` is the lifetime of the byte string doing the splitting.
#[derive(Debug)]
-pub struct SplitN<'a> {
- split: Split<'a>,
+pub struct SplitN<'h, 's> {
+ split: Split<'h, 's>,
limit: usize,
count: usize,
}
-impl<'a> SplitN<'a> {
+impl<'h, 's> SplitN<'h, 's> {
fn new(
- haystack: &'a [u8],
- splitter: &'a [u8],
+ haystack: &'h [u8],
+ splitter: &'s [u8],
limit: usize,
- ) -> SplitN<'a> {
+ ) -> SplitN<'h, 's> {
let split = haystack.split_str(splitter);
SplitN { split, limit, count: 0 }
}
}
-impl<'a> Iterator for SplitN<'a> {
- type Item = &'a [u8];
+impl<'h, 's> Iterator for SplitN<'h, 's> {
+ type Item = &'h [u8];
#[inline]
- fn next(&mut self) -> Option<&'a [u8]> {
+ fn next(&mut self) -> Option<&'h [u8]> {
self.count += 1;
if self.count > self.limit || self.split.done {
None
@@ -3479,31 +3533,31 @@ impl<'a> Iterator for SplitN<'a> {
/// An iterator over at most `n` substrings in a byte string, split by a
/// separator, in reverse.
///
-/// `'a` is the lifetime of the byte string being split, while `F` is the type
-/// of the predicate, i.e., `FnMut(char) -> bool`.
+/// `'h` is the lifetime of the byte string being split (the haystack), while
+/// `'s` is the lifetime of the byte string doing the splitting.
#[derive(Debug)]
-pub struct SplitNReverse<'a> {
- split: SplitReverse<'a>,
+pub struct SplitNReverse<'h, 's> {
+ split: SplitReverse<'h, 's>,
limit: usize,
count: usize,
}
-impl<'a> SplitNReverse<'a> {
+impl<'h, 's> SplitNReverse<'h, 's> {
fn new(
- haystack: &'a [u8],
- splitter: &'a [u8],
+ haystack: &'h [u8],
+ splitter: &'s [u8],
limit: usize,
- ) -> SplitNReverse<'a> {
+ ) -> SplitNReverse<'h, 's> {
let split = haystack.rsplit_str(splitter);
SplitNReverse { split, limit, count: 0 }
}
}
-impl<'a> Iterator for SplitNReverse<'a> {
- type Item = &'a [u8];
+impl<'h, 's> Iterator for SplitNReverse<'h, 's> {
+ type Item = &'h [u8];
#[inline]
- fn next(&mut self) -> Option<&'a [u8]> {
+ fn next(&mut self) -> Option<&'h [u8]> {
self.count += 1;
if self.count > self.limit || self.split.done {
None
@@ -3521,6 +3575,7 @@ impl<'a> Iterator for SplitNReverse<'a> {
/// `\n`.
///
/// `'a` is the lifetime of the byte string being iterated over.
+#[derive(Clone, Debug)]
pub struct Lines<'a> {
it: LinesWithTerminator<'a>,
}
@@ -3529,6 +3584,28 @@ impl<'a> Lines<'a> {
fn new(bytes: &'a [u8]) -> Lines<'a> {
Lines { it: LinesWithTerminator::new(bytes) }
}
+
+ /// Return a copy of the rest of the underlying bytes without affecting the
+ /// iterator itself.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// let s = b"\
+ /// foo
+ /// bar\r
+ /// baz";
+ /// let mut lines = s.lines();
+ /// assert_eq!(lines.next(), Some(B("foo")));
+ /// assert_eq!(lines.as_bytes(), B("bar\r\nbaz"));
+ /// ```
+ pub fn as_bytes(&self) -> &'a [u8] {
+ self.it.bytes
+ }
}
impl<'a> Iterator for Lines<'a> {
@@ -3536,17 +3613,19 @@ impl<'a> Iterator for Lines<'a> {
#[inline]
fn next(&mut self) -> Option<&'a [u8]> {
- let mut line = self.it.next()?;
- if line.last_byte() == Some(b'\n') {
- line = &line[..line.len() - 1];
- if line.last_byte() == Some(b'\r') {
- line = &line[..line.len() - 1];
- }
- }
- Some(line)
+ Some(trim_last_terminator(self.it.next()?))
+ }
+}
+
+impl<'a> DoubleEndedIterator for Lines<'a> {
+ #[inline]
+ fn next_back(&mut self) -> Option<Self::Item> {
+ Some(trim_last_terminator(self.it.next_back()?))
}
}
+impl<'a> iter::FusedIterator for Lines<'a> {}
+
/// An iterator over all lines in a byte string, including their terminators.
///
/// For this iterator, the only line terminator recognized is `\n`. (Since
@@ -3560,6 +3639,7 @@ impl<'a> Iterator for Lines<'a> {
/// the original byte string.
///
/// `'a` is the lifetime of the byte string being iterated over.
+#[derive(Clone, Debug)]
pub struct LinesWithTerminator<'a> {
bytes: &'a [u8],
}
@@ -3568,6 +3648,28 @@ impl<'a> LinesWithTerminator<'a> {
fn new(bytes: &'a [u8]) -> LinesWithTerminator<'a> {
LinesWithTerminator { bytes }
}
+
+ /// Return a copy of the rest of the underlying bytes without affecting the
+ /// iterator itself.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// let s = b"\
+ /// foo
+ /// bar\r
+ /// baz";
+ /// let mut lines = s.lines_with_terminator();
+ /// assert_eq!(lines.next(), Some(B("foo\n")));
+ /// assert_eq!(lines.as_bytes(), B("bar\r\nbaz"));
+ /// ```
+ pub fn as_bytes(&self) -> &'a [u8] {
+ self.bytes
+ }
}
impl<'a> Iterator for LinesWithTerminator<'a> {
@@ -3591,10 +3693,43 @@ impl<'a> Iterator for LinesWithTerminator<'a> {
}
}
-#[cfg(test)]
+impl<'a> DoubleEndedIterator for LinesWithTerminator<'a> {
+ #[inline]
+ fn next_back(&mut self) -> Option<Self::Item> {
+ let end = self.bytes.len().checked_sub(1)?;
+ match self.bytes[..end].rfind_byte(b'\n') {
+ None => {
+ let line = self.bytes;
+ self.bytes = b"";
+ Some(line)
+ }
+ Some(end) => {
+ let line = &self.bytes[end + 1..];
+ self.bytes = &self.bytes[..end + 1];
+ Some(line)
+ }
+ }
+ }
+}
+
+impl<'a> iter::FusedIterator for LinesWithTerminator<'a> {}
+
+fn trim_last_terminator(mut s: &[u8]) -> &[u8] {
+ if s.last_byte() == Some(b'\n') {
+ s = &s[..s.len() - 1];
+ if s.last_byte() == Some(b'\r') {
+ s = &s[..s.len() - 1];
+ }
+ }
+ s
+}
+
+#[cfg(all(test, feature = "std"))]
mod tests {
- use crate::ext_slice::{ByteSlice, B};
- use crate::tests::LOSSY_TESTS;
+ use crate::{
+ ext_slice::{ByteSlice, Lines, LinesWithTerminator, B},
+ tests::LOSSY_TESTS,
+ };
#[test]
fn to_str_lossy() {
@@ -3622,34 +3757,55 @@ mod tests {
}
#[test]
- #[should_panic]
- fn copy_within_fail1() {
- let mut buf = *b"foobar";
- let s = &mut buf;
- s.copy_within_str(0..2, 5);
- }
+ fn lines_iteration() {
+ macro_rules! t {
+ ($it:expr, $forward:expr) => {
+ let mut res: Vec<&[u8]> = Vec::from($forward);
+ assert_eq!($it.collect::<Vec<_>>(), res);
+ res.reverse();
+ assert_eq!($it.rev().collect::<Vec<_>>(), res);
+ };
+ }
- #[test]
- #[should_panic]
- fn copy_within_fail2() {
- let mut buf = *b"foobar";
- let s = &mut buf;
- s.copy_within_str(3..2, 0);
- }
+ t!(Lines::new(b""), []);
+ t!(LinesWithTerminator::new(b""), []);
- #[test]
- #[should_panic]
- fn copy_within_fail3() {
- let mut buf = *b"foobar";
- let s = &mut buf;
- s.copy_within_str(5..7, 0);
- }
+ t!(Lines::new(b"\n"), [B("")]);
+ t!(Lines::new(b"\r\n"), [B("")]);
+ t!(LinesWithTerminator::new(b"\n"), [B("\n")]);
- #[test]
- #[should_panic]
- fn copy_within_fail4() {
- let mut buf = *b"foobar";
- let s = &mut buf;
- s.copy_within_str(0..1, 6);
+ t!(Lines::new(b"a"), [B("a")]);
+ t!(LinesWithTerminator::new(b"a"), [B("a")]);
+
+ t!(Lines::new(b"abc"), [B("abc")]);
+ t!(LinesWithTerminator::new(b"abc"), [B("abc")]);
+
+ t!(Lines::new(b"abc\n"), [B("abc")]);
+ t!(Lines::new(b"abc\r\n"), [B("abc")]);
+ t!(LinesWithTerminator::new(b"abc\n"), [B("abc\n")]);
+
+ t!(Lines::new(b"abc\n\n"), [B("abc"), B("")]);
+ t!(LinesWithTerminator::new(b"abc\n\n"), [B("abc\n"), B("\n")]);
+
+ t!(Lines::new(b"abc\n\ndef"), [B("abc"), B(""), B("def")]);
+ t!(
+ LinesWithTerminator::new(b"abc\n\ndef"),
+ [B("abc\n"), B("\n"), B("def")]
+ );
+
+ t!(Lines::new(b"abc\n\ndef\n"), [B("abc"), B(""), B("def")]);
+ t!(
+ LinesWithTerminator::new(b"abc\n\ndef\n"),
+ [B("abc\n"), B("\n"), B("def\n")]
+ );
+
+ t!(Lines::new(b"\na\nb\n"), [B(""), B("a"), B("b")]);
+ t!(
+ LinesWithTerminator::new(b"\na\nb\n"),
+ [B("\n"), B("a\n"), B("b\n")]
+ );
+
+ t!(Lines::new(b"\n\n\n"), [B(""), B(""), B("")]);
+ t!(LinesWithTerminator::new(b"\n\n\n"), [B("\n"), B("\n"), B("\n")]);
}
}
diff --git a/src/ext_vec.rs b/src/ext_vec.rs
index 5beb0e1..5effdd0 100644
--- a/src/ext_vec.rs
+++ b/src/ext_vec.rs
@@ -1,16 +1,21 @@
-use std::borrow::Cow;
-use std::error;
-use std::ffi::{OsStr, OsString};
-use std::fmt;
-use std::iter;
-use std::ops;
-use std::path::{Path, PathBuf};
-use std::ptr;
-use std::str;
-use std::vec;
-
-use crate::ext_slice::ByteSlice;
-use crate::utf8::{self, Utf8Error};
+use core::fmt;
+use core::iter;
+use core::ops;
+use core::ptr;
+
+use alloc::{borrow::Cow, string::String, vec, vec::Vec};
+
+#[cfg(feature = "std")]
+use std::{
+ error,
+ ffi::{OsStr, OsString},
+ path::{Path, PathBuf},
+};
+
+use crate::{
+ ext_slice::ByteSlice,
+ utf8::{self, Utf8Error},
+};
/// Concatenate the elements given by the iterator together into a single
/// `Vec<u8>`.
@@ -154,8 +159,9 @@ pub trait ByteVec: Sealed {
/// Create a new byte string from an owned OS string.
///
- /// On Unix, this always succeeds and is zero cost. On non-Unix systems,
- /// this returns the original OS string if it is not valid UTF-8.
+ /// When the underlying bytes of OS strings are accessible, then this
+ /// always succeeds and is zero cost. Otherwise, this returns the given
+ /// `OsString` if it is not valid UTF-8.
///
/// # Examples
///
@@ -171,6 +177,7 @@ pub trait ByteVec: Sealed {
/// assert_eq!(bs, B("foo"));
/// ```
#[inline]
+ #[cfg(feature = "std")]
fn from_os_string(os_str: OsString) -> Result<Vec<u8>, OsString> {
#[cfg(unix)]
#[inline]
@@ -191,10 +198,11 @@ pub trait ByteVec: Sealed {
/// Lossily create a new byte string from an OS string slice.
///
- /// On Unix, this always succeeds, is zero cost and always returns a slice.
- /// On non-Unix systems, this does a UTF-8 check. If the given OS string
- /// slice is not valid UTF-8, then it is lossily decoded into valid UTF-8
- /// (with invalid bytes replaced by the Unicode replacement codepoint).
+ /// When the underlying bytes of OS strings are accessible, then this is
+ /// zero cost and always returns a slice. Otherwise, a UTF-8 check is
+ /// performed and if the given OS string is not valid UTF-8, then it is
+ /// lossily decoded into valid UTF-8 (with invalid bytes replaced by the
+ /// Unicode replacement codepoint).
///
/// # Examples
///
@@ -210,6 +218,7 @@ pub trait ByteVec: Sealed {
/// assert_eq!(bs, B("foo"));
/// ```
#[inline]
+ #[cfg(feature = "std")]
fn from_os_str_lossy<'a>(os_str: &'a OsStr) -> Cow<'a, [u8]> {
#[cfg(unix)]
#[inline]
@@ -233,8 +242,9 @@ pub trait ByteVec: Sealed {
/// Create a new byte string from an owned file path.
///
- /// On Unix, this always succeeds and is zero cost. On non-Unix systems,
- /// this returns the original path if it is not valid UTF-8.
+ /// When the underlying bytes of paths are accessible, then this always
+ /// succeeds and is zero cost. Otherwise, this returns the given `PathBuf`
+ /// if it is not valid UTF-8.
///
/// # Examples
///
@@ -250,16 +260,18 @@ pub trait ByteVec: Sealed {
/// assert_eq!(bs, B("foo"));
/// ```
#[inline]
+ #[cfg(feature = "std")]
fn from_path_buf(path: PathBuf) -> Result<Vec<u8>, PathBuf> {
Vec::from_os_string(path.into_os_string()).map_err(PathBuf::from)
}
/// Lossily create a new byte string from a file path.
///
- /// On Unix, this always succeeds, is zero cost and always returns a slice.
- /// On non-Unix systems, this does a UTF-8 check. If the given path is not
- /// valid UTF-8, then it is lossily decoded into valid UTF-8 (with invalid
- /// bytes replaced by the Unicode replacement codepoint).
+ /// When the underlying bytes of paths are accessible, then this is
+ /// zero cost and always returns a slice. Otherwise, a UTF-8 check is
+ /// performed and if the given path is not valid UTF-8, then it is lossily
+ /// decoded into valid UTF-8 (with invalid bytes replaced by the Unicode
+ /// replacement codepoint).
///
/// # Examples
///
@@ -275,6 +287,7 @@ pub trait ByteVec: Sealed {
/// assert_eq!(bs, B("foo"));
/// ```
#[inline]
+ #[cfg(feature = "std")]
fn from_path_lossy<'a>(path: &'a Path) -> Cow<'a, [u8]> {
Vec::from_os_str_lossy(path.as_os_str())
}
@@ -363,12 +376,10 @@ pub trait ByteVec: Sealed {
/// ```
/// use bstr::ByteVec;
///
- /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
/// let bytes = Vec::from("hello");
- /// let string = bytes.into_string()?;
+ /// let string = bytes.into_string().unwrap();
///
/// assert_eq!("hello", string);
- /// # Ok(()) }; example().unwrap()
/// ```
///
/// If this byte string is not valid UTF-8, then an error will be returned.
@@ -469,8 +480,9 @@ pub trait ByteVec: Sealed {
/// Converts this byte string into an OS string, in place.
///
- /// On Unix, this always succeeds and is zero cost. On non-Unix systems,
- /// this returns the original byte string if it is not valid UTF-8.
+ /// When OS strings can be constructed from arbitrary byte sequences, this
+ /// always succeeds and is zero cost. Otherwise, if this byte string is not
+ /// valid UTF-8, then an error (with the original byte string) is returned.
///
/// # Examples
///
@@ -485,14 +497,15 @@ pub trait ByteVec: Sealed {
/// let os_str = bs.into_os_string().expect("should be valid UTF-8");
/// assert_eq!(os_str, OsStr::new("foo"));
/// ```
+ #[cfg(feature = "std")]
#[inline]
- fn into_os_string(self) -> Result<OsString, Vec<u8>>
+ fn into_os_string(self) -> Result<OsString, FromUtf8Error>
where
Self: Sized,
{
#[cfg(unix)]
#[inline]
- fn imp(v: Vec<u8>) -> Result<OsString, Vec<u8>> {
+ fn imp(v: Vec<u8>) -> Result<OsString, FromUtf8Error> {
use std::os::unix::ffi::OsStringExt;
Ok(OsString::from_vec(v))
@@ -500,11 +513,8 @@ pub trait ByteVec: Sealed {
#[cfg(not(unix))]
#[inline]
- fn imp(v: Vec<u8>) -> Result<OsString, Vec<u8>> {
- match v.into_string() {
- Ok(s) => Ok(OsString::from(s)),
- Err(err) => Err(err.into_vec()),
- }
+ fn imp(v: Vec<u8>) -> Result<OsString, FromUtf8Error> {
+ v.into_string().map(OsString::from)
}
imp(self.into_vec())
@@ -512,13 +522,13 @@ pub trait ByteVec: Sealed {
/// Lossily converts this byte string into an OS string, in place.
///
- /// On Unix, this always succeeds and is zero cost. On non-Unix systems,
- /// this will perform a UTF-8 check and lossily convert this byte string
- /// into valid UTF-8 using the Unicode replacement codepoint.
+ /// When OS strings can be constructed from arbitrary byte sequences, this
+ /// is zero cost and always returns a slice. Otherwise, this will perform a
+ /// UTF-8 check and lossily convert this byte string into valid UTF-8 using
+ /// the Unicode replacement codepoint.
///
- /// Note that this can prevent the correct roundtripping of file paths on
- /// non-Unix systems such as Windows, where file paths are an arbitrary
- /// sequence of 16-bit integers.
+ /// Note that this can prevent the correct roundtripping of file paths when
+ /// the representation of `OsString` is opaque.
///
/// # Examples
///
@@ -532,6 +542,7 @@ pub trait ByteVec: Sealed {
/// assert_eq!(os_str.to_string_lossy(), "foo\u{FFFD}bar");
/// ```
#[inline]
+ #[cfg(feature = "std")]
fn into_os_string_lossy(self) -> OsString
where
Self: Sized,
@@ -555,8 +566,9 @@ pub trait ByteVec: Sealed {
/// Converts this byte string into an owned file path, in place.
///
- /// On Unix, this always succeeds and is zero cost. On non-Unix systems,
- /// this returns the original byte string if it is not valid UTF-8.
+ /// When paths can be constructed from arbitrary byte sequences, this
+ /// always succeeds and is zero cost. Otherwise, if this byte string is not
+ /// valid UTF-8, then an error (with the original byte string) is returned.
///
/// # Examples
///
@@ -569,8 +581,9 @@ pub trait ByteVec: Sealed {
/// let path = bs.into_path_buf().expect("should be valid UTF-8");
/// assert_eq!(path.as_os_str(), "foo");
/// ```
+ #[cfg(feature = "std")]
#[inline]
- fn into_path_buf(self) -> Result<PathBuf, Vec<u8>>
+ fn into_path_buf(self) -> Result<PathBuf, FromUtf8Error>
where
Self: Sized,
{
@@ -579,13 +592,13 @@ pub trait ByteVec: Sealed {
/// Lossily converts this byte string into an owned file path, in place.
///
- /// On Unix, this always succeeds and is zero cost. On non-Unix systems,
- /// this will perform a UTF-8 check and lossily convert this byte string
- /// into valid UTF-8 using the Unicode replacement codepoint.
+ /// When paths can be constructed from arbitrary byte sequences, this is
+ /// zero cost and always returns a slice. Otherwise, this will perform a
+ /// UTF-8 check and lossily convert this byte string into valid UTF-8 using
+ /// the Unicode replacement codepoint.
///
- /// Note that this can prevent the correct roundtripping of file paths on
- /// non-Unix systems such as Windows, where file paths are an arbitrary
- /// sequence of 16-bit integers.
+ /// Note that this can prevent the correct roundtripping of file paths when
+ /// the representation of `PathBuf` is opaque.
///
/// # Examples
///
@@ -599,6 +612,7 @@ pub trait ByteVec: Sealed {
/// assert_eq!(path.to_string_lossy(), "foo\u{FFFD}bar");
/// ```
#[inline]
+ #[cfg(feature = "std")]
fn into_path_buf_lossy(self) -> PathBuf
where
Self: Sized,
@@ -1029,6 +1043,7 @@ impl FromUtf8Error {
}
}
+#[cfg(feature = "std")]
impl error::Error for FromUtf8Error {
#[inline]
fn description(&self) -> &str {
@@ -1043,7 +1058,7 @@ impl fmt::Display for FromUtf8Error {
}
}
-#[cfg(test)]
+#[cfg(all(test, feature = "std"))]
mod tests {
use crate::ext_vec::ByteVec;
diff --git a/src/impls.rs b/src/impls.rs
index 85a27ba..669aee6 100644
--- a/src/impls.rs
+++ b/src/impls.rs
@@ -18,7 +18,7 @@ macro_rules! impl_partial_eq {
};
}
-#[cfg(feature = "std")]
+#[cfg(feature = "alloc")]
macro_rules! impl_partial_eq_cow {
($lhs:ty, $rhs:ty) => {
impl<'a, 'b> PartialEq<$rhs> for $lhs {
@@ -59,17 +59,22 @@ macro_rules! impl_partial_ord {
};
}
-#[cfg(feature = "std")]
+#[cfg(feature = "alloc")]
mod bstring {
- use std::borrow::{Borrow, Cow, ToOwned};
- use std::cmp::Ordering;
- use std::fmt;
- use std::iter::FromIterator;
- use std::ops;
+ use core::{
+ cmp::Ordering, convert::TryFrom, fmt, iter::FromIterator, ops,
+ };
- use crate::bstr::BStr;
- use crate::bstring::BString;
- use crate::ext_vec::ByteVec;
+ use alloc::{
+ borrow::{Borrow, Cow, ToOwned},
+ string::String,
+ vec,
+ vec::Vec,
+ };
+
+ use crate::{
+ bstr::BStr, bstring::BString, ext_slice::ByteSlice, ext_vec::ByteVec,
+ };
impl fmt::Display for BString {
#[inline]
@@ -90,21 +95,21 @@ mod bstring {
#[inline]
fn deref(&self) -> &Vec<u8> {
- &self.bytes
+ self.as_vec()
}
}
impl ops::DerefMut for BString {
#[inline]
fn deref_mut(&mut self) -> &mut Vec<u8> {
- &mut self.bytes
+ self.as_vec_mut()
}
}
impl AsRef<[u8]> for BString {
#[inline]
fn as_ref(&self) -> &[u8] {
- &self.bytes
+ self.as_bytes()
}
}
@@ -118,7 +123,7 @@ mod bstring {
impl AsMut<[u8]> for BString {
#[inline]
fn as_mut(&mut self) -> &mut [u8] {
- &mut self.bytes
+ self.as_bytes_mut()
}
}
@@ -161,14 +166,14 @@ mod bstring {
impl From<Vec<u8>> for BString {
#[inline]
fn from(s: Vec<u8>) -> BString {
- BString { bytes: s }
+ BString::new(s)
}
}
impl From<BString> for Vec<u8> {
#[inline]
fn from(s: BString) -> Vec<u8> {
- s.bytes
+ s.into_vec()
}
}
@@ -200,6 +205,24 @@ mod bstring {
}
}
+ impl TryFrom<BString> for String {
+ type Error = crate::FromUtf8Error;
+
+ #[inline]
+ fn try_from(s: BString) -> Result<String, crate::FromUtf8Error> {
+ s.into_vec().into_string()
+ }
+ }
+
+ impl<'a> TryFrom<&'a BString> for &'a str {
+ type Error = crate::Utf8Error;
+
+ #[inline]
+ fn try_from(s: &'a BString) -> Result<&'a str, crate::Utf8Error> {
+ s.as_bytes().to_str()
+ }
+ }
+
impl FromIterator<char> for BString {
#[inline]
fn from_iter<T: IntoIterator<Item = char>>(iter: T) -> BString {
@@ -279,7 +302,7 @@ mod bstring {
impl PartialOrd for BString {
#[inline]
fn partial_cmp(&self, other: &BString) -> Option<Ordering> {
- PartialOrd::partial_cmp(&self.bytes, &other.bytes)
+ PartialOrd::partial_cmp(self.as_bytes(), other.as_bytes())
}
}
@@ -301,15 +324,12 @@ mod bstring {
}
mod bstr {
- #[cfg(feature = "std")]
- use std::borrow::Cow;
+ use core::{cmp::Ordering, convert::TryFrom, fmt, ops};
- use core::cmp::Ordering;
- use core::fmt;
- use core::ops;
+ #[cfg(feature = "alloc")]
+ use alloc::{borrow::Cow, boxed::Box, string::String, vec::Vec};
- use crate::bstr::BStr;
- use crate::ext_slice::ByteSlice;
+ use crate::{bstr::BStr, ext_slice::ByteSlice};
impl fmt::Display for BStr {
#[inline]
@@ -590,6 +610,13 @@ mod bstr {
}
}
+ impl<'a> From<&'a BStr> for &'a [u8] {
+ #[inline]
+ fn from(s: &'a BStr) -> &'a [u8] {
+ BStr::as_bytes(s)
+ }
+ }
+
impl<'a> From<&'a str> for &'a BStr {
#[inline]
fn from(s: &'a str) -> &'a BStr {
@@ -597,7 +624,7 @@ mod bstr {
}
}
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
impl<'a> From<&'a BStr> for Cow<'a, BStr> {
#[inline]
fn from(s: &'a BStr) -> Cow<'a, BStr> {
@@ -605,7 +632,7 @@ mod bstr {
}
}
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
impl From<Box<[u8]>> for Box<BStr> {
#[inline]
fn from(s: Box<[u8]>) -> Box<BStr> {
@@ -613,7 +640,7 @@ mod bstr {
}
}
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
impl From<Box<BStr>> for Box<[u8]> {
#[inline]
fn from(s: Box<BStr>) -> Box<[u8]> {
@@ -621,6 +648,25 @@ mod bstr {
}
}
+ impl<'a> TryFrom<&'a BStr> for &'a str {
+ type Error = crate::Utf8Error;
+
+ #[inline]
+ fn try_from(s: &'a BStr) -> Result<&'a str, crate::Utf8Error> {
+ s.as_bytes().to_str()
+ }
+ }
+
+ #[cfg(feature = "alloc")]
+ impl<'a> TryFrom<&'a BStr> for String {
+ type Error = crate::Utf8Error;
+
+ #[inline]
+ fn try_from(s: &'a BStr) -> Result<String, crate::Utf8Error> {
+ Ok(s.as_bytes().to_str()?.into())
+ }
+ }
+
impl Eq for BStr {}
impl PartialEq<BStr> for BStr {
@@ -635,19 +681,19 @@ mod bstr {
impl_partial_eq!(BStr, str);
impl_partial_eq!(BStr, &'a str);
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
impl_partial_eq!(BStr, Vec<u8>);
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
impl_partial_eq!(&'a BStr, Vec<u8>);
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
impl_partial_eq!(BStr, String);
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
impl_partial_eq!(&'a BStr, String);
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
impl_partial_eq_cow!(&'a BStr, Cow<'a, BStr>);
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
impl_partial_eq_cow!(&'a BStr, Cow<'a, str>);
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
impl_partial_eq_cow!(&'a BStr, Cow<'a, [u8]>);
impl PartialOrd for BStr {
@@ -669,17 +715,17 @@ mod bstr {
impl_partial_ord!(BStr, str);
impl_partial_ord!(BStr, &'a str);
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
impl_partial_ord!(BStr, Vec<u8>);
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
impl_partial_ord!(&'a BStr, Vec<u8>);
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
impl_partial_ord!(BStr, String);
- #[cfg(feature = "std")]
+ #[cfg(feature = "alloc")]
impl_partial_ord!(&'a BStr, String);
}
-#[cfg(feature = "serde1-nostd")]
+#[cfg(feature = "serde")]
mod bstr_serde {
use core::fmt;
@@ -737,10 +783,11 @@ mod bstr_serde {
}
}
-#[cfg(feature = "serde1")]
+#[cfg(all(feature = "serde", feature = "alloc"))]
mod bstring_serde {
- use std::cmp;
- use std::fmt;
+ use core::{cmp, fmt};
+
+ use alloc::{string::String, vec::Vec};
use serde::{
de::Error, de::SeqAccess, de::Visitor, Deserialize, Deserializer,
@@ -825,8 +872,9 @@ mod bstring_serde {
}
}
-#[cfg(test)]
+#[cfg(all(test, feature = "std"))]
mod display {
+ #[cfg(not(miri))]
use crate::bstring::BString;
use crate::ByteSlice;
@@ -926,6 +974,7 @@ mod display {
);
}
+ #[cfg(not(miri))]
quickcheck::quickcheck! {
fn total_length(bstr: BString) -> bool {
let size = bstr.chars().count();
@@ -934,7 +983,7 @@ mod display {
}
}
-#[cfg(test)]
+#[cfg(all(test, feature = "alloc"))]
mod bstring_arbitrary {
use crate::bstring::BString;
@@ -946,12 +995,13 @@ mod bstring_arbitrary {
}
fn shrink(&self) -> Box<dyn Iterator<Item = BString>> {
- Box::new(self.bytes.shrink().map(BString::from))
+ Box::new(self.as_vec().shrink().map(BString::from))
}
}
}
#[test]
+#[cfg(feature = "std")]
fn test_debug() {
use crate::{ByteSlice, B};
@@ -973,10 +1023,12 @@ fn test_debug() {
// See: https://github.com/BurntSushi/bstr/issues/82
#[test]
+#[cfg(feature = "std")]
fn test_cows_regression() {
- use crate::ByteSlice;
use std::borrow::Cow;
+ use crate::ByteSlice;
+
let c1 = Cow::from(b"hello bstr".as_bstr());
let c2 = b"goodbye bstr".as_bstr();
assert_ne!(c1, c2);
diff --git a/src/io.rs b/src/io.rs
index ad6f3c1..1386bf3 100644
--- a/src/io.rs
+++ b/src/io.rs
@@ -7,10 +7,11 @@ facilities for conveniently and efficiently working with lines as byte strings.
More APIs may be added in the future.
*/
+use alloc::{vec, vec::Vec};
+
use std::io;
-use crate::ext_slice::ByteSlice;
-use crate::ext_vec::ByteVec;
+use crate::{ext_slice::ByteSlice, ext_vec::ByteVec};
/// An extention trait for
/// [`std::io::BufRead`](https://doc.rust-lang.org/std/io/trait.BufRead.html)
@@ -36,7 +37,7 @@ pub trait BufReadExt: io::BufRead {
/// use bstr::io::BufReadExt;
///
/// # fn example() -> Result<(), io::Error> {
- /// let cursor = io::Cursor::new(b"lorem\nipsum\r\ndolor");
+ /// let mut cursor = io::Cursor::new(b"lorem\nipsum\r\ndolor");
///
/// let mut lines = vec![];
/// for result in cursor.byte_lines() {
@@ -79,7 +80,7 @@ pub trait BufReadExt: io::BufRead {
/// use bstr::io::BufReadExt;
///
/// # fn example() -> Result<(), io::Error> {
- /// let cursor = io::Cursor::new(b"lorem\x00ipsum\x00dolor");
+ /// let mut cursor = io::Cursor::new(b"lorem\x00ipsum\x00dolor");
///
/// let mut records = vec![];
/// for result in cursor.byte_records(b'\x00') {
@@ -122,7 +123,7 @@ pub trait BufReadExt: io::BufRead {
/// use bstr::io::BufReadExt;
///
/// # fn example() -> Result<(), io::Error> {
- /// let cursor = io::Cursor::new(b"lorem\nipsum\r\ndolor");
+ /// let mut cursor = io::Cursor::new(b"lorem\nipsum\r\ndolor");
///
/// let mut lines = vec![];
/// cursor.for_byte_line(|line| {
@@ -135,7 +136,7 @@ pub trait BufReadExt: io::BufRead {
/// assert_eq!(lines[2], "dolor".as_bytes());
/// # Ok(()) }; example().unwrap()
/// ```
- fn for_byte_line<F>(self, mut for_each_line: F) -> io::Result<()>
+ fn for_byte_line<F>(&mut self, mut for_each_line: F) -> io::Result<()>
where
Self: Sized,
F: FnMut(&[u8]) -> io::Result<bool>,
@@ -169,7 +170,7 @@ pub trait BufReadExt: io::BufRead {
/// use bstr::io::BufReadExt;
///
/// # fn example() -> Result<(), io::Error> {
- /// let cursor = io::Cursor::new(b"lorem\x00ipsum\x00dolor");
+ /// let mut cursor = io::Cursor::new(b"lorem\x00ipsum\x00dolor");
///
/// let mut records = vec![];
/// cursor.for_byte_record(b'\x00', |record| {
@@ -183,7 +184,7 @@ pub trait BufReadExt: io::BufRead {
/// # Ok(()) }; example().unwrap()
/// ```
fn for_byte_record<F>(
- self,
+ &mut self,
terminator: u8,
mut for_each_record: F,
) -> io::Result<()>
@@ -223,7 +224,7 @@ pub trait BufReadExt: io::BufRead {
/// use bstr::io::BufReadExt;
///
/// # fn example() -> Result<(), io::Error> {
- /// let cursor = io::Cursor::new(b"lorem\nipsum\r\ndolor");
+ /// let mut cursor = io::Cursor::new(b"lorem\nipsum\r\ndolor");
///
/// let mut lines = vec![];
/// cursor.for_byte_line_with_terminator(|line| {
@@ -237,7 +238,7 @@ pub trait BufReadExt: io::BufRead {
/// # Ok(()) }; example().unwrap()
/// ```
fn for_byte_line_with_terminator<F>(
- self,
+ &mut self,
for_each_line: F,
) -> io::Result<()>
where
@@ -269,11 +270,10 @@ pub trait BufReadExt: io::BufRead {
/// ```
/// use std::io;
///
- /// use bstr::B;
- /// use bstr::io::BufReadExt;
+ /// use bstr::{io::BufReadExt, B};
///
/// # fn example() -> Result<(), io::Error> {
- /// let cursor = io::Cursor::new(b"lorem\x00ipsum\x00dolor");
+ /// let mut cursor = io::Cursor::new(b"lorem\x00ipsum\x00dolor");
///
/// let mut records = vec![];
/// cursor.for_byte_record_with_terminator(b'\x00', |record| {
@@ -287,7 +287,7 @@ pub trait BufReadExt: io::BufRead {
/// # Ok(()) }; example().unwrap()
/// ```
fn for_byte_record_with_terminator<F>(
- mut self,
+ &mut self,
terminator: u8,
mut for_each_record: F,
) -> io::Result<()>
@@ -438,11 +438,12 @@ fn trim_record_slice(mut record: &[u8], terminator: u8) -> &[u8] {
record
}
-#[cfg(test)]
+#[cfg(all(test, feature = "std"))]
mod tests {
- use super::BufReadExt;
use crate::bstring::BString;
+ use super::BufReadExt;
+
fn collect_lines<B: AsRef<[u8]>>(slice: B) -> Vec<BString> {
let mut lines = vec![];
slice
diff --git a/src/lib.rs b/src/lib.rs
index 41142c9..09e17b0 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -52,23 +52,27 @@ Here's another example showing how to do a search and replace (and also showing
use of the `B` function):
```
+# #[cfg(feature = "alloc")] {
use bstr::{B, ByteSlice};
let old = B("foo ☃☃☃ foo foo quux foo");
let new = old.replace("foo", "hello");
assert_eq!(new, B("hello ☃☃☃ hello hello quux hello"));
+# }
```
And here's an example that shows case conversion, even in the presence of
invalid UTF-8:
```
+# #[cfg(all(feature = "alloc", feature = "unicode"))] {
use bstr::{ByteSlice, ByteVec};
let mut lower = Vec::from("hello β");
lower[0] = b'\xFF';
// lowercase β is uppercased to Β
assert_eq!(lower.to_uppercase(), b"\xFFELLO \xCE\x92");
+# }
```
# Convenient debug representation
@@ -98,10 +102,8 @@ method converts any `&[u8]` to a `&BStr`.
# When should I use byte strings?
-This library reflects my hypothesis that UTF-8 by convention is a better trade
-off in some circumstances than guaranteed UTF-8. It's possible, perhaps even
-likely, that this is a niche concern for folks working closely with core text
-primitives.
+This library reflects my belief that UTF-8 by convention is a better trade
+off in some circumstances than guaranteed UTF-8.
The first time this idea hit me was in the implementation of Rust's regex
engine. In particular, very little of the internal implementation cares at all
@@ -134,24 +136,26 @@ incremental way by only parsing chunks at a time, but this is often complex to
do or impractical. For example, many regex engines only accept one contiguous
sequence of bytes at a time with no way to perform incremental matching.
-In summary, conventional UTF-8 byte strings provided by this library are
-definitely useful in some limited circumstances, but how useful they are more
-broadly isn't clear yet.
-
# `bstr` in public APIs
-Since this library is not yet `1.0`, you should not use it in the public API of
-your crates until it hits `1.0` (unless you're OK with with tracking breaking
-releases of `bstr`). It is expected that `bstr 1.0` will be released before
-2022.
+This library is past version `1` and is expected to remain at version `1` for
+the foreseeable future. Therefore, it is encouraged to put types from `bstr`
+(like `BStr` and `BString`) in your public API if that makes sense for your
+crate.
+
+With that said, in general, it should be possible to avoid putting anything
+in this crate into your public APIs. Namely, you should never need to use the
+`ByteSlice` or `ByteVec` traits as bounds on public APIs, since their only
+purpose is to extend the methods on the concrete types `[u8]` and `Vec<u8>`,
+respectively. Similarly, it should not be necessary to put either the `BStr` or
+`BString` types into public APIs. If you want to use them internally, then they
+can be converted to/from `[u8]`/`Vec<u8>` as needed. The conversions are free.
+
+So while it shouldn't ever be 100% necessary to make `bstr` a public
+dependency, there may be cases where it is convenient to do so. This is an
+explicitly supported use case of `bstr`, and as such, major version releases
+should be exceptionally rare.
-In general, it should be possible to avoid putting anything in this crate into
-your public APIs. Namely, you should never need to use the `ByteSlice` or
-`ByteVec` traits as bounds on public APIs, since their only purpose is to
-extend the methods on the concrete types `[u8]` and `Vec<u8>`, respectively.
-Similarly, it should not be necessary to put either the `BStr` or `BString`
-types into public APIs. If you want to use them internally, then they can
-be converted to/from `[u8]`/`Vec<u8>` as needed.
# Differences with standard strings
@@ -318,7 +322,8 @@ they can do:
by accessing their underlying 16-bit integer representation. Unfortunately,
this isn't zero cost (it introduces a second WTF-8 decoding step) and it's
not clear this is a good thing to do, since WTF-8 should ideally remain an
- internal implementation detail.
+ internal implementation detail. This is roughly the approach taken by the
+ [`os_str_bytes`](https://crates.io/crates/os_str_bytes) crate.
2. One could instead declare that they will not handle paths on Windows that
are not valid UTF-16, and return an error when one is encountered.
3. Like (2), but instead of returning an error, lossily decode the file path
@@ -365,19 +370,57 @@ UTF-8, and thus contain latent bugs on Unix where paths with invalid UTF-8 are
not terribly uncommon. If you instead use byte strings, then you're guaranteed
to write correct code for Unix, at the cost of getting a corner case wrong on
Windows.
+
+# Cargo features
+
+This crates comes with a few features that control standard library, serde
+and Unicode support.
+
+* `std` - **Enabled** by default. This provides APIs that require the standard
+ library, such as `Vec<u8>` and `PathBuf`. Enabling this feature also enables
+ the `alloc` feature and any other relevant `std` features for dependencies.
+* `alloc` - **Enabled** by default. This provides APIs that require allocations
+ via the `alloc` crate, such as `Vec<u8>`.
+* `unicode` - **Enabled** by default. This provides APIs that require sizable
+ Unicode data compiled into the binary. This includes, but is not limited to,
+ grapheme/word/sentence segmenters. When this is disabled, basic support such
+ as UTF-8 decoding is still included. Note that currently, enabling this
+ feature also requires enabling the `std` feature. It is expected that this
+ limitation will be lifted at some point.
+* `serde` - Enables implementations of serde traits for `BStr`, and also
+ `BString` when `alloc` is enabled.
*/
-#![cfg_attr(not(feature = "std"), no_std)]
+#![cfg_attr(not(any(feature = "std", test)), no_std)]
+#![cfg_attr(docsrs, feature(doc_auto_cfg))]
+
+// Why do we do this? Well, in order for us to use once_cell's 'Lazy' type to
+// load DFAs, it requires enabling its 'std' feature. Yet, there is really
+// nothing about our 'unicode' feature that requires 'std'. We could declare
+// that 'unicode = [std, ...]', which would be fine, but once regex-automata
+// 0.3 is a thing, I believe we can drop once_cell altogether and thus drop
+// the need for 'std' to be enabled when 'unicode' is enabled. But if we make
+// 'unicode' also enable 'std', then it would be a breaking change to remove
+// 'std' from that list.
+//
+// So, for right now, we force folks to explicitly say they want 'std' if they
+// want 'unicode'. In the future, we should be able to relax this.
+#[cfg(all(feature = "unicode", not(feature = "std")))]
+compile_error!("enabling 'unicode' requires enabling 'std'");
+
+#[cfg(feature = "alloc")]
+extern crate alloc;
pub use crate::bstr::BStr;
-#[cfg(feature = "std")]
+#[cfg(feature = "alloc")]
pub use crate::bstring::BString;
+#[cfg(feature = "unicode")]
+pub use crate::ext_slice::Fields;
pub use crate::ext_slice::{
- ByteSlice, Bytes, Fields, FieldsWith, Find, FindReverse, Finder,
- FinderReverse, Lines, LinesWithTerminator, Split, SplitN, SplitNReverse,
- SplitReverse, B,
+ ByteSlice, Bytes, FieldsWith, Find, FindReverse, Finder, FinderReverse,
+ Lines, LinesWithTerminator, Split, SplitN, SplitNReverse, SplitReverse, B,
};
-#[cfg(feature = "std")]
+#[cfg(feature = "alloc")]
pub use crate::ext_vec::{concat, join, ByteVec, DrainBytes, FromUtf8Error};
#[cfg(feature = "unicode")]
pub use crate::unicode::{
@@ -391,26 +434,28 @@ pub use crate::utf8::{
mod ascii;
mod bstr;
-#[cfg(feature = "std")]
+#[cfg(feature = "alloc")]
mod bstring;
mod byteset;
mod ext_slice;
-#[cfg(feature = "std")]
+#[cfg(feature = "alloc")]
mod ext_vec;
mod impls;
#[cfg(feature = "std")]
pub mod io;
-#[cfg(test)]
+#[cfg(all(test, feature = "std"))]
mod tests;
#[cfg(feature = "unicode")]
mod unicode;
mod utf8;
-#[cfg(test)]
+#[cfg(all(test, feature = "std"))]
mod apitests {
- use crate::bstr::BStr;
- use crate::bstring::BString;
- use crate::ext_slice::{Finder, FinderReverse};
+ use crate::{
+ bstr::BStr,
+ bstring::BString,
+ ext_slice::{Finder, FinderReverse},
+ };
#[test]
fn oibits() {
diff --git a/src/tests.rs b/src/tests.rs
index f4179fd..03a4461 100644
--- a/src/tests.rs
+++ b/src/tests.rs
@@ -6,7 +6,7 @@
///
/// The first element in each tuple is the expected result of lossy decoding,
/// while the second element is the input given.
-pub const LOSSY_TESTS: &[(&str, &[u8])] = &[
+pub(crate) const LOSSY_TESTS: &[(&str, &[u8])] = &[
("a", b"a"),
("\u{FFFD}", b"\xFF"),
("\u{FFFD}\u{FFFD}", b"\xFF\xFF"),
diff --git a/src/unicode/data/GraphemeBreakTest.txt b/src/unicode/data/GraphemeBreakTest.txt
index fb4fec9..eff2fd3 100644
--- a/src/unicode/data/GraphemeBreakTest.txt
+++ b/src/unicode/data/GraphemeBreakTest.txt
@@ -1,6 +1,6 @@
-# GraphemeBreakTest-12.1.0.txt
-# Date: 2019-03-10, 10:53:12 GMT
-# © 2019 Unicode®, Inc.
+# GraphemeBreakTest-14.0.0.txt
+# Date: 2021-03-08, 06:22:32 GMT
+# © 2021 Unicode®, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
# For terms of use, see http://www.unicode.org/terms_of_use.html
#
diff --git a/src/unicode/data/SentenceBreakTest.txt b/src/unicode/data/SentenceBreakTest.txt
index 7c1c34a..61ea42c 100644
--- a/src/unicode/data/SentenceBreakTest.txt
+++ b/src/unicode/data/SentenceBreakTest.txt
@@ -1,6 +1,6 @@
-# SentenceBreakTest-12.1.0.txt
-# Date: 2019-03-10, 10:53:28 GMT
-# © 2019 Unicode®, Inc.
+# SentenceBreakTest-14.0.0.txt
+# Date: 2021-03-08, 06:22:40 GMT
+# © 2021 Unicode®, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
# For terms of use, see http://www.unicode.org/terms_of_use.html
#
diff --git a/src/unicode/data/WordBreakTest.txt b/src/unicode/data/WordBreakTest.txt
index facd892..1d1435b 100644
--- a/src/unicode/data/WordBreakTest.txt
+++ b/src/unicode/data/WordBreakTest.txt
@@ -1,6 +1,6 @@
-# WordBreakTest-12.1.0.txt
-# Date: 2019-03-10, 10:53:29 GMT
-# © 2019 Unicode®, Inc.
+# WordBreakTest-14.0.0.txt
+# Date: 2021-03-08, 06:22:40 GMT
+# © 2021 Unicode®, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
# For terms of use, see http://www.unicode.org/terms_of_use.html
#
diff --git a/src/unicode/fsm/grapheme_break_fwd.bigendian.dfa b/src/unicode/fsm/grapheme_break_fwd.bigendian.dfa
index 0efaaf2..31f99c1 100644
--- a/src/unicode/fsm/grapheme_break_fwd.bigendian.dfa
+++ b/src/unicode/fsm/grapheme_break_fwd.bigendian.dfa
Binary files differ
diff --git a/src/unicode/fsm/grapheme_break_fwd.littleendian.dfa b/src/unicode/fsm/grapheme_break_fwd.littleendian.dfa
index eb24025..3a51728 100644
--- a/src/unicode/fsm/grapheme_break_fwd.littleendian.dfa
+++ b/src/unicode/fsm/grapheme_break_fwd.littleendian.dfa
Binary files differ
diff --git a/src/unicode/fsm/grapheme_break_fwd.rs b/src/unicode/fsm/grapheme_break_fwd.rs
index b53b1d7..dea4a7e 100644
--- a/src/unicode/fsm/grapheme_break_fwd.rs
+++ b/src/unicode/fsm/grapheme_break_fwd.rs
@@ -2,11 +2,12 @@
//
// ucd-generate dfa --name GRAPHEME_BREAK_FWD --sparse --minimize --anchored --state-size 2 src/unicode/fsm/ [snip (arg too long)]
//
-// ucd-generate 0.2.9 is available on crates.io.
+// ucd-generate 0.2.12 is available on crates.io.
#[cfg(target_endian = "big")]
-lazy_static::lazy_static! {
- pub static ref GRAPHEME_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u16> = {
+pub static GRAPHEME_BREAK_FWD: ::once_cell::sync::Lazy<
+ ::regex_automata::SparseDFA<&'static [u8], u16>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u8; 0],
@@ -18,15 +19,13 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("grapheme_break_fwd.bigendian.dfa"),
};
- unsafe {
- ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) }
+});
#[cfg(target_endian = "little")]
-lazy_static::lazy_static! {
- pub static ref GRAPHEME_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u16> = {
+pub static GRAPHEME_BREAK_FWD: ::once_cell::sync::Lazy<
+ ::regex_automata::SparseDFA<&'static [u8], u16>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u8; 0],
@@ -38,8 +37,5 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("grapheme_break_fwd.littleendian.dfa"),
};
- unsafe {
- ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) }
+});
diff --git a/src/unicode/fsm/grapheme_break_rev.bigendian.dfa b/src/unicode/fsm/grapheme_break_rev.bigendian.dfa
index d42cd36..742d2a6 100644
--- a/src/unicode/fsm/grapheme_break_rev.bigendian.dfa
+++ b/src/unicode/fsm/grapheme_break_rev.bigendian.dfa
Binary files differ
diff --git a/src/unicode/fsm/grapheme_break_rev.littleendian.dfa b/src/unicode/fsm/grapheme_break_rev.littleendian.dfa
index c75ea5f..d1937f2 100644
--- a/src/unicode/fsm/grapheme_break_rev.littleendian.dfa
+++ b/src/unicode/fsm/grapheme_break_rev.littleendian.dfa
Binary files differ
diff --git a/src/unicode/fsm/grapheme_break_rev.rs b/src/unicode/fsm/grapheme_break_rev.rs
index 93e888c..2d2cd54 100644
--- a/src/unicode/fsm/grapheme_break_rev.rs
+++ b/src/unicode/fsm/grapheme_break_rev.rs
@@ -2,11 +2,12 @@
//
// ucd-generate dfa --name GRAPHEME_BREAK_REV --reverse --longest --sparse --minimize --anchored --state-size 2 src/unicode/fsm/ [snip (arg too long)]
//
-// ucd-generate 0.2.9 is available on crates.io.
+// ucd-generate 0.2.12 is available on crates.io.
#[cfg(target_endian = "big")]
-lazy_static::lazy_static! {
- pub static ref GRAPHEME_BREAK_REV: ::regex_automata::SparseDFA<&'static [u8], u16> = {
+pub static GRAPHEME_BREAK_REV: ::once_cell::sync::Lazy<
+ ::regex_automata::SparseDFA<&'static [u8], u16>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u8; 0],
@@ -18,15 +19,13 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("grapheme_break_rev.bigendian.dfa"),
};
- unsafe {
- ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) }
+});
#[cfg(target_endian = "little")]
-lazy_static::lazy_static! {
- pub static ref GRAPHEME_BREAK_REV: ::regex_automata::SparseDFA<&'static [u8], u16> = {
+pub static GRAPHEME_BREAK_REV: ::once_cell::sync::Lazy<
+ ::regex_automata::SparseDFA<&'static [u8], u16>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u8; 0],
@@ -38,8 +37,5 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("grapheme_break_rev.littleendian.dfa"),
};
- unsafe {
- ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) }
+});
diff --git a/src/unicode/fsm/regional_indicator_rev.rs b/src/unicode/fsm/regional_indicator_rev.rs
index 2bf7e4c..db7a40f 100644
--- a/src/unicode/fsm/regional_indicator_rev.rs
+++ b/src/unicode/fsm/regional_indicator_rev.rs
@@ -2,11 +2,12 @@
//
// ucd-generate dfa --name REGIONAL_INDICATOR_REV --reverse --classes --minimize --anchored --premultiply --state-size 1 src/unicode/fsm/ \p{gcb=Regional_Indicator}
//
-// ucd-generate 0.2.9 is available on crates.io.
+// ucd-generate 0.2.12 is available on crates.io.
#[cfg(target_endian = "big")]
-lazy_static::lazy_static! {
- pub static ref REGIONAL_INDICATOR_REV: ::regex_automata::DenseDFA<&'static [u8], u8> = {
+pub static REGIONAL_INDICATOR_REV: ::once_cell::sync::Lazy<
+ ::regex_automata::DenseDFA<&'static [u8], u8>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u8; 0],
@@ -18,15 +19,13 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("regional_indicator_rev.bigendian.dfa"),
};
- unsafe {
- ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) }
+});
#[cfg(target_endian = "little")]
-lazy_static::lazy_static! {
- pub static ref REGIONAL_INDICATOR_REV: ::regex_automata::DenseDFA<&'static [u8], u8> = {
+pub static REGIONAL_INDICATOR_REV: ::once_cell::sync::Lazy<
+ ::regex_automata::DenseDFA<&'static [u8], u8>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u8; 0],
@@ -38,8 +37,5 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("regional_indicator_rev.littleendian.dfa"),
};
- unsafe {
- ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) }
+});
diff --git a/src/unicode/fsm/sentence_break_fwd.bigendian.dfa b/src/unicode/fsm/sentence_break_fwd.bigendian.dfa
index a1813d7..1abdae8 100644
--- a/src/unicode/fsm/sentence_break_fwd.bigendian.dfa
+++ b/src/unicode/fsm/sentence_break_fwd.bigendian.dfa
Binary files differ
diff --git a/src/unicode/fsm/sentence_break_fwd.littleendian.dfa b/src/unicode/fsm/sentence_break_fwd.littleendian.dfa
index 2763583..2f8aadd 100644
--- a/src/unicode/fsm/sentence_break_fwd.littleendian.dfa
+++ b/src/unicode/fsm/sentence_break_fwd.littleendian.dfa
Binary files differ
diff --git a/src/unicode/fsm/sentence_break_fwd.rs b/src/unicode/fsm/sentence_break_fwd.rs
index cc937a4..97dd658 100644
--- a/src/unicode/fsm/sentence_break_fwd.rs
+++ b/src/unicode/fsm/sentence_break_fwd.rs
@@ -2,11 +2,12 @@
//
// ucd-generate dfa --name SENTENCE_BREAK_FWD --minimize --sparse --anchored --state-size 4 src/unicode/fsm/ [snip (arg too long)]
//
-// ucd-generate 0.2.9 is available on crates.io.
+// ucd-generate 0.2.12 is available on crates.io.
#[cfg(target_endian = "big")]
-lazy_static::lazy_static! {
- pub static ref SENTENCE_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u32> = {
+pub static SENTENCE_BREAK_FWD: ::once_cell::sync::Lazy<
+ ::regex_automata::SparseDFA<&'static [u8], u32>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u8; 0],
@@ -18,15 +19,13 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("sentence_break_fwd.bigendian.dfa"),
};
- unsafe {
- ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) }
+});
#[cfg(target_endian = "little")]
-lazy_static::lazy_static! {
- pub static ref SENTENCE_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u32> = {
+pub static SENTENCE_BREAK_FWD: ::once_cell::sync::Lazy<
+ ::regex_automata::SparseDFA<&'static [u8], u32>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u8; 0],
@@ -38,8 +37,5 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("sentence_break_fwd.littleendian.dfa"),
};
- unsafe {
- ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) }
+});
diff --git a/src/unicode/fsm/simple_word_fwd.bigendian.dfa b/src/unicode/fsm/simple_word_fwd.bigendian.dfa
index adc64c1..888e465 100644
--- a/src/unicode/fsm/simple_word_fwd.bigendian.dfa
+++ b/src/unicode/fsm/simple_word_fwd.bigendian.dfa
Binary files differ
diff --git a/src/unicode/fsm/simple_word_fwd.littleendian.dfa b/src/unicode/fsm/simple_word_fwd.littleendian.dfa
index dd48386..a1d527c 100644
--- a/src/unicode/fsm/simple_word_fwd.littleendian.dfa
+++ b/src/unicode/fsm/simple_word_fwd.littleendian.dfa
Binary files differ
diff --git a/src/unicode/fsm/simple_word_fwd.rs b/src/unicode/fsm/simple_word_fwd.rs
index f1f3da5..32b69b6 100644
--- a/src/unicode/fsm/simple_word_fwd.rs
+++ b/src/unicode/fsm/simple_word_fwd.rs
@@ -2,11 +2,12 @@
//
// ucd-generate dfa --name SIMPLE_WORD_FWD --sparse --minimize --state-size 2 src/unicode/fsm/ \w
//
-// ucd-generate 0.2.9 is available on crates.io.
+// ucd-generate 0.2.12 is available on crates.io.
#[cfg(target_endian = "big")]
-lazy_static::lazy_static! {
- pub static ref SIMPLE_WORD_FWD: ::regex_automata::SparseDFA<&'static [u8], u16> = {
+pub static SIMPLE_WORD_FWD: ::once_cell::sync::Lazy<
+ ::regex_automata::SparseDFA<&'static [u8], u16>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u8; 0],
@@ -18,15 +19,13 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("simple_word_fwd.bigendian.dfa"),
};
- unsafe {
- ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) }
+});
#[cfg(target_endian = "little")]
-lazy_static::lazy_static! {
- pub static ref SIMPLE_WORD_FWD: ::regex_automata::SparseDFA<&'static [u8], u16> = {
+pub static SIMPLE_WORD_FWD: ::once_cell::sync::Lazy<
+ ::regex_automata::SparseDFA<&'static [u8], u16>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u8; 0],
@@ -38,8 +37,5 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("simple_word_fwd.littleendian.dfa"),
};
- unsafe {
- ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) }
+});
diff --git a/src/unicode/fsm/whitespace_anchored_fwd.rs b/src/unicode/fsm/whitespace_anchored_fwd.rs
index 419b5d4..0780412 100644
--- a/src/unicode/fsm/whitespace_anchored_fwd.rs
+++ b/src/unicode/fsm/whitespace_anchored_fwd.rs
@@ -2,11 +2,12 @@
//
// ucd-generate dfa --name WHITESPACE_ANCHORED_FWD --anchored --classes --premultiply --minimize --state-size 1 src/unicode/fsm/ \s+
//
-// ucd-generate 0.2.9 is available on crates.io.
+// ucd-generate 0.2.12 is available on crates.io.
#[cfg(target_endian = "big")]
-lazy_static::lazy_static! {
- pub static ref WHITESPACE_ANCHORED_FWD: ::regex_automata::DenseDFA<&'static [u8], u8> = {
+pub static WHITESPACE_ANCHORED_FWD: ::once_cell::sync::Lazy<
+ ::regex_automata::DenseDFA<&'static [u8], u8>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u8; 0],
@@ -18,15 +19,13 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("whitespace_anchored_fwd.bigendian.dfa"),
};
- unsafe {
- ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) }
+});
#[cfg(target_endian = "little")]
-lazy_static::lazy_static! {
- pub static ref WHITESPACE_ANCHORED_FWD: ::regex_automata::DenseDFA<&'static [u8], u8> = {
+pub static WHITESPACE_ANCHORED_FWD: ::once_cell::sync::Lazy<
+ ::regex_automata::DenseDFA<&'static [u8], u8>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u8; 0],
@@ -38,8 +37,5 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("whitespace_anchored_fwd.littleendian.dfa"),
};
- unsafe {
- ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) }
+});
diff --git a/src/unicode/fsm/whitespace_anchored_rev.rs b/src/unicode/fsm/whitespace_anchored_rev.rs
index 301b03c..3d0d7a6 100644
--- a/src/unicode/fsm/whitespace_anchored_rev.rs
+++ b/src/unicode/fsm/whitespace_anchored_rev.rs
@@ -2,11 +2,12 @@
//
// ucd-generate dfa --name WHITESPACE_ANCHORED_REV --reverse --anchored --classes --premultiply --minimize --state-size 2 src/unicode/fsm/ \s+
//
-// ucd-generate 0.2.9 is available on crates.io.
+// ucd-generate 0.2.12 is available on crates.io.
#[cfg(target_endian = "big")]
-lazy_static::lazy_static! {
- pub static ref WHITESPACE_ANCHORED_REV: ::regex_automata::DenseDFA<&'static [u16], u16> = {
+pub static WHITESPACE_ANCHORED_REV: ::once_cell::sync::Lazy<
+ ::regex_automata::DenseDFA<&'static [u16], u16>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u16; 0],
@@ -18,15 +19,13 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("whitespace_anchored_rev.bigendian.dfa"),
};
- unsafe {
- ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) }
+});
#[cfg(target_endian = "little")]
-lazy_static::lazy_static! {
- pub static ref WHITESPACE_ANCHORED_REV: ::regex_automata::DenseDFA<&'static [u16], u16> = {
+pub static WHITESPACE_ANCHORED_REV: ::once_cell::sync::Lazy<
+ ::regex_automata::DenseDFA<&'static [u16], u16>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u16; 0],
@@ -38,8 +37,5 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("whitespace_anchored_rev.littleendian.dfa"),
};
- unsafe {
- ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) }
+});
diff --git a/src/unicode/fsm/word_break_fwd.bigendian.dfa b/src/unicode/fsm/word_break_fwd.bigendian.dfa
index 1e75db6..efb9c81 100644
--- a/src/unicode/fsm/word_break_fwd.bigendian.dfa
+++ b/src/unicode/fsm/word_break_fwd.bigendian.dfa
Binary files differ
diff --git a/src/unicode/fsm/word_break_fwd.littleendian.dfa b/src/unicode/fsm/word_break_fwd.littleendian.dfa
index e3093a3..9a716d0 100644
--- a/src/unicode/fsm/word_break_fwd.littleendian.dfa
+++ b/src/unicode/fsm/word_break_fwd.littleendian.dfa
Binary files differ
diff --git a/src/unicode/fsm/word_break_fwd.rs b/src/unicode/fsm/word_break_fwd.rs
index fb041b7..dcb5f6b 100644
--- a/src/unicode/fsm/word_break_fwd.rs
+++ b/src/unicode/fsm/word_break_fwd.rs
@@ -2,11 +2,12 @@
//
// ucd-generate dfa --name WORD_BREAK_FWD --sparse --minimize --anchored --state-size 4 src/unicode/fsm/ [snip (arg too long)]
//
-// ucd-generate 0.2.9 is available on crates.io.
+// ucd-generate 0.2.12 is available on crates.io.
#[cfg(target_endian = "big")]
-lazy_static::lazy_static! {
- pub static ref WORD_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u32> = {
+pub static WORD_BREAK_FWD: ::once_cell::sync::Lazy<
+ ::regex_automata::SparseDFA<&'static [u8], u32>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u8; 0],
@@ -18,15 +19,13 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("word_break_fwd.bigendian.dfa"),
};
- unsafe {
- ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) }
+});
#[cfg(target_endian = "little")]
-lazy_static::lazy_static! {
- pub static ref WORD_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u32> = {
+pub static WORD_BREAK_FWD: ::once_cell::sync::Lazy<
+ ::regex_automata::SparseDFA<&'static [u8], u32>,
+> = ::once_cell::sync::Lazy::new(|| {
#[repr(C)]
struct Aligned<B: ?Sized> {
_align: [u8; 0],
@@ -38,8 +37,5 @@ lazy_static::lazy_static! {
bytes: *include_bytes!("word_break_fwd.littleendian.dfa"),
};
- unsafe {
- ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes)
- }
- };
-}
+ unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) }
+});
diff --git a/src/unicode/grapheme.rs b/src/unicode/grapheme.rs
index ad31cf1..13b730c 100644
--- a/src/unicode/grapheme.rs
+++ b/src/unicode/grapheme.rs
@@ -1,10 +1,14 @@
use regex_automata::DFA;
-use crate::ext_slice::ByteSlice;
-use crate::unicode::fsm::grapheme_break_fwd::GRAPHEME_BREAK_FWD;
-use crate::unicode::fsm::grapheme_break_rev::GRAPHEME_BREAK_REV;
-use crate::unicode::fsm::regional_indicator_rev::REGIONAL_INDICATOR_REV;
-use crate::utf8;
+use crate::{
+ ext_slice::ByteSlice,
+ unicode::fsm::{
+ grapheme_break_fwd::GRAPHEME_BREAK_FWD,
+ grapheme_break_rev::GRAPHEME_BREAK_REV,
+ regional_indicator_rev::REGIONAL_INDICATOR_REV,
+ },
+ utf8,
+};
/// An iterator over grapheme clusters in a byte string.
///
@@ -125,7 +129,7 @@ pub struct GraphemeIndices<'a> {
impl<'a> GraphemeIndices<'a> {
pub(crate) fn new(bs: &'a [u8]) -> GraphemeIndices<'a> {
- GraphemeIndices { bs: bs, forward_index: 0, reverse_index: bs.len() }
+ GraphemeIndices { bs, forward_index: 0, reverse_index: bs.len() }
}
/// View the underlying data as a subslice of the original data.
@@ -191,6 +195,22 @@ impl<'a> DoubleEndedIterator for GraphemeIndices<'a> {
pub fn decode_grapheme(bs: &[u8]) -> (&str, usize) {
if bs.is_empty() {
("", 0)
+ } else if bs.len() >= 2
+ && bs[0].is_ascii()
+ && bs[1].is_ascii()
+ && !bs[0].is_ascii_whitespace()
+ {
+ // FIXME: It is somewhat sad that we have to special case this, but it
+ // leads to a significant speed up in predominantly ASCII text. The
+ // issue here is that the DFA has a bit of overhead, and running it for
+ // every byte in mostly ASCII text results in a bit slowdown. We should
+ // re-litigate this once regex-automata 0.3 is out, but it might be
+ // hard to avoid the special case. A DFA is always going to at least
+ // require some memory access.
+
+ // Safe because all ASCII bytes are valid UTF-8.
+ let grapheme = unsafe { bs[..1].to_str_unchecked() };
+ (grapheme, 1)
} else if let Some(end) = GRAPHEME_BREAK_FWD.find(bs) {
// Safe because a match can only occur for valid UTF-8.
let grapheme = unsafe { bs[..end].to_str_unchecked() };
@@ -257,15 +277,17 @@ fn adjust_rev_for_regional_indicator(mut bs: &[u8], i: usize) -> usize {
}
}
-#[cfg(test)]
+#[cfg(all(test, feature = "std"))]
mod tests {
+ #[cfg(not(miri))]
use ucd_parse::GraphemeClusterBreakTest;
+ use crate::{ext_slice::ByteSlice, tests::LOSSY_TESTS};
+
use super::*;
- use crate::ext_slice::ByteSlice;
- use crate::tests::LOSSY_TESTS;
#[test]
+ #[cfg(not(miri))]
fn forward_ucd() {
for (i, test) in ucdtests().into_iter().enumerate() {
let given = test.grapheme_clusters.concat();
@@ -288,6 +310,7 @@ mod tests {
}
#[test]
+ #[cfg(not(miri))]
fn reverse_ucd() {
for (i, test) in ucdtests().into_iter().enumerate() {
let given = test.grapheme_clusters.concat();
@@ -329,15 +352,18 @@ mod tests {
}
}
+ #[cfg(not(miri))]
fn uniescape(s: &str) -> String {
s.chars().flat_map(|c| c.escape_unicode()).collect::<String>()
}
+ #[cfg(not(miri))]
fn uniescape_vec(strs: &[String]) -> Vec<String> {
strs.iter().map(|s| uniescape(s)).collect()
}
/// Return all of the UCD for grapheme breaks.
+ #[cfg(not(miri))]
fn ucdtests() -> Vec<GraphemeClusterBreakTest> {
const TESTDATA: &'static str =
include_str!("data/GraphemeBreakTest.txt");
diff --git a/src/unicode/mod.rs b/src/unicode/mod.rs
index 60318f4..80638e8 100644
--- a/src/unicode/mod.rs
+++ b/src/unicode/mod.rs
@@ -1,8 +1,8 @@
-pub use self::grapheme::{decode_grapheme, GraphemeIndices, Graphemes};
-pub use self::sentence::{SentenceIndices, Sentences};
-pub use self::whitespace::{whitespace_len_fwd, whitespace_len_rev};
-pub use self::word::{
- WordIndices, Words, WordsWithBreakIndices, WordsWithBreaks,
+pub use self::{
+ grapheme::{decode_grapheme, GraphemeIndices, Graphemes},
+ sentence::{SentenceIndices, Sentences},
+ whitespace::{whitespace_len_fwd, whitespace_len_rev},
+ word::{WordIndices, Words, WordsWithBreakIndices, WordsWithBreaks},
};
mod fsm;
diff --git a/src/unicode/sentence.rs b/src/unicode/sentence.rs
index 063f342..ff29c7e 100644
--- a/src/unicode/sentence.rs
+++ b/src/unicode/sentence.rs
@@ -1,8 +1,9 @@
use regex_automata::DFA;
-use crate::ext_slice::ByteSlice;
-use crate::unicode::fsm::sentence_break_fwd::SENTENCE_BREAK_FWD;
-use crate::utf8;
+use crate::{
+ ext_slice::ByteSlice,
+ unicode::fsm::sentence_break_fwd::SENTENCE_BREAK_FWD, utf8,
+};
/// An iterator over sentences in a byte string.
///
@@ -97,7 +98,7 @@ pub struct SentenceIndices<'a> {
impl<'a> SentenceIndices<'a> {
pub(crate) fn new(bs: &'a [u8]) -> SentenceIndices<'a> {
- SentenceIndices { bs: bs, forward_index: 0 }
+ SentenceIndices { bs, forward_index: 0 }
}
/// View the underlying data as a subslice of the original data.
@@ -156,13 +157,15 @@ fn decode_sentence(bs: &[u8]) -> (&str, usize) {
}
}
-#[cfg(test)]
+#[cfg(all(test, feature = "std"))]
mod tests {
+ #[cfg(not(miri))]
use ucd_parse::SentenceBreakTest;
use crate::ext_slice::ByteSlice;
#[test]
+ #[cfg(not(miri))]
fn forward_ucd() {
for (i, test) in ucdtests().into_iter().enumerate() {
let given = test.sentences.concat();
@@ -198,11 +201,13 @@ mod tests {
bytes.sentences().collect()
}
+ #[cfg(not(miri))]
fn strs_to_bstrs<S: AsRef<str>>(strs: &[S]) -> Vec<&[u8]> {
strs.iter().map(|s| s.as_ref().as_bytes()).collect()
}
/// Return all of the UCD for sentence breaks.
+ #[cfg(not(miri))]
fn ucdtests() -> Vec<SentenceBreakTest> {
const TESTDATA: &'static str =
include_str!("data/SentenceBreakTest.txt");
diff --git a/src/unicode/whitespace.rs b/src/unicode/whitespace.rs
index 949a83f..b5eff30 100644
--- a/src/unicode/whitespace.rs
+++ b/src/unicode/whitespace.rs
@@ -1,7 +1,9 @@
use regex_automata::DFA;
-use crate::unicode::fsm::whitespace_anchored_fwd::WHITESPACE_ANCHORED_FWD;
-use crate::unicode::fsm::whitespace_anchored_rev::WHITESPACE_ANCHORED_REV;
+use crate::unicode::fsm::{
+ whitespace_anchored_fwd::WHITESPACE_ANCHORED_FWD,
+ whitespace_anchored_rev::WHITESPACE_ANCHORED_REV,
+};
/// Return the first position of a non-whitespace character.
pub fn whitespace_len_fwd(slice: &[u8]) -> usize {
diff --git a/src/unicode/word.rs b/src/unicode/word.rs
index e0a5701..849f0c8 100644
--- a/src/unicode/word.rs
+++ b/src/unicode/word.rs
@@ -1,9 +1,12 @@
use regex_automata::DFA;
-use crate::ext_slice::ByteSlice;
-use crate::unicode::fsm::simple_word_fwd::SIMPLE_WORD_FWD;
-use crate::unicode::fsm::word_break_fwd::WORD_BREAK_FWD;
-use crate::utf8;
+use crate::{
+ ext_slice::ByteSlice,
+ unicode::fsm::{
+ simple_word_fwd::SIMPLE_WORD_FWD, word_break_fwd::WORD_BREAK_FWD,
+ },
+ utf8,
+};
/// An iterator over words in a byte string.
///
@@ -254,7 +257,7 @@ pub struct WordsWithBreakIndices<'a> {
impl<'a> WordsWithBreakIndices<'a> {
pub(crate) fn new(bs: &'a [u8]) -> WordsWithBreakIndices<'a> {
- WordsWithBreakIndices { bs: bs, forward_index: 0 }
+ WordsWithBreakIndices { bs, forward_index: 0 }
}
/// View the underlying data as a subslice of the original data.
@@ -316,13 +319,15 @@ fn decode_word(bs: &[u8]) -> (&str, usize) {
}
}
-#[cfg(test)]
+#[cfg(all(test, feature = "std"))]
mod tests {
+ #[cfg(not(miri))]
use ucd_parse::WordBreakTest;
use crate::ext_slice::ByteSlice;
#[test]
+ #[cfg(not(miri))]
fn forward_ucd() {
for (i, test) in ucdtests().into_iter().enumerate() {
let given = test.words.concat();
@@ -379,17 +384,26 @@ mod tests {
assert_eq!(vec!["1XY"], words(b"1XY"));
assert_eq!(vec!["\u{FEFF}", "Ты"], words("\u{FEFF}Ты".as_bytes()));
+
+ // Tests that Vithkuqi works, which was introduced in Unicode 14.
+ // This test fails prior to Unicode 14.
+ assert_eq!(
+ vec!["\u{10570}\u{10597}"],
+ words("\u{10570}\u{10597}".as_bytes())
+ );
}
fn words(bytes: &[u8]) -> Vec<&str> {
bytes.words_with_breaks().collect()
}
+ #[cfg(not(miri))]
fn strs_to_bstrs<S: AsRef<str>>(strs: &[S]) -> Vec<&[u8]> {
strs.iter().map(|s| s.as_ref().as_bytes()).collect()
}
/// Return all of the UCD for word breaks.
+ #[cfg(not(miri))]
fn ucdtests() -> Vec<WordBreakTest> {
const TESTDATA: &'static str = include_str!("data/WordBreakTest.txt");
diff --git a/src/utf8.rs b/src/utf8.rs
index 5c7de36..bc9bc52 100644
--- a/src/utf8.rs
+++ b/src/utf8.rs
@@ -1,13 +1,9 @@
-use core::char;
-use core::cmp;
-use core::fmt;
-use core::str;
+use core::{char, cmp, fmt, str};
+
#[cfg(feature = "std")]
use std::error;
-use crate::ascii;
-use crate::bstr::BStr;
-use crate::ext_slice::ByteSlice;
+use crate::{ascii, bstr::BStr, ext_slice::ByteSlice};
// The UTF-8 decoder provided here is based on the one presented here:
// https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
@@ -75,7 +71,7 @@ const STATES_FORWARD: &'static [u8] = &[
///
/// When invalid UTF-8 byte sequences are found, they are substituted with the
/// Unicode replacement codepoint (`U+FFFD`) using the
-/// ["maximal subpart" strategy](http://www.unicode.org/review/pr-121.html).
+/// ["maximal subpart" strategy](https://www.unicode.org/review/pr-121.html).
///
/// This iterator is created by the
/// [`chars`](trait.ByteSlice.html#method.chars) method provided by the
@@ -146,7 +142,7 @@ impl<'a> DoubleEndedIterator for Chars<'a> {
///
/// When invalid UTF-8 byte sequences are found, they are substituted with the
/// Unicode replacement codepoint (`U+FFFD`) using the
-/// ["maximal subpart" strategy](http://www.unicode.org/review/pr-121.html).
+/// ["maximal subpart" strategy](https://www.unicode.org/review/pr-121.html).
///
/// Note that this is slightly different from the `CharIndices` iterator
/// provided by the standard library. Aside from working on possibly invalid
@@ -168,7 +164,7 @@ pub struct CharIndices<'a> {
impl<'a> CharIndices<'a> {
pub(crate) fn new(bs: &'a [u8]) -> CharIndices<'a> {
- CharIndices { bs: bs, forward_index: 0, reverse_index: bs.len() }
+ CharIndices { bs, forward_index: 0, reverse_index: bs.len() }
}
/// View the underlying data as a subslice of the original data.
@@ -406,7 +402,7 @@ impl<'a> ::core::iter::FusedIterator for Utf8Chunks<'a> {}
/// assert_eq!(err.valid_up_to(), 6);
/// assert_eq!(err.error_len(), Some(1));
/// ```
-#[derive(Debug, Eq, PartialEq)]
+#[derive(Clone, Debug, Eq, PartialEq)]
pub struct Utf8Error {
valid_up_to: usize,
error_len: Option<usize>,
@@ -854,13 +850,15 @@ fn is_leading_or_invalid_utf8_byte(b: u8) -> bool {
(b & 0b1100_0000) != 0b1000_0000
}
-#[cfg(test)]
+#[cfg(all(test, feature = "std"))]
mod tests {
use std::char;
- use crate::ext_slice::{ByteSlice, B};
- use crate::tests::LOSSY_TESTS;
- use crate::utf8::{self, Utf8Error};
+ use crate::{
+ ext_slice::{ByteSlice, B},
+ tests::LOSSY_TESTS,
+ utf8::{self, Utf8Error},
+ };
fn utf8e(valid_up_to: usize) -> Utf8Error {
Utf8Error { valid_up_to, error_len: None }
@@ -871,6 +869,7 @@ mod tests {
}
#[test]
+ #[cfg(not(miri))]
fn validate_all_codepoints() {
for i in 0..(0x10FFFF + 1) {
let cp = match char::from_u32(i) {