aboutsummaryrefslogtreecommitdiff
path: root/src/unicode/grapheme.rs
diff options
context:
space:
mode:
Diffstat (limited to 'src/unicode/grapheme.rs')
-rw-r--r--src/unicode/grapheme.rs44
1 files changed, 35 insertions, 9 deletions
diff --git a/src/unicode/grapheme.rs b/src/unicode/grapheme.rs
index ad31cf1..13b730c 100644
--- a/src/unicode/grapheme.rs
+++ b/src/unicode/grapheme.rs
@@ -1,10 +1,14 @@
use regex_automata::DFA;
-use crate::ext_slice::ByteSlice;
-use crate::unicode::fsm::grapheme_break_fwd::GRAPHEME_BREAK_FWD;
-use crate::unicode::fsm::grapheme_break_rev::GRAPHEME_BREAK_REV;
-use crate::unicode::fsm::regional_indicator_rev::REGIONAL_INDICATOR_REV;
-use crate::utf8;
+use crate::{
+ ext_slice::ByteSlice,
+ unicode::fsm::{
+ grapheme_break_fwd::GRAPHEME_BREAK_FWD,
+ grapheme_break_rev::GRAPHEME_BREAK_REV,
+ regional_indicator_rev::REGIONAL_INDICATOR_REV,
+ },
+ utf8,
+};
/// An iterator over grapheme clusters in a byte string.
///
@@ -125,7 +129,7 @@ pub struct GraphemeIndices<'a> {
impl<'a> GraphemeIndices<'a> {
pub(crate) fn new(bs: &'a [u8]) -> GraphemeIndices<'a> {
- GraphemeIndices { bs: bs, forward_index: 0, reverse_index: bs.len() }
+ GraphemeIndices { bs, forward_index: 0, reverse_index: bs.len() }
}
/// View the underlying data as a subslice of the original data.
@@ -191,6 +195,22 @@ impl<'a> DoubleEndedIterator for GraphemeIndices<'a> {
pub fn decode_grapheme(bs: &[u8]) -> (&str, usize) {
if bs.is_empty() {
("", 0)
+ } else if bs.len() >= 2
+ && bs[0].is_ascii()
+ && bs[1].is_ascii()
+ && !bs[0].is_ascii_whitespace()
+ {
+ // FIXME: It is somewhat sad that we have to special case this, but it
+ // leads to a significant speed up in predominantly ASCII text. The
+ // issue here is that the DFA has a bit of overhead, and running it for
+ // every byte in mostly ASCII text results in a bit slowdown. We should
+ // re-litigate this once regex-automata 0.3 is out, but it might be
+ // hard to avoid the special case. A DFA is always going to at least
+ // require some memory access.
+
+ // Safe because all ASCII bytes are valid UTF-8.
+ let grapheme = unsafe { bs[..1].to_str_unchecked() };
+ (grapheme, 1)
} else if let Some(end) = GRAPHEME_BREAK_FWD.find(bs) {
// Safe because a match can only occur for valid UTF-8.
let grapheme = unsafe { bs[..end].to_str_unchecked() };
@@ -257,15 +277,17 @@ fn adjust_rev_for_regional_indicator(mut bs: &[u8], i: usize) -> usize {
}
}
-#[cfg(test)]
+#[cfg(all(test, feature = "std"))]
mod tests {
+ #[cfg(not(miri))]
use ucd_parse::GraphemeClusterBreakTest;
+ use crate::{ext_slice::ByteSlice, tests::LOSSY_TESTS};
+
use super::*;
- use crate::ext_slice::ByteSlice;
- use crate::tests::LOSSY_TESTS;
#[test]
+ #[cfg(not(miri))]
fn forward_ucd() {
for (i, test) in ucdtests().into_iter().enumerate() {
let given = test.grapheme_clusters.concat();
@@ -288,6 +310,7 @@ mod tests {
}
#[test]
+ #[cfg(not(miri))]
fn reverse_ucd() {
for (i, test) in ucdtests().into_iter().enumerate() {
let given = test.grapheme_clusters.concat();
@@ -329,15 +352,18 @@ mod tests {
}
}
+ #[cfg(not(miri))]
fn uniescape(s: &str) -> String {
s.chars().flat_map(|c| c.escape_unicode()).collect::<String>()
}
+ #[cfg(not(miri))]
fn uniescape_vec(strs: &[String]) -> Vec<String> {
strs.iter().map(|s| uniescape(s)).collect()
}
/// Return all of the UCD for grapheme breaks.
+ #[cfg(not(miri))]
fn ucdtests() -> Vec<GraphemeClusterBreakTest> {
const TESTDATA: &'static str =
include_str!("data/GraphemeBreakTest.txt");