aboutsummaryrefslogtreecommitdiff
path: root/src/grapheme.rs
diff options
context:
space:
mode:
Diffstat (limited to 'src/grapheme.rs')
-rw-r--r--src/grapheme.rs146
1 files changed, 95 insertions, 51 deletions
diff --git a/src/grapheme.rs b/src/grapheme.rs
index 190b86e..eb554c9 100644
--- a/src/grapheme.rs
+++ b/src/grapheme.rs
@@ -49,7 +49,9 @@ impl<'a> Iterator for GraphemeIndices<'a> {
#[inline]
fn next(&mut self) -> Option<(usize, &'a str)> {
- self.iter.next().map(|s| (s.as_ptr() as usize - self.start_offset, s))
+ self.iter
+ .next()
+ .map(|s| (s.as_ptr() as usize - self.start_offset, s))
}
#[inline]
@@ -61,7 +63,9 @@ impl<'a> Iterator for GraphemeIndices<'a> {
impl<'a> DoubleEndedIterator for GraphemeIndices<'a> {
#[inline]
fn next_back(&mut self) -> Option<(usize, &'a str)> {
- self.iter.next_back().map(|s| (s.as_ptr() as usize - self.start_offset, s))
+ self.iter
+ .next_back()
+ .map(|s| (s.as_ptr() as usize - self.start_offset, s))
}
}
@@ -126,7 +130,11 @@ impl<'a> DoubleEndedIterator for Graphemes<'a> {
if end == self.cursor.cur_cursor() {
return None;
}
- let prev = self.cursor_back.prev_boundary(self.string, 0).unwrap().unwrap();
+ let prev = self
+ .cursor_back
+ .prev_boundary(self.string, 0)
+ .unwrap()
+ .unwrap();
Some(&self.string[prev..end])
}
}
@@ -143,7 +151,10 @@ pub fn new_graphemes<'b>(s: &'b str, is_extended: bool) -> Graphemes<'b> {
#[inline]
pub fn new_grapheme_indices<'b>(s: &'b str, is_extended: bool) -> GraphemeIndices<'b> {
- GraphemeIndices { start_offset: s.as_ptr() as usize, iter: new_graphemes(s, is_extended) }
+ GraphemeIndices {
+ start_offset: s.as_ptr() as usize,
+ iter: new_graphemes(s, is_extended),
+ }
}
// maybe unify with PairResult?
@@ -165,6 +176,9 @@ enum GraphemeState {
}
/// Cursor-based segmenter for grapheme clusters.
+///
+/// This allows working with ropes and other datastructures where the string is not contiguous or
+/// fully known at initialization time.
#[derive(Clone, Debug)]
pub struct GraphemeCursor {
// Current cursor position.
@@ -212,7 +226,7 @@ pub enum GraphemeIncomplete {
/// current chunk, so the chunk after that is requested. This will only be
/// returned if the chunk ends before the `len` parameter provided on
/// creation of the cursor.
- NextChunk, // requesting chunk following the one given
+ NextChunk, // requesting chunk following the one given
/// An error returned when the chunk given does not contain the cursor position.
InvalidOffset,
@@ -221,42 +235,42 @@ pub enum GraphemeIncomplete {
// An enum describing the result from lookup of a pair of categories.
#[derive(PartialEq, Eq)]
enum PairResult {
- NotBreak, // definitely not a break
- Break, // definitely a break
- Extended, // a break iff not in extended mode
- Regional, // a break if preceded by an even number of RIS
- Emoji, // a break if preceded by emoji base and (Extend)*
+ NotBreak, // definitely not a break
+ Break, // definitely a break
+ Extended, // a break iff not in extended mode
+ Regional, // a break if preceded by an even number of RIS
+ Emoji, // a break if preceded by emoji base and (Extend)*
}
#[inline]
fn check_pair(before: GraphemeCat, after: GraphemeCat) -> PairResult {
- use crate::tables::grapheme::GraphemeCat::*;
use self::PairResult::*;
+ use crate::tables::grapheme::GraphemeCat::*;
match (before, after) {
- (GC_CR, GC_LF) => NotBreak, // GB3
- (GC_Control, _) => Break, // GB4
- (GC_CR, _) => Break, // GB4
- (GC_LF, _) => Break, // GB4
- (_, GC_Control) => Break, // GB5
- (_, GC_CR) => Break, // GB5
- (_, GC_LF) => Break, // GB5
- (GC_L, GC_L) => NotBreak, // GB6
- (GC_L, GC_V) => NotBreak, // GB6
- (GC_L, GC_LV) => NotBreak, // GB6
- (GC_L, GC_LVT) => NotBreak, // GB6
- (GC_LV, GC_V) => NotBreak, // GB7
- (GC_LV, GC_T) => NotBreak, // GB7
- (GC_V, GC_V) => NotBreak, // GB7
- (GC_V, GC_T) => NotBreak, // GB7
- (GC_LVT, GC_T) => NotBreak, // GB8
- (GC_T, GC_T) => NotBreak, // GB8
- (_, GC_Extend) => NotBreak, // GB9
- (_, GC_ZWJ) => NotBreak, // GB9
- (_, GC_SpacingMark) => Extended, // GB9a
- (GC_Prepend, _) => Extended, // GB9b
- (GC_ZWJ, GC_Extended_Pictographic) => Emoji, // GB11
- (GC_Regional_Indicator, GC_Regional_Indicator) => Regional, // GB12, GB13
- (_, _) => Break, // GB999
+ (GC_CR, GC_LF) => NotBreak, // GB3
+ (GC_Control, _) => Break, // GB4
+ (GC_CR, _) => Break, // GB4
+ (GC_LF, _) => Break, // GB4
+ (_, GC_Control) => Break, // GB5
+ (_, GC_CR) => Break, // GB5
+ (_, GC_LF) => Break, // GB5
+ (GC_L, GC_L) => NotBreak, // GB6
+ (GC_L, GC_V) => NotBreak, // GB6
+ (GC_L, GC_LV) => NotBreak, // GB6
+ (GC_L, GC_LVT) => NotBreak, // GB6
+ (GC_LV, GC_V) => NotBreak, // GB7
+ (GC_LV, GC_T) => NotBreak, // GB7
+ (GC_V, GC_V) => NotBreak, // GB7
+ (GC_V, GC_T) => NotBreak, // GB7
+ (GC_LVT, GC_T) => NotBreak, // GB8
+ (GC_T, GC_T) => NotBreak, // GB8
+ (_, GC_Extend) => NotBreak, // GB9
+ (_, GC_ZWJ) => NotBreak, // GB9
+ (_, GC_SpacingMark) => Extended, // GB9a
+ (GC_Prepend, _) => Extended, // GB9b
+ (GC_ZWJ, GC_Extended_Pictographic) => Emoji, // GB11
+ (GC_Regional_Indicator, GC_Regional_Indicator) => Regional, // GB12, GB13
+ (_, _) => Break, // GB999
}
}
@@ -394,17 +408,19 @@ impl GraphemeCursor {
if self.is_extended && chunk_start + chunk.len() == self.offset {
let ch = chunk.chars().rev().next().unwrap();
if self.grapheme_category(ch) == gr::GC_Prepend {
- self.decide(false); // GB9b
+ self.decide(false); // GB9b
return;
}
}
match self.state {
GraphemeState::Regional => self.handle_regional(chunk, chunk_start),
GraphemeState::Emoji => self.handle_emoji(chunk, chunk_start),
- _ => if self.cat_before.is_none() && self.offset == chunk.len() + chunk_start {
- let ch = chunk.chars().rev().next().unwrap();
- self.cat_before = Some(self.grapheme_category(ch));
- },
+ _ => {
+ if self.cat_before.is_none() && self.offset == chunk.len() + chunk_start {
+ let ch = chunk.chars().rev().next().unwrap();
+ self.cat_before = Some(self.grapheme_category(ch));
+ }
+ }
}
}
@@ -512,17 +528,21 @@ impl GraphemeCursor {
/// cursor.set_cursor(12);
/// assert_eq!(cursor.is_boundary(flags, 0), Ok(false));
/// ```
- pub fn is_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result<bool, GraphemeIncomplete> {
+ pub fn is_boundary(
+ &mut self,
+ chunk: &str,
+ chunk_start: usize,
+ ) -> Result<bool, GraphemeIncomplete> {
use crate::tables::grapheme as gr;
if self.state == GraphemeState::Break {
- return Ok(true)
+ return Ok(true);
}
if self.state == GraphemeState::NotBreak {
- return Ok(false)
+ return Ok(false);
}
if self.offset < chunk_start || self.offset >= chunk_start + chunk.len() {
if self.offset > chunk_start + chunk.len() || self.cat_after.is_none() {
- return Err(GraphemeIncomplete::InvalidOffset)
+ return Err(GraphemeIncomplete::InvalidOffset);
}
}
if let Some(pre_context_offset) = self.pre_context_offset {
@@ -603,7 +623,11 @@ impl GraphemeCursor {
/// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(4)));
/// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(None));
/// ```
- pub fn next_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result<Option<usize>, GraphemeIncomplete> {
+ pub fn next_boundary(
+ &mut self,
+ chunk: &str,
+ chunk_start: usize,
+ ) -> Result<Option<usize>, GraphemeIncomplete> {
if self.offset == self.len {
return Ok(None);
}
@@ -678,7 +702,11 @@ impl GraphemeCursor {
/// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(0)));
/// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(None));
/// ```
- pub fn prev_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result<Option<usize>, GraphemeIncomplete> {
+ pub fn prev_boundary(
+ &mut self,
+ chunk: &str,
+ chunk_start: usize,
+ ) -> Result<Option<usize>, GraphemeIncomplete> {
if self.offset == 0 {
return Ok(None);
}
@@ -699,7 +727,11 @@ impl GraphemeCursor {
self.cat_after = self.cat_before.take();
self.state = GraphemeState::Unknown;
if let Some(ris_count) = self.ris_count {
- self.ris_count = if ris_count > 0 { Some(ris_count - 1) } else { None };
+ self.ris_count = if ris_count > 0 {
+ Some(ris_count - 1)
+ } else {
+ None
+ };
}
if let Some(prev_ch) = iter.next() {
ch = prev_ch;
@@ -726,7 +758,10 @@ impl GraphemeCursor {
fn test_grapheme_cursor_ris_precontext() {
let s = "\u{1f1fa}\u{1f1f8}\u{1f1fa}\u{1f1f8}\u{1f1fa}\u{1f1f8}";
let mut c = GraphemeCursor::new(8, s.len(), true);
- assert_eq!(c.is_boundary(&s[4..], 4), Err(GraphemeIncomplete::PreContext(4)));
+ assert_eq!(
+ c.is_boundary(&s[4..], 4),
+ Err(GraphemeIncomplete::PreContext(4))
+ );
c.provide_context(&s[..4], 0);
assert_eq!(c.is_boundary(&s[4..], 4), Ok(true));
}
@@ -735,7 +770,10 @@ fn test_grapheme_cursor_ris_precontext() {
fn test_grapheme_cursor_chunk_start_require_precontext() {
let s = "\r\n";
let mut c = GraphemeCursor::new(1, s.len(), true);
- assert_eq!(c.is_boundary(&s[1..], 1), Err(GraphemeIncomplete::PreContext(1)));
+ assert_eq!(
+ c.is_boundary(&s[1..], 1),
+ Err(GraphemeIncomplete::PreContext(1))
+ );
c.provide_context(&s[..1], 0);
assert_eq!(c.is_boundary(&s[1..], 1), Ok(false));
}
@@ -744,7 +782,10 @@ fn test_grapheme_cursor_chunk_start_require_precontext() {
fn test_grapheme_cursor_prev_boundary() {
let s = "abcd";
let mut c = GraphemeCursor::new(3, s.len(), true);
- assert_eq!(c.prev_boundary(&s[2..], 2), Err(GraphemeIncomplete::PrevChunk));
+ assert_eq!(
+ c.prev_boundary(&s[2..], 2),
+ Err(GraphemeIncomplete::PrevChunk)
+ );
assert_eq!(c.prev_boundary(&s[..2], 0), Ok(Some(2)));
}
@@ -752,6 +793,9 @@ fn test_grapheme_cursor_prev_boundary() {
fn test_grapheme_cursor_prev_boundary_chunk_start() {
let s = "abcd";
let mut c = GraphemeCursor::new(2, s.len(), true);
- assert_eq!(c.prev_boundary(&s[2..], 2), Err(GraphemeIncomplete::PrevChunk));
+ assert_eq!(
+ c.prev_boundary(&s[2..], 2),
+ Err(GraphemeIncomplete::PrevChunk)
+ );
assert_eq!(c.prev_boundary(&s[..2], 0), Ok(Some(1)));
}