aboutsummaryrefslogtreecommitdiff
path: root/src/windows/wtf8/code_points.rs
blob: b265db332412f9e6207bb650837d938e9e85c9d3 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
use std::iter::Peekable;
use std::mem;

use crate::util::is_continuation;
use crate::util::BYTE_SHIFT;
use crate::util::CONT_MASK;

use super::EncodingError;
use super::Result;

pub(in super::super) struct CodePoints<I>
where
    I: Iterator<Item = u8>,
{
    iter: Peekable<I>,
    surrogate: bool,
}

impl<I> CodePoints<I>
where
    I: Iterator<Item = u8>,
{
    pub(in super::super) fn new<S>(string: S) -> Self
    where
        S: IntoIterator<IntoIter = I, Item = I::Item>,
    {
        Self {
            iter: string.into_iter().peekable(),
            surrogate: false,
        }
    }

    fn consume_next(&mut self, code_point: &mut u32) -> Result<()> {
        if let Some(&byte) = self.iter.peek() {
            if !is_continuation(byte) {
                self.surrogate = false;
                // Not consuming this byte will be useful if this crate ever
                // offers a way to encode lossily.
                return Err(EncodingError::Byte(byte));
            }
            *code_point =
                (*code_point << BYTE_SHIFT) | u32::from(byte & CONT_MASK);

            let removed = self.iter.next();
            debug_assert_eq!(Some(byte), removed);
        } else {
            return Err(EncodingError::End());
        }
        Ok(())
    }

    pub(super) fn inner_size_hint(&self) -> (usize, Option<usize>) {
        self.iter.size_hint()
    }
}

impl<I> Iterator for CodePoints<I>
where
    I: Iterator<Item = u8>,
{
    type Item = Result<u32>;

    fn next(&mut self) -> Option<Self::Item> {
        let byte = self.iter.next()?;
        let mut code_point: u32 = byte.into();

        macro_rules! consume_next {
            () => {{
                if let Err(error) = self.consume_next(&mut code_point) {
                    return Some(Err(error));
                }
            }};
        }

        let prev_surrogate = mem::replace(&mut self.surrogate, false);

        let mut invalid = false;
        if !byte.is_ascii() {
            if byte < 0xC2 {
                return Some(Err(EncodingError::Byte(byte)));
            }

            if byte < 0xE0 {
                code_point &= 0x1F;
            } else {
                code_point &= 0x0F;
                consume_next!();

                if byte >= 0xF0 {
                    if code_point.wrapping_sub(0x10) >= 0x100 {
                        invalid = true;
                    }
                    consume_next!();

                // This condition is optimized to detect surrogate code points.
                } else if code_point & 0xFE0 == 0x360 {
                    if code_point & 0x10 == 0 {
                        self.surrogate = true;
                    } else if prev_surrogate {
                        // Decoding a broken surrogate pair would be lossy.
                        invalid = true;
                    }
                }

                if code_point < 0x20 {
                    invalid = true;
                }
            }
            consume_next!();
        }
        if invalid {
            return Some(Err(EncodingError::CodePoint(code_point)));
        }

        Some(Ok(code_point))
    }
}