diff options
Diffstat (limited to 'src/lexer/tokenizer.rs')
-rw-r--r-- | src/lexer/tokenizer.rs | 330 |
1 files changed, 330 insertions, 0 deletions
diff --git a/src/lexer/tokenizer.rs b/src/lexer/tokenizer.rs new file mode 100644 index 0000000..c5e84a0 --- /dev/null +++ b/src/lexer/tokenizer.rs @@ -0,0 +1,330 @@ +use crate::lexer::lexer_impl::Lexer; +use crate::lexer::lexer_impl::LexerError; +use crate::lexer::loc::Loc; +use crate::lexer::parser_language::ParserLanguage; +use crate::lexer::str_lit::StrLit; +use crate::lexer::str_lit::StrLitDecodeError; +use crate::lexer::token::Token; +use crate::lexer::token::TokenWithLocation; + +#[derive(Debug, thiserror::Error)] +pub enum TokenizerError { + #[error(transparent)] + LexerError(#[from] LexerError), + #[error(transparent)] + StrLitDecodeError(#[from] StrLitDecodeError), + #[error("Internal tokenizer error")] + InternalError, + // TODO: too broad + #[error("Incorrect input")] + IncorrectInput, + #[error("Not allowed in this context: {0}")] + NotAllowedInThisContext(&'static str), + #[error("Unexpected end of input")] + UnexpectedEof, + #[error("Expecting string literal")] + ExpectStrLit, + #[error("Expecting int literal")] + ExpectIntLit, + #[error("Expecting float literal")] + ExpectFloatLit, + #[error("Expecting identifier")] + ExpectIdent, + #[error("Expecting identifier `{}`", .0)] + ExpectNamedIdent(String), + #[error("While parsing {}, expecting char `{}`", .1, .0)] + ExpectChar(char, &'static str), + #[error("Expecting any char of: {}", .0.iter().map(|c| format!("`{}`", c)).collect::<Vec<_>>().join(", "))] + ExpectAnyChar(Vec<char>), +} + +pub type TokenizerResult<R> = Result<R, TokenizerError>; + +#[derive(Clone)] +pub struct Tokenizer<'a> { + lexer: Lexer<'a>, + next_token: Option<TokenWithLocation>, + last_token_loc: Option<Loc>, +} + +impl<'a> Tokenizer<'a> { + pub fn new(input: &'a str, comment_style: ParserLanguage) -> Tokenizer<'a> { + Tokenizer { + lexer: Lexer::new(input, comment_style), + next_token: None, + last_token_loc: None, + } + } + + pub fn loc(&self) -> Loc { + // After lookahead return the location of the next token + self.next_token + .as_ref() + .map(|t| t.loc.clone()) + // After token consumed return the location of that token + .or(self.last_token_loc.clone()) + // Otherwise return the position of lexer + .unwrap_or(self.lexer.loc) + } + + pub fn lookahead_loc(&mut self) -> Loc { + drop(self.lookahead()); + // TODO: does not handle EOF properly + self.loc() + } + + fn lookahead(&mut self) -> TokenizerResult<Option<&Token>> { + Ok(match self.next_token { + Some(ref token) => Some(&token.token), + None => { + self.next_token = self.lexer.next_token()?; + self.last_token_loc = self.next_token.as_ref().map(|t| t.loc.clone()); + match self.next_token { + Some(ref token) => Some(&token.token), + None => None, + } + } + }) + } + + pub fn lookahead_some(&mut self) -> TokenizerResult<&Token> { + match self.lookahead()? { + Some(token) => Ok(token), + None => Err(TokenizerError::UnexpectedEof), + } + } + + fn next(&mut self) -> TokenizerResult<Option<Token>> { + self.lookahead()?; + Ok(self + .next_token + .take() + .map(|TokenWithLocation { token, .. }| token)) + } + + pub fn next_some(&mut self) -> TokenizerResult<Token> { + match self.next()? { + Some(token) => Ok(token), + None => Err(TokenizerError::UnexpectedEof), + } + } + + /// Can be called only after lookahead, otherwise it's error + pub fn advance(&mut self) -> TokenizerResult<Token> { + self.next_token + .take() + .map(|TokenWithLocation { token, .. }| token) + .ok_or(TokenizerError::InternalError) + } + + /// No more tokens + pub fn syntax_eof(&mut self) -> TokenizerResult<bool> { + Ok(self.lookahead()?.is_none()) + } + + pub fn next_token_if_map<P, R>(&mut self, p: P) -> TokenizerResult<Option<R>> + where + P: FnOnce(&Token) -> Option<R>, + { + self.lookahead()?; + let v = match self.next_token { + Some(ref token) => match p(&token.token) { + Some(v) => v, + None => return Ok(None), + }, + _ => return Ok(None), + }; + self.next_token = None; + Ok(Some(v)) + } + + pub fn next_token_check_map<P, R, E>(&mut self, p: P) -> Result<R, E> + where + P: FnOnce(&Token) -> Result<R, E>, + E: From<TokenizerError>, + { + self.lookahead()?; + let r = match self.next_token { + Some(ref token) => p(&token.token)?, + None => return Err(TokenizerError::UnexpectedEof.into()), + }; + self.next_token = None; + Ok(r) + } + + fn next_token_if<P>(&mut self, p: P) -> TokenizerResult<Option<Token>> + where + P: FnOnce(&Token) -> bool, + { + self.next_token_if_map(|token| if p(token) { Some(token.clone()) } else { None }) + } + + pub fn next_ident_if_in(&mut self, idents: &[&str]) -> TokenizerResult<Option<String>> { + let v = match self.lookahead()? { + Some(&Token::Ident(ref next)) => { + if idents.into_iter().find(|&i| i == next).is_some() { + next.clone() + } else { + return Ok(None); + } + } + _ => return Ok(None), + }; + self.advance()?; + Ok(Some(v)) + } + + pub fn next_ident_if_eq(&mut self, word: &str) -> TokenizerResult<bool> { + Ok(self.next_ident_if_in(&[word])? != None) + } + + pub fn next_ident_expect_eq(&mut self, word: &str) -> TokenizerResult<()> { + if self.next_ident_if_eq(word)? { + Ok(()) + } else { + Err(TokenizerError::ExpectNamedIdent(word.to_owned())) + } + } + + pub fn next_ident_if_eq_error(&mut self, word: &'static str) -> TokenizerResult<()> { + if self.clone().next_ident_if_eq(word)? { + // TODO: which context? + return Err(TokenizerError::NotAllowedInThisContext(word)); + } + Ok(()) + } + + pub fn next_symbol_if_eq(&mut self, symbol: char) -> TokenizerResult<bool> { + Ok(self.next_token_if(|token| match token { + &Token::Symbol(c) if c == symbol => true, + _ => false, + })? != None) + } + + pub fn next_symbol_expect_eq( + &mut self, + symbol: char, + desc: &'static str, + ) -> TokenizerResult<()> { + if self.lookahead_is_symbol(symbol)? { + self.advance()?; + Ok(()) + } else { + Err(TokenizerError::ExpectChar(symbol, desc)) + } + } + + pub fn next_symbol_expect_eq_oneof(&mut self, symbols: &[char]) -> TokenizerResult<char> { + for symbol in symbols { + if let Ok(()) = self.next_symbol_expect_eq(*symbol, "ignored") { + return Ok(*symbol); + } + } + Err(TokenizerError::ExpectAnyChar(symbols.to_owned())) + } + + pub fn lookahead_is_str_lit(&mut self) -> TokenizerResult<bool> { + Ok(match self.lookahead()? { + Some(&Token::StrLit(..)) => true, + _ => false, + }) + } + + pub fn lookahead_is_int_lit(&mut self) -> TokenizerResult<bool> { + Ok(match self.lookahead()? { + Some(&Token::IntLit(..)) => true, + _ => false, + }) + } + + pub fn lookahead_is_json_number(&mut self) -> TokenizerResult<bool> { + Ok(match self.lookahead()? { + Some(&Token::JsonNumber(..)) => true, + _ => false, + }) + } + + pub fn lookahead_if_symbol(&mut self) -> TokenizerResult<Option<char>> { + Ok(match self.lookahead()? { + Some(&Token::Symbol(c)) => Some(c), + _ => None, + }) + } + + pub fn lookahead_is_symbol(&mut self, symbol: char) -> TokenizerResult<bool> { + Ok(self.lookahead_if_symbol()? == Some(symbol)) + } + + pub fn lookahead_is_ident(&mut self, ident: &str) -> TokenizerResult<bool> { + Ok(match self.lookahead()? { + Some(Token::Ident(i)) => i == ident, + _ => false, + }) + } + + pub fn next_ident(&mut self) -> TokenizerResult<String> { + self.next_token_check_map(|token| match token { + &Token::Ident(ref ident) => Ok(ident.clone()), + _ => Err(TokenizerError::ExpectIdent), + }) + } + + pub fn next_str_lit(&mut self) -> TokenizerResult<StrLit> { + self.next_token_check_map(|token| match token { + &Token::StrLit(ref str_lit) => Ok(str_lit.clone()), + _ => Err(TokenizerError::ExpectStrLit), + }) + } + + pub fn next_int_lit(&mut self) -> TokenizerResult<u64> { + self.next_token_check_map(|token| match token { + &Token::IntLit(v) => Ok(v), + _ => Err(TokenizerError::ExpectIntLit), + }) + } + + pub fn next_float_lit(&mut self) -> TokenizerResult<f64> { + self.next_token_check_map(|token| match token { + &Token::FloatLit(v) => Ok(v), + _ => Err(TokenizerError::ExpectFloatLit), + }) + } +} + +#[cfg(test)] +mod test { + + use super::*; + + fn tokenize<P, R>(input: &str, what: P) -> R + where + P: FnOnce(&mut Tokenizer) -> TokenizerResult<R>, + { + let mut tokenizer = Tokenizer::new(input, ParserLanguage::Proto); + let r = what(&mut tokenizer).expect(&format!("parse failed at {}", tokenizer.loc())); + let eof = tokenizer + .syntax_eof() + .expect(&format!("check eof failed at {}", tokenizer.loc())); + assert!(eof, "{}", tokenizer.loc()); + r + } + + #[test] + fn test_ident() { + let msg = r#" aabb_c "#; + let mess = tokenize(msg, |p| p.next_ident().map(|s| s.to_owned())); + assert_eq!("aabb_c", mess); + } + + #[test] + fn test_str_lit() { + let msg = r#" "a\nb" "#; + let mess = tokenize(msg, |p| p.next_str_lit()); + assert_eq!( + StrLit { + escaped: r#"a\nb"#.to_owned() + }, + mess + ); + } +} |