aboutsummaryrefslogtreecommitdiff
path: root/src/lexer/tokenizer.rs
diff options
context:
space:
mode:
Diffstat (limited to 'src/lexer/tokenizer.rs')
-rw-r--r--src/lexer/tokenizer.rs330
1 files changed, 330 insertions, 0 deletions
diff --git a/src/lexer/tokenizer.rs b/src/lexer/tokenizer.rs
new file mode 100644
index 0000000..c5e84a0
--- /dev/null
+++ b/src/lexer/tokenizer.rs
@@ -0,0 +1,330 @@
+use crate::lexer::lexer_impl::Lexer;
+use crate::lexer::lexer_impl::LexerError;
+use crate::lexer::loc::Loc;
+use crate::lexer::parser_language::ParserLanguage;
+use crate::lexer::str_lit::StrLit;
+use crate::lexer::str_lit::StrLitDecodeError;
+use crate::lexer::token::Token;
+use crate::lexer::token::TokenWithLocation;
+
+#[derive(Debug, thiserror::Error)]
+pub enum TokenizerError {
+ #[error(transparent)]
+ LexerError(#[from] LexerError),
+ #[error(transparent)]
+ StrLitDecodeError(#[from] StrLitDecodeError),
+ #[error("Internal tokenizer error")]
+ InternalError,
+ // TODO: too broad
+ #[error("Incorrect input")]
+ IncorrectInput,
+ #[error("Not allowed in this context: {0}")]
+ NotAllowedInThisContext(&'static str),
+ #[error("Unexpected end of input")]
+ UnexpectedEof,
+ #[error("Expecting string literal")]
+ ExpectStrLit,
+ #[error("Expecting int literal")]
+ ExpectIntLit,
+ #[error("Expecting float literal")]
+ ExpectFloatLit,
+ #[error("Expecting identifier")]
+ ExpectIdent,
+ #[error("Expecting identifier `{}`", .0)]
+ ExpectNamedIdent(String),
+ #[error("While parsing {}, expecting char `{}`", .1, .0)]
+ ExpectChar(char, &'static str),
+ #[error("Expecting any char of: {}", .0.iter().map(|c| format!("`{}`", c)).collect::<Vec<_>>().join(", "))]
+ ExpectAnyChar(Vec<char>),
+}
+
+pub type TokenizerResult<R> = Result<R, TokenizerError>;
+
+#[derive(Clone)]
+pub struct Tokenizer<'a> {
+ lexer: Lexer<'a>,
+ next_token: Option<TokenWithLocation>,
+ last_token_loc: Option<Loc>,
+}
+
+impl<'a> Tokenizer<'a> {
+ pub fn new(input: &'a str, comment_style: ParserLanguage) -> Tokenizer<'a> {
+ Tokenizer {
+ lexer: Lexer::new(input, comment_style),
+ next_token: None,
+ last_token_loc: None,
+ }
+ }
+
+ pub fn loc(&self) -> Loc {
+ // After lookahead return the location of the next token
+ self.next_token
+ .as_ref()
+ .map(|t| t.loc.clone())
+ // After token consumed return the location of that token
+ .or(self.last_token_loc.clone())
+ // Otherwise return the position of lexer
+ .unwrap_or(self.lexer.loc)
+ }
+
+ pub fn lookahead_loc(&mut self) -> Loc {
+ drop(self.lookahead());
+ // TODO: does not handle EOF properly
+ self.loc()
+ }
+
+ fn lookahead(&mut self) -> TokenizerResult<Option<&Token>> {
+ Ok(match self.next_token {
+ Some(ref token) => Some(&token.token),
+ None => {
+ self.next_token = self.lexer.next_token()?;
+ self.last_token_loc = self.next_token.as_ref().map(|t| t.loc.clone());
+ match self.next_token {
+ Some(ref token) => Some(&token.token),
+ None => None,
+ }
+ }
+ })
+ }
+
+ pub fn lookahead_some(&mut self) -> TokenizerResult<&Token> {
+ match self.lookahead()? {
+ Some(token) => Ok(token),
+ None => Err(TokenizerError::UnexpectedEof),
+ }
+ }
+
+ fn next(&mut self) -> TokenizerResult<Option<Token>> {
+ self.lookahead()?;
+ Ok(self
+ .next_token
+ .take()
+ .map(|TokenWithLocation { token, .. }| token))
+ }
+
+ pub fn next_some(&mut self) -> TokenizerResult<Token> {
+ match self.next()? {
+ Some(token) => Ok(token),
+ None => Err(TokenizerError::UnexpectedEof),
+ }
+ }
+
+ /// Can be called only after lookahead, otherwise it's error
+ pub fn advance(&mut self) -> TokenizerResult<Token> {
+ self.next_token
+ .take()
+ .map(|TokenWithLocation { token, .. }| token)
+ .ok_or(TokenizerError::InternalError)
+ }
+
+ /// No more tokens
+ pub fn syntax_eof(&mut self) -> TokenizerResult<bool> {
+ Ok(self.lookahead()?.is_none())
+ }
+
+ pub fn next_token_if_map<P, R>(&mut self, p: P) -> TokenizerResult<Option<R>>
+ where
+ P: FnOnce(&Token) -> Option<R>,
+ {
+ self.lookahead()?;
+ let v = match self.next_token {
+ Some(ref token) => match p(&token.token) {
+ Some(v) => v,
+ None => return Ok(None),
+ },
+ _ => return Ok(None),
+ };
+ self.next_token = None;
+ Ok(Some(v))
+ }
+
+ pub fn next_token_check_map<P, R, E>(&mut self, p: P) -> Result<R, E>
+ where
+ P: FnOnce(&Token) -> Result<R, E>,
+ E: From<TokenizerError>,
+ {
+ self.lookahead()?;
+ let r = match self.next_token {
+ Some(ref token) => p(&token.token)?,
+ None => return Err(TokenizerError::UnexpectedEof.into()),
+ };
+ self.next_token = None;
+ Ok(r)
+ }
+
+ fn next_token_if<P>(&mut self, p: P) -> TokenizerResult<Option<Token>>
+ where
+ P: FnOnce(&Token) -> bool,
+ {
+ self.next_token_if_map(|token| if p(token) { Some(token.clone()) } else { None })
+ }
+
+ pub fn next_ident_if_in(&mut self, idents: &[&str]) -> TokenizerResult<Option<String>> {
+ let v = match self.lookahead()? {
+ Some(&Token::Ident(ref next)) => {
+ if idents.into_iter().find(|&i| i == next).is_some() {
+ next.clone()
+ } else {
+ return Ok(None);
+ }
+ }
+ _ => return Ok(None),
+ };
+ self.advance()?;
+ Ok(Some(v))
+ }
+
+ pub fn next_ident_if_eq(&mut self, word: &str) -> TokenizerResult<bool> {
+ Ok(self.next_ident_if_in(&[word])? != None)
+ }
+
+ pub fn next_ident_expect_eq(&mut self, word: &str) -> TokenizerResult<()> {
+ if self.next_ident_if_eq(word)? {
+ Ok(())
+ } else {
+ Err(TokenizerError::ExpectNamedIdent(word.to_owned()))
+ }
+ }
+
+ pub fn next_ident_if_eq_error(&mut self, word: &'static str) -> TokenizerResult<()> {
+ if self.clone().next_ident_if_eq(word)? {
+ // TODO: which context?
+ return Err(TokenizerError::NotAllowedInThisContext(word));
+ }
+ Ok(())
+ }
+
+ pub fn next_symbol_if_eq(&mut self, symbol: char) -> TokenizerResult<bool> {
+ Ok(self.next_token_if(|token| match token {
+ &Token::Symbol(c) if c == symbol => true,
+ _ => false,
+ })? != None)
+ }
+
+ pub fn next_symbol_expect_eq(
+ &mut self,
+ symbol: char,
+ desc: &'static str,
+ ) -> TokenizerResult<()> {
+ if self.lookahead_is_symbol(symbol)? {
+ self.advance()?;
+ Ok(())
+ } else {
+ Err(TokenizerError::ExpectChar(symbol, desc))
+ }
+ }
+
+ pub fn next_symbol_expect_eq_oneof(&mut self, symbols: &[char]) -> TokenizerResult<char> {
+ for symbol in symbols {
+ if let Ok(()) = self.next_symbol_expect_eq(*symbol, "ignored") {
+ return Ok(*symbol);
+ }
+ }
+ Err(TokenizerError::ExpectAnyChar(symbols.to_owned()))
+ }
+
+ pub fn lookahead_is_str_lit(&mut self) -> TokenizerResult<bool> {
+ Ok(match self.lookahead()? {
+ Some(&Token::StrLit(..)) => true,
+ _ => false,
+ })
+ }
+
+ pub fn lookahead_is_int_lit(&mut self) -> TokenizerResult<bool> {
+ Ok(match self.lookahead()? {
+ Some(&Token::IntLit(..)) => true,
+ _ => false,
+ })
+ }
+
+ pub fn lookahead_is_json_number(&mut self) -> TokenizerResult<bool> {
+ Ok(match self.lookahead()? {
+ Some(&Token::JsonNumber(..)) => true,
+ _ => false,
+ })
+ }
+
+ pub fn lookahead_if_symbol(&mut self) -> TokenizerResult<Option<char>> {
+ Ok(match self.lookahead()? {
+ Some(&Token::Symbol(c)) => Some(c),
+ _ => None,
+ })
+ }
+
+ pub fn lookahead_is_symbol(&mut self, symbol: char) -> TokenizerResult<bool> {
+ Ok(self.lookahead_if_symbol()? == Some(symbol))
+ }
+
+ pub fn lookahead_is_ident(&mut self, ident: &str) -> TokenizerResult<bool> {
+ Ok(match self.lookahead()? {
+ Some(Token::Ident(i)) => i == ident,
+ _ => false,
+ })
+ }
+
+ pub fn next_ident(&mut self) -> TokenizerResult<String> {
+ self.next_token_check_map(|token| match token {
+ &Token::Ident(ref ident) => Ok(ident.clone()),
+ _ => Err(TokenizerError::ExpectIdent),
+ })
+ }
+
+ pub fn next_str_lit(&mut self) -> TokenizerResult<StrLit> {
+ self.next_token_check_map(|token| match token {
+ &Token::StrLit(ref str_lit) => Ok(str_lit.clone()),
+ _ => Err(TokenizerError::ExpectStrLit),
+ })
+ }
+
+ pub fn next_int_lit(&mut self) -> TokenizerResult<u64> {
+ self.next_token_check_map(|token| match token {
+ &Token::IntLit(v) => Ok(v),
+ _ => Err(TokenizerError::ExpectIntLit),
+ })
+ }
+
+ pub fn next_float_lit(&mut self) -> TokenizerResult<f64> {
+ self.next_token_check_map(|token| match token {
+ &Token::FloatLit(v) => Ok(v),
+ _ => Err(TokenizerError::ExpectFloatLit),
+ })
+ }
+}
+
+#[cfg(test)]
+mod test {
+
+ use super::*;
+
+ fn tokenize<P, R>(input: &str, what: P) -> R
+ where
+ P: FnOnce(&mut Tokenizer) -> TokenizerResult<R>,
+ {
+ let mut tokenizer = Tokenizer::new(input, ParserLanguage::Proto);
+ let r = what(&mut tokenizer).expect(&format!("parse failed at {}", tokenizer.loc()));
+ let eof = tokenizer
+ .syntax_eof()
+ .expect(&format!("check eof failed at {}", tokenizer.loc()));
+ assert!(eof, "{}", tokenizer.loc());
+ r
+ }
+
+ #[test]
+ fn test_ident() {
+ let msg = r#" aabb_c "#;
+ let mess = tokenize(msg, |p| p.next_ident().map(|s| s.to_owned()));
+ assert_eq!("aabb_c", mess);
+ }
+
+ #[test]
+ fn test_str_lit() {
+ let msg = r#" "a\nb" "#;
+ let mess = tokenize(msg, |p| p.next_str_lit());
+ assert_eq!(
+ StrLit {
+ escaped: r#"a\nb"#.to_owned()
+ },
+ mess
+ );
+ }
+}