/* * Copyright 2016 Google Inc. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.google.turbine.parse; import static com.google.common.base.Verify.verify; import static com.google.common.collect.ImmutableList.toImmutableList; import static com.google.turbine.parse.UnicodeEscapePreprocessor.ASCII_SUB; import static java.lang.Math.min; import com.google.common.collect.ImmutableList; import com.google.turbine.diag.SourceFile; import com.google.turbine.diag.TurbineError; import com.google.turbine.diag.TurbineError.ErrorKind; import org.jspecify.nullness.Nullable; /** A {@link Lexer} that streams input from a {@link UnicodeEscapePreprocessor}. */ public class StreamLexer implements Lexer { private final UnicodeEscapePreprocessor reader; /** The current input character. */ private int ch; /** The start position of the current token. */ private int position; /** The start position of the current numeric literal or identifier token. */ private int readFrom; /** The value of the current string or character literal token. */ private String value = null; /** A saved javadoc comment. */ private String javadoc = null; public StreamLexer(UnicodeEscapePreprocessor reader) { this.reader = reader; eat(); } /** Records the value of a literal. */ private void saveValue(String value) { this.value = value; } /** Records the start position of a literal. */ private void readFrom() { value = null; readFrom = reader.position(); } /** Consumes an input character. */ private void eat() { ch = reader.next(); } @Override public @Nullable String javadoc() { String result = javadoc; javadoc = null; if (result == null) { return null; } verify(result.endsWith("*/"), result); return result.substring(0, result.length() - "*/".length()); } @Override public String stringValue() { if (value != null) { return value; } return reader.readString(readFrom, reader.position()); } @Override public int position() { return position; } @Override public SourceFile source() { return reader.source(); } @Override public Token next() { OUTER: while (true) { position = reader.position(); switch (ch) { case '\r': case '\n': case ' ': case '\t': case '\f': eat(); continue OUTER; case '/': { eat(); switch (ch) { case '/': while (true) { eat(); switch (ch) { case '\n': case '\r': eat(); continue OUTER; case ASCII_SUB: if (reader.done()) { return Token.EOF; } eat(); break; default: // fall out } } case '*': eat(); boolean sawStar = false; boolean isJavadoc = false; if (ch == '*') { eat(); // handle empty non-javadoc comments: `/**/` if (ch == '/') { eat(); continue OUTER; } isJavadoc = true; readFrom(); } while (true) { switch (ch) { case '*': eat(); sawStar = true; break; case '/': eat(); if (sawStar) { if (isJavadoc) { // Save the comment, excluding the leading `/**` and including // the trailing `/*`. The comment is trimmed and normalized later. javadoc = stringValue(); } continue OUTER; } sawStar = false; break; case ASCII_SUB: if (reader.done()) { throw TurbineError.format( reader.source(), position, ErrorKind.UNCLOSED_COMMENT); } eat(); sawStar = false; break; default: eat(); sawStar = false; break; } } default: if (ch == '=') { eat(); return Token.DIVEQ; } return Token.DIV; } } case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u': case 'v': case 'w': case 'x': case 'y': case 'z': case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': case 'Y': case 'Z': case '_': case '$': return identifier(); case ASCII_SUB: if (!reader.done()) { throw error(ErrorKind.UNEXPECTED_EOF); } return Token.EOF; case '-': case '=': case '>': case '<': case '!': case '~': case '+': case '?': case ':': case '*': case '&': case '|': case '^': case '%': return operator(); case '(': eat(); return Token.LPAREN; case ')': eat(); return Token.RPAREN; case '{': eat(); return Token.LBRACE; case '}': eat(); return Token.RBRACE; case '[': eat(); return Token.LBRACK; case ']': eat(); return Token.RBRACK; case ';': eat(); return Token.SEMI; case ',': eat(); return Token.COMMA; case '@': eat(); return Token.AT; // what about frac, etc.? case '0': { readFrom(); eat(); switch (ch) { case 'x': case 'X': eat(); return hexLiteral(); case 'b': case 'B': eat(); return boolLiteral(); case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '_': return octalLiteral(); case '.': eat(); return floatLiteral(); case 'f': case 'F': eat(); return Token.FLOAT_LITERAL; case 'd': case 'D': eat(); return Token.DOUBLE_LITERAL; case 'l': case 'L': eat(); return Token.LONG_LITERAL; default: return Token.INT_LITERAL; } } case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': readFrom(); return decimalLiteral(); case '.': { readFrom(); eat(); switch (ch) { case '.': { eat(); if (ch == '.') { eat(); return Token.ELLIPSIS; } else { throw inputError(); } } case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': return floatLiteral(); default: return Token.DOT; } } case '\'': { eat(); char value; switch (ch) { case '\\': eat(); value = escape(); break; case '\'': throw error(ErrorKind.EMPTY_CHARACTER_LITERAL); default: value = (char) ch; eat(); } if (ch == '\'') { saveValue(String.valueOf(value)); eat(); return Token.CHAR_LITERAL; } throw error(ErrorKind.UNTERMINATED_CHARACTER_LITERAL); } case '"': { eat(); if (ch == '"') { eat(); if (ch != '"') { saveValue(""); return Token.STRING_LITERAL; } eat(); return textBlock(); } readFrom(); StringBuilder sb = new StringBuilder(); STRING: while (true) { switch (ch) { case '\\': eat(); sb.append(escape()); continue STRING; case '"': saveValue(sb.toString()); eat(); return Token.STRING_LITERAL; case '\n': throw error(ErrorKind.UNTERMINATED_STRING); case ASCII_SUB: if (reader.done()) { return Token.EOF; } // falls through default: sb.appendCodePoint(ch); eat(); continue STRING; } } } default: if (Character.isJavaIdentifierStart(ch)) { // TODO(cushon): the style guide disallows non-ascii identifiers return identifier(); } throw inputError(); } } } private Token textBlock() { OUTER: while (true) { switch (ch) { case ' ': case '\r': case '\t': eat(); break; default: break OUTER; } } switch (ch) { case '\r': eat(); if (ch == '\n') { eat(); } break; case '\n': eat(); break; default: throw inputError(); } readFrom(); StringBuilder sb = new StringBuilder(); while (true) { switch (ch) { case '"': eat(); if (ch != '"') { sb.append("\""); continue; } eat(); if (ch != '"') { sb.append("\"\""); continue; } eat(); String value = sb.toString(); value = stripIndent(value); value = translateEscapes(value); saveValue(value); return Token.STRING_LITERAL; case ASCII_SUB: if (reader.done()) { return Token.EOF; } // falls through default: sb.appendCodePoint(ch); eat(); continue; } } } static String stripIndent(String value) { if (value.isEmpty()) { return value; } ImmutableList lines = value.lines().collect(toImmutableList()); // the amount of whitespace to strip from the beginning of every line int strip = Integer.MAX_VALUE; char last = value.charAt(value.length() - 1); boolean trailingNewline = last == '\n' || last == '\r'; if (trailingNewline) { // If the input contains a trailing newline, we have something like: // // |String s = """ // | foo // |"""; // // Because the final """ is unindented, nothing should be stripped. strip = 0; } else { // find the longest common prefix of whitespace across all non-blank lines for (int i = 0; i < lines.size(); i++) { String line = lines.get(i); int nonWhitespaceStart = nonWhitespaceStart(line); if (nonWhitespaceStart == line.length()) { continue; } strip = min(strip, nonWhitespaceStart); } } StringBuilder result = new StringBuilder(); boolean first = true; for (String line : lines) { if (!first) { result.append('\n'); } int end = trailingWhitespaceStart(line); if (strip <= end) { result.append(line, strip, end); } first = false; } if (trailingNewline) { result.append('\n'); } return result.toString(); } private static int nonWhitespaceStart(String value) { int i = 0; while (i < value.length() && Character.isWhitespace(value.charAt(i))) { i++; } return i; } private static int trailingWhitespaceStart(String value) { int i = value.length() - 1; while (i >= 0 && Character.isWhitespace(value.charAt(i))) { i--; } return i + 1; } private static String translateEscapes(String value) { StreamLexer lexer = new StreamLexer(new UnicodeEscapePreprocessor(new SourceFile(null, value + ASCII_SUB))); return lexer.translateEscapes(); } private String translateEscapes() { readFrom(); StringBuilder sb = new StringBuilder(); OUTER: while (true) { switch (ch) { case '\\': eat(); sb.append(escape()); continue; case ASCII_SUB: break OUTER; default: sb.appendCodePoint(ch); eat(); continue; } } return sb.toString(); } private char escape() { boolean zeroToThree = false; switch (ch) { case 'b': eat(); return '\b'; case 't': eat(); return '\t'; case 'n': eat(); return '\n'; case 'f': eat(); return '\f'; case 'r': eat(); return '\r'; case '"': eat(); return '\"'; case '\'': eat(); return '\''; case '\\': eat(); return '\\'; case '0': case '1': case '2': case '3': zeroToThree = true; // falls through case '4': case '5': case '6': case '7': { char value = (char) (ch - '0'); eat(); switch (ch) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': { value = (char) ((value << 3) | (ch - '0')); eat(); if (zeroToThree) { switch (ch) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': value = (char) ((value << 3) | (ch - '0')); eat(); return value; default: return value; } } } // fall through default: return value; } } default: throw inputError(); } } private Token decimalLiteral() { readDigits(); switch (ch) { case 'e': case 'E': return floatLiteral(); case '.': eat(); return floatLiteral(); case 'f': case 'F': eat(); return Token.FLOAT_LITERAL; case 'd': case 'D': eat(); return Token.DOUBLE_LITERAL; case 'l': case 'L': eat(); return Token.LONG_LITERAL; default: return Token.INT_LITERAL; } } private Token hexFloatLiteral() { readHexDigits(); switch (ch) { case 'p': case 'P': eat(); signedInteger(); break; default: // fall out } return floatTypeSuffix(); } private Token floatLiteral() { if ('0' <= ch && ch <= '9') { readDigits(); } switch (ch) { case 'e': case 'E': eat(); signedInteger(); break; default: // fall out } return floatTypeSuffix(); } private Token floatTypeSuffix() { switch (ch) { case 'd': case 'D': eat(); return Token.DOUBLE_LITERAL; case 'f': case 'F': eat(); return Token.FLOAT_LITERAL; default: return Token.DOUBLE_LITERAL; } } private void signedInteger() { switch (ch) { case '-': case '+': eat(); break; default: break; } readDigits(); } private void readHexDigits() { switch (ch) { case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': eat(); break; default: throw inputError(); } OUTER: while (true) { switch (ch) { case '_': { do { eat(); } while (ch == '_'); switch (ch) { case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': continue OUTER; default: throw inputError(); } } case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': eat(); break; default: return; } } } private void readDigits() { if ('0' <= ch && ch <= '9') { eat(); } else { throw inputError(); } OUTER: while (true) { switch (ch) { case '_': do { eat(); } while (ch == '_'); if ('0' <= ch && ch <= '9') { continue OUTER; } else { throw inputError(); } case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': eat(); continue OUTER; default: return; } } } private Token boolLiteral() { readBinaryDigits(); switch (ch) { case 'l': case 'L': eat(); return Token.LONG_LITERAL; default: return Token.INT_LITERAL; } } private void readBinaryDigits() { switch (ch) { case '0': case '1': eat(); break; default: throw inputError(); } OUTER: while (true) { switch (ch) { case '_': do { eat(); } while (ch == '_'); switch (ch) { case '0': case '1': continue OUTER; default: throw inputError(); } case '0': case '1': eat(); continue OUTER; default: return; } } } private Token octalLiteral() { readOctalDigits(); switch (ch) { case 'l': case 'L': eat(); return Token.LONG_LITERAL; default: return Token.INT_LITERAL; } } private void readOctalDigits() { switch (ch) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '_': eat(); break; default: throw inputError(); } OUTER: while (true) { switch (ch) { case '_': do { eat(); } while (ch == '_'); switch (ch) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': continue OUTER; default: throw inputError(); } case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': eat(); continue OUTER; default: return; } } } private Token hexLiteral() { readHexDigits(); switch (ch) { case '.': eat(); return hexFloatLiteral(); case 'l': case 'L': eat(); return Token.LONG_LITERAL; case 'p': case 'P': eat(); signedInteger(); return floatTypeSuffix(); default: return Token.INT_LITERAL; } } private Token operator() { switch (ch) { case '=': eat(); if (ch == '=') { eat(); return Token.EQ; } else { return Token.ASSIGN; } case '>': eat(); switch (ch) { case '=': eat(); return Token.GTE; case '>': eat(); switch (ch) { case '>': eat(); if (ch == '=') { eat(); return Token.GTGTGTE; } else { return Token.GTGTGT; } case '=': eat(); return Token.GTGTE; default: return Token.GTGT; } default: return Token.GT; } case '<': eat(); switch (ch) { case '=': eat(); return Token.LTE; case '<': eat(); if (ch == '=') { eat(); return Token.LTLTE; } else { return Token.LTLT; } default: return Token.LT; } case '!': eat(); if (ch == '=') { eat(); return Token.NOTEQ; } else { return Token.NOT; } case '~': eat(); return Token.TILDE; case '?': eat(); return Token.COND; case ':': eat(); if (ch == ':') { eat(); return Token.COLONCOLON; } else { return Token.COLON; } case '-': eat(); switch (ch) { case '>': eat(); return Token.ARROW; case '-': eat(); return Token.DECR; case '=': eat(); return Token.MINUSEQ; default: return Token.MINUS; } case '&': eat(); switch (ch) { case '&': eat(); return Token.ANDAND; case '=': eat(); return Token.ANDEQ; default: return Token.AND; } case '|': eat(); switch (ch) { case '=': eat(); return Token.OREQ; case '|': eat(); return Token.OROR; default: return Token.OR; } case '+': eat(); switch (ch) { case '+': eat(); return Token.INCR; case '=': eat(); return Token.PLUSEQ; default: return Token.PLUS; } case '*': eat(); if (ch == '=') { eat(); return Token.MULTEQ; } else { return Token.MULT; } case '/': // handled with comments throw inputError(); case '%': eat(); if (ch == '=') { eat(); return Token.MODEQ; } else { return Token.MOD; } case '^': eat(); if (ch == '=') { eat(); return Token.XOREQ; } else { return Token.XOR; } default: throw inputError(); } } private Token identifier() { readFrom(); eat(); // TODO(cushon): the style guide disallows non-ascii identifiers while (Character.isJavaIdentifierPart(ch)) { if (ch == ASCII_SUB && reader.done()) { break; } eat(); } return makeIdent(stringValue()); } private static Token makeIdent(String s) { switch (s) { case "abstract": return Token.ABSTRACT; case "assert": return Token.ASSERT; case "boolean": return Token.BOOLEAN; case "break": return Token.BREAK; case "byte": return Token.BYTE; case "case": return Token.CASE; case "catch": return Token.CATCH; case "char": return Token.CHAR; case "class": return Token.CLASS; case "const": return Token.CONST; case "continue": return Token.CONTINUE; case "default": return Token.DEFAULT; case "do": return Token.DO; case "double": return Token.DOUBLE; case "else": return Token.ELSE; case "enum": return Token.ENUM; case "extends": return Token.EXTENDS; case "final": return Token.FINAL; case "finally": return Token.FINALLY; case "float": return Token.FLOAT; case "for": return Token.FOR; case "goto": return Token.GOTO; case "if": return Token.IF; case "implements": return Token.IMPLEMENTS; case "import": return Token.IMPORT; case "instanceof": return Token.INSTANCEOF; case "int": return Token.INT; case "interface": return Token.INTERFACE; case "long": return Token.LONG; case "native": return Token.NATIVE; case "new": return Token.NEW; case "package": return Token.PACKAGE; case "private": return Token.PRIVATE; case "protected": return Token.PROTECTED; case "public": return Token.PUBLIC; case "return": return Token.RETURN; case "short": return Token.SHORT; case "static": return Token.STATIC; case "strictfp": return Token.STRICTFP; case "super": return Token.SUPER; case "switch": return Token.SWITCH; case "synchronized": return Token.SYNCHRONIZED; case "this": return Token.THIS; case "throw": return Token.THROW; case "throws": return Token.THROWS; case "transient": return Token.TRANSIENT; case "try": return Token.TRY; case "void": return Token.VOID; case "volatile": return Token.VOLATILE; case "while": return Token.WHILE; case "true": return Token.TRUE; case "false": return Token.FALSE; case "null": return Token.NULL; default: return Token.IDENT; } } private TurbineError inputError() { return error( ErrorKind.UNEXPECTED_INPUT, Character.isBmpCodePoint(ch) ? Character.toString((char) ch) : String.format("U+%X", ch)); } private TurbineError error(ErrorKind kind, Object... args) { return TurbineError.format(reader.source(), reader.position(), kind, args); } }