aboutsummaryrefslogtreecommitdiff
path: root/syntax/scan.go
diff options
context:
space:
mode:
Diffstat (limited to 'syntax/scan.go')
-rw-r--r--syntax/scan.go1123
1 files changed, 1123 insertions, 0 deletions
diff --git a/syntax/scan.go b/syntax/scan.go
new file mode 100644
index 0000000..bb4165e
--- /dev/null
+++ b/syntax/scan.go
@@ -0,0 +1,1123 @@
+// Copyright 2017 The Bazel Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package syntax
+
+// A lexical scanner for Starlark.
+
+import (
+ "fmt"
+ "io"
+ "io/ioutil"
+ "log"
+ "math/big"
+ "os"
+ "strconv"
+ "strings"
+ "unicode"
+ "unicode/utf8"
+)
+
+// A Token represents a Starlark lexical token.
+type Token int8
+
+const (
+ ILLEGAL Token = iota
+ EOF
+
+ NEWLINE
+ INDENT
+ OUTDENT
+
+ // Tokens with values
+ IDENT // x
+ INT // 123
+ FLOAT // 1.23e45
+ STRING // "foo" or 'foo' or '''foo''' or r'foo' or r"foo"
+ BYTES // b"foo", etc
+
+ // Punctuation
+ PLUS // +
+ MINUS // -
+ STAR // *
+ SLASH // /
+ SLASHSLASH // //
+ PERCENT // %
+ AMP // &
+ PIPE // |
+ CIRCUMFLEX // ^
+ LTLT // <<
+ GTGT // >>
+ TILDE // ~
+ DOT // .
+ COMMA // ,
+ EQ // =
+ SEMI // ;
+ COLON // :
+ LPAREN // (
+ RPAREN // )
+ LBRACK // [
+ RBRACK // ]
+ LBRACE // {
+ RBRACE // }
+ LT // <
+ GT // >
+ GE // >=
+ LE // <=
+ EQL // ==
+ NEQ // !=
+ PLUS_EQ // += (keep order consistent with PLUS..GTGT)
+ MINUS_EQ // -=
+ STAR_EQ // *=
+ SLASH_EQ // /=
+ SLASHSLASH_EQ // //=
+ PERCENT_EQ // %=
+ AMP_EQ // &=
+ PIPE_EQ // |=
+ CIRCUMFLEX_EQ // ^=
+ LTLT_EQ // <<=
+ GTGT_EQ // >>=
+ STARSTAR // **
+
+ // Keywords
+ AND
+ BREAK
+ CONTINUE
+ DEF
+ ELIF
+ ELSE
+ FOR
+ IF
+ IN
+ LAMBDA
+ LOAD
+ NOT
+ NOT_IN // synthesized by parser from NOT IN
+ OR
+ PASS
+ RETURN
+ WHILE
+
+ maxToken
+)
+
+func (tok Token) String() string { return tokenNames[tok] }
+
+// GoString is like String but quotes punctuation tokens.
+// Use Sprintf("%#v", tok) when constructing error messages.
+func (tok Token) GoString() string {
+ if tok >= PLUS && tok <= STARSTAR {
+ return "'" + tokenNames[tok] + "'"
+ }
+ return tokenNames[tok]
+}
+
+var tokenNames = [...]string{
+ ILLEGAL: "illegal token",
+ EOF: "end of file",
+ NEWLINE: "newline",
+ INDENT: "indent",
+ OUTDENT: "outdent",
+ IDENT: "identifier",
+ INT: "int literal",
+ FLOAT: "float literal",
+ STRING: "string literal",
+ PLUS: "+",
+ MINUS: "-",
+ STAR: "*",
+ SLASH: "/",
+ SLASHSLASH: "//",
+ PERCENT: "%",
+ AMP: "&",
+ PIPE: "|",
+ CIRCUMFLEX: "^",
+ LTLT: "<<",
+ GTGT: ">>",
+ TILDE: "~",
+ DOT: ".",
+ COMMA: ",",
+ EQ: "=",
+ SEMI: ";",
+ COLON: ":",
+ LPAREN: "(",
+ RPAREN: ")",
+ LBRACK: "[",
+ RBRACK: "]",
+ LBRACE: "{",
+ RBRACE: "}",
+ LT: "<",
+ GT: ">",
+ GE: ">=",
+ LE: "<=",
+ EQL: "==",
+ NEQ: "!=",
+ PLUS_EQ: "+=",
+ MINUS_EQ: "-=",
+ STAR_EQ: "*=",
+ SLASH_EQ: "/=",
+ SLASHSLASH_EQ: "//=",
+ PERCENT_EQ: "%=",
+ AMP_EQ: "&=",
+ PIPE_EQ: "|=",
+ CIRCUMFLEX_EQ: "^=",
+ LTLT_EQ: "<<=",
+ GTGT_EQ: ">>=",
+ STARSTAR: "**",
+ AND: "and",
+ BREAK: "break",
+ CONTINUE: "continue",
+ DEF: "def",
+ ELIF: "elif",
+ ELSE: "else",
+ FOR: "for",
+ IF: "if",
+ IN: "in",
+ LAMBDA: "lambda",
+ LOAD: "load",
+ NOT: "not",
+ NOT_IN: "not in",
+ OR: "or",
+ PASS: "pass",
+ RETURN: "return",
+ WHILE: "while",
+}
+
+// A FilePortion describes the content of a portion of a file.
+// Callers may provide a FilePortion for the src argument of Parse
+// when the desired initial line and column numbers are not (1, 1),
+// such as when an expression is parsed from within larger file.
+type FilePortion struct {
+ Content []byte
+ FirstLine, FirstCol int32
+}
+
+// A Position describes the location of a rune of input.
+type Position struct {
+ file *string // filename (indirect for compactness)
+ Line int32 // 1-based line number; 0 if line unknown
+ Col int32 // 1-based column (rune) number; 0 if column unknown
+}
+
+// IsValid reports whether the position is valid.
+func (p Position) IsValid() bool { return p.file != nil }
+
+// Filename returns the name of the file containing this position.
+func (p Position) Filename() string {
+ if p.file != nil {
+ return *p.file
+ }
+ return "<invalid>"
+}
+
+// MakePosition returns position with the specified components.
+func MakePosition(file *string, line, col int32) Position { return Position{file, line, col} }
+
+// add returns the position at the end of s, assuming it starts at p.
+func (p Position) add(s string) Position {
+ if n := strings.Count(s, "\n"); n > 0 {
+ p.Line += int32(n)
+ s = s[strings.LastIndex(s, "\n")+1:]
+ p.Col = 1
+ }
+ p.Col += int32(utf8.RuneCountInString(s))
+ return p
+}
+
+func (p Position) String() string {
+ file := p.Filename()
+ if p.Line > 0 {
+ if p.Col > 0 {
+ return fmt.Sprintf("%s:%d:%d", file, p.Line, p.Col)
+ }
+ return fmt.Sprintf("%s:%d", file, p.Line)
+ }
+ return file
+}
+
+func (p Position) isBefore(q Position) bool {
+ if p.Line != q.Line {
+ return p.Line < q.Line
+ }
+ return p.Col < q.Col
+}
+
+// An scanner represents a single input file being parsed.
+type scanner struct {
+ rest []byte // rest of input (in REPL, a line of input)
+ token []byte // token being scanned
+ pos Position // current input position
+ depth int // nesting of [ ] { } ( )
+ indentstk []int // stack of indentation levels
+ dents int // number of saved INDENT (>0) or OUTDENT (<0) tokens to return
+ lineStart bool // after NEWLINE; convert spaces to indentation tokens
+ keepComments bool // accumulate comments in slice
+ lineComments []Comment // list of full line comments (if keepComments)
+ suffixComments []Comment // list of suffix comments (if keepComments)
+
+ readline func() ([]byte, error) // read next line of input (REPL only)
+}
+
+func newScanner(filename string, src interface{}, keepComments bool) (*scanner, error) {
+ var firstLine, firstCol int32 = 1, 1
+ if portion, ok := src.(FilePortion); ok {
+ firstLine, firstCol = portion.FirstLine, portion.FirstCol
+ }
+ sc := &scanner{
+ pos: MakePosition(&filename, firstLine, firstCol),
+ indentstk: make([]int, 1, 10), // []int{0} + spare capacity
+ lineStart: true,
+ keepComments: keepComments,
+ }
+ sc.readline, _ = src.(func() ([]byte, error)) // ParseCompoundStmt (REPL) only
+ if sc.readline == nil {
+ data, err := readSource(filename, src)
+ if err != nil {
+ return nil, err
+ }
+ sc.rest = data
+ }
+ return sc, nil
+}
+
+func readSource(filename string, src interface{}) ([]byte, error) {
+ switch src := src.(type) {
+ case string:
+ return []byte(src), nil
+ case []byte:
+ return src, nil
+ case io.Reader:
+ data, err := ioutil.ReadAll(src)
+ if err != nil {
+ err = &os.PathError{Op: "read", Path: filename, Err: err}
+ return nil, err
+ }
+ return data, nil
+ case FilePortion:
+ return src.Content, nil
+ case nil:
+ return ioutil.ReadFile(filename)
+ default:
+ return nil, fmt.Errorf("invalid source: %T", src)
+ }
+}
+
+// An Error describes the nature and position of a scanner or parser error.
+type Error struct {
+ Pos Position
+ Msg string
+}
+
+func (e Error) Error() string { return e.Pos.String() + ": " + e.Msg }
+
+// errorf is called to report an error.
+// errorf does not return: it panics.
+func (sc *scanner) error(pos Position, s string) {
+ panic(Error{pos, s})
+}
+
+func (sc *scanner) errorf(pos Position, format string, args ...interface{}) {
+ sc.error(pos, fmt.Sprintf(format, args...))
+}
+
+func (sc *scanner) recover(err *error) {
+ // The scanner and parser panic both for routine errors like
+ // syntax errors and for programmer bugs like array index
+ // errors. Turn both into error returns. Catching bug panics
+ // is especially important when processing many files.
+ switch e := recover().(type) {
+ case nil:
+ // no panic
+ case Error:
+ *err = e
+ default:
+ *err = Error{sc.pos, fmt.Sprintf("internal error: %v", e)}
+ if debug {
+ log.Fatal(*err)
+ }
+ }
+}
+
+// eof reports whether the input has reached end of file.
+func (sc *scanner) eof() bool {
+ return len(sc.rest) == 0 && !sc.readLine()
+}
+
+// readLine attempts to read another line of input.
+// Precondition: len(sc.rest)==0.
+func (sc *scanner) readLine() bool {
+ if sc.readline != nil {
+ var err error
+ sc.rest, err = sc.readline()
+ if err != nil {
+ sc.errorf(sc.pos, "%v", err) // EOF or ErrInterrupt
+ }
+ return len(sc.rest) > 0
+ }
+ return false
+}
+
+// peekRune returns the next rune in the input without consuming it.
+// Newlines in Unix, DOS, or Mac format are treated as one rune, '\n'.
+func (sc *scanner) peekRune() rune {
+ // TODO(adonovan): opt: measure and perhaps inline eof.
+ if sc.eof() {
+ return 0
+ }
+
+ // fast path: ASCII
+ if b := sc.rest[0]; b < utf8.RuneSelf {
+ if b == '\r' {
+ return '\n'
+ }
+ return rune(b)
+ }
+
+ r, _ := utf8.DecodeRune(sc.rest)
+ return r
+}
+
+// readRune consumes and returns the next rune in the input.
+// Newlines in Unix, DOS, or Mac format are treated as one rune, '\n'.
+func (sc *scanner) readRune() rune {
+ // eof() has been inlined here, both to avoid a call
+ // and to establish len(rest)>0 to avoid a bounds check.
+ if len(sc.rest) == 0 {
+ if !sc.readLine() {
+ sc.error(sc.pos, "internal scanner error: readRune at EOF")
+ }
+ // Redundant, but eliminates the bounds-check below.
+ if len(sc.rest) == 0 {
+ return 0
+ }
+ }
+
+ // fast path: ASCII
+ if b := sc.rest[0]; b < utf8.RuneSelf {
+ r := rune(b)
+ sc.rest = sc.rest[1:]
+ if r == '\r' {
+ if len(sc.rest) > 0 && sc.rest[0] == '\n' {
+ sc.rest = sc.rest[1:]
+ }
+ r = '\n'
+ }
+ if r == '\n' {
+ sc.pos.Line++
+ sc.pos.Col = 1
+ } else {
+ sc.pos.Col++
+ }
+ return r
+ }
+
+ r, size := utf8.DecodeRune(sc.rest)
+ sc.rest = sc.rest[size:]
+ sc.pos.Col++
+ return r
+}
+
+// tokenValue records the position and value associated with each token.
+type tokenValue struct {
+ raw string // raw text of token
+ int int64 // decoded int
+ bigInt *big.Int // decoded integers > int64
+ float float64 // decoded float
+ string string // decoded string or bytes
+ pos Position // start position of token
+}
+
+// startToken marks the beginning of the next input token.
+// It must be followed by a call to endToken once the token has
+// been consumed using readRune.
+func (sc *scanner) startToken(val *tokenValue) {
+ sc.token = sc.rest
+ val.raw = ""
+ val.pos = sc.pos
+}
+
+// endToken marks the end of an input token.
+// It records the actual token string in val.raw if the caller
+// has not done that already.
+func (sc *scanner) endToken(val *tokenValue) {
+ if val.raw == "" {
+ val.raw = string(sc.token[:len(sc.token)-len(sc.rest)])
+ }
+}
+
+// nextToken is called by the parser to obtain the next input token.
+// It returns the token value and sets val to the data associated with
+// the token.
+//
+// For all our input tokens, the associated data is val.pos (the
+// position where the token begins), val.raw (the input string
+// corresponding to the token). For string and int tokens, the string
+// and int fields additionally contain the token's interpreted value.
+func (sc *scanner) nextToken(val *tokenValue) Token {
+
+ // The following distribution of tokens guides case ordering:
+ //
+ // COMMA 27 %
+ // STRING 23 %
+ // IDENT 15 %
+ // EQL 11 %
+ // LBRACK 5.5 %
+ // RBRACK 5.5 %
+ // NEWLINE 3 %
+ // LPAREN 2.9 %
+ // RPAREN 2.9 %
+ // INT 2 %
+ // others < 1 %
+ //
+ // Although NEWLINE tokens are infrequent, and lineStart is
+ // usually (~97%) false on entry, skipped newlines account for
+ // about 50% of all iterations of the 'start' loop.
+
+start:
+ var c rune
+
+ // Deal with leading spaces and indentation.
+ blank := false
+ savedLineStart := sc.lineStart
+ if sc.lineStart {
+ sc.lineStart = false
+ col := 0
+ for {
+ c = sc.peekRune()
+ if c == ' ' {
+ col++
+ sc.readRune()
+ } else if c == '\t' {
+ const tab = 8
+ col += int(tab - (sc.pos.Col-1)%tab)
+ sc.readRune()
+ } else {
+ break
+ }
+ }
+
+ // The third clause matches EOF.
+ if c == '#' || c == '\n' || c == 0 {
+ blank = true
+ }
+
+ // Compute indentation level for non-blank lines not
+ // inside an expression. This is not the common case.
+ if !blank && sc.depth == 0 {
+ cur := sc.indentstk[len(sc.indentstk)-1]
+ if col > cur {
+ // indent
+ sc.dents++
+ sc.indentstk = append(sc.indentstk, col)
+ } else if col < cur {
+ // outdent(s)
+ for len(sc.indentstk) > 0 && col < sc.indentstk[len(sc.indentstk)-1] {
+ sc.dents--
+ sc.indentstk = sc.indentstk[:len(sc.indentstk)-1] // pop
+ }
+ if col != sc.indentstk[len(sc.indentstk)-1] {
+ sc.error(sc.pos, "unindent does not match any outer indentation level")
+ }
+ }
+ }
+ }
+
+ // Return saved indentation tokens.
+ if sc.dents != 0 {
+ sc.startToken(val)
+ sc.endToken(val)
+ if sc.dents < 0 {
+ sc.dents++
+ return OUTDENT
+ } else {
+ sc.dents--
+ return INDENT
+ }
+ }
+
+ // start of line proper
+ c = sc.peekRune()
+
+ // Skip spaces.
+ for c == ' ' || c == '\t' {
+ sc.readRune()
+ c = sc.peekRune()
+ }
+
+ // comment
+ if c == '#' {
+ if sc.keepComments {
+ sc.startToken(val)
+ }
+ // Consume up to newline (included).
+ for c != 0 && c != '\n' {
+ sc.readRune()
+ c = sc.peekRune()
+ }
+ if sc.keepComments {
+ sc.endToken(val)
+ if blank {
+ sc.lineComments = append(sc.lineComments, Comment{val.pos, val.raw})
+ } else {
+ sc.suffixComments = append(sc.suffixComments, Comment{val.pos, val.raw})
+ }
+ }
+ }
+
+ // newline
+ if c == '\n' {
+ sc.lineStart = true
+
+ // Ignore newlines within expressions (common case).
+ if sc.depth > 0 {
+ sc.readRune()
+ goto start
+ }
+
+ // Ignore blank lines, except in the REPL,
+ // where they emit OUTDENTs and NEWLINE.
+ if blank {
+ if sc.readline == nil {
+ sc.readRune()
+ goto start
+ } else if len(sc.indentstk) > 1 {
+ sc.dents = 1 - len(sc.indentstk)
+ sc.indentstk = sc.indentstk[:1]
+ goto start
+ }
+ }
+
+ // At top-level (not in an expression).
+ sc.startToken(val)
+ sc.readRune()
+ val.raw = "\n"
+ return NEWLINE
+ }
+
+ // end of file
+ if c == 0 {
+ // Emit OUTDENTs for unfinished indentation,
+ // preceded by a NEWLINE if we haven't just emitted one.
+ if len(sc.indentstk) > 1 {
+ if savedLineStart {
+ sc.dents = 1 - len(sc.indentstk)
+ sc.indentstk = sc.indentstk[:1]
+ goto start
+ } else {
+ sc.lineStart = true
+ sc.startToken(val)
+ val.raw = "\n"
+ return NEWLINE
+ }
+ }
+
+ sc.startToken(val)
+ sc.endToken(val)
+ return EOF
+ }
+
+ // line continuation
+ if c == '\\' {
+ sc.readRune()
+ if sc.peekRune() != '\n' {
+ sc.errorf(sc.pos, "stray backslash in program")
+ }
+ sc.readRune()
+ goto start
+ }
+
+ // start of the next token
+ sc.startToken(val)
+
+ // comma (common case)
+ if c == ',' {
+ sc.readRune()
+ sc.endToken(val)
+ return COMMA
+ }
+
+ // string literal
+ if c == '"' || c == '\'' {
+ return sc.scanString(val, c)
+ }
+
+ // identifier or keyword
+ if isIdentStart(c) {
+ if (c == 'r' || c == 'b') && len(sc.rest) > 1 && (sc.rest[1] == '"' || sc.rest[1] == '\'') {
+ // r"..."
+ // b"..."
+ sc.readRune()
+ c = sc.peekRune()
+ return sc.scanString(val, c)
+ } else if c == 'r' && len(sc.rest) > 2 && sc.rest[1] == 'b' && (sc.rest[2] == '"' || sc.rest[2] == '\'') {
+ // rb"..."
+ sc.readRune()
+ sc.readRune()
+ c = sc.peekRune()
+ return sc.scanString(val, c)
+ }
+
+ for isIdent(c) {
+ sc.readRune()
+ c = sc.peekRune()
+ }
+ sc.endToken(val)
+ if k, ok := keywordToken[val.raw]; ok {
+ return k
+ }
+
+ return IDENT
+ }
+
+ // brackets
+ switch c {
+ case '[', '(', '{':
+ sc.depth++
+ sc.readRune()
+ sc.endToken(val)
+ switch c {
+ case '[':
+ return LBRACK
+ case '(':
+ return LPAREN
+ case '{':
+ return LBRACE
+ }
+ panic("unreachable")
+
+ case ']', ')', '}':
+ if sc.depth == 0 {
+ sc.errorf(sc.pos, "unexpected %q", c)
+ } else {
+ sc.depth--
+ }
+ sc.readRune()
+ sc.endToken(val)
+ switch c {
+ case ']':
+ return RBRACK
+ case ')':
+ return RPAREN
+ case '}':
+ return RBRACE
+ }
+ panic("unreachable")
+ }
+
+ // int or float literal, or period
+ if isdigit(c) || c == '.' {
+ return sc.scanNumber(val, c)
+ }
+
+ // other punctuation
+ defer sc.endToken(val)
+ switch c {
+ case '=', '<', '>', '!', '+', '-', '%', '/', '&', '|', '^': // possibly followed by '='
+ start := sc.pos
+ sc.readRune()
+ if sc.peekRune() == '=' {
+ sc.readRune()
+ switch c {
+ case '<':
+ return LE
+ case '>':
+ return GE
+ case '=':
+ return EQL
+ case '!':
+ return NEQ
+ case '+':
+ return PLUS_EQ
+ case '-':
+ return MINUS_EQ
+ case '/':
+ return SLASH_EQ
+ case '%':
+ return PERCENT_EQ
+ case '&':
+ return AMP_EQ
+ case '|':
+ return PIPE_EQ
+ case '^':
+ return CIRCUMFLEX_EQ
+ }
+ }
+ switch c {
+ case '=':
+ return EQ
+ case '<':
+ if sc.peekRune() == '<' {
+ sc.readRune()
+ if sc.peekRune() == '=' {
+ sc.readRune()
+ return LTLT_EQ
+ } else {
+ return LTLT
+ }
+ }
+ return LT
+ case '>':
+ if sc.peekRune() == '>' {
+ sc.readRune()
+ if sc.peekRune() == '=' {
+ sc.readRune()
+ return GTGT_EQ
+ } else {
+ return GTGT
+ }
+ }
+ return GT
+ case '!':
+ sc.error(start, "unexpected input character '!'")
+ case '+':
+ return PLUS
+ case '-':
+ return MINUS
+ case '/':
+ if sc.peekRune() == '/' {
+ sc.readRune()
+ if sc.peekRune() == '=' {
+ sc.readRune()
+ return SLASHSLASH_EQ
+ } else {
+ return SLASHSLASH
+ }
+ }
+ return SLASH
+ case '%':
+ return PERCENT
+ case '&':
+ return AMP
+ case '|':
+ return PIPE
+ case '^':
+ return CIRCUMFLEX
+ }
+ panic("unreachable")
+
+ case ':', ';', '~': // single-char tokens (except comma)
+ sc.readRune()
+ switch c {
+ case ':':
+ return COLON
+ case ';':
+ return SEMI
+ case '~':
+ return TILDE
+ }
+ panic("unreachable")
+
+ case '*': // possibly followed by '*' or '='
+ sc.readRune()
+ switch sc.peekRune() {
+ case '*':
+ sc.readRune()
+ return STARSTAR
+ case '=':
+ sc.readRune()
+ return STAR_EQ
+ }
+ return STAR
+ }
+
+ sc.errorf(sc.pos, "unexpected input character %#q", c)
+ panic("unreachable")
+}
+
+func (sc *scanner) scanString(val *tokenValue, quote rune) Token {
+ start := sc.pos
+ triple := len(sc.rest) >= 3 && sc.rest[0] == byte(quote) && sc.rest[1] == byte(quote) && sc.rest[2] == byte(quote)
+ sc.readRune()
+
+ // String literals may contain escaped or unescaped newlines,
+ // causing them to span multiple lines (gulps) of REPL input;
+ // they are the only such token. Thus we cannot call endToken,
+ // as it assumes sc.rest is unchanged since startToken.
+ // Instead, buffer the token here.
+ // TODO(adonovan): opt: buffer only if we encounter a newline.
+ raw := new(strings.Builder)
+
+ // Copy the prefix, e.g. r' or " (see startToken).
+ raw.Write(sc.token[:len(sc.token)-len(sc.rest)])
+
+ if !triple {
+ // single-quoted string literal
+ for {
+ if sc.eof() {
+ sc.error(val.pos, "unexpected EOF in string")
+ }
+ c := sc.readRune()
+ raw.WriteRune(c)
+ if c == quote {
+ break
+ }
+ if c == '\n' {
+ sc.error(val.pos, "unexpected newline in string")
+ }
+ if c == '\\' {
+ if sc.eof() {
+ sc.error(val.pos, "unexpected EOF in string")
+ }
+ c = sc.readRune()
+ raw.WriteRune(c)
+ }
+ }
+ } else {
+ // triple-quoted string literal
+ sc.readRune()
+ raw.WriteRune(quote)
+ sc.readRune()
+ raw.WriteRune(quote)
+
+ quoteCount := 0
+ for {
+ if sc.eof() {
+ sc.error(val.pos, "unexpected EOF in string")
+ }
+ c := sc.readRune()
+ raw.WriteRune(c)
+ if c == quote {
+ quoteCount++
+ if quoteCount == 3 {
+ break
+ }
+ } else {
+ quoteCount = 0
+ }
+ if c == '\\' {
+ if sc.eof() {
+ sc.error(val.pos, "unexpected EOF in string")
+ }
+ c = sc.readRune()
+ raw.WriteRune(c)
+ }
+ }
+ }
+ val.raw = raw.String()
+
+ s, _, isByte, err := unquote(val.raw)
+ if err != nil {
+ sc.error(start, err.Error())
+ }
+ val.string = s
+ if isByte {
+ return BYTES
+ } else {
+ return STRING
+ }
+}
+
+func (sc *scanner) scanNumber(val *tokenValue, c rune) Token {
+ // https://github.com/google/starlark-go/blob/master/doc/spec.md#lexical-elements
+ //
+ // Python features not supported:
+ // - integer literals of >64 bits of precision
+ // - 123L or 123l long suffix
+ // - traditional octal: 0755
+ // https://docs.python.org/2/reference/lexical_analysis.html#integer-and-long-integer-literals
+
+ start := sc.pos
+ fraction, exponent := false, false
+
+ if c == '.' {
+ // dot or start of fraction
+ sc.readRune()
+ c = sc.peekRune()
+ if !isdigit(c) {
+ sc.endToken(val)
+ return DOT
+ }
+ fraction = true
+ } else if c == '0' {
+ // hex, octal, binary or float
+ sc.readRune()
+ c = sc.peekRune()
+
+ if c == '.' {
+ fraction = true
+ } else if c == 'x' || c == 'X' {
+ // hex
+ sc.readRune()
+ c = sc.peekRune()
+ if !isxdigit(c) {
+ sc.error(start, "invalid hex literal")
+ }
+ for isxdigit(c) {
+ sc.readRune()
+ c = sc.peekRune()
+ }
+ } else if c == 'o' || c == 'O' {
+ // octal
+ sc.readRune()
+ c = sc.peekRune()
+ if !isodigit(c) {
+ sc.error(sc.pos, "invalid octal literal")
+ }
+ for isodigit(c) {
+ sc.readRune()
+ c = sc.peekRune()
+ }
+ } else if c == 'b' || c == 'B' {
+ // binary
+ sc.readRune()
+ c = sc.peekRune()
+ if !isbdigit(c) {
+ sc.error(sc.pos, "invalid binary literal")
+ }
+ for isbdigit(c) {
+ sc.readRune()
+ c = sc.peekRune()
+ }
+ } else {
+ // float (or obsolete octal "0755")
+ allzeros, octal := true, true
+ for isdigit(c) {
+ if c != '0' {
+ allzeros = false
+ }
+ if c > '7' {
+ octal = false
+ }
+ sc.readRune()
+ c = sc.peekRune()
+ }
+ if c == '.' {
+ fraction = true
+ } else if c == 'e' || c == 'E' {
+ exponent = true
+ } else if octal && !allzeros {
+ sc.endToken(val)
+ sc.errorf(sc.pos, "obsolete form of octal literal; use 0o%s", val.raw[1:])
+ }
+ }
+ } else {
+ // decimal
+ for isdigit(c) {
+ sc.readRune()
+ c = sc.peekRune()
+ }
+
+ if c == '.' {
+ fraction = true
+ } else if c == 'e' || c == 'E' {
+ exponent = true
+ }
+ }
+
+ if fraction {
+ sc.readRune() // consume '.'
+ c = sc.peekRune()
+ for isdigit(c) {
+ sc.readRune()
+ c = sc.peekRune()
+ }
+
+ if c == 'e' || c == 'E' {
+ exponent = true
+ }
+ }
+
+ if exponent {
+ sc.readRune() // consume [eE]
+ c = sc.peekRune()
+ if c == '+' || c == '-' {
+ sc.readRune()
+ c = sc.peekRune()
+ if !isdigit(c) {
+ sc.error(sc.pos, "invalid float literal")
+ }
+ }
+ for isdigit(c) {
+ sc.readRune()
+ c = sc.peekRune()
+ }
+ }
+
+ sc.endToken(val)
+ if fraction || exponent {
+ var err error
+ val.float, err = strconv.ParseFloat(val.raw, 64)
+ if err != nil {
+ sc.error(sc.pos, "invalid float literal")
+ }
+ return FLOAT
+ } else {
+ var err error
+ s := val.raw
+ val.bigInt = nil
+ if len(s) > 2 && s[0] == '0' && (s[1] == 'o' || s[1] == 'O') {
+ val.int, err = strconv.ParseInt(s[2:], 8, 64)
+ } else if len(s) > 2 && s[0] == '0' && (s[1] == 'b' || s[1] == 'B') {
+ val.int, err = strconv.ParseInt(s[2:], 2, 64)
+ } else {
+ val.int, err = strconv.ParseInt(s, 0, 64)
+ if err != nil {
+ num := new(big.Int)
+ var ok bool
+ val.bigInt, ok = num.SetString(s, 0)
+ if ok {
+ err = nil
+ }
+ }
+ }
+ if err != nil {
+ sc.error(start, "invalid int literal")
+ }
+ return INT
+ }
+}
+
+// isIdent reports whether c is an identifier rune.
+func isIdent(c rune) bool {
+ return isdigit(c) || isIdentStart(c)
+}
+
+func isIdentStart(c rune) bool {
+ return 'a' <= c && c <= 'z' ||
+ 'A' <= c && c <= 'Z' ||
+ c == '_' ||
+ unicode.IsLetter(c)
+}
+
+func isdigit(c rune) bool { return '0' <= c && c <= '9' }
+func isodigit(c rune) bool { return '0' <= c && c <= '7' }
+func isxdigit(c rune) bool { return isdigit(c) || 'A' <= c && c <= 'F' || 'a' <= c && c <= 'f' }
+func isbdigit(c rune) bool { return '0' == c || c == '1' }
+
+// keywordToken records the special tokens for
+// strings that should not be treated as ordinary identifiers.
+var keywordToken = map[string]Token{
+ "and": AND,
+ "break": BREAK,
+ "continue": CONTINUE,
+ "def": DEF,
+ "elif": ELIF,
+ "else": ELSE,
+ "for": FOR,
+ "if": IF,
+ "in": IN,
+ "lambda": LAMBDA,
+ "load": LOAD,
+ "not": NOT,
+ "or": OR,
+ "pass": PASS,
+ "return": RETURN,
+ "while": WHILE,
+
+ // reserved words:
+ "as": ILLEGAL,
+ // "assert": ILLEGAL, // heavily used by our tests
+ "class": ILLEGAL,
+ "del": ILLEGAL,
+ "except": ILLEGAL,
+ "finally": ILLEGAL,
+ "from": ILLEGAL,
+ "global": ILLEGAL,
+ "import": ILLEGAL,
+ "is": ILLEGAL,
+ "nonlocal": ILLEGAL,
+ "raise": ILLEGAL,
+ "try": ILLEGAL,
+ "with": ILLEGAL,
+ "yield": ILLEGAL,
+}