aboutsummaryrefslogtreecommitdiff
path: root/syntax/scan.go
diff options
context:
space:
mode:
authoralandonovan <adonovan@google.com>2019-01-04 13:48:12 -0500
committerGitHub <noreply@github.com>2019-01-04 13:48:12 -0500
commit30e71c6b16e7cb4257d5436cb317ee1e989f1774 (patch)
tree12973d93ac3a8722fe92d52f149b15cc2abbd199 /syntax/scan.go
parent9d9777168d883df01c5a74800b807e5315fb9850 (diff)
downloadstarlark-go-30e71c6b16e7cb4257d5436cb317ee1e989f1774.tar.gz
syntax: improve REPL parsing (#98)
Previously, the REPL used a heuristic: it would consume a single line and attempt to parse it; if that failed, it would consume lines up to a blank line then parse the whole as a file. This was suboptimal for various reasons: it failed to parse lines ending with an unfinished multi-line string literal, for example, and it would prematurely stop reading even while parentheses were open. This change integrates the REPL with the scanner and parser (as Python does). The REPL invokes a new parser entry point, ParseCompoundStmt, that consumes only enough input to parse a compound statement, defined as (a) blank line, (b) a semicolon-separated list of simple statements all on one line, or (c) a complex statement such as def, if or for. If the 'src' value provided to the scanner is a function of type func() ([]byte, error), then the scanner will call it each time it runs out of input. Fixes #81
Diffstat (limited to 'syntax/scan.go')
-rw-r--r--syntax/scan.go147
1 files changed, 109 insertions, 38 deletions
diff --git a/syntax/scan.go b/syntax/scan.go
index 7c16f82..af24bce 100644
--- a/syntax/scan.go
+++ b/syntax/scan.go
@@ -7,6 +7,7 @@ package syntax
// A lexical scanner for Starlark.
import (
+ "bytes"
"fmt"
"io"
"io/ioutil"
@@ -231,8 +232,7 @@ func (p Position) isBefore(q Position) bool {
// An scanner represents a single input file being parsed.
type scanner struct {
- complete []byte // entire input
- rest []byte // rest of input
+ rest []byte // rest of input (in REPL, a line of input)
token []byte // token being scanned
pos Position // current input position
depth int // nesting of [ ] { } ( )
@@ -242,21 +242,26 @@ type scanner struct {
keepComments bool // accumulate comments in slice
lineComments []Comment // list of full line comments (if keepComments)
suffixComments []Comment // list of suffix comments (if keepComments)
+
+ readline func() ([]byte, error) // read next line of input (REPL only)
}
func newScanner(filename string, src interface{}, keepComments bool) (*scanner, error) {
- data, err := readSource(filename, src)
- if err != nil {
- return nil, err
- }
- return &scanner{
- complete: data,
- rest: data,
+ sc := &scanner{
pos: Position{file: &filename, Line: 1, Col: 1},
indentstk: make([]int, 1, 10), // []int{0} + spare capacity
lineStart: true,
keepComments: keepComments,
- }, nil
+ }
+ sc.readline, _ = src.(func() ([]byte, error)) // REPL only
+ if sc.readline == nil {
+ data, err := readSource(filename, src)
+ if err != nil {
+ return nil, err
+ }
+ sc.rest = data
+ }
+ return sc, nil
}
func readSource(filename string, src interface{}) ([]byte, error) {
@@ -316,13 +321,28 @@ func (sc *scanner) recover(err *error) {
// eof reports whether the input has reached end of file.
func (sc *scanner) eof() bool {
- return len(sc.rest) == 0
+ return len(sc.rest) == 0 && !sc.readLine()
+}
+
+// readLine attempts to read another line of input.
+// Precondition: len(sc.rest)==0.
+func (sc *scanner) readLine() bool {
+ if sc.readline != nil {
+ var err error
+ sc.rest, err = sc.readline()
+ if err != nil {
+ sc.errorf(sc.pos, "%v", err) // EOF or ErrInterrupt
+ }
+ return len(sc.rest) > 0
+ }
+ return false
}
// peekRune returns the next rune in the input without consuming it.
// Newlines in Unix, DOS, or Mac format are treated as one rune, '\n'.
func (sc *scanner) peekRune() rune {
- if len(sc.rest) == 0 {
+ // TODO(adonovan): opt: measure and perhaps inline eof.
+ if sc.eof() {
return 0
}
@@ -341,9 +361,16 @@ func (sc *scanner) peekRune() rune {
// readRune consumes and returns the next rune in the input.
// Newlines in Unix, DOS, or Mac format are treated as one rune, '\n'.
func (sc *scanner) readRune() rune {
+ // eof() has been inlined here, both to avoid a call
+ // and to establish len(rest)>0 to avoid a bounds check.
if len(sc.rest) == 0 {
- sc.error(sc.pos, "internal scanner error: readRune at EOF")
- return 0 // unreachable but eliminates bounds-check below
+ if !sc.readLine() {
+ sc.error(sc.pos, "internal scanner error: readRune at EOF")
+ }
+ // Redundant, but eliminates the bounds-check below.
+ if len(sc.rest) == 0 {
+ return 0
+ }
}
// fast path: ASCII
@@ -520,11 +547,26 @@ start:
// newline
if c == '\n' {
sc.lineStart = true
- if blank || sc.depth > 0 {
- // Ignore blank lines, or newlines within expressions (common case).
+
+ // Ignore newlines within expressions (common case).
+ if sc.depth > 0 {
sc.readRune()
goto start
}
+
+ // Ignore blank lines, except in the REPL,
+ // where they emit OUTDENTs and NEWLINE.
+ if blank {
+ if sc.readline == nil {
+ sc.readRune()
+ goto start
+ } else if len(sc.indentstk) > 1 {
+ sc.dents = 1 - len(sc.indentstk)
+ sc.indentstk = sc.indentstk[1:]
+ goto start
+ }
+ }
+
// At top-level (not in an expression).
sc.startToken(val)
sc.readRune()
@@ -759,37 +801,66 @@ func (sc *scanner) scanString(val *tokenValue, quote rune) Token {
start := sc.pos
triple := len(sc.rest) >= 3 && sc.rest[0] == byte(quote) && sc.rest[1] == byte(quote) && sc.rest[2] == byte(quote)
sc.readRune()
- if triple {
- sc.readRune()
- sc.readRune()
- }
-
- quoteCount := 0
- for {
- if sc.eof() {
- sc.error(val.pos, "unexpected EOF in string")
- }
- c := sc.readRune()
- if c == '\n' && !triple {
- sc.error(val.pos, "unexpected newline in string")
- }
- if c == quote {
- quoteCount++
- if !triple || quoteCount == 3 {
+ if !triple {
+ // Precondition: startToken was already called.
+ for {
+ if sc.eof() {
+ sc.error(val.pos, "unexpected EOF in string")
+ }
+ c := sc.readRune()
+ if c == quote {
break
}
- } else {
- quoteCount = 0
+ if c == '\n' {
+ sc.error(val.pos, "unexpected newline in string")
+ }
+ if c == '\\' {
+ if sc.eof() {
+ sc.error(val.pos, "unexpected EOF in string")
+ }
+ sc.readRune()
+ }
}
- if c == '\\' {
+ sc.endToken(val)
+ } else {
+ // triple-quoted string literal
+ sc.readRune()
+ sc.readRune()
+
+ // A triple-quoted string literal may span multiple
+ // gulps of REPL input; it is the only such token.
+ // Thus we must avoid {start,end}Token.
+ var raw bytes.Buffer
+
+ // Copy the prefix, e.g. r''' or """ (see startToken).
+ raw.Write(sc.token[:len(sc.token)-len(sc.rest)])
+
+ quoteCount := 0
+ for {
if sc.eof() {
sc.error(val.pos, "unexpected EOF in string")
}
- sc.readRune()
+ c := sc.readRune()
+ raw.WriteRune(c)
+ if c == quote {
+ quoteCount++
+ if quoteCount == 3 {
+ break
+ }
+ } else {
+ quoteCount = 0
+ }
+ if c == '\\' {
+ if sc.eof() {
+ sc.error(val.pos, "unexpected EOF in string")
+ }
+ c = sc.readRune()
+ raw.WriteRune(c)
+ }
}
+ val.raw = raw.String()
}
- sc.endToken(val)
s, _, err := unquote(val.raw)
if err != nil {
sc.error(start, err.Error())