syntax: improve REPL parsing (#98)

Previously, the REPL used a heuristic: it would consume a single line and attempt to parse it; if that failed, it would consume lines up to a blank line then parse the whole as a file. This was suboptimal for various reasons: it failed to parse lines ending with an unfinished multi-line string literal, for example, and it would prematurely stop reading even while parentheses were open. This change integrates the REPL with the scanner and parser (as Python does). The REPL invokes a new parser entry point, ParseCompoundStmt, that consumes only enough input to parse a compound statement, defined as (a) blank line, (b) a semicolon-separated list of simple statements all on one line, or (c) a complex statement such as def, if or for. If the 'src' value provided to the scanner is a function of type func() ([]byte, error), then the scanner will call it each time it runs out of input. Fixes #81
author: alandonovan <adonovan@google.com> 2019-01-04 13:48:12 -0500
committer: GitHub <noreply@github.com> 2019-01-04 13:48:12 -0500
commit: 30e71c6b16e7cb4257d5436cb317ee1e989f1774 (patch)
tree: 12973d93ac3a8722fe92d52f149b15cc2abbd199 /syntax
parent: 9d9777168d883df01c5a74800b807e5315fb9850 (diff)
download: starlark-go-30e71c6b16e7cb4257d5436cb317ee1e989f1774.tar.gz
5 files changed, 250 insertions, 43 deletions
diff --git a/syntax/parse.go b/syntax/parse.go
index af79cc1..2361085 100644
--- a/syntax/parse.go
+++ b/syntax/parse.go
@@ -47,6 +47,38 @@ func Parse(filename string, src interface{}, mode Mode) (f *File, err error) {
 	return f, nil
 }
 
+// ParseCompoundStmt parses a single compound statement:
+// a blank line, a def, for, while, or if statement, or a
+// semicolon-separated list of simple statements followed
+// by a newline. These are the units on which the REPL operates.
+// ParseCompoundStmt does not consume any following input.
+// The parser calls the readline function each
+// time it needs a new line of input.
+func ParseCompoundStmt(filename string, readline func() ([]byte, error)) (f *File, err error) {
+	in, err := newScanner(filename, readline, false)
+	if err != nil {
+		return nil, err
+	}
+
+	p := parser{in: in}
+	defer p.in.recover(&err)
+
+	p.nextToken() // read first lookahead token
+
+	var stmts []Stmt
+	switch p.tok {
+	case DEF, IF, FOR, WHILE:
+		stmts = p.parseStmt(stmts)
+	case NEWLINE:
+		// blank line
+	default:
+		// Don't consume newline, to avoid blocking again.
+		stmts = p.parseSimpleStmt(stmts, false)
+	}
+
+	return &File{Path: filename, Stmts: stmts}, nil
+}
+
 // ParseExpr parses a Starlark expression.
 // See Parse for explanation of parameters.
 func ParseExpr(filename string, src interface{}, mode Mode) (expr Expr, err error) {
@@ -58,6 +90,10 @@ func ParseExpr(filename string, src interface{}, mode Mode) (expr Expr, err erro
 	defer p.in.recover(&err)
 
 	p.nextToken() // read first lookahead token
+
+	// TODO(adonovan): Python's eval would use the equivalent of
+	// parseExpr here, which permits an unparenthesized tuple.
+	// We should too.
 	expr = p.parseTest()
 
 	// A following newline (e.g. "f()\n") appears outside any brackets,
@@ -114,7 +150,7 @@ func (p *parser) parseStmt(stmts []Stmt) []Stmt {
 	} else if p.tok == WHILE {
 		return append(stmts, p.parseWhileStmt())
 	}
-	return p.parseSimpleStmt(stmts)
+	return p.parseSimpleStmt(stmts, true)
 }
 
 func (p *parser) parseDefStmt() Stmt {
@@ -219,7 +255,8 @@ func (p *parser) parseForLoopVariables() Expr {
 }
 
 // simple_stmt = small_stmt (SEMI small_stmt)* SEMI? NEWLINE
-func (p *parser) parseSimpleStmt(stmts []Stmt) []Stmt {
+// In REPL mode, it does not consume the NEWLINE.
+func (p *parser) parseSimpleStmt(stmts []Stmt, consumeNL bool) []Stmt {
 	for {
 		stmts = append(stmts, p.parseSmallStmt())
 		if p.tok != SEMI {
@@ -231,9 +268,10 @@ func (p *parser) parseSimpleStmt(stmts []Stmt) []Stmt {
 		}
 	}
 	// EOF without NEWLINE occurs in `if x: pass`, for example.
-	if p.tok != EOF {
+	if p.tok != EOF && consumeNL {
 		p.consume(NEWLINE)
 	}
+
 	return stmts
 }
 
@@ -355,7 +393,7 @@ func (p *parser) parseSuite() []Stmt {
 		return stmts
 	}
 
-	return p.parseSimpleStmt(nil)
+	return p.parseSimpleStmt(nil, true)
 }
 
 func (p *parser) parseIdent() *Ident {
diff --git a/syntax/parse_test.go b/syntax/parse_test.go
index e4cb9f4..58eccc7 100644
--- a/syntax/parse_test.go
+++ b/syntax/parse_test.go
@@ -5,8 +5,12 @@
 package syntax_test
 
 import (
+	"bufio"
 	"bytes"
 	"fmt"
+	"go/build"
+	"io/ioutil"
+	"path/filepath"
 	"reflect"
 	"strings"
 	"testing"
@@ -175,6 +179,14 @@ else:
 def h():
 	pass`,
 			`(DefStmt Name=f Function=(Function Body=((DefStmt Name=g Function=(Function Body=((BranchStmt Token=pass)))) (BranchStmt Token=pass))))`},
+		{"f();g()",
+			`(ExprStmt X=(CallExpr Fn=f))`},
+		{"f();",
+			`(ExprStmt X=(CallExpr Fn=f))`},
+		{"f();g()\n",
+			`(ExprStmt X=(CallExpr Fn=f))`},
+		{"f();\n",
+			`(ExprStmt X=(CallExpr Fn=f))`},
 	} {
 		f, err := syntax.Parse("foo.star", test.input, 0)
 		if err != nil {
@@ -243,6 +255,67 @@ pass`,
 	}
 }
 
+// TestCompoundStmt tests handling of REPL-style compound statements.
+func TestCompoundStmt(t *testing.T) {
+	for _, test := range []struct {
+		input, want string
+	}{
+		// blank lines
+		{"\n",
+			``},
+		{"   \n",
+			``},
+		{"# comment\n",
+			``},
+		// simple statement
+		{"1\n",
+			`(ExprStmt X=1)`},
+		{"print(1)\n",
+			`(ExprStmt X=(CallExpr Fn=print Args=(1)))`},
+		{"1;2;3;\n",
+			`(ExprStmt X=1)(ExprStmt X=2)(ExprStmt X=3)`},
+		{"f();g()\n",
+			`(ExprStmt X=(CallExpr Fn=f))(ExprStmt X=(CallExpr Fn=g))`},
+		{"f();\n",
+			`(ExprStmt X=(CallExpr Fn=f))`},
+		{"f(\n\n\n\n\n\n\n)\n",
+			`(ExprStmt X=(CallExpr Fn=f))`},
+		// complex statements
+		{"def f():\n  pass\n\n",
+			`(DefStmt Name=f Function=(Function Body=((BranchStmt Token=pass))))`},
+		{"if cond:\n  pass\n\n",
+			`(IfStmt Cond=cond True=((BranchStmt Token=pass)))`},
+		// Even as a 1-liner, the following blank line is required.
+		{"if cond: pass\n\n",
+			`(IfStmt Cond=cond True=((BranchStmt Token=pass)))`},
+	} {
+
+		// Fake readline input from string.
+		// The ! suffix, which would cause a parse error,
+		// tests that the parser doesn't read more than necessary.
+		sc := bufio.NewScanner(strings.NewReader(test.input + "!"))
+		readline := func() ([]byte, error) {
+			if sc.Scan() {
+				return []byte(sc.Text() + "\n"), nil
+			}
+			return nil, sc.Err()
+		}
+
+		f, err := syntax.ParseCompoundStmt("foo.star", readline)
+		if err != nil {
+			t.Errorf("parse `%s` failed: %v", test.input, stripPos(err))
+			continue
+		}
+		var got string
+		for _, stmt := range f.Stmts {
+			got += treeString(stmt)
+		}
+		if test.want != got {
+			t.Errorf("parse `%s` = %s, want %s", test.input, got, test.want)
+		}
+	}
+}
+
 func stripPos(err error) string {
 	s := err.Error()
 	if i := strings.Index(s, ": "); i >= 0 {
@@ -402,3 +475,26 @@ File
 		t.Errorf("got %s, want %s", got, want)
 	}
 }
+
+// dataFile is the same as starlarktest.DataFile.
+// We make a copy to avoid a dependency cycle.
+var dataFile = func(pkgdir, filename string) string {
+	return filepath.Join(build.Default.GOPATH, "src/go.starlark.net", pkgdir, filename)
+}
+
+func BenchmarkParse(b *testing.B) {
+	filename := dataFile("syntax", "testdata/scan.star")
+	b.StopTimer()
+	data, err := ioutil.ReadFile(filename)
+	if err != nil {
+		b.Fatal(err)
+	}
+	b.StartTimer()
+
+	for i := 0; i < b.N; i++ {
+		_, err := syntax.Parse(filename, data, 0)
+		if err != nil {
+			b.Fatal(err)
+		}
+	}
+}
diff --git a/syntax/scan.go b/syntax/scan.go
index 7c16f82..af24bce 100644
--- a/syntax/scan.go
+++ b/syntax/scan.go
@@ -7,6 +7,7 @@ package syntax
 // A lexical scanner for Starlark.
 
 import (
+	"bytes"
 	"fmt"
 	"io"
 	"io/ioutil"
@@ -231,8 +232,7 @@ func (p Position) isBefore(q Position) bool {
 
 // An scanner represents a single input file being parsed.
 type scanner struct {
-	complete       []byte    // entire input
-	rest           []byte    // rest of input
+	rest           []byte    // rest of input (in REPL, a line of input)
 	token          []byte    // token being scanned
 	pos            Position  // current input position
 	depth          int       // nesting of [ ] { } ( )
@@ -242,21 +242,26 @@ type scanner struct {
 	keepComments   bool      // accumulate comments in slice
 	lineComments   []Comment // list of full line comments (if keepComments)
 	suffixComments []Comment // list of suffix comments (if keepComments)
+
+	readline func() ([]byte, error) // read next line of input (REPL only)
 }
 
 func newScanner(filename string, src interface{}, keepComments bool) (*scanner, error) {
-	data, err := readSource(filename, src)
-	if err != nil {
-		return nil, err
-	}
-	return &scanner{
-		complete:     data,
-		rest:         data,
+	sc := &scanner{
 		pos:          Position{file: &filename, Line: 1, Col: 1},
 		indentstk:    make([]int, 1, 10), // []int{0} + spare capacity
 		lineStart:    true,
 		keepComments: keepComments,
-	}, nil
+	}
+	sc.readline, _ = src.(func() ([]byte, error)) // REPL only
+	if sc.readline == nil {
+		data, err := readSource(filename, src)
+		if err != nil {
+			return nil, err
+		}
+		sc.rest = data
+	}
+	return sc, nil
 }
 
 func readSource(filename string, src interface{}) ([]byte, error) {
@@ -316,13 +321,28 @@ func (sc *scanner) recover(err *error) {
 
 // eof reports whether the input has reached end of file.
 func (sc *scanner) eof() bool {
-	return len(sc.rest) == 0
+	return len(sc.rest) == 0 && !sc.readLine()
+}
+
+// readLine attempts to read another line of input.
+// Precondition: len(sc.rest)==0.
+func (sc *scanner) readLine() bool {
+	if sc.readline != nil {
+		var err error
+		sc.rest, err = sc.readline()
+		if err != nil {
+			sc.errorf(sc.pos, "%v", err) // EOF or ErrInterrupt
+		}
+		return len(sc.rest) > 0
+	}
+	return false
 }
 
 // peekRune returns the next rune in the input without consuming it.
 // Newlines in Unix, DOS, or Mac format are treated as one rune, '\n'.
 func (sc *scanner) peekRune() rune {
-	if len(sc.rest) == 0 {
+	// TODO(adonovan): opt: measure and perhaps inline eof.
+	if sc.eof() {
 		return 0
 	}
 
@@ -341,9 +361,16 @@ func (sc *scanner) peekRune() rune {
 // readRune consumes and returns the next rune in the input.
 // Newlines in Unix, DOS, or Mac format are treated as one rune, '\n'.
 func (sc *scanner) readRune() rune {
+	// eof() has been inlined here, both to avoid a call
+	// and to establish len(rest)>0 to avoid a bounds check.
 	if len(sc.rest) == 0 {
-		sc.error(sc.pos, "internal scanner error: readRune at EOF")
-		return 0 // unreachable but eliminates bounds-check below
+		if !sc.readLine() {
+			sc.error(sc.pos, "internal scanner error: readRune at EOF")
+		}
+		// Redundant, but eliminates the bounds-check below.
+		if len(sc.rest) == 0 {
+			return 0
+		}
 	}
 
 	// fast path: ASCII
@@ -520,11 +547,26 @@ start:
 	// newline
 	if c == '\n' {
 		sc.lineStart = true
-		if blank || sc.depth > 0 {
-			// Ignore blank lines, or newlines within expressions (common case).
+
+		// Ignore newlines within expressions (common case).
+		if sc.depth > 0 {
 			sc.readRune()
 			goto start
 		}
+
+		// Ignore blank lines, except in the REPL,
+		// where they emit OUTDENTs and NEWLINE.
+		if blank {
+			if sc.readline == nil {
+				sc.readRune()
+				goto start
+			} else if len(sc.indentstk) > 1 {
+				sc.dents = 1 - len(sc.indentstk)
+				sc.indentstk = sc.indentstk[1:]
+				goto start
+			}
+		}
+
 		// At top-level (not in an expression).
 		sc.startToken(val)
 		sc.readRune()
@@ -759,37 +801,66 @@ func (sc *scanner) scanString(val *tokenValue, quote rune) Token {
 	start := sc.pos
 	triple := len(sc.rest) >= 3 && sc.rest[0] == byte(quote) && sc.rest[1] == byte(quote) && sc.rest[2] == byte(quote)
 	sc.readRune()
-	if triple {
-		sc.readRune()
-		sc.readRune()
-	}
-
-	quoteCount := 0
-	for {
-		if sc.eof() {
-			sc.error(val.pos, "unexpected EOF in string")
-		}
-		c := sc.readRune()
-		if c == '\n' && !triple {
-			sc.error(val.pos, "unexpected newline in string")
-		}
-		if c == quote {
-			quoteCount++
-			if !triple || quoteCount == 3 {
+	if !triple {
+		// Precondition: startToken was already called.
+		for {
+			if sc.eof() {
+				sc.error(val.pos, "unexpected EOF in string")
+			}
+			c := sc.readRune()
+			if c == quote {
 				break
 			}
-		} else {
-			quoteCount = 0
+			if c == '\n' {
+				sc.error(val.pos, "unexpected newline in string")
+			}
+			if c == '\\' {
+				if sc.eof() {
+					sc.error(val.pos, "unexpected EOF in string")
+				}
+				sc.readRune()
+			}
 		}
-		if c == '\\' {
+		sc.endToken(val)
+	} else {
+		// triple-quoted string literal
+		sc.readRune()
+		sc.readRune()
+
+		// A triple-quoted string literal may span multiple
+		// gulps of REPL input; it is the only such token.
+		// Thus we must avoid {start,end}Token.
+		var raw bytes.Buffer
+
+		// Copy the prefix, e.g. r''' or """ (see startToken).
+		raw.Write(sc.token[:len(sc.token)-len(sc.rest)])
+
+		quoteCount := 0
+		for {
 			if sc.eof() {
 				sc.error(val.pos, "unexpected EOF in string")
 			}
-			sc.readRune()
+			c := sc.readRune()
+			raw.WriteRune(c)
+			if c == quote {
+				quoteCount++
+				if quoteCount == 3 {
+					break
+				}
+			} else {
+				quoteCount = 0
+			}
+			if c == '\\' {
+				if sc.eof() {
+					sc.error(val.pos, "unexpected EOF in string")
+				}
+				c = sc.readRune()
+				raw.WriteRune(c)
+			}
 		}
+		val.raw = raw.String()
 	}
 
-	sc.endToken(val)
 	s, _, err := unquote(val.raw)
 	if err != nil {
 		sc.error(start, err.Error())
diff --git a/syntax/scan_test.go b/syntax/scan_test.go
index e01a1c9..9de610e 100644
--- a/syntax/scan_test.go
+++ b/syntax/scan_test.go
@@ -208,6 +208,7 @@ pass`, "pass newline pass EOF"}, // consecutive newlines are consolidated
 		{"x ! 0", "foo.star:1:3: unexpected input character '!'"},
 		// github.com/google/starlark-go/issues/80
 		{"([{<>}])", "( [ { < > } ] ) EOF"},
+		{"f();", "f ( ) ; EOF"},
 	} {
 		got, err := scan(test.input)
 		if err != nil {
diff --git a/syntax/testdata/errors.star b/syntax/testdata/errors.star
index b7c9b3d..d2d09b2 100644
--- a/syntax/testdata/errors.star
+++ b/syntax/testdata/errors.star
@@ -138,7 +138,8 @@ _ = a + b not c ### "got identifier, want in"
 ---
 f(1+2 = 3) ### "keyword argument must have form name=expr"
 ---
-print(1, 2, 3 ### `got end of file, want '\)'`
+print(1, 2, 3
+### `got end of file, want '\)'`
 ---
 _ = a if b ### "conditional expression without else clause"
 ---
author	alandonovan <adonovan@google.com>	2019-01-04 13:48:12 -0500
committer	GitHub <noreply@github.com>	2019-01-04 13:48:12 -0500
commit	30e71c6b16e7cb4257d5436cb317ee1e989f1774 (patch)
tree	12973d93ac3a8722fe92d52f149b15cc2abbd199 /syntax
parent	9d9777168d883df01c5a74800b807e5315fb9850 (diff)
download	starlark-go-30e71c6b16e7cb4257d5436cb317ee1e989f1774.tar.gz