syntax: improve REPL parsing (#98)

Previously, the REPL used a heuristic: it would consume a single line and attempt to parse it; if that failed, it would consume lines up to a blank line then parse the whole as a file. This was suboptimal for various reasons: it failed to parse lines ending with an unfinished multi-line string literal, for example, and it would prematurely stop reading even while parentheses were open. This change integrates the REPL with the scanner and parser (as Python does). The REPL invokes a new parser entry point, ParseCompoundStmt, that consumes only enough input to parse a compound statement, defined as (a) blank line, (b) a semicolon-separated list of simple statements all on one line, or (c) a complex statement such as def, if or for. If the 'src' value provided to the scanner is a function of type func() ([]byte, error), then the scanner will call it each time it runs out of input. Fixes #81
author: alandonovan <adonovan@google.com> 2019-01-04 13:48:12 -0500
committer: GitHub <noreply@github.com> 2019-01-04 13:48:12 -0500
commit: 30e71c6b16e7cb4257d5436cb317ee1e989f1774 (patch)
tree: 12973d93ac3a8722fe92d52f149b15cc2abbd199 /syntax/scan.go
parent: 9d9777168d883df01c5a74800b807e5315fb9850 (diff)
download: starlark-go-30e71c6b16e7cb4257d5436cb317ee1e989f1774.tar.gz
1 files changed, 109 insertions, 38 deletions
diff --git a/syntax/scan.go b/syntax/scan.go
index 7c16f82..af24bce 100644
--- a/syntax/scan.go
+++ b/syntax/scan.go
@@ -7,6 +7,7 @@ package syntax
 // A lexical scanner for Starlark.
 
 import (
+	"bytes"
 	"fmt"
 	"io"
 	"io/ioutil"
@@ -231,8 +232,7 @@ func (p Position) isBefore(q Position) bool {
 
 // An scanner represents a single input file being parsed.
 type scanner struct {
-	complete       []byte    // entire input
-	rest           []byte    // rest of input
+	rest           []byte    // rest of input (in REPL, a line of input)
 	token          []byte    // token being scanned
 	pos            Position  // current input position
 	depth          int       // nesting of [ ] { } ( )
@@ -242,21 +242,26 @@ type scanner struct {
 	keepComments   bool      // accumulate comments in slice
 	lineComments   []Comment // list of full line comments (if keepComments)
 	suffixComments []Comment // list of suffix comments (if keepComments)
+
+	readline func() ([]byte, error) // read next line of input (REPL only)
 }
 
 func newScanner(filename string, src interface{}, keepComments bool) (*scanner, error) {
-	data, err := readSource(filename, src)
-	if err != nil {
-		return nil, err
-	}
-	return &scanner{
-		complete:     data,
-		rest:         data,
+	sc := &scanner{
 		pos:          Position{file: &filename, Line: 1, Col: 1},
 		indentstk:    make([]int, 1, 10), // []int{0} + spare capacity
 		lineStart:    true,
 		keepComments: keepComments,
-	}, nil
+	}
+	sc.readline, _ = src.(func() ([]byte, error)) // REPL only
+	if sc.readline == nil {
+		data, err := readSource(filename, src)
+		if err != nil {
+			return nil, err
+		}
+		sc.rest = data
+	}
+	return sc, nil
 }
 
 func readSource(filename string, src interface{}) ([]byte, error) {
@@ -316,13 +321,28 @@ func (sc *scanner) recover(err *error) {
 
 // eof reports whether the input has reached end of file.
 func (sc *scanner) eof() bool {
-	return len(sc.rest) == 0
+	return len(sc.rest) == 0 && !sc.readLine()
+}
+
+// readLine attempts to read another line of input.
+// Precondition: len(sc.rest)==0.
+func (sc *scanner) readLine() bool {
+	if sc.readline != nil {
+		var err error
+		sc.rest, err = sc.readline()
+		if err != nil {
+			sc.errorf(sc.pos, "%v", err) // EOF or ErrInterrupt
+		}
+		return len(sc.rest) > 0
+	}
+	return false
 }
 
 // peekRune returns the next rune in the input without consuming it.
 // Newlines in Unix, DOS, or Mac format are treated as one rune, '\n'.
 func (sc *scanner) peekRune() rune {
-	if len(sc.rest) == 0 {
+	// TODO(adonovan): opt: measure and perhaps inline eof.
+	if sc.eof() {
 		return 0
 	}
 
@@ -341,9 +361,16 @@ func (sc *scanner) peekRune() rune {
 // readRune consumes and returns the next rune in the input.
 // Newlines in Unix, DOS, or Mac format are treated as one rune, '\n'.
 func (sc *scanner) readRune() rune {
+	// eof() has been inlined here, both to avoid a call
+	// and to establish len(rest)>0 to avoid a bounds check.
 	if len(sc.rest) == 0 {
-		sc.error(sc.pos, "internal scanner error: readRune at EOF")
-		return 0 // unreachable but eliminates bounds-check below
+		if !sc.readLine() {
+			sc.error(sc.pos, "internal scanner error: readRune at EOF")
+		}
+		// Redundant, but eliminates the bounds-check below.
+		if len(sc.rest) == 0 {
+			return 0
+		}
 	}
 
 	// fast path: ASCII
@@ -520,11 +547,26 @@ start:
 	// newline
 	if c == '\n' {
 		sc.lineStart = true
-		if blank || sc.depth > 0 {
-			// Ignore blank lines, or newlines within expressions (common case).
+
+		// Ignore newlines within expressions (common case).
+		if sc.depth > 0 {
 			sc.readRune()
 			goto start
 		}
+
+		// Ignore blank lines, except in the REPL,
+		// where they emit OUTDENTs and NEWLINE.
+		if blank {
+			if sc.readline == nil {
+				sc.readRune()
+				goto start
+			} else if len(sc.indentstk) > 1 {
+				sc.dents = 1 - len(sc.indentstk)
+				sc.indentstk = sc.indentstk[1:]
+				goto start
+			}
+		}
+
 		// At top-level (not in an expression).
 		sc.startToken(val)
 		sc.readRune()
@@ -759,37 +801,66 @@ func (sc *scanner) scanString(val *tokenValue, quote rune) Token {
 	start := sc.pos
 	triple := len(sc.rest) >= 3 && sc.rest[0] == byte(quote) && sc.rest[1] == byte(quote) && sc.rest[2] == byte(quote)
 	sc.readRune()
-	if triple {
-		sc.readRune()
-		sc.readRune()
-	}
-
-	quoteCount := 0
-	for {
-		if sc.eof() {
-			sc.error(val.pos, "unexpected EOF in string")
-		}
-		c := sc.readRune()
-		if c == '\n' && !triple {
-			sc.error(val.pos, "unexpected newline in string")
-		}
-		if c == quote {
-			quoteCount++
-			if !triple || quoteCount == 3 {
+	if !triple {
+		// Precondition: startToken was already called.
+		for {
+			if sc.eof() {
+				sc.error(val.pos, "unexpected EOF in string")
+			}
+			c := sc.readRune()
+			if c == quote {
 				break
 			}
-		} else {
-			quoteCount = 0
+			if c == '\n' {
+				sc.error(val.pos, "unexpected newline in string")
+			}
+			if c == '\\' {
+				if sc.eof() {
+					sc.error(val.pos, "unexpected EOF in string")
+				}
+				sc.readRune()
+			}
 		}
-		if c == '\\' {
+		sc.endToken(val)
+	} else {
+		// triple-quoted string literal
+		sc.readRune()
+		sc.readRune()
+
+		// A triple-quoted string literal may span multiple
+		// gulps of REPL input; it is the only such token.
+		// Thus we must avoid {start,end}Token.
+		var raw bytes.Buffer
+
+		// Copy the prefix, e.g. r''' or """ (see startToken).
+		raw.Write(sc.token[:len(sc.token)-len(sc.rest)])
+
+		quoteCount := 0
+		for {
 			if sc.eof() {
 				sc.error(val.pos, "unexpected EOF in string")
 			}
-			sc.readRune()
+			c := sc.readRune()
+			raw.WriteRune(c)
+			if c == quote {
+				quoteCount++
+				if quoteCount == 3 {
+					break
+				}
+			} else {
+				quoteCount = 0
+			}
+			if c == '\\' {
+				if sc.eof() {
+					sc.error(val.pos, "unexpected EOF in string")
+				}
+				c = sc.readRune()
+				raw.WriteRune(c)
+			}
 		}
+		val.raw = raw.String()
 	}
 
-	sc.endToken(val)
 	s, _, err := unquote(val.raw)
 	if err != nil {
 		sc.error(start, err.Error())
author	alandonovan <adonovan@google.com>	2019-01-04 13:48:12 -0500
committer	GitHub <noreply@github.com>	2019-01-04 13:48:12 -0500
commit	30e71c6b16e7cb4257d5436cb317ee1e989f1774 (patch)
tree	12973d93ac3a8722fe92d52f149b15cc2abbd199 /syntax/scan.go
parent	9d9777168d883df01c5a74800b807e5315fb9850 (diff)
download	starlark-go-30e71c6b16e7cb4257d5436cb317ee1e989f1774.tar.gz