starlark: add 'bytes' data type, for binary strings (#330)

THIS IS AN INCOMPATIBLE LANGUAGE CHANGE; see below This change defines a 'bytes' data type, an immutable string of bytes. In this Go implementation of Starlark, ordinary strings are also strings of bytes, so the behavior of the two is very similar. However, that is not required by the spec. Other implementations of Starlark, notably in Java, may use strings of UTF-16 codes for the ordinary string type, and thus need a distinct type for byte strings. See testdata/bytes.star for a tour of the API, and some remaining questions. See the attached issue for an outline of the proposed spec change. A Java implementation is underway, but is greatly complicated by Bazel's unfortunate misdecoding of UTF-8 files as Latin1. The string.elems iterable view is now indexable. The old syntax.quote function (which was in fact not used except in tests) has been replaced by syntax.Quote, which is similar to Go's strconv.Quote. This change removes go.starlark.net.lib.proto.Bytes. IMPORTANT: string literals that previously used hex escapes \xXX or octal escapes \OOO to denote byte values greater than 127 will now result in a compile error advising you to use \u escapes instead if you want the UTF-8 encoding of a code point in the range U+80 to U+FF. A string literal can no longer denote invalid text, such as the 1-element string formerly written "\xff". Updates https://github.com/bazelbuild/starlark/issues/112 Fixes https://github.com/google/starlark-go/issues/222
author: alandonovan <adonovan@google.com> 2021-02-12 16:57:32 -0500
committer: GitHub <noreply@github.com> 2021-02-12 16:57:32 -0500
commit: ebe61bd709bf23d7baddbb34e79084d7d156be04 (patch)
tree: 2efd7da6ec6557b9aaed7932d19e07e1beade288 /syntax/scan_test.go
parent: 0a10e4fe7402e37a43d9b62c15bfeac1cd4ef272 (diff)
download: starlark-go-ebe61bd709bf23d7baddbb34e79084d7d156be04.tar.gz
1 files changed, 40 insertions, 6 deletions
diff --git a/syntax/scan_test.go b/syntax/scan_test.go
index 0f2d9f2..9582bd7 100644
--- a/syntax/scan_test.go
+++ b/syntax/scan_test.go
@@ -10,6 +10,7 @@ import (
 	"go/build"
 	"io/ioutil"
 	"path/filepath"
+	"strings"
 	"testing"
 )
 
@@ -42,8 +43,8 @@ func scan(src interface{}) (tokens string, err error) {
 			}
 		case FLOAT:
 			fmt.Fprintf(&buf, "%e", val.float)
-		case STRING:
-			fmt.Fprintf(&buf, "%q", val.string)
+		case STRING, BYTES:
+			buf.WriteString(Quote(val.string, tok == BYTES))
 		default:
 			buf.WriteString(tok.String())
 		}
@@ -189,9 +190,34 @@ pass`, "pass newline pass EOF"}, // consecutive newlines are consolidated
 		{"i = 012934", `foo.star:1:5: invalid int literal`},
 		// octal escapes in string literals
 		{`"\037"`, `"\x1f" EOF`},
-		{`"\377"`, `"\xff" EOF`},
-		{`"\378"`, `"\x1f8" EOF`},                                // = '\37' + '8'
-		{`"\400"`, `foo.star:1:1: invalid escape sequence \400`}, // unlike Python 2 and 3
+		{`"\377"`, `foo.star:1:1: non-ASCII octal escape \377 (use \u00FF for the UTF-8 encoding of U+00FF)`},
+		{`"\378"`, `"\x1f8" EOF`},                               // = '\37' + '8'
+		{`"\400"`, `foo.star:1:1: non-ASCII octal escape \400`}, // unlike Python 2 and 3
+		// hex escapes
+		{`"\x00\x20\x09\x41\x7e\x7f"`, `"\x00 \tA~\x7f" EOF`}, // DEL is non-printable
+		{`"\x80"`, `foo.star:1:1: non-ASCII hex escape`},
+		{`"\xff"`, `foo.star:1:1: non-ASCII hex escape`},
+		{`"\xFf"`, `foo.star:1:1: non-ASCII hex escape`},
+		{`"\xF"`, `foo.star:1:1: truncated escape sequence \xF`},
+		{`"\x"`, `foo.star:1:1: truncated escape sequence \x`},
+		{`"\xfg"`, `foo.star:1:1: invalid escape sequence \xfg`},
+		// Unicode escapes
+		// \uXXXX
+		{`"\u0400"`, `"Ѐ" EOF`},
+		{`"\u100"`, `foo.star:1:1: truncated escape sequence \u100`},
+		{`"\u04000"`, `"Ѐ0" EOF`}, // = U+0400 + '0'
+		{`"\u100g"`, `foo.star:1:1: invalid escape sequence \u100g`},
+		{`"\u4E16"`, `"世" EOF`},
+		{`"\udc00"`, `foo.star:1:1: invalid Unicode code point U+DC00`}, // surrogate
+		// \UXXXXXXXX
+		{`"\U00000400"`, `"Ѐ" EOF`},
+		{`"\U0000400"`, `foo.star:1:1: truncated escape sequence \U0000400`},
+		{`"\U000004000"`, `"Ѐ0" EOF`}, // = U+0400 + '0'
+		{`"\U1000000g"`, `foo.star:1:1: invalid escape sequence \U1000000g`},
+		{`"\U0010FFFF"`, `"\U0010ffff" EOF`},
+		{`"\U00110000"`, `foo.star:1:1: code point out of range: \U00110000 (max \U00110000)`},
+		{`"\U0001F63F"`, `"😿" EOF`},
+		{`"\U0000dc00"`, `foo.star:1:1: invalid Unicode code point U+DC00`}, // surrogate
 
 		// backslash escapes
 		// As in Go, a backslash must escape something.
@@ -218,6 +244,12 @@ pass`, "pass newline pass EOF"}, // consecutive newlines are consolidated
 		{`r'\"'`, `"\\\"" EOF`},
 		{`'a\zb'`, `foo.star:1:1: invalid escape sequence \z`},
 		{`"\o123"`, `foo.star:1:1: invalid escape sequence \o`},
+		// bytes literals (where they differ from text strings)
+		{`b"AЀ世😿"`, `b"AЀ世😿`},                                       // 1-4 byte encodings, literal
+		{`b"\x41\u0400\u4e16\U0001F63F"`, `b"AЀ世😿"`},                // same, as escapes
+		{`b"\377\378\x80\xff\xFf"`, `b"\xff\x1f8\x80\xff\xff" EOF`}, // hex/oct escapes allow non-ASCII
+		{`b"\400"`, `foo.star:1:2: invalid escape sequence \400`},
+		{`b"\udc00"`, `foo.star:1:2: invalid Unicode code point U+DC00`}, // (same as string)
 		// floats starting with octal digits
 		{"012934.", `1.293400e+04 EOF`},
 		{"012934.1", `1.293410e+04 EOF`},
@@ -243,7 +275,9 @@ pass`, "pass newline pass EOF"}, // consecutive newlines are consolidated
 		if err != nil {
 			got = err.(Error).Error()
 		}
-		if test.want != got {
+		// Prefix match allows us to truncate errors in expecations.
+		// Success cases all end in EOF.
+		if !strings.HasPrefix(got, test.want) {
 			t.Errorf("scan `%s` = [%s], want [%s]", test.input, got, test.want)
 		}
 	}
author	alandonovan <adonovan@google.com>	2021-02-12 16:57:32 -0500
committer	GitHub <noreply@github.com>	2021-02-12 16:57:32 -0500
commit	ebe61bd709bf23d7baddbb34e79084d7d156be04 (patch)
tree	2efd7da6ec6557b9aaed7932d19e07e1beade288 /syntax/scan_test.go
parent	0a10e4fe7402e37a43d9b62c15bfeac1cd4ef272 (diff)
download	starlark-go-ebe61bd709bf23d7baddbb34e79084d7d156be04.tar.gz