aboutsummaryrefslogtreecommitdiff
path: root/internal
diff options
context:
space:
mode:
authoralandonovan <adonovan@google.com>2021-02-12 16:57:32 -0500
committerGitHub <noreply@github.com>2021-02-12 16:57:32 -0500
commitebe61bd709bf23d7baddbb34e79084d7d156be04 (patch)
tree2efd7da6ec6557b9aaed7932d19e07e1beade288 /internal
parent0a10e4fe7402e37a43d9b62c15bfeac1cd4ef272 (diff)
downloadstarlark-go-ebe61bd709bf23d7baddbb34e79084d7d156be04.tar.gz
starlark: add 'bytes' data type, for binary strings (#330)
THIS IS AN INCOMPATIBLE LANGUAGE CHANGE; see below This change defines a 'bytes' data type, an immutable string of bytes. In this Go implementation of Starlark, ordinary strings are also strings of bytes, so the behavior of the two is very similar. However, that is not required by the spec. Other implementations of Starlark, notably in Java, may use strings of UTF-16 codes for the ordinary string type, and thus need a distinct type for byte strings. See testdata/bytes.star for a tour of the API, and some remaining questions. See the attached issue for an outline of the proposed spec change. A Java implementation is underway, but is greatly complicated by Bazel's unfortunate misdecoding of UTF-8 files as Latin1. The string.elems iterable view is now indexable. The old syntax.quote function (which was in fact not used except in tests) has been replaced by syntax.Quote, which is similar to Go's strconv.Quote. This change removes go.starlark.net.lib.proto.Bytes. IMPORTANT: string literals that previously used hex escapes \xXX or octal escapes \OOO to denote byte values greater than 127 will now result in a compile error advising you to use \u escapes instead if you want the UTF-8 encoding of a code point in the range U+80 to U+FF. A string literal can no longer denote invalid text, such as the 1-element string formerly written "\xff". Updates https://github.com/bazelbuild/starlark/issues/112 Fixes https://github.com/google/starlark-go/issues/222
Diffstat (limited to 'internal')
-rw-r--r--internal/compile/compile.go32
-rw-r--r--internal/compile/serial.go22
2 files changed, 38 insertions, 16 deletions
diff --git a/internal/compile/compile.go b/internal/compile/compile.go
index ab67018..c314e6e 100644
--- a/internal/compile/compile.go
+++ b/internal/compile/compile.go
@@ -33,6 +33,7 @@ import (
"os"
"path/filepath"
"strconv"
+ "strings"
"sync"
"go.starlark.net/resolve"
@@ -46,7 +47,7 @@ var Disassemble = false
const debug = false // make code generation verbose, for debugging the compiler
// Increment this to force recompilation of saved bytecode files.
-const Version = 11
+const Version = 12
type Opcode uint8
@@ -309,12 +310,15 @@ func (op Opcode) String() string {
type Program struct {
Loads []Binding // name (really, string) and position of each load stmt
Names []string // names of attributes and predeclared variables
- Constants []interface{} // = string | int64 | float64 | *big.Int
+ Constants []interface{} // = string | int64 | float64 | *big.Int | Bytes
Functions []*Funcode
Globals []Binding // for error messages and tracing
Toplevel *Funcode // module initialization function
}
+// The type of a bytes literal value, to distinguish from text string.
+type Bytes string
+
// A Funcode is the code of a compiled Starlark function.
//
// Funcodes are serialized by the encoder.function method,
@@ -863,6 +867,8 @@ func PrintOp(fn *Funcode, pc uint32, op Opcode, arg uint32) {
switch x := fn.Prog.Constants[arg].(type) {
case string:
comment = strconv.Quote(x)
+ case Bytes:
+ comment = "b" + strconv.Quote(string(x))
default:
comment = fmt.Sprint(x)
}
@@ -1283,8 +1289,12 @@ func (fcomp *fcomp) expr(e syntax.Expr) {
fcomp.lookup(e)
case *syntax.Literal:
- // e.Value is int64, float64, *bigInt, or string.
- fcomp.emit1(CONSTANT, fcomp.pcomp.constantIndex(e.Value))
+ // e.Value is int64, float64, *bigInt, string
+ v := e.Value
+ if e.Token == syntax.BYTES {
+ v = Bytes(v.(string))
+ }
+ fcomp.emit1(CONSTANT, fcomp.pcomp.constantIndex(v))
case *syntax.ListExpr:
for _, x := range e.List {
@@ -1522,7 +1532,7 @@ func (fcomp *fcomp) plus(e *syntax.BinaryExpr) {
}
// addable reports whether e is a statically addable
-// expression: a [s]tring, [l]ist, or [t]uple.
+// expression: a [s]tring, [b]ytes, [l]ist, or [t]uple.
func addable(e syntax.Expr) rune {
switch e := e.(type) {
case *syntax.Literal:
@@ -1530,6 +1540,8 @@ func addable(e syntax.Expr) rune {
switch e.Token {
case syntax.STRING:
return 's'
+ case syntax.BYTES:
+ return 'b'
}
case *syntax.ListExpr:
return 'l'
@@ -1544,12 +1556,16 @@ func addable(e syntax.Expr) rune {
// The resulting syntax is degenerate, lacking position, etc.
func add(code rune, args []summand) syntax.Expr {
switch code {
- case 's':
- var buf bytes.Buffer
+ case 's', 'b':
+ var buf strings.Builder
for _, arg := range args {
buf.WriteString(arg.x.(*syntax.Literal).Value.(string))
}
- return &syntax.Literal{Token: syntax.STRING, Value: buf.String()}
+ tok := syntax.STRING
+ if code == 'b' {
+ tok = syntax.BYTES
+ }
+ return &syntax.Literal{Token: tok, Value: buf.String()}
case 'l':
var elems []syntax.Expr
for _, arg := range args {
diff --git a/internal/compile/serial.go b/internal/compile/serial.go
index 0107ef9..adadabf 100644
--- a/internal/compile/serial.go
+++ b/internal/compile/serial.go
@@ -51,9 +51,10 @@ package compile
//
// Constant: # type data
// type varint # 0=string string
-// data ... # 1=int varint
-// # 2=float varint (bits as uint64)
-// # 3=bigint string (decimal ASCII text)
+// data ... # 1=bytes string
+// # 2=int varint
+// # 3=float varint (bits as uint64)
+// # 4=bigint string (decimal ASCII text)
//
// The encoding starts with a four-byte magic number.
// The next four bytes are a little-endian uint32
@@ -109,14 +110,17 @@ func (prog *Program) Encode() []byte {
case string:
e.int(0)
e.string(c)
- case int64:
+ case Bytes:
e.int(1)
+ e.string(string(c))
+ case int64:
+ e.int(2)
e.int64(c)
case float64:
- e.int(2)
+ e.int(3)
e.uint64(math.Float64bits(c))
case *big.Int:
- e.int(3)
+ e.int(4)
e.string(c.Text(10))
}
}
@@ -249,10 +253,12 @@ func DecodeProgram(data []byte) (_ *Program, err error) {
case 0:
c = d.string()
case 1:
- c = d.int64()
+ c = Bytes(d.string())
case 2:
- c = math.Float64frombits(d.uint64())
+ c = d.int64()
case 3:
+ c = math.Float64frombits(d.uint64())
+ case 4:
c, _ = new(big.Int).SetString(d.string(), 10)
}
constants[i] = c