aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--go.mod4
-rw-r--r--go.sum10
-rw-r--r--internal/compile/compile.go32
-rw-r--r--internal/compile/serial.go22
-rw-r--r--lib/proto/proto.go87
-rw-r--r--starlark/eval.go26
-rw-r--r--starlark/eval_test.go1
-rw-r--r--starlark/hashtable.go4
-rw-r--r--starlark/library.go179
-rw-r--r--starlark/testdata/bytes.star159
-rw-r--r--starlark/testdata/json.star2
-rw-r--r--starlark/testdata/string.star40
-rw-r--r--starlark/value.go180
-rw-r--r--syntax/parse.go7
-rw-r--r--syntax/parse_test.go7
-rw-r--r--syntax/quote.go170
-rw-r--r--syntax/quote_test.go21
-rw-r--r--syntax/scan.go24
-rw-r--r--syntax/scan_test.go46
-rw-r--r--syntax/syntax.go2
20 files changed, 739 insertions, 284 deletions
diff --git a/go.mod b/go.mod
index 50bc000..d14060e 100644
--- a/go.mod
+++ b/go.mod
@@ -6,6 +6,8 @@ require (
github.com/chzyer/logex v1.1.10 // indirect
github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e
github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1 // indirect
- golang.org/x/sys v0.0.0-20200803210538-64077c9b5642
+ github.com/google/go-cmp v0.5.1 // indirect
+ golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f
+ golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect
google.golang.org/protobuf v1.25.0
)
diff --git a/go.sum b/go.sum
index b40c868..90a8048 100644
--- a/go.sum
+++ b/go.sum
@@ -24,8 +24,9 @@ github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5a
github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
-github.com/google/go-cmp v0.5.0 h1:/QaMHBdZ26BB3SSst0Iwl10Epc+xhTquomWX0oZEB6w=
github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
+github.com/google/go-cmp v0.5.1 h1:JFrFEBb2xKufg6XkJsJr+WbKb4FQlURi5RUcBveYu9k=
+github.com/google/go-cmp v0.5.1/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
@@ -42,15 +43,16 @@ golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJ
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
-golang.org/x/sys v0.0.0-20200803210538-64077c9b5642 h1:B6caxRw+hozq68X2MY7jEpZh/cr4/aHLv9xU8Kkadrw=
-golang.org/x/sys v0.0.0-20200803210538-64077c9b5642/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f h1:+Nyd8tzPX9R7BWHguqsrbFdRx3WQ/1ib8I44HXV5yTA=
+golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY=
golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q=
-golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 h1:go1bK/D/BFZV2I8cIQd1NKEZ+0owSTG1fDTci4IqFcE=
+golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM=
google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc=
diff --git a/internal/compile/compile.go b/internal/compile/compile.go
index ab67018..c314e6e 100644
--- a/internal/compile/compile.go
+++ b/internal/compile/compile.go
@@ -33,6 +33,7 @@ import (
"os"
"path/filepath"
"strconv"
+ "strings"
"sync"
"go.starlark.net/resolve"
@@ -46,7 +47,7 @@ var Disassemble = false
const debug = false // make code generation verbose, for debugging the compiler
// Increment this to force recompilation of saved bytecode files.
-const Version = 11
+const Version = 12
type Opcode uint8
@@ -309,12 +310,15 @@ func (op Opcode) String() string {
type Program struct {
Loads []Binding // name (really, string) and position of each load stmt
Names []string // names of attributes and predeclared variables
- Constants []interface{} // = string | int64 | float64 | *big.Int
+ Constants []interface{} // = string | int64 | float64 | *big.Int | Bytes
Functions []*Funcode
Globals []Binding // for error messages and tracing
Toplevel *Funcode // module initialization function
}
+// The type of a bytes literal value, to distinguish from text string.
+type Bytes string
+
// A Funcode is the code of a compiled Starlark function.
//
// Funcodes are serialized by the encoder.function method,
@@ -863,6 +867,8 @@ func PrintOp(fn *Funcode, pc uint32, op Opcode, arg uint32) {
switch x := fn.Prog.Constants[arg].(type) {
case string:
comment = strconv.Quote(x)
+ case Bytes:
+ comment = "b" + strconv.Quote(string(x))
default:
comment = fmt.Sprint(x)
}
@@ -1283,8 +1289,12 @@ func (fcomp *fcomp) expr(e syntax.Expr) {
fcomp.lookup(e)
case *syntax.Literal:
- // e.Value is int64, float64, *bigInt, or string.
- fcomp.emit1(CONSTANT, fcomp.pcomp.constantIndex(e.Value))
+ // e.Value is int64, float64, *bigInt, string
+ v := e.Value
+ if e.Token == syntax.BYTES {
+ v = Bytes(v.(string))
+ }
+ fcomp.emit1(CONSTANT, fcomp.pcomp.constantIndex(v))
case *syntax.ListExpr:
for _, x := range e.List {
@@ -1522,7 +1532,7 @@ func (fcomp *fcomp) plus(e *syntax.BinaryExpr) {
}
// addable reports whether e is a statically addable
-// expression: a [s]tring, [l]ist, or [t]uple.
+// expression: a [s]tring, [b]ytes, [l]ist, or [t]uple.
func addable(e syntax.Expr) rune {
switch e := e.(type) {
case *syntax.Literal:
@@ -1530,6 +1540,8 @@ func addable(e syntax.Expr) rune {
switch e.Token {
case syntax.STRING:
return 's'
+ case syntax.BYTES:
+ return 'b'
}
case *syntax.ListExpr:
return 'l'
@@ -1544,12 +1556,16 @@ func addable(e syntax.Expr) rune {
// The resulting syntax is degenerate, lacking position, etc.
func add(code rune, args []summand) syntax.Expr {
switch code {
- case 's':
- var buf bytes.Buffer
+ case 's', 'b':
+ var buf strings.Builder
for _, arg := range args {
buf.WriteString(arg.x.(*syntax.Literal).Value.(string))
}
- return &syntax.Literal{Token: syntax.STRING, Value: buf.String()}
+ tok := syntax.STRING
+ if code == 'b' {
+ tok = syntax.BYTES
+ }
+ return &syntax.Literal{Token: tok, Value: buf.String()}
case 'l':
var elems []syntax.Expr
for _, arg := range args {
diff --git a/internal/compile/serial.go b/internal/compile/serial.go
index 0107ef9..adadabf 100644
--- a/internal/compile/serial.go
+++ b/internal/compile/serial.go
@@ -51,9 +51,10 @@ package compile
//
// Constant: # type data
// type varint # 0=string string
-// data ... # 1=int varint
-// # 2=float varint (bits as uint64)
-// # 3=bigint string (decimal ASCII text)
+// data ... # 1=bytes string
+// # 2=int varint
+// # 3=float varint (bits as uint64)
+// # 4=bigint string (decimal ASCII text)
//
// The encoding starts with a four-byte magic number.
// The next four bytes are a little-endian uint32
@@ -109,14 +110,17 @@ func (prog *Program) Encode() []byte {
case string:
e.int(0)
e.string(c)
- case int64:
+ case Bytes:
e.int(1)
+ e.string(string(c))
+ case int64:
+ e.int(2)
e.int64(c)
case float64:
- e.int(2)
+ e.int(3)
e.uint64(math.Float64bits(c))
case *big.Int:
- e.int(3)
+ e.int(4)
e.string(c.Text(10))
}
}
@@ -249,10 +253,12 @@ func DecodeProgram(data []byte) (_ *Program, err error) {
case 0:
c = d.string()
case 1:
- c = d.int64()
+ c = Bytes(d.string())
case 2:
- c = math.Float64frombits(d.uint64())
+ c = d.int64()
case 3:
+ c = math.Float64frombits(d.uint64())
+ case 4:
c, _ = new(big.Int).SetString(d.string(), 10)
}
constants[i] = c
diff --git a/lib/proto/proto.go b/lib/proto/proto.go
index 84aa0d6..149162d 100644
--- a/lib/proto/proto.go
+++ b/lib/proto/proto.go
@@ -79,8 +79,6 @@
package proto
// TODO(adonovan): Go and Starlark API improvements:
-// - Contribute the 'bytes' data type to the core language.
-// See https://github.com/bazelbuild/starlark/issues/112.
// - Make Message and RepeatedField comparable.
// (NOTE: proto.Equal works only with generated message types.)
// - Support maps, oneof, any. But not messageset if we can avoid it.
@@ -234,7 +232,7 @@ func marshal(_ *starlark.Thread, fn *starlark.Builtin, args starlark.Tuple, kwar
if err != nil {
return nil, fmt.Errorf("%s: %v", fn.Name(), err)
}
- return Bytes(data), nil
+ return starlark.Bytes(data), nil
} else {
text, err := prototext.MarshalOptions{Indent: " "}.Marshal(m.Message())
if err != nil {
@@ -247,7 +245,7 @@ func marshal(_ *starlark.Thread, fn *starlark.Builtin, args starlark.Tuple, kwar
// unmarshal(msg) decodes a binary protocol message to a Message.
func unmarshal(thread *starlark.Thread, fn *starlark.Builtin, args starlark.Tuple, kwargs []starlark.Tuple) (starlark.Value, error) {
var desc MessageDescriptor
- var data Bytes
+ var data starlark.Bytes
if err := starlark.UnpackPositionalArgs(fn.Name(), args, kwargs, 2, &desc, &data); err != nil {
return nil, err
}
@@ -486,7 +484,7 @@ func toProto(fdesc protoreflect.FieldDescriptor, v starlark.Value) (protoreflect
case protoreflect.StringKind:
if s, ok := starlark.AsString(v); ok {
return protoreflect.ValueOfString(s), nil
- } else if b, ok := v.(Bytes); ok {
+ } else if b, ok := v.(starlark.Bytes); ok {
// TODO(adonovan): allow bytes for string? Not friendly to a Java port.
return protoreflect.ValueOfBytes([]byte(b)), nil
}
@@ -497,7 +495,7 @@ func toProto(fdesc protoreflect.FieldDescriptor, v starlark.Value) (protoreflect
// Instead provide b"..." literals in the core
// and a bytes(str) conversion.
return protoreflect.ValueOfBytes([]byte(s)), nil
- } else if b, ok := v.(Bytes); ok {
+ } else if b, ok := v.(starlark.Bytes); ok {
return protoreflect.ValueOfBytes([]byte(b)), nil
}
@@ -588,7 +586,7 @@ func toStarlark1(typ protoreflect.FieldDescriptor, x protoreflect.Value, frozen
return starlark.String(x.String())
case protoreflect.BytesKind:
- return Bytes(x.Bytes())
+ return starlark.Bytes(x.Bytes())
case protoreflect.DoubleKind, protoreflect.FloatKind:
return starlark.Float(x.Float())
@@ -1232,78 +1230,3 @@ func (x EnumValueDescriptor) CompareSameType(op syntax.Token, y_ starlark.Value,
return false, fmt.Errorf("%s %s %s not implemented", x.Type(), op, y_.Type())
}
}
-
-// A Bytes is an immutable sequence of bytes.
-// It is comparable, iterable, indexable, and sliceable.
-//
-// (In go.starlark.net, text Strings are also byte strings,
-// but we shouldn't rely on that.
-// See https://github.com/bazelbuild/starlark/issues/112.)
-type Bytes string
-
-var (
- _ starlark.Comparable = Bytes("")
- _ starlark.Iterable = Bytes("")
- _ starlark.Sliceable = Bytes("")
- _ starlark.Sequence = Bytes("")
-)
-
-func (b Bytes) String() string { return fmt.Sprintf("<%d bytes>", len(b)) }
-func (b Bytes) Type() string { return "bytes" }
-func (b Bytes) Freeze() {} // immutable
-func (b Bytes) Truth() starlark.Bool { return len(b) > 0 }
-func (b Bytes) Hash() (uint32, error) { return starlark.String(b).Hash() }
-func (b Bytes) Len() int { return len(b) }
-func (b Bytes) Index(i int) starlark.Value { return starlark.MakeInt(int(b[i])) }
-
-func (b Bytes) Slice(start, end, step int) starlark.Value {
- if step == 1 {
- return b[start:end]
- }
-
- sign := signum(step)
- var str []byte
- for i := start; signum(end-i) == sign; i += step {
- str = append(str, b[i])
- }
- return Bytes(str)
-}
-
-// From Hacker's Delight, section 2.8.
-func signum64(x int64) int { return int(uint64(x>>63) | uint64(-x)>>63) }
-func signum(x int) int { return signum64(int64(x)) }
-
-func (b Bytes) Iterate() starlark.Iterator { return &bytesIterator{string(b)} }
-
-type bytesIterator struct{ string }
-
-func (it *bytesIterator) Next(p *starlark.Value) bool {
- if it.string == "" {
- return false
- }
- *p = starlark.MakeInt(int(it.string[0]))
- it.string = it.string[1:]
- return true
-}
-
-func (it *bytesIterator) Done() {}
-
-func (x Bytes) CompareSameType(op syntax.Token, y_ starlark.Value, depth int) (bool, error) {
- y := y_.(Bytes)
- cmp := strings.Compare(string(x), string(y))
- switch op {
- case syntax.EQL:
- return cmp == 0, nil
- case syntax.NEQ:
- return cmp != 0, nil
- case syntax.LE:
- return cmp <= 0, nil
- case syntax.LT:
- return cmp < 0, nil
- case syntax.GE:
- return cmp >= 0, nil
- case syntax.GT:
- return cmp > 0, nil
- }
- panic(op)
-}
diff --git a/starlark/eval.go b/starlark/eval.go
index c9bbb67..d0ad91f 100644
--- a/starlark/eval.go
+++ b/starlark/eval.go
@@ -478,6 +478,8 @@ func makeToplevelFunction(prog *compile.Program, predeclared StringDict) *Functi
v = MakeBigInt(c)
case string:
v = String(c)
+ case compile.Bytes:
+ v = Bytes(c)
case float64:
v = Float(c)
default:
@@ -796,6 +798,8 @@ func Binary(op syntax.Token, x, y Value) (Value, error) {
return xf * y, nil
case String:
return stringRepeat(y, x)
+ case Bytes:
+ return bytesRepeat(y, x)
case *List:
elems, err := tupleRepeat(Tuple(y.elems), x)
if err != nil {
@@ -820,6 +824,10 @@ func Binary(op syntax.Token, x, y Value) (Value, error) {
if y, ok := y.(Int); ok {
return stringRepeat(x, y)
}
+ case Bytes:
+ if y, ok := y.(Int); ok {
+ return bytesRepeat(x, y)
+ }
case *List:
if y, ok := y.(Int); ok {
elems, err := tupleRepeat(Tuple(x.elems), y)
@@ -996,6 +1004,19 @@ func Binary(op syntax.Token, x, y Value) (Value, error) {
return nil, fmt.Errorf("'in <string>' requires string as left operand, not %s", x.Type())
}
return Bool(strings.Contains(string(y), string(needle))), nil
+ case Bytes:
+ switch needle := x.(type) {
+ case Bytes:
+ return Bool(strings.Contains(string(y), string(needle))), nil
+ case Int:
+ var b byte
+ if err := AsInt(needle, &b); err != nil {
+ return nil, fmt.Errorf("int in bytes: %s", err)
+ }
+ return Bool(strings.IndexByte(string(y), b) >= 0), nil
+ default:
+ return nil, fmt.Errorf("'in bytes' requires bytes or int as left operand, not %s", x.Type())
+ }
case rangeValue:
i, err := NumberToInt(x)
if err != nil {
@@ -1138,6 +1159,11 @@ func tupleRepeat(elems Tuple, n Int) (Tuple, error) {
return res, nil
}
+func bytesRepeat(b Bytes, n Int) (Bytes, error) {
+ res, err := stringRepeat(String(b), n)
+ return Bytes(res), err
+}
+
func stringRepeat(s String, n Int) (String, error) {
if s == "" {
return "", nil
diff --git a/starlark/eval_test.go b/starlark/eval_test.go
index 4ce08d3..9752fe8 100644
--- a/starlark/eval_test.go
+++ b/starlark/eval_test.go
@@ -115,6 +115,7 @@ func TestExecFile(t *testing.T) {
"testdata/assign.star",
"testdata/bool.star",
"testdata/builtins.star",
+ "testdata/bytes.star",
"testdata/control.star",
"testdata/dict.star",
"testdata/float.star",
diff --git a/starlark/hashtable.go b/starlark/hashtable.go
index d425019..27990b5 100644
--- a/starlark/hashtable.go
+++ b/starlark/hashtable.go
@@ -362,9 +362,9 @@ func hashString(s string) uint32 {
//go:linkname goStringHash runtime.stringHash
func goStringHash(s string, seed uintptr) uintptr
-// softHashString computes the FNV hash of s in software.
+// softHashString computes the 32-bit FNV-1a hash of s in software.
func softHashString(s string) uint32 {
- var h uint32
+ var h uint32 = 2166136261
for i := 0; i < len(s); i++ {
h ^= uint32(s[i])
h *= 16777619
diff --git a/starlark/library.go b/starlark/library.go
index 5645418..5620426 100644
--- a/starlark/library.go
+++ b/starlark/library.go
@@ -42,6 +42,7 @@ func init() {
"any": NewBuiltin("any", any),
"all": NewBuiltin("all", all),
"bool": NewBuiltin("bool", bool_),
+ "bytes": NewBuiltin("bytes", bytes_),
"chr": NewBuiltin("chr", chr),
"dict": NewBuiltin("dict", dict),
"dir": NewBuiltin("dir", dir),
@@ -73,6 +74,10 @@ func init() {
// methods of built-in types
// https://github.com/google/starlark-go/blob/master/doc/spec.md#built-in-methods
var (
+ bytesMethods = map[string]*Builtin{
+ "elems": NewBuiltin("elems", bytes_elems),
+ }
+
dictMethods = map[string]*Builtin{
"clear": NewBuiltin("clear", dict_clear),
"get": NewBuiltin("get", dict_get),
@@ -198,6 +203,45 @@ func bool_(thread *Thread, _ *Builtin, args Tuple, kwargs []Tuple) (Value, error
return x.Truth(), nil
}
+// https://github.com/google/starlark-go/blob/master/doc/spec.md#bytes
+func bytes_(thread *Thread, _ *Builtin, args Tuple, kwargs []Tuple) (Value, error) {
+ if len(kwargs) > 0 {
+ return nil, fmt.Errorf("bytes does not accept keyword arguments")
+ }
+ if len(args) != 1 {
+ return nil, fmt.Errorf("bytes: got %d arguments, want exactly 1", len(args))
+ }
+ switch x := args[0].(type) {
+ case Bytes:
+ return x, nil
+ case String:
+ // Invalid encodings are replaced by that of U+FFFD.
+ return Bytes(utf8Transcode(string(x))), nil
+ case Iterable:
+ // iterable of numeric byte values
+ var buf strings.Builder
+ if n := Len(x); n >= 0 {
+ // common case: known length
+ buf.Grow(n)
+ }
+ iter := x.Iterate()
+ defer iter.Done()
+ var elem Value
+ var b byte
+ for i := 0; iter.Next(&elem); i++ {
+ if err := AsInt(elem, &b); err != nil {
+ return nil, fmt.Errorf("bytes: at index %d, %s", i, err)
+ }
+ buf.WriteByte(b)
+ }
+ return Bytes(buf.String()), nil
+
+ default:
+ // Unlike string(foo), which stringifies it, bytes(foo) is an error.
+ return nil, fmt.Errorf("bytes: got %s, want string, bytes, or iterable of ints", x.Type())
+ }
+}
+
// https://github.com/google/starlark-go/blob/master/doc/spec.md#chr
func chr(thread *Thread, _ *Builtin, args Tuple, kwargs []Tuple) (Value, error) {
if len(kwargs) > 0 {
@@ -261,9 +305,6 @@ func enumerate(thread *Thread, _ *Builtin, args Tuple, kwargs []Tuple) (Value, e
}
iter := iterable.Iterate()
- if iter == nil {
- return nil, fmt.Errorf("enumerate: got %s, want iterable", iterable.Type())
- }
defer iter.Done()
var pairs []Value
@@ -433,19 +474,27 @@ func hasattr(thread *Thread, _ *Builtin, args Tuple, kwargs []Tuple) (Value, err
// https://github.com/google/starlark-go/blob/master/doc/spec.md#hash
func hash(thread *Thread, _ *Builtin, args Tuple, kwargs []Tuple) (Value, error) {
- var s string
- if err := UnpackPositionalArgs("hash", args, kwargs, 1, &s); err != nil {
+ var x Value
+ if err := UnpackPositionalArgs("hash", args, kwargs, 1, &x); err != nil {
return nil, err
}
- // The Starlark spec requires that the hash function be
- // deterministic across all runs, motivated by the need
- // for reproducibility of builds. Thus we cannot call
- // String.Hash, which uses the fastest implementation
- // available, because as varies across process restarts,
- // and may evolve with the implementation.
-
- return MakeInt(int(javaStringHash(s))), nil
+ var h int
+ switch x := x.(type) {
+ case String:
+ // The Starlark spec requires that the hash function be
+ // deterministic across all runs, motivated by the need
+ // for reproducibility of builds. Thus we cannot call
+ // String.Hash, which uses the fastest implementation
+ // available, because as varies across process restarts,
+ // and may evolve with the implementation.
+ h = int(javaStringHash(string(x)))
+ case Bytes:
+ h = int(softHashString(string(x))) // FNV32
+ default:
+ return nil, fmt.Errorf("hash: got %s, want string or bytes", x.Type())
+ }
+ return MakeInt(h), nil
}
// javaStringHash returns the same hash as would be produced by
@@ -691,16 +740,26 @@ func ord(thread *Thread, _ *Builtin, args Tuple, kwargs []Tuple) (Value, error)
if len(args) != 1 {
return nil, fmt.Errorf("ord: got %d arguments, want 1", len(args))
}
- s, ok := AsString(args[0])
- if !ok {
- return nil, fmt.Errorf("ord: got %s, want string", args[0].Type())
- }
- r, sz := utf8.DecodeRuneInString(s)
- if sz == 0 || sz != len(s) {
- n := utf8.RuneCountInString(s)
- return nil, fmt.Errorf("ord: string encodes %d Unicode code points, want 1", n)
+ switch x := args[0].(type) {
+ case String:
+ // ord(string) returns int value of sole rune.
+ s := string(x)
+ r, sz := utf8.DecodeRuneInString(s)
+ if sz == 0 || sz != len(s) {
+ n := utf8.RuneCountInString(s)
+ return nil, fmt.Errorf("ord: string encodes %d Unicode code points, want 1", n)
+ }
+ return MakeInt(int(r)), nil
+
+ case Bytes:
+ // ord(bytes) returns int value of sole byte.
+ if len(x) != 1 {
+ return nil, fmt.Errorf("ord: bytes has length %d, want 1", len(x))
+ }
+ return MakeInt(int(x[0])), nil
+ default:
+ return nil, fmt.Errorf("ord: got %s, want string or bytes", x.Type())
}
- return MakeInt(int(r)), nil
}
// https://github.com/google/starlark-go/blob/master/doc/spec.md#print
@@ -716,6 +775,8 @@ func print(thread *Thread, b *Builtin, args Tuple, kwargs []Tuple) (Value, error
}
if s, ok := AsString(v); ok {
buf.WriteString(s)
+ } else if b, ok := v.(Bytes); ok {
+ buf.WriteString(string(b))
} else {
writeValue(buf, v, nil)
}
@@ -993,11 +1054,29 @@ func str(thread *Thread, _ *Builtin, args Tuple, kwargs []Tuple) (Value, error)
if len(args) != 1 {
return nil, fmt.Errorf("str: got %d arguments, want exactly 1", len(args))
}
- x := args[0]
- if _, ok := AsString(x); !ok {
- x = String(x.String())
+ switch x := args[0].(type) {
+ case String:
+ return x, nil
+ case Bytes:
+ // Invalid encodings are replaced by that of U+FFFD.
+ return String(utf8Transcode(string(x))), nil
+ default:
+ return String(x.String()), nil
}
- return x, nil
+}
+
+// utf8Transcode returns the UTF-8-to-UTF-8 transcoding of s.
+// The effect is that each code unit that is part of an
+// invalid sequence is replaced by U+FFFD.
+func utf8Transcode(s string) string {
+ if utf8.ValidString(s) {
+ return s
+ }
+ var out strings.Builder
+ for _, r := range s {
+ out.WriteRune(r)
+ }
+ return out.String()
}
// https://github.com/google/starlark-go/blob/master/doc/spec.md#tuple
@@ -1374,13 +1453,51 @@ func string_iterable(_ *Thread, b *Builtin, args Tuple, kwargs []Tuple) (Value,
if err := UnpackPositionalArgs(b.Name(), args, kwargs, 0); err != nil {
return nil, err
}
- return stringIterable{
- s: b.Receiver().(String),
- ords: b.Name()[len(b.Name())-2] == 'd',
- codepoints: b.Name()[0] == 'c',
- }, nil
+ s := b.Receiver().(String)
+ ords := b.Name()[len(b.Name())-2] == 'd'
+ codepoints := b.Name()[0] == 'c'
+ if codepoints {
+ return stringCodepoints{s, ords}, nil
+ } else {
+ return stringElems{s, ords}, nil
+ }
+}
+
+// bytes_elems returns an unspecified iterable value whose
+// iterator yields the int values of successive elements.
+func bytes_elems(_ *Thread, b *Builtin, args Tuple, kwargs []Tuple) (Value, error) {
+ if err := UnpackPositionalArgs(b.Name(), args, kwargs, 0); err != nil {
+ return nil, err
+ }
+ return bytesIterable{b.Receiver().(Bytes)}, nil
+}
+
+// A bytesIterable is an iterable returned by bytes.elems(),
+// whose iterator yields a sequence of numeric bytes values.
+type bytesIterable struct{ bytes Bytes }
+
+var _ Iterable = (*bytesIterable)(nil)
+
+func (bi bytesIterable) String() string { return bi.bytes.String() + ".elems()" }
+func (bi bytesIterable) Type() string { return "bytes.elems" }
+func (bi bytesIterable) Freeze() {} // immutable
+func (bi bytesIterable) Truth() Bool { return True }
+func (bi bytesIterable) Hash() (uint32, error) { return 0, fmt.Errorf("unhashable: %s", bi.Type()) }
+func (bi bytesIterable) Iterate() Iterator { return &bytesIterator{bi.bytes} }
+
+type bytesIterator struct{ bytes Bytes }
+
+func (it *bytesIterator) Next(p *Value) bool {
+ if it.bytes == "" {
+ return false
+ }
+ *p = MakeInt(int(it.bytes[0]))
+ it.bytes = it.bytes[1:]
+ return true
}
+func (*bytesIterator) Done() {}
+
// https://github.com/google/starlark-go/blob/master/doc/spec.md#string·count
func string_count(_ *Thread, b *Builtin, args Tuple, kwargs []Tuple) (Value, error) {
var sub string
diff --git a/starlark/testdata/bytes.star b/starlark/testdata/bytes.star
new file mode 100644
index 0000000..d500403
--- /dev/null
+++ b/starlark/testdata/bytes.star
@@ -0,0 +1,159 @@
+# Tests of 'bytes' (immutable byte strings).
+
+load("assert.star", "assert")
+
+# bytes(string) -- UTF-k to UTF-8 transcoding with U+FFFD replacement
+hello = bytes("hello, 世界")
+goodbye = bytes("goodbye")
+empty = bytes("")
+nonprinting = bytes("\t\n\x7F\u200D") # TAB, NEWLINE, DEL, ZERO_WIDTH_JOINER
+assert.eq(bytes("hello, 世界"[:-1]), b"hello, 世��")
+
+# bytes(iterable of int) -- construct from numeric byte values
+assert.eq(bytes([65, 66, 67]), b"ABC")
+assert.eq(bytes((65, 66, 67)), b"ABC")
+assert.eq(bytes([0xf0, 0x9f, 0x98, 0xbf]), b"😿")
+assert.fails(lambda: bytes([300]),
+ "at index 0, 300 out of range .want value in unsigned 8-bit range")
+assert.fails(lambda: bytes([b"a"]),
+ "at index 0, got bytes, want int")
+assert.fails(lambda: bytes(1), "want string, bytes, or iterable of ints")
+
+# literals
+assert.eq(b"hello, 世界", hello)
+assert.eq(b"goodbye", goodbye)
+assert.eq(b"", empty)
+assert.eq(b"\t\n\x7F\u200D", nonprinting)
+assert.ne("abc", b"abc")
+assert.eq(b"\012\xff\u0400\U0001F63F", b"\n\xffЀ😿") # see scanner tests for more
+assert.eq(rb"\r\n\t", b"\\r\\n\\t") # raw
+
+# type
+assert.eq(type(hello), "bytes")
+
+# len
+assert.eq(len(hello), 13)
+assert.eq(len(goodbye), 7)
+assert.eq(len(empty), 0)
+assert.eq(len(b"A"), 1)
+assert.eq(len(b"Ѐ"), 2)
+assert.eq(len(b"世"), 3)
+assert.eq(len(b"😿"), 4)
+
+# truth
+assert.true(hello)
+assert.true(goodbye)
+assert.true(not empty)
+
+# str(bytes) does UTF-8 to UTF-k transcoding.
+# TODO(adonovan): specify.
+assert.eq(str(hello), "hello, 世界")
+assert.eq(str(hello[:-1]), "hello, 世��") # incomplete UTF-8 encoding => U+FFFD
+assert.eq(str(goodbye), "goodbye")
+assert.eq(str(empty), "")
+assert.eq(str(nonprinting), "\t\n\x7f\u200d")
+assert.eq(str(b"\xED\xB0\x80"), "���") # UTF-8 encoding of unpaired surrogate => U+FFFD x 3
+
+# repr
+assert.eq(repr(hello), r'b"hello, 世界"')
+assert.eq(repr(hello[:-1]), r'b"hello, 世\xe7\x95"') # (incomplete UTF-8 encoding )
+assert.eq(repr(goodbye), 'b"goodbye"')
+assert.eq(repr(empty), 'b""')
+assert.eq(repr(nonprinting), 'b"\\t\\n\\x7f\\u200d"')
+
+# equality
+assert.eq(hello, hello)
+assert.ne(hello, goodbye)
+assert.eq(b"goodbye", goodbye)
+
+# ordered comparison
+assert.lt(b"abc", b"abd")
+assert.lt(b"abc", b"abcd")
+assert.lt(b"\x7f", b"\x80") # bytes compare as uint8, not int8
+
+# bytes are dict-hashable
+dict = {hello: 1, goodbye: 2}
+dict[b"goodbye"] = 3
+assert.eq(len(dict), 2)
+assert.eq(dict[goodbye], 3)
+
+# hash(bytes) is 32-bit FNV-1a.
+assert.eq(hash(b""), 0x811c9dc5)
+assert.eq(hash(b"a"), 0xe40c292c)
+assert.eq(hash(b"ab"), 0x4d2505ca)
+assert.eq(hash(b"abc"), 0x1a47e90b)
+
+# indexing
+assert.eq(goodbye[0], b"g")
+assert.eq(goodbye[-1], b"e")
+assert.fails(lambda: goodbye[100], "out of range")
+
+# slicing
+assert.eq(goodbye[:4], b"good")
+assert.eq(goodbye[4:], b"bye")
+assert.eq(goodbye[::2], b"gobe")
+assert.eq(goodbye[3:4], b"d") # special case: len=1
+assert.eq(goodbye[4:4], b"") # special case: len=0
+
+# bytes in bytes
+assert.eq(b"bc" in b"abcd", True)
+assert.eq(b"bc" in b"dcab", False)
+assert.fails(lambda: "bc" in b"dcab", "requires bytes or int as left operand, not string")
+
+# int in bytes
+assert.eq(97 in b"abc", True) # 97='a'
+assert.eq(100 in b"abc", False) # 100='d'
+assert.fails(lambda: 256 in b"abc", "int in bytes: 256 out of range")
+assert.fails(lambda: -1 in b"abc", "int in bytes: -1 out of range")
+
+# ord TODO(adonovan): specify
+assert.eq(ord(b"a"), 97)
+assert.fails(lambda: ord(b"ab"), "ord: bytes has length 2, want 1")
+assert.fails(lambda: ord(b""), "ord: bytes has length 0, want 1")
+
+# repeat (bytes * int)
+assert.eq(goodbye * 3, b"goodbyegoodbyegoodbye")
+assert.eq(3 * goodbye, b"goodbyegoodbyegoodbye")
+
+# elems() returns an iterable value over 1-byte substrings.
+assert.eq(type(hello.elems()), "bytes.elems")
+assert.eq(str(hello.elems()), "b\"hello, 世界\".elems()")
+assert.eq(list(hello.elems()), [104, 101, 108, 108, 111, 44, 32, 228, 184, 150, 231, 149, 140])
+assert.eq(bytes([104, 101, 108, 108, 111, 44, 32, 228, 184, 150, 231, 149, 140]), hello)
+assert.eq(list(goodbye.elems()), [103, 111, 111, 100, 98, 121, 101])
+assert.eq(list(empty.elems()), [])
+assert.eq(bytes(hello.elems()), hello) # bytes(iterable) is dual to bytes.elems()
+
+# x[i] = ...
+def f():
+ b"abc"[1] = b"B"
+
+assert.fails(f, "bytes.*does not support.*assignment")
+
+# TODO(adonovan): the specification is not finalized in many areas:
+# - chr, ord functions
+# - encoding/decoding bytes to string.
+# - methods: find, index, split, etc.
+#
+# Summary of string operations (put this in spec).
+#
+# string to number:
+# - bytes[i] returns numeric value of ith byte.
+# - ord(string) returns numeric value of sole code point in string.
+# - ord(string[i]) is not a useful operation: fails on non-ASCII; see below.
+# Q. Perhaps ord should return the first (not sole) code point? Then it becomes a UTF-8 decoder.
+# Perhaps ord(string, index=int) should apply the index and relax the len=1 check.
+# - string.codepoint() iterates over 1-codepoint substrings.
+# - string.codepoint_ords() iterates over numeric values of code points in string.
+# - string.elems() iterates over 1-element (UTF-k code) substrings.
+# - string.elem_ords() iterates over numeric UTF-k code values.
+# - string.elem_ords()[i] returns numeric value of ith element (UTF-k code).
+# - string.elems()[i] returns substring of a single element (UTF-k code).
+# - int(string) parses string as decimal (or other) numeric literal.
+#
+# number to string:
+# - chr(int) returns string, UTF-k encoding of Unicode code point (like Python).
+# Redundant with '%c' % int (which Python2 calls 'unichr'.)
+# - bytes(chr(int)) returns byte string containing UTF-8 encoding of one code point.
+# - bytes([int]) returns 1-byte string (with regrettable list allocation).
+# - str(int) - format number as decimal.
diff --git a/starlark/testdata/json.star b/starlark/testdata/json.star
index ef33d91..7c7b316 100644
--- a/starlark/testdata/json.star
+++ b/starlark/testdata/json.star
@@ -23,7 +23,7 @@ assert.eq(json.encode(range(3)), "[0,1,2]") # a built-in iterable
assert.eq(json.encode(dict(x = 1, y = "two")), '{"x":1,"y":"two"}')
assert.eq(json.encode(dict(y = "two", x = 1)), '{"x":1,"y":"two"}') # key, not insertion, order
assert.eq(json.encode(struct(x = 1, y = "two")), '{"x":1,"y":"two"}') # a user-defined HasAttrs
-assert.eq(json.encode("\x80"), '"\\ufffd"') # invalid UTF-8 -> replacement char
+assert.eq(json.encode("😹"[:1]), '"\\ufffd"') # invalid UTF-8 -> replacement char
def encode_error(expr, error):
assert.fails(lambda: json.encode(expr), error)
diff --git a/starlark/testdata/string.star b/starlark/testdata/string.star
index 84c6791..b317d1a 100644
--- a/starlark/testdata/string.star
+++ b/starlark/testdata/string.star
@@ -37,8 +37,9 @@ assert.eq(chr(1049), "Й") # 2-byte UTF-8 encoding
assert.eq(chr(0x1F63F), "😿") # 4-byte UTF-8 encoding
assert.fails(lambda: chr(-1), "Unicode code point -1 out of range \\(<0\\)")
assert.fails(lambda: chr(0x110000), "Unicode code point U\\+110000 out of range \\(>0x10FFFF\\)")
-assert.eq(ord("A"), 65)
-assert.eq(ord("Й"), 1049)
+assert.eq(ord("A"), 0x41)
+assert.eq(ord("Й"), 0x419)
+assert.eq(ord("世"), 0x4e16)
assert.eq(ord("😿"), 0x1F63F)
assert.eq(ord("Й"[1:]), 0xFFFD) # = Unicode replacement character
assert.fails(lambda: ord("abc"), "string encodes 3 Unicode code points, want 1")
@@ -46,42 +47,50 @@ assert.fails(lambda: ord(""), "string encodes 0 Unicode code points, want 1")
assert.fails(lambda: ord("😿"[1:]), "string encodes 3 Unicode code points, want 1") # 3 x 0xFFFD
# string.codepoint_ords
-assert.eq(type("abcЙ😿".codepoint_ords()), "codepoints")
+assert.eq(type("abcЙ😿".codepoint_ords()), "string.codepoints")
assert.eq(str("abcЙ😿".codepoint_ords()), '"abcЙ😿".codepoint_ords()')
assert.eq(list("abcЙ😿".codepoint_ords()), [97, 98, 99, 1049, 128575])
assert.eq(list(("A" + "😿Z"[1:]).codepoint_ords()), [ord("A"), 0xFFFD, 0xFFFD, 0xFFFD, ord("Z")])
assert.eq(list("".codepoint_ords()), [])
+assert.fails(lambda: "abcЙ😿".codepoint_ords()[2], "unhandled index") # not indexable
+assert.fails(lambda: len("abcЙ😿".codepoint_ords()), "no len") # unknown length
# string.codepoints
-assert.eq(type("abcЙ😿".codepoints()), "codepoints")
+assert.eq(type("abcЙ😿".codepoints()), "string.codepoints")
assert.eq(str("abcЙ😿".codepoints()), '"abcЙ😿".codepoints()')
assert.eq(list("abcЙ😿".codepoints()), ["a", "b", "c", "Й", "😿"])
-assert.eq(list(("A" + "😿Z"[1:]).codepoints()), ["A", "\x9f", "\x98", "\xbf", "Z"])
+assert.eq(list(("A" + "😿Z"[1:]).codepoints()), ["A", "�", "�", "�", "Z"])
assert.eq(list("".codepoints()), [])
+assert.fails(lambda: "abcЙ😿".codepoints()[2], "unhandled index") # not indexable
+assert.fails(lambda: len("abcЙ😿".codepoints()), "no len") # unknown length
# string.elem_ords
-assert.eq(type("abcЙ😿".elem_ords()), "elems")
+assert.eq(type("abcЙ😿".elem_ords()), "string.elems")
assert.eq(str("abcЙ😿".elem_ords()), '"abcЙ😿".elem_ords()')
assert.eq(list("abcЙ😿".elem_ords()), [97, 98, 99, 208, 153, 240, 159, 152, 191])
assert.eq(list(("A" + "😿Z"[1:]).elem_ords()), [65, 159, 152, 191, 90])
assert.eq(list("".elem_ords()), [])
+assert.eq("abcЙ😿".elem_ords()[2], 99) # indexable
+assert.eq(len("abcЙ😿".elem_ords()), 9) # known length
-# string.elems
-assert.eq(type("abcЙ😿".elems()), "elems")
+# string.elems (1-byte substrings, which are invalid text)
+assert.eq(type("abcЙ😿".elems()), "string.elems")
assert.eq(str("abcЙ😿".elems()), '"abcЙ😿".elems()')
assert.eq(
- list("abcЙ😿".elems()),
- ["a", "b", "c", "\xd0", "\x99", "\xf0", "\x9f", "\x98", "\xbf"],
+ repr(list("abcЙ😿".elems())),
+ r'["a", "b", "c", "\xd0", "\x99", "\xf0", "\x9f", "\x98", "\xbf"]',
)
assert.eq(
- list(("A" + "😿Z"[1:]).elems()),
- ["A", "\x9f", "\x98", "\xbf", "Z"],
+ repr(list(("A" + "😿Z"[1:]).elems())),
+ r'["A", "\x9f", "\x98", "\xbf", "Z"]',
)
assert.eq(list("".elems()), [])
+assert.eq("abcЙ😿".elems()[2], "c") # indexable
+assert.eq(len("abcЙ😿".elems()), 9) # known length
# indexing, x[i]
assert.eq("Hello, 世界!"[0], "H")
-assert.eq("Hello, 世界!"[7], "\xe4")
+assert.eq(repr("Hello, 世界!"[7]), r'"\xe4"') # (invalid text)
assert.eq("Hello, 世界!"[13], "!")
assert.fails(lambda: "abc"[-4], "out of range")
assert.eq("abc"[-3], "a")
@@ -93,10 +102,8 @@ assert.eq("abc"[2], "c")
assert.fails(lambda: "abc"[4], "out of range")
# x[i] = ...
-x2 = "abc"
-
def f():
- x2[1] = "B"
+ "abc"[1] = "B"
assert.fails(f, "string.*does not support.*assignment")
@@ -122,6 +129,7 @@ assert.eq("abc"[:3], "abc")
assert.eq("abc"[:4], "abc")
assert.eq("abc"[1:2], "b")
assert.eq("abc"[2:1], "")
+assert.eq(repr("😿"[:1]), r'"\xf0"') # (invalid text)
# non-unit strides
assert.eq("abcd"[0:4:1], "abcd")
diff --git a/starlark/value.go b/starlark/value.go
index bcec750..81e29ed 100644
--- a/starlark/value.go
+++ b/starlark/value.go
@@ -499,13 +499,20 @@ func (f Float) Unary(op syntax.Token) (Value, error) {
return nil, nil
}
-// String is the type of a Starlark string.
+// String is the type of a Starlark text string.
//
// A String encapsulates an an immutable sequence of bytes,
// but strings are not directly iterable. Instead, iterate
// over the result of calling one of these four methods:
// codepoints, codepoint_ords, elems, elem_ords.
//
+// Strings typically contain text; use Bytes for binary strings.
+// The Starlark spec defines text strings as sequences of UTF-k
+// codes that encode Unicode code points. In this Go implementation,
+// k=8, whereas in a Java implementation, k=16. For portability,
+// operations on strings should aim to avoid assumptions about
+// the value of k.
+//
// Warning: the contract of the Value interface's String method is that
// it returns the value printed in Starlark notation,
// so s.String() or fmt.Sprintf("%s", s) returns a quoted string.
@@ -513,7 +520,7 @@ func (f Float) Unary(op syntax.Token) (Value, error) {
// of a Starlark string as a Go string.
type String string
-func (s String) String() string { return strconv.Quote(string(s)) }
+func (s String) String() string { return syntax.Quote(string(s), false) }
func (s String) GoString() string { return string(s) }
func (s String) Type() string { return "string" }
func (s String) Freeze() {} // immutable
@@ -545,73 +552,106 @@ func (x String) CompareSameType(op syntax.Token, y_ Value, depth int) (bool, err
func AsString(x Value) (string, bool) { v, ok := x.(String); return string(v), ok }
-// A stringIterable is an iterable whose iterator yields a sequence of
-// either Unicode code points or elements (bytes),
-// either numerically or as successive substrings.
-type stringIterable struct {
- s String
- ords bool
- codepoints bool
+// A stringElems is an iterable whose iterator yields a sequence of
+// elements (bytes), either numerically or as successive substrings.
+// It is an indexable sequence.
+type stringElems struct {
+ s String
+ ords bool
}
-var _ Iterable = (*stringIterable)(nil)
+var (
+ _ Iterable = (*stringElems)(nil)
+ _ Indexable = (*stringElems)(nil)
+)
-func (si stringIterable) String() string {
- var etype string
- if si.codepoints {
- etype = "codepoint"
+func (si stringElems) String() string {
+ if si.ords {
+ return si.s.String() + ".elem_ords()"
} else {
- etype = "elem"
+ return si.s.String() + ".elems()"
}
+}
+func (si stringElems) Type() string { return "string.elems" }
+func (si stringElems) Freeze() {} // immutable
+func (si stringElems) Truth() Bool { return True }
+func (si stringElems) Hash() (uint32, error) { return 0, fmt.Errorf("unhashable: %s", si.Type()) }
+func (si stringElems) Iterate() Iterator { return &stringElemsIterator{si, 0} }
+func (si stringElems) Len() int { return len(si.s) }
+func (si stringElems) Index(i int) Value {
if si.ords {
- return si.s.String() + "." + etype + "_ords()"
+ return MakeInt(int(si.s[i]))
} else {
- return si.s.String() + "." + etype + "s()"
+ // TODO(adonovan): opt: preallocate canonical 1-byte strings
+ // to avoid interface allocation.
+ return si.s[i : i+1]
+ }
+}
+
+type stringElemsIterator struct {
+ si stringElems
+ i int
+}
+
+func (it *stringElemsIterator) Next(p *Value) bool {
+ if it.i == len(it.si.s) {
+ return false
}
+ *p = it.si.Index(it.i)
+ it.i++
+ return true
+}
+
+func (*stringElemsIterator) Done() {}
+
+// A stringCodepoints is an iterable whose iterator yields a sequence of
+// Unicode code points, either numerically or as successive substrings.
+// It is not indexable.
+type stringCodepoints struct {
+ s String
+ ords bool
}
-func (si stringIterable) Type() string {
- if si.codepoints {
- return "codepoints"
+
+var _ Iterable = (*stringCodepoints)(nil)
+
+func (si stringCodepoints) String() string {
+ if si.ords {
+ return si.s.String() + ".codepoint_ords()"
} else {
- return "elems"
+ return si.s.String() + ".codepoints()"
}
}
-func (si stringIterable) Freeze() {} // immutable
-func (si stringIterable) Truth() Bool { return True }
-func (si stringIterable) Hash() (uint32, error) { return 0, fmt.Errorf("unhashable: %s", si.Type()) }
-func (si stringIterable) Iterate() Iterator { return &stringIterator{si, 0} }
+func (si stringCodepoints) Type() string { return "string.codepoints" }
+func (si stringCodepoints) Freeze() {} // immutable
+func (si stringCodepoints) Truth() Bool { return True }
+func (si stringCodepoints) Hash() (uint32, error) { return 0, fmt.Errorf("unhashable: %s", si.Type()) }
+func (si stringCodepoints) Iterate() Iterator { return &stringCodepointsIterator{si, 0} }
-type stringIterator struct {
- si stringIterable
+type stringCodepointsIterator struct {
+ si stringCodepoints
i int
}
-func (it *stringIterator) Next(p *Value) bool {
+func (it *stringCodepointsIterator) Next(p *Value) bool {
s := it.si.s[it.i:]
if s == "" {
return false
}
- if it.si.codepoints {
- r, sz := utf8.DecodeRuneInString(string(s))
- if !it.si.ords {
- *p = s[:sz]
+ r, sz := utf8.DecodeRuneInString(string(s))
+ if !it.si.ords {
+ if r == utf8.RuneError {
+ *p = String(r)
} else {
- *p = MakeInt(int(r))
+ *p = s[:sz]
}
- it.i += sz
} else {
- b := int(s[0])
- if !it.si.ords {
- *p = s[:1]
- } else {
- *p = MakeInt(b)
- }
- it.i += 1
+ *p = MakeInt(int(r))
}
+ it.i += sz
return true
}
-func (*stringIterator) Done() {}
+func (*stringCodepointsIterator) Done() {}
// A Function is a function defined by a Starlark def statement or lambda expression.
// The initialization behavior of a Starlark module is also represented by a Function.
@@ -1084,6 +1124,7 @@ func writeValue(out *strings.Builder, x Value, path []Value) {
case nil:
out.WriteString("<nil>") // indicates a bug
+ // These four cases are duplicates of T.String(), for efficiency.
case NoneType:
out.WriteString("None")
@@ -1098,7 +1139,7 @@ func writeValue(out *strings.Builder, x Value, path []Value) {
}
case String:
- fmt.Fprintf(out, "%q", string(x))
+ out.WriteString(syntax.Quote(string(x), false))
case *List:
out.WriteByte('[')
@@ -1318,6 +1359,8 @@ func Len(x Value) int {
switch x := x.(type) {
case String:
return x.Len()
+ case Indexable:
+ return x.Len()
case Sequence:
return x.Len()
}
@@ -1335,3 +1378,54 @@ func Iterate(x Value) Iterator {
}
return nil
}
+
+// Bytes is the type of a Starlark binary string.
+//
+// A Bytes encapsulates an immutable sequence of bytes.
+// It is comparable, indexable, and sliceable, but not direcly iterable;
+// use bytes.elems() for an iterable view.
+//
+// In this Go implementation, the elements of 'string' and 'bytes' are
+// both bytes, but in other implementations, notably Java, the elements
+// of a 'string' are UTF-16 codes (Java chars). The spec abstracts text
+// strings as sequences of UTF-k codes that encode Unicode code points,
+// and operations that convert from text to binary incur UTF-k-to-UTF-8
+// transcoding; conversely, conversion from binary to text incurs
+// UTF-8-to-UTF-k transcoding. Because k=8 for Go, these operations
+// are the identity function, at least for valid encodings of text.
+type Bytes string
+
+var (
+ _ Comparable = Bytes("")
+ _ Sliceable = Bytes("")
+ _ Indexable = Bytes("")
+)
+
+func (b Bytes) String() string { return syntax.Quote(string(b), true) }
+func (b Bytes) Type() string { return "bytes" }
+func (b Bytes) Freeze() {} // immutable
+func (b Bytes) Truth() Bool { return len(b) > 0 }
+func (b Bytes) Hash() (uint32, error) { return String(b).Hash() }
+func (b Bytes) Len() int { return len(b) }
+func (b Bytes) Index(i int) Value { return b[i : i+1] }
+
+func (b Bytes) Attr(name string) (Value, error) { return builtinAttr(b, name, bytesMethods) }
+func (b Bytes) AttrNames() []string { return builtinAttrNames(bytesMethods) }
+
+func (b Bytes) Slice(start, end, step int) Value {
+ if step == 1 {
+ return b[start:end]
+ }
+
+ sign := signum(step)
+ var str []byte
+ for i := start; signum(end-i) == sign; i += step {
+ str = append(str, b[i])
+ }
+ return Bytes(str)
+}
+
+func (x Bytes) CompareSameType(op syntax.Token, y_ Value, depth int) (bool, error) {
+ y := y_.(Bytes)
+ return threeway(op, strings.Compare(string(x), string(y))), nil
+}
diff --git a/syntax/parse.go b/syntax/parse.go
index 50b8087..f4c8fff 100644
--- a/syntax/parse.go
+++ b/syntax/parse.go
@@ -771,8 +771,7 @@ func (p *parser) parseArgs() []Expr {
}
// primary = IDENT
-// | INT | FLOAT
-// | STRING
+// | INT | FLOAT | STRING | BYTES
// | '[' ... // list literal or comprehension
// | '{' ... // dict literal or comprehension
// | '(' ... // tuple or parenthesized expression
@@ -782,7 +781,7 @@ func (p *parser) parsePrimary() Expr {
case IDENT:
return p.parseIdent()
- case INT, FLOAT, STRING:
+ case INT, FLOAT, STRING, BYTES:
var val interface{}
tok := p.tok
switch tok {
@@ -794,7 +793,7 @@ func (p *parser) parsePrimary() Expr {
}
case FLOAT:
val = p.tokval.float
- case STRING:
+ case STRING, BYTES:
val = p.tokval.string
}
raw := p.tokval.raw
diff --git a/syntax/parse_test.go b/syntax/parse_test.go
index 6052e79..fedbb3e 100644
--- a/syntax/parse_test.go
+++ b/syntax/parse_test.go
@@ -361,9 +361,12 @@ func writeTree(out *bytes.Buffer, x reflect.Value) {
case reflect.Struct:
switch v := x.Interface().(type) {
case syntax.Literal:
- if v.Token == syntax.STRING {
+ switch v.Token {
+ case syntax.STRING:
fmt.Fprintf(out, "%q", v.Value)
- } else if v.Token == syntax.INT {
+ case syntax.BYTES:
+ fmt.Fprintf(out, "b%q", v.Value)
+ case syntax.INT:
fmt.Fprintf(out, "%d", v.Value)
}
return
diff --git a/syntax/quote.go b/syntax/quote.go
index 49cb259..741e106 100644
--- a/syntax/quote.go
+++ b/syntax/quote.go
@@ -10,6 +10,8 @@ import (
"fmt"
"strconv"
"strings"
+ "unicode"
+ "unicode/utf8"
)
// unesc maps single-letter chars following \ to their actual values.
@@ -41,15 +43,20 @@ var esc = [256]byte{
}
// unquote unquotes the quoted string, returning the actual
-// string value, whether the original was triple-quoted, and
-// an error describing invalid input.
-func unquote(quoted string) (s string, triple bool, err error) {
+// string value, whether the original was triple-quoted,
+// whether it was a byte string, and an error describing invalid input.
+func unquote(quoted string) (s string, triple, isByte bool, err error) {
// Check for raw prefix: means don't interpret the inner \.
raw := false
if strings.HasPrefix(quoted, "r") {
raw = true
quoted = quoted[1:]
}
+ // Check for bytes prefix.
+ if strings.HasPrefix(quoted, "b") {
+ isByte = true
+ quoted = quoted[1:]
+ }
if len(quoted) < 2 {
err = fmt.Errorf("string literal too short")
@@ -138,7 +145,7 @@ func unquote(quoted string) (s string, triple bool, err error) {
quoted = quoted[2:]
case '0', '1', '2', '3', '4', '5', '6', '7':
- // Octal escape, up to 3 digits.
+ // Octal escape, up to 3 digits, \OOO.
n := int(quoted[1] - '0')
quoted = quoted[2:]
for i := 1; i < 3; i++ {
@@ -148,6 +155,10 @@ func unquote(quoted string) (s string, triple bool, err error) {
n = n*8 + int(quoted[0]-'0')
quoted = quoted[1:]
}
+ if !isByte && n > 127 {
+ err = fmt.Errorf(`non-ASCII octal escape \%o (use \u%04X for the UTF-8 encoding of U+%04X)`, n, n, n)
+ return
+ }
if n >= 256 {
// NOTE: Python silently discards the high bit,
// so that '\541' == '\141' == 'a'.
@@ -158,7 +169,7 @@ func unquote(quoted string) (s string, triple bool, err error) {
buf.WriteByte(byte(n))
case 'x':
- // Hexadecimal escape, exactly 2 digits.
+ // Hexadecimal escape, exactly 2 digits, \xXX. [0-127]
if len(quoted) < 4 {
err = fmt.Errorf(`truncated escape sequence %s`, quoted)
return
@@ -168,8 +179,41 @@ func unquote(quoted string) (s string, triple bool, err error) {
err = fmt.Errorf(`invalid escape sequence %s`, quoted[:4])
return
}
+ if !isByte && n > 127 {
+ err = fmt.Errorf(`non-ASCII hex escape %s (use \u%04X for the UTF-8 encoding of U+%04X)`,
+ quoted[:4], n, n)
+ return
+ }
buf.WriteByte(byte(n))
quoted = quoted[4:]
+
+ case 'u', 'U':
+ // Unicode code point, 4 (\uXXXX) or 8 (\UXXXXXXXX) hex digits.
+ sz := 6
+ if quoted[1] == 'U' {
+ sz = 10
+ }
+ if len(quoted) < sz {
+ err = fmt.Errorf(`truncated escape sequence %s`, quoted)
+ return
+ }
+ n, err1 := strconv.ParseUint(quoted[2:sz], 16, 0)
+ if err1 != nil {
+ err = fmt.Errorf(`invalid escape sequence %s`, quoted[:sz])
+ return
+ }
+ if n > unicode.MaxRune {
+ err = fmt.Errorf(`code point out of range: %s (max \U%08x)`,
+ quoted[:sz], n)
+ return
+ }
+ // As in Go, surrogates are disallowed.
+ if 0xD800 <= n && n < 0xE000 {
+ err = fmt.Errorf(`invalid Unicode code point U+%04X`, n)
+ return
+ }
+ buf.WriteRune(rune(n))
+ quoted = quoted[sz:]
}
}
@@ -187,67 +231,79 @@ func indexByte(s string, b byte) int {
return -1
}
-// hex is a list of the hexadecimal digits, for use in quoting.
-// We always print lower-case hexadecimal.
-const hex = "0123456789abcdef"
+// Quote returns a Starlark literal that denotes s.
+// If b, it returns a bytes literal.
+func Quote(s string, b bool) string {
+ const hex = "0123456789abcdef"
+ var runeTmp [utf8.UTFMax]byte
-// quote returns the quoted form of the string value "x".
-// If triple is true, quote uses the triple-quoted form """x""".
-func quote(unquoted string, triple bool) string {
- q := `"`
- if triple {
- q = `"""`
+ buf := make([]byte, 0, 3*len(s)/2)
+ if b {
+ buf = append(buf, 'b')
}
-
- buf := new(strings.Builder)
- buf.WriteString(q)
-
- for i := 0; i < len(unquoted); i++ {
- c := unquoted[i]
- if c == '"' && triple && (i+1 < len(unquoted) && unquoted[i+1] != '"' || i+2 < len(unquoted) && unquoted[i+2] != '"') {
- // Can pass up to two quotes through, because they are followed by a non-quote byte.
- buf.WriteByte(c)
- if i+1 < len(unquoted) && unquoted[i+1] == '"' {
- buf.WriteByte(c)
- i++
- }
- continue
+ buf = append(buf, '"')
+ for width := 0; len(s) > 0; s = s[width:] {
+ r := rune(s[0])
+ width = 1
+ if r >= utf8.RuneSelf {
+ r, width = utf8.DecodeRuneInString(s)
}
- if triple && c == '\n' {
- // Can allow newline in triple-quoted string.
- buf.WriteByte(c)
+ if width == 1 && r == utf8.RuneError {
+ // String (!b) literals accept \xXX escapes only for ASCII,
+ // but we must use them here to represent invalid bytes.
+ // The result is not a legal literal.
+ buf = append(buf, `\x`...)
+ buf = append(buf, hex[s[0]>>4])
+ buf = append(buf, hex[s[0]&0xF])
continue
}
- if c == '\'' {
- // Can allow ' since we always use ".
- buf.WriteByte(c)
+ if r == '"' || r == '\\' { // always backslashed
+ buf = append(buf, '\\')
+ buf = append(buf, byte(r))
continue
}
- if esc[c] != 0 {
- buf.WriteByte('\\')
- buf.WriteByte(esc[c])
+ if strconv.IsPrint(r) {
+ n := utf8.EncodeRune(runeTmp[:], r)
+ buf = append(buf, runeTmp[:n]...)
continue
}
- if c < 0x20 || c >= 0x80 {
- // BUILD files are supposed to be Latin-1, so escape all control and high bytes.
- // I'd prefer to use \x here, but Blaze does not implement
- // \x in quoted strings (b/7272572).
- buf.WriteByte('\\')
- buf.WriteByte(hex[c>>6]) // actually octal but reusing hex digits 0-7.
- buf.WriteByte(hex[(c>>3)&7])
- buf.WriteByte(hex[c&7])
- /*
- buf.WriteByte('\\')
- buf.WriteByte('x')
- buf.WriteByte(hex[c>>4])
- buf.WriteByte(hex[c&0xF])
- */
- continue
+ switch r {
+ case '\a':
+ buf = append(buf, `\a`...)
+ case '\b':
+ buf = append(buf, `\b`...)
+ case '\f':
+ buf = append(buf, `\f`...)
+ case '\n':
+ buf = append(buf, `\n`...)
+ case '\r':
+ buf = append(buf, `\r`...)
+ case '\t':
+ buf = append(buf, `\t`...)
+ case '\v':
+ buf = append(buf, `\v`...)
+ default:
+ switch {
+ case r < ' ' || r == 0x7f:
+ buf = append(buf, `\x`...)
+ buf = append(buf, hex[byte(r)>>4])
+ buf = append(buf, hex[byte(r)&0xF])
+ case r > utf8.MaxRune:
+ r = 0xFFFD
+ fallthrough
+ case r < 0x10000:
+ buf = append(buf, `\u`...)
+ for s := 12; s >= 0; s -= 4 {
+ buf = append(buf, hex[r>>uint(s)&0xF])
+ }
+ default:
+ buf = append(buf, `\U`...)
+ for s := 28; s >= 0; s -= 4 {
+ buf = append(buf, hex[r>>uint(s)&0xF])
+ }
+ }
}
- buf.WriteByte(c)
- continue
}
-
- buf.WriteString(q)
- return buf.String()
+ buf = append(buf, '"')
+ return string(buf)
}
diff --git a/syntax/quote_test.go b/syntax/quote_test.go
index f9068ee..be7498b 100644
--- a/syntax/quote_test.go
+++ b/syntax/quote_test.go
@@ -22,17 +22,14 @@ var quoteTests = []struct {
{`'quote"here'`, `quote"here`, false},
{`"quote'here"`, `quote'here`, true},
{`'quote\'here'`, `quote'here`, false},
- {`"""hello " ' world "" asdf ''' foo"""`, `hello " ' world "" asdf ''' foo`, true},
- {`"""hello
-world"""`, "hello\nworld", true},
- {`"\a\b\f\n\r\t\v\000\377"`, "\a\b\f\n\r\t\v\000\xFF", true},
- {`"\a\b\f\n\r\t\v\x00\xff"`, "\a\b\f\n\r\t\v\000\xFF", false},
- {`"\a\b\f\n\r\t\v\000\xFF"`, "\a\b\f\n\r\t\v\000\xFF", false},
- {`"\a\b\f\n\r\t\v\000\377\"'\\\003\200"`, "\a\b\f\n\r\t\v\x00\xFF\"'\\\x03\x80", true},
- {`"\a\b\f\n\r\t\v\x00\xff\"'\\\x03\x80"`, "\a\b\f\n\r\t\v\x00\xFF\"'\\\x03\x80", false},
- {`"\a\b\f\n\r\t\v\000\xFF\"'\\\x03\x80"`, "\a\b\f\n\r\t\v\x00\xFF\"'\\\x03\x80", false},
- {`"\a\b\f\n\r\t\v\000\xFF\"\\\x03\x80"`, "\a\b\f\n\r\t\v\x00\xFF\"\\\x03\x80", false},
+ {`"\a\b\f\n\r\t\v\x00\x7f"`, "\a\b\f\n\r\t\v\000\x7F", true},
+ {`"\a\b\f\n\r\t\v\x00\x7f"`, "\a\b\f\n\r\t\v\000\x7F", false},
+ {`"\a\b\f\n\r\t\v\x00\x7f"`, "\a\b\f\n\r\t\v\000\x7F", false},
+ {`"\a\b\f\n\r\t\v\x00\x7f\"'\\\x03"`, "\a\b\f\n\r\t\v\x00\x7F\"'\\\x03", true},
+ {`"\a\b\f\n\r\t\v\x00\x7f\"'\\\x03"`, "\a\b\f\n\r\t\v\x00\x7F\"'\\\x03", false},
+ {`"\a\b\f\n\r\t\v\x00\x7f\"'\\\x03"`, "\a\b\f\n\r\t\v\x00\x7F\"'\\\x03", false},
+ {`"\a\b\f\n\r\t\v\x00\x7f\"\\\x03"`, "\a\b\f\n\r\t\v\x00\x7F\"\\\x03", false},
{
`"cat $(SRCS) | grep '\\s*ip_block:' | sed -e 's/\\s*ip_block: \"\\([^ ]*\\)\"/ \x27\\1\x27,/g' >> $@; "`,
"cat $(SRCS) | grep '\\s*ip_block:' | sed -e 's/\\s*ip_block: \"\\([^ ]*\\)\"/ '\\1',/g' >> $@; ",
@@ -50,7 +47,7 @@ func TestQuote(t *testing.T) {
if !tt.std {
continue
}
- q := quote(tt.s, strings.HasPrefix(tt.q, `"""`))
+ q := Quote(tt.s, false)
if q != tt.q {
t.Errorf("quote(%#q) = %s, want %s", tt.s, q, tt.q)
}
@@ -59,7 +56,7 @@ func TestQuote(t *testing.T) {
func TestUnquote(t *testing.T) {
for _, tt := range quoteTests {
- s, triple, err := unquote(tt.q)
+ s, triple, _, err := unquote(tt.q)
wantTriple := strings.HasPrefix(tt.q, `"""`) || strings.HasPrefix(tt.q, `'''`)
if s != tt.s || triple != wantTriple || err != nil {
t.Errorf("unquote(%s) = %#q, %v, %v want %#q, %v, nil", tt.q, s, triple, err, tt.s, wantTriple)
diff --git a/syntax/scan.go b/syntax/scan.go
index a162264..bb4165e 100644
--- a/syntax/scan.go
+++ b/syntax/scan.go
@@ -35,6 +35,7 @@ const (
INT // 123
FLOAT // 1.23e45
STRING // "foo" or 'foo' or '''foo''' or r'foo' or r"foo"
+ BYTES // b"foo", etc
// Punctuation
PLUS // +
@@ -268,7 +269,7 @@ func newScanner(filename string, src interface{}, keepComments bool) (*scanner,
lineStart: true,
keepComments: keepComments,
}
- sc.readline, _ = src.(func() ([]byte, error)) // REPL only
+ sc.readline, _ = src.(func() ([]byte, error)) // ParseCompoundStmt (REPL) only
if sc.readline == nil {
data, err := readSource(filename, src)
if err != nil {
@@ -422,7 +423,7 @@ type tokenValue struct {
int int64 // decoded int
bigInt *big.Int // decoded integers > int64
float float64 // decoded float
- string string // decoded string
+ string string // decoded string or bytes
pos Position // start position of token
}
@@ -642,8 +643,15 @@ start:
// identifier or keyword
if isIdentStart(c) {
- // raw string literal
- if c == 'r' && len(sc.rest) > 1 && (sc.rest[1] == '"' || sc.rest[1] == '\'') {
+ if (c == 'r' || c == 'b') && len(sc.rest) > 1 && (sc.rest[1] == '"' || sc.rest[1] == '\'') {
+ // r"..."
+ // b"..."
+ sc.readRune()
+ c = sc.peekRune()
+ return sc.scanString(val, c)
+ } else if c == 'r' && len(sc.rest) > 2 && sc.rest[1] == 'b' && (sc.rest[2] == '"' || sc.rest[2] == '\'') {
+ // rb"..."
+ sc.readRune()
sc.readRune()
c = sc.peekRune()
return sc.scanString(val, c)
@@ -887,12 +895,16 @@ func (sc *scanner) scanString(val *tokenValue, quote rune) Token {
}
val.raw = raw.String()
- s, _, err := unquote(val.raw)
+ s, _, isByte, err := unquote(val.raw)
if err != nil {
sc.error(start, err.Error())
}
val.string = s
- return STRING
+ if isByte {
+ return BYTES
+ } else {
+ return STRING
+ }
}
func (sc *scanner) scanNumber(val *tokenValue, c rune) Token {
diff --git a/syntax/scan_test.go b/syntax/scan_test.go
index 0f2d9f2..9582bd7 100644
--- a/syntax/scan_test.go
+++ b/syntax/scan_test.go
@@ -10,6 +10,7 @@ import (
"go/build"
"io/ioutil"
"path/filepath"
+ "strings"
"testing"
)
@@ -42,8 +43,8 @@ func scan(src interface{}) (tokens string, err error) {
}
case FLOAT:
fmt.Fprintf(&buf, "%e", val.float)
- case STRING:
- fmt.Fprintf(&buf, "%q", val.string)
+ case STRING, BYTES:
+ buf.WriteString(Quote(val.string, tok == BYTES))
default:
buf.WriteString(tok.String())
}
@@ -189,9 +190,34 @@ pass`, "pass newline pass EOF"}, // consecutive newlines are consolidated
{"i = 012934", `foo.star:1:5: invalid int literal`},
// octal escapes in string literals
{`"\037"`, `"\x1f" EOF`},
- {`"\377"`, `"\xff" EOF`},
- {`"\378"`, `"\x1f8" EOF`}, // = '\37' + '8'
- {`"\400"`, `foo.star:1:1: invalid escape sequence \400`}, // unlike Python 2 and 3
+ {`"\377"`, `foo.star:1:1: non-ASCII octal escape \377 (use \u00FF for the UTF-8 encoding of U+00FF)`},
+ {`"\378"`, `"\x1f8" EOF`}, // = '\37' + '8'
+ {`"\400"`, `foo.star:1:1: non-ASCII octal escape \400`}, // unlike Python 2 and 3
+ // hex escapes
+ {`"\x00\x20\x09\x41\x7e\x7f"`, `"\x00 \tA~\x7f" EOF`}, // DEL is non-printable
+ {`"\x80"`, `foo.star:1:1: non-ASCII hex escape`},
+ {`"\xff"`, `foo.star:1:1: non-ASCII hex escape`},
+ {`"\xFf"`, `foo.star:1:1: non-ASCII hex escape`},
+ {`"\xF"`, `foo.star:1:1: truncated escape sequence \xF`},
+ {`"\x"`, `foo.star:1:1: truncated escape sequence \x`},
+ {`"\xfg"`, `foo.star:1:1: invalid escape sequence \xfg`},
+ // Unicode escapes
+ // \uXXXX
+ {`"\u0400"`, `"Ѐ" EOF`},
+ {`"\u100"`, `foo.star:1:1: truncated escape sequence \u100`},
+ {`"\u04000"`, `"Ѐ0" EOF`}, // = U+0400 + '0'
+ {`"\u100g"`, `foo.star:1:1: invalid escape sequence \u100g`},
+ {`"\u4E16"`, `"世" EOF`},
+ {`"\udc00"`, `foo.star:1:1: invalid Unicode code point U+DC00`}, // surrogate
+ // \UXXXXXXXX
+ {`"\U00000400"`, `"Ѐ" EOF`},
+ {`"\U0000400"`, `foo.star:1:1: truncated escape sequence \U0000400`},
+ {`"\U000004000"`, `"Ѐ0" EOF`}, // = U+0400 + '0'
+ {`"\U1000000g"`, `foo.star:1:1: invalid escape sequence \U1000000g`},
+ {`"\U0010FFFF"`, `"\U0010ffff" EOF`},
+ {`"\U00110000"`, `foo.star:1:1: code point out of range: \U00110000 (max \U00110000)`},
+ {`"\U0001F63F"`, `"😿" EOF`},
+ {`"\U0000dc00"`, `foo.star:1:1: invalid Unicode code point U+DC00`}, // surrogate
// backslash escapes
// As in Go, a backslash must escape something.
@@ -218,6 +244,12 @@ pass`, "pass newline pass EOF"}, // consecutive newlines are consolidated
{`r'\"'`, `"\\\"" EOF`},
{`'a\zb'`, `foo.star:1:1: invalid escape sequence \z`},
{`"\o123"`, `foo.star:1:1: invalid escape sequence \o`},
+ // bytes literals (where they differ from text strings)
+ {`b"AЀ世😿"`, `b"AЀ世😿`}, // 1-4 byte encodings, literal
+ {`b"\x41\u0400\u4e16\U0001F63F"`, `b"AЀ世😿"`}, // same, as escapes
+ {`b"\377\378\x80\xff\xFf"`, `b"\xff\x1f8\x80\xff\xff" EOF`}, // hex/oct escapes allow non-ASCII
+ {`b"\400"`, `foo.star:1:2: invalid escape sequence \400`},
+ {`b"\udc00"`, `foo.star:1:2: invalid Unicode code point U+DC00`}, // (same as string)
// floats starting with octal digits
{"012934.", `1.293400e+04 EOF`},
{"012934.1", `1.293410e+04 EOF`},
@@ -243,7 +275,9 @@ pass`, "pass newline pass EOF"}, // consecutive newlines are consolidated
if err != nil {
got = err.(Error).Error()
}
- if test.want != got {
+ // Prefix match allows us to truncate errors in expecations.
+ // Success cases all end in EOF.
+ if !strings.HasPrefix(got, test.want) {
t.Errorf("scan `%s` = [%s], want [%s]", test.input, got, test.want)
}
}
diff --git a/syntax/syntax.go b/syntax/syntax.go
index 8bbf5c0..20b28bb 100644
--- a/syntax/syntax.go
+++ b/syntax/syntax.go
@@ -251,7 +251,7 @@ func (x *Ident) Span() (start, end Position) {
// A Literal represents a literal string or number.
type Literal struct {
commentsRef
- Token Token // = STRING | INT | FLOAT
+ Token Token // = STRING | BYTES | INT | FLOAT
TokenPos Position
Raw string // uninterpreted text
Value interface{} // = string | int64 | *big.Int | float64