aboutsummaryrefslogtreecommitdiff
path: root/syntax/quote.go
diff options
context:
space:
mode:
authorAlan Donovan <adonovan@google.com>2017-10-02 10:10:28 -0400
committerAlan Donovan <adonovan@google.com>2017-10-02 10:10:28 -0400
commit312d1a5b5a9c50204aee186aeca0b7dbbd3eaaa0 (patch)
treeb766f2d515a7a3abcb0ebc6da796e04ab9739a97 /syntax/quote.go
downloadstarlark-go-312d1a5b5a9c50204aee186aeca0b7dbbd3eaaa0.tar.gz
skylark: create GitHub repository from google3@170697745
Diffstat (limited to 'syntax/quote.go')
-rw-r--r--syntax/quote.go270
1 files changed, 270 insertions, 0 deletions
diff --git a/syntax/quote.go b/syntax/quote.go
new file mode 100644
index 0000000..0a8321a
--- /dev/null
+++ b/syntax/quote.go
@@ -0,0 +1,270 @@
+// Copyright 2017 The Bazel Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package syntax
+
+// Skylark quoted string utilities.
+
+import (
+ "bytes"
+ "fmt"
+ "strconv"
+ "strings"
+)
+
+// unesc maps single-letter chars following \ to their actual values.
+var unesc = [256]byte{
+ 'a': '\a',
+ 'b': '\b',
+ 'f': '\f',
+ 'n': '\n',
+ 'r': '\r',
+ 't': '\t',
+ 'v': '\v',
+ '\\': '\\',
+ '\'': '\'',
+ '"': '"',
+}
+
+// esc maps escape-worthy bytes to the char that should follow \.
+var esc = [256]byte{
+ '\a': 'a',
+ '\b': 'b',
+ '\f': 'f',
+ '\n': 'n',
+ '\r': 'r',
+ '\t': 't',
+ '\v': 'v',
+ '\\': '\\',
+ '\'': '\'',
+ '"': '"',
+}
+
+// notEsc is a list of characters that can follow a \ in a string value
+// without having to escape the \. That is, since ( is in this list, we
+// quote the Go string "foo\\(bar" as the Python literal "foo\(bar".
+// This really does happen in BUILD files, especially in strings
+// being used as shell arguments containing regular expressions.
+const notEsc = " !#$%&()*+,-./:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ{|}~"
+
+// unquote unquotes the quoted string, returning the actual
+// string value, whether the original was triple-quoted, and
+// an error describing invalid input.
+func unquote(quoted string) (s string, triple bool, err error) {
+ // Check for raw prefix: means don't interpret the inner \.
+ raw := false
+ if strings.HasPrefix(quoted, "r") {
+ raw = true
+ quoted = quoted[1:]
+ }
+
+ if len(quoted) < 2 {
+ err = fmt.Errorf("string literal too short")
+ return
+ }
+
+ if quoted[0] != '"' && quoted[0] != '\'' || quoted[0] != quoted[len(quoted)-1] {
+ err = fmt.Errorf("string literal has invalid quotes")
+ return
+ }
+
+ // Check for triple quoted string.
+ quote := quoted[0]
+ if len(quoted) >= 6 && quoted[1] == quote && quoted[2] == quote && quoted[:3] == quoted[len(quoted)-3:] {
+ triple = true
+ quoted = quoted[3 : len(quoted)-3]
+ } else {
+ quoted = quoted[1 : len(quoted)-1]
+ }
+
+ // Now quoted is the quoted data, but no quotes.
+ // If we're in raw mode or there are no escapes or
+ // carriage returns, we're done.
+ var unquoteChars string
+ if raw {
+ unquoteChars = "\r"
+ } else {
+ unquoteChars = "\\\r"
+ }
+ if !strings.ContainsAny(quoted, unquoteChars) {
+ s = quoted
+ return
+ }
+
+ // Otherwise process quoted string.
+ // Each iteration processes one escape sequence along with the
+ // plain text leading up to it.
+ var buf bytes.Buffer
+ for {
+ // Remove prefix before escape sequence.
+ i := strings.IndexAny(quoted, unquoteChars)
+ if i < 0 {
+ i = len(quoted)
+ }
+ buf.WriteString(quoted[:i])
+ quoted = quoted[i:]
+
+ if len(quoted) == 0 {
+ break
+ }
+
+ // Process carriage return.
+ if quoted[0] == '\r' {
+ buf.WriteByte('\n')
+ if len(quoted) > 1 && quoted[1] == '\n' {
+ quoted = quoted[2:]
+ } else {
+ quoted = quoted[1:]
+ }
+ continue
+ }
+
+ // Process escape sequence.
+ if len(quoted) == 1 {
+ err = fmt.Errorf(`truncated escape sequence \`)
+ return
+ }
+
+ switch quoted[1] {
+ default:
+ // In Python, if \z (for some byte z) is not a known escape sequence
+ // then it appears as literal text in the string.
+ buf.WriteString(quoted[:2])
+ quoted = quoted[2:]
+
+ case '\n':
+ // Ignore the escape and the line break.
+ quoted = quoted[2:]
+
+ case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '\'', '"':
+ // One-char escape
+ buf.WriteByte(unesc[quoted[1]])
+ quoted = quoted[2:]
+
+ case '0', '1', '2', '3', '4', '5', '6', '7':
+ // Octal escape, up to 3 digits.
+ n := int(quoted[1] - '0')
+ quoted = quoted[2:]
+ for i := 1; i < 3; i++ {
+ if len(quoted) == 0 || quoted[0] < '0' || '7' < quoted[0] {
+ break
+ }
+ n = n*8 + int(quoted[0]-'0')
+ quoted = quoted[1:]
+ }
+ if n >= 256 {
+ // NOTE: Python silently discards the high bit,
+ // so that '\541' == '\141' == 'a'.
+ // Let's see if we can avoid doing that in BUILD files.
+ err = fmt.Errorf(`invalid escape sequence \%03o`, n)
+ return
+ }
+ buf.WriteByte(byte(n))
+
+ case 'x':
+ // Hexadecimal escape, exactly 2 digits.
+ if len(quoted) < 4 {
+ err = fmt.Errorf(`truncated escape sequence %s`, quoted)
+ return
+ }
+ n, err1 := strconv.ParseInt(quoted[2:4], 16, 0)
+ if err1 != nil {
+ err = fmt.Errorf(`invalid escape sequence %s`, quoted[:4])
+ return
+ }
+ buf.WriteByte(byte(n))
+ quoted = quoted[4:]
+ }
+ }
+
+ s = buf.String()
+ return
+}
+
+// indexByte returns the index of the first instance of b in s, or else -1.
+func indexByte(s string, b byte) int {
+ for i := 0; i < len(s); i++ {
+ if s[i] == b {
+ return i
+ }
+ }
+ return -1
+}
+
+// hex is a list of the hexadecimal digits, for use in quoting.
+// We always print lower-case hexadecimal.
+const hex = "0123456789abcdef"
+
+// quote returns the quoted form of the string value "x".
+// If triple is true, quote uses the triple-quoted form """x""".
+func quote(unquoted string, triple bool) string {
+ q := `"`
+ if triple {
+ q = `"""`
+ }
+
+ var buf bytes.Buffer
+ buf.WriteString(q)
+
+ for i := 0; i < len(unquoted); i++ {
+ c := unquoted[i]
+ if c == '"' && triple && (i+1 < len(unquoted) && unquoted[i+1] != '"' || i+2 < len(unquoted) && unquoted[i+2] != '"') {
+ // Can pass up to two quotes through, because they are followed by a non-quote byte.
+ buf.WriteByte(c)
+ if i+1 < len(unquoted) && unquoted[i+1] == '"' {
+ buf.WriteByte(c)
+ i++
+ }
+ continue
+ }
+ if triple && c == '\n' {
+ // Can allow newline in triple-quoted string.
+ buf.WriteByte(c)
+ continue
+ }
+ if c == '\'' {
+ // Can allow ' since we always use ".
+ buf.WriteByte(c)
+ continue
+ }
+ if c == '\\' {
+ if i+1 < len(unquoted) && indexByte(notEsc, unquoted[i+1]) >= 0 {
+ // Can pass \ through when followed by a byte that
+ // known not to be a valid escape sequence and also
+ // that does not trigger an escape sequence of its own.
+ // Use this, because various BUILD files do.
+ buf.WriteByte('\\')
+ buf.WriteByte(unquoted[i+1])
+ i++
+ continue
+ }
+ }
+ if esc[c] != 0 {
+ buf.WriteByte('\\')
+ buf.WriteByte(esc[c])
+ continue
+ }
+ if c < 0x20 || c >= 0x80 {
+ // BUILD files are supposed to be Latin-1, so escape all control and high bytes.
+ // I'd prefer to use \x here, but Blaze does not implement
+ // \x in quoted strings (b/7272572).
+ buf.WriteByte('\\')
+ buf.WriteByte(hex[c>>6]) // actually octal but reusing hex digits 0-7.
+ buf.WriteByte(hex[(c>>3)&7])
+ buf.WriteByte(hex[c&7])
+ /*
+ buf.WriteByte('\\')
+ buf.WriteByte('x')
+ buf.WriteByte(hex[c>>4])
+ buf.WriteByte(hex[c&0xF])
+ */
+ continue
+ }
+ buf.WriteByte(c)
+ continue
+ }
+
+ buf.WriteString(q)
+ return buf.String()
+}