1 files changed, 270 insertions, 0 deletions
diff --git a/syntax/quote.go b/syntax/quote.go
new file mode 100644
index 0000000..0a8321a
--- /dev/null
+++ b/syntax/quote.go
@@ -0,0 +1,270 @@
+// Copyright 2017 The Bazel Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package syntax
+
+// Skylark quoted string utilities.
+
+import (
+	"bytes"
+	"fmt"
+	"strconv"
+	"strings"
+)
+
+// unesc maps single-letter chars following \ to their actual values.
+var unesc = [256]byte{
+	'a':  '\a',
+	'b':  '\b',
+	'f':  '\f',
+	'n':  '\n',
+	'r':  '\r',
+	't':  '\t',
+	'v':  '\v',
+	'\\': '\\',
+	'\'': '\'',
+	'"':  '"',
+}
+
+// esc maps escape-worthy bytes to the char that should follow \.
+var esc = [256]byte{
+	'\a': 'a',
+	'\b': 'b',
+	'\f': 'f',
+	'\n': 'n',
+	'\r': 'r',
+	'\t': 't',
+	'\v': 'v',
+	'\\': '\\',
+	'\'': '\'',
+	'"':  '"',
+}
+
+// notEsc is a list of characters that can follow a \ in a string value
+// without having to escape the \. That is, since ( is in this list, we
+// quote the Go string "foo\\(bar" as the Python literal "foo\(bar".
+// This really does happen in BUILD files, especially in strings
+// being used as shell arguments containing regular expressions.
+const notEsc = " !#$%&()*+,-./:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ{|}~"
+
+// unquote unquotes the quoted string, returning the actual
+// string value, whether the original was triple-quoted, and
+// an error describing invalid input.
+func unquote(quoted string) (s string, triple bool, err error) {
+	// Check for raw prefix: means don't interpret the inner \.
+	raw := false
+	if strings.HasPrefix(quoted, "r") {
+		raw = true
+		quoted = quoted[1:]
+	}
+
+	if len(quoted) < 2 {
+		err = fmt.Errorf("string literal too short")
+		return
+	}
+
+	if quoted[0] != '"' && quoted[0] != '\'' || quoted[0] != quoted[len(quoted)-1] {
+		err = fmt.Errorf("string literal has invalid quotes")
+		return
+	}
+
+	// Check for triple quoted string.
+	quote := quoted[0]
+	if len(quoted) >= 6 && quoted[1] == quote && quoted[2] == quote && quoted[:3] == quoted[len(quoted)-3:] {
+		triple = true
+		quoted = quoted[3 : len(quoted)-3]
+	} else {
+		quoted = quoted[1 : len(quoted)-1]
+	}
+
+	// Now quoted is the quoted data, but no quotes.
+	// If we're in raw mode or there are no escapes or
+	// carriage returns, we're done.
+	var unquoteChars string
+	if raw {
+		unquoteChars = "\r"
+	} else {
+		unquoteChars = "\\\r"
+	}
+	if !strings.ContainsAny(quoted, unquoteChars) {
+		s = quoted
+		return
+	}
+
+	// Otherwise process quoted string.
+	// Each iteration processes one escape sequence along with the
+	// plain text leading up to it.
+	var buf bytes.Buffer
+	for {
+		// Remove prefix before escape sequence.
+		i := strings.IndexAny(quoted, unquoteChars)
+		if i < 0 {
+			i = len(quoted)
+		}
+		buf.WriteString(quoted[:i])
+		quoted = quoted[i:]
+
+		if len(quoted) == 0 {
+			break
+		}
+
+		// Process carriage return.
+		if quoted[0] == '\r' {
+			buf.WriteByte('\n')
+			if len(quoted) > 1 && quoted[1] == '\n' {
+				quoted = quoted[2:]
+			} else {
+				quoted = quoted[1:]
+			}
+			continue
+		}
+
+		// Process escape sequence.
+		if len(quoted) == 1 {
+			err = fmt.Errorf(`truncated escape sequence \`)
+			return
+		}
+
+		switch quoted[1] {
+		default:
+			// In Python, if \z (for some byte z) is not a known escape sequence
+			// then it appears as literal text in the string.
+			buf.WriteString(quoted[:2])
+			quoted = quoted[2:]
+
+		case '\n':
+			// Ignore the escape and the line break.
+			quoted = quoted[2:]
+
+		case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '\'', '"':
+			// One-char escape
+			buf.WriteByte(unesc[quoted[1]])
+			quoted = quoted[2:]
+
+		case '0', '1', '2', '3', '4', '5', '6', '7':
+			// Octal escape, up to 3 digits.
+			n := int(quoted[1] - '0')
+			quoted = quoted[2:]
+			for i := 1; i < 3; i++ {
+				if len(quoted) == 0 || quoted[0] < '0' || '7' < quoted[0] {
+					break
+				}
+				n = n*8 + int(quoted[0]-'0')
+				quoted = quoted[1:]
+			}
+			if n >= 256 {
+				// NOTE: Python silently discards the high bit,
+				// so that '\541' == '\141' == 'a'.
+				// Let's see if we can avoid doing that in BUILD files.
+				err = fmt.Errorf(`invalid escape sequence \%03o`, n)
+				return
+			}
+			buf.WriteByte(byte(n))
+
+		case 'x':
+			// Hexadecimal escape, exactly 2 digits.
+			if len(quoted) < 4 {
+				err = fmt.Errorf(`truncated escape sequence %s`, quoted)
+				return
+			}
+			n, err1 := strconv.ParseInt(quoted[2:4], 16, 0)
+			if err1 != nil {
+				err = fmt.Errorf(`invalid escape sequence %s`, quoted[:4])
+				return
+			}
+			buf.WriteByte(byte(n))
+			quoted = quoted[4:]
+		}
+	}
+
+	s = buf.String()
+	return
+}
+
+// indexByte returns the index of the first instance of b in s, or else -1.
+func indexByte(s string, b byte) int {
+	for i := 0; i < len(s); i++ {
+		if s[i] == b {
+			return i
+		}
+	}
+	return -1
+}
+
+// hex is a list of the hexadecimal digits, for use in quoting.
+// We always print lower-case hexadecimal.
+const hex = "0123456789abcdef"
+
+// quote returns the quoted form of the string value "x".
+// If triple is true, quote uses the triple-quoted form """x""".
+func quote(unquoted string, triple bool) string {
+	q := `"`
+	if triple {
+		q = `"""`
+	}
+
+	var buf bytes.Buffer
+	buf.WriteString(q)
+
+	for i := 0; i < len(unquoted); i++ {
+		c := unquoted[i]
+		if c == '"' && triple && (i+1 < len(unquoted) && unquoted[i+1] != '"' || i+2 < len(unquoted) && unquoted[i+2] != '"') {
+			// Can pass up to two quotes through, because they are followed by a non-quote byte.
+			buf.WriteByte(c)
+			if i+1 < len(unquoted) && unquoted[i+1] == '"' {
+				buf.WriteByte(c)
+				i++
+			}
+			continue
+		}
+		if triple && c == '\n' {
+			// Can allow newline in triple-quoted string.
+			buf.WriteByte(c)
+			continue
+		}
+		if c == '\'' {
+			// Can allow ' since we always use ".
+			buf.WriteByte(c)
+			continue
+		}
+		if c == '\\' {
+			if i+1 < len(unquoted) && indexByte(notEsc, unquoted[i+1]) >= 0 {
+				// Can pass \ through when followed by a byte that
+				// known not to be a valid escape sequence and also
+				// that does not trigger an escape sequence of its own.
+				// Use this, because various BUILD files do.
+				buf.WriteByte('\\')
+				buf.WriteByte(unquoted[i+1])
+				i++
+				continue
+			}
+		}
+		if esc[c] != 0 {
+			buf.WriteByte('\\')
+			buf.WriteByte(esc[c])
+			continue
+		}
+		if c < 0x20 || c >= 0x80 {
+			// BUILD files are supposed to be Latin-1, so escape all control and high bytes.
+			// I'd prefer to use \x here, but Blaze does not implement
+			// \x in quoted strings (b/7272572).
+			buf.WriteByte('\\')
+			buf.WriteByte(hex[c>>6]) // actually octal but reusing hex digits 0-7.
+			buf.WriteByte(hex[(c>>3)&7])
+			buf.WriteByte(hex[c&7])
+			/*
+				buf.WriteByte('\\')
+				buf.WriteByte('x')
+				buf.WriteByte(hex[c>>4])
+				buf.WriteByte(hex[c&0xF])
+			*/
+			continue
+		}
+		buf.WriteByte(c)
+		continue
+	}
+
+	buf.WriteString(q)
+	return buf.String()
+}