Initial commit of license classifier.

author: Bill Wendling <morbo@google.com> 2017-04-09 20:49:36 -0700
committer: Bill Wendling <morbo@google.com> 2017-04-09 20:49:36 -0700
commit: 85ea7edc49509df714ac09a6166e514b05e30d6d (patch)
tree: 2bb8c36b5938ce5af17775a382e2a0e8608e2723 /internal
download: licenseclassifier-85ea7edc49509df714ac09a6166e514b05e30d6d.tar.gz
7 files changed, 1903 insertions, 0 deletions
diff --git a/internal/commentparser/comment_parser.go b/internal/commentparser/comment_parser.go
new file mode 100644
index 0000000..0cc3fe3
--- /dev/null
+++ b/internal/commentparser/comment_parser.go
@@ -0,0 +1,336 @@
+// Copyright 2017 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package commentparser does a basic parse over a source file and returns all
+// of the comments from the code. This is useful for when you want to analyze
+// text written in comments (like copyright notices) but not in the code
+// itself.
+package commentparser
+
+import (
+	"bytes"
+	"log"
+	"strings"
+	"unicode/utf8"
+
+	"github.com/google/licenseclassifier/internal/commentparser/language"
+)
+
+const (
+	eofInString            = "%d:EOF in string"
+	eofInSingleLineComment = "%d:EOF in single line comment"
+	eofInMultilineComment  = "%d:EOF in multiline comment"
+)
+
+// Parse parses the input data and returns the comments.
+func Parse(contents []byte, lang language.Language) Comments {
+	if len(contents) == 0 {
+		return nil
+	}
+
+	c := string(contents)
+	if !strings.HasSuffix(c, "\n") {
+		// Force a terminating newline if one isn't present.
+		c += "\n"
+	}
+	i := &input{
+		s:      c,
+		lang:   lang,
+		offset: 0,
+		pos:    position{line: 1, lineRune: []int{0}},
+	}
+	i.lex()
+	return i.comments
+}
+
+// Comment is either a single line or multiline comment in a source code file.
+// A single line comment has StartLine equal to EndLine. The lines are 1-based.
+type Comment struct {
+	StartLine int
+	EndLine   int
+	Text      string
+}
+
+// Comments allows us to treat a slice of comments as a unit.
+type Comments []*Comment
+
+// ChunkIterator returns a read-only channel and generates the comments in a
+// goroutine, then closes the channel.
+func (c Comments) ChunkIterator() <-chan Comments {
+	ch := make(chan Comments)
+	go func() {
+		index := 0
+		prevLine := c.StartLine()
+
+		for ; index < len(c); index++ {
+			var chunk Comments
+			for ; index < len(c); index++ {
+				if c[index].StartLine > prevLine+2 {
+					break
+				}
+				chunk = append(chunk, c[index])
+				prevLine = c[index].StartLine
+			}
+			if len(chunk) == 0 {
+				break
+			}
+
+			ch <- chunk
+			if index >= len(c) {
+				break
+			}
+
+			prevLine = c[index].StartLine
+			index--
+		}
+
+		close(ch)
+	}()
+	return ch
+}
+
+// StartLine is the line number (1-based) the first part of the comment block
+// starts on.
+func (c Comments) StartLine() int {
+	if len(c) == 0 {
+		return 0
+	}
+	return c[0].StartLine
+}
+
+// String creates a string out of the text of the comments. Comment begin and
+// end markers are removed.
+func (c Comments) String() string {
+	var s []string
+	for _, cmt := range c {
+		s = append(s, cmt.Text)
+	}
+	return strings.Join(s, "\n")
+}
+
+// position records the location of a lexeme.
+type position struct {
+	line     int   // Line number of input: 1-based
+	lineRune []int // Rune offset from beginning of line: 0-based
+}
+
+// input holds the current state of the lexer.
+type input struct {
+	s        string            // Entire input.
+	lang     language.Language // Source code language.
+	offset   int               // Offset into input.
+	pos      position          // Current position in the input.
+	comments Comments          // Comments in the source file.
+}
+
+// lex is called to obtain the comments.
+func (i *input) lex() {
+	for {
+		c, ok := i.peekRune()
+		if !ok {
+			break
+		}
+
+		switch c {
+		case '"', '\'', '`': // String
+			// Ignore strings because they could contain comment
+			// start or end sequences which we need to ignore.
+			if i.lang == language.HTML {
+				// Quotes in HTML-like files aren't meaningful,
+				// because it's basically plain text
+				break
+			}
+
+			ok, hasEscape := i.lang.QuoteCharacter(c)
+			if !ok {
+				break
+			}
+
+			quote := string(c)
+			if i.lang == language.Python {
+				if c == '\'' && i.match("'''") {
+					quote = "'''"
+				} else if c == '"' && i.match(`"""`) {
+					quote = `"""`
+				} else {
+					i.readRune() // Eat quote.
+				}
+			} else {
+				i.readRune() // Eat quote.
+			}
+
+			startLine := i.pos.line
+			for {
+				c, ok = i.peekRune()
+				if !ok {
+					log.Printf(eofInString, startLine)
+					return
+				}
+				if hasEscape && c == '\\' {
+					i.readRune() // Eat escape.
+				} else if i.match(quote) {
+					break
+				} else if (i.lang == language.JavaScript || i.lang == language.Perl) && c == '\n' {
+					// JavaScript and Perl allow you to
+					// specify regexes without quotes, but
+					// which contain quotes. So treat the
+					// newline as terminating the string.
+					break
+				}
+				i.readRune() // Eat character.
+				if i.eof() {
+					log.Printf(eofInString, startLine)
+					return
+				}
+			}
+		default:
+			startLine := i.pos.line
+			var comment bytes.Buffer
+			if i.singleLineComment() { // Single line comment
+				for {
+					if i.eof() {
+						log.Printf(eofInSingleLineComment, i.pos.line)
+						return
+					}
+					c = i.readRune()
+					if c == '\n' {
+						i.unreadRune(c)
+						break
+					}
+					comment.WriteRune(c)
+				}
+				i.comments = append(i.comments, &Comment{
+					StartLine: startLine,
+					EndLine:   i.pos.line,
+					Text:      comment.String(),
+				})
+			} else if ok, start, end := i.multiLineComment(); ok { // Multiline comment
+				nesting := 0
+				startLine := i.pos.line
+				for {
+					if i.eof() {
+						log.Printf(eofInMultilineComment, startLine)
+						return
+					}
+					c := i.readRune()
+					comment.WriteRune(c)
+					if i.lang.NestedComments() && i.match(start) {
+						// Allows nested comments.
+						comment.WriteString(start)
+						nesting++
+					}
+					if i.match(end) {
+						if nesting > 0 {
+							comment.WriteString(end)
+							nesting--
+						} else {
+							break
+						}
+					}
+				}
+				i.comments = append(i.comments, &Comment{
+					StartLine: startLine,
+					EndLine:   i.pos.line,
+					Text:      comment.String(),
+				})
+			}
+		}
+
+		i.readRune() // Ignore non-comments.
+	}
+}
+
+// singleLineComment returns 'true' if we've run across a single line comment
+// in the given language.
+func (i *input) singleLineComment() bool {
+	return i.match(i.lang.SingleLineCommentStart())
+}
+
+// multiLineComment returns 'true' if we've run across a multiline comment in
+// the given language.
+func (i *input) multiLineComment() (bool, string, string) {
+	if s := i.lang.MultilineCommentStart(); i.match(s) {
+		return true, s, i.lang.MultilineCommentEnd()
+	}
+	return false, "", ""
+}
+
+// match returns 'true' if the next tokens in the stream match the given
+// string.
+func (i *input) match(s string) bool {
+	if s == "" {
+		return false
+	}
+	saved := s
+	var read []rune
+	for len(s) > 0 && !i.eof() {
+		r, size := utf8.DecodeRuneInString(s)
+		if c, ok := i.peekRune(); ok && c == r {
+			read = append(read, c)
+		} else {
+			// No match. Push the tokens we read back onto the stack.
+			for idx := len(read) - 1; idx >= 0; idx-- {
+				i.unreadRune(read[idx])
+			}
+			return false
+		}
+		s = s[size:]
+		i.readRune() // Eat token.
+	}
+	return string(read) == saved
+}
+
+// eof reports whether the input has reached the end of the file.
+func (i *input) eof() bool {
+	return len(i.s) <= i.offset
+}
+
+// peekRune returns the next rune in the input without consuming it.
+func (i *input) peekRune() (rune, bool) {
+	if i.eof() {
+		return rune(0), false
+	}
+	r, _ := utf8.DecodeRuneInString(i.s[i.offset:])
+	return r, true
+}
+
+// readRune consumes and returns the next rune in the input.
+func (i *input) readRune() rune {
+	r, size := utf8.DecodeRuneInString(i.s[i.offset:])
+	if r == '\n' {
+		i.pos.line++
+		i.pos.lineRune = append(i.pos.lineRune, 0)
+	} else {
+		i.pos.lineRune[len(i.pos.lineRune)-1]++
+	}
+	i.offset += size
+	return r
+}
+
+// unreadRune winds the lexer's state back to before the rune was read.
+func (i *input) unreadRune(c rune) {
+	p := make([]byte, utf8.UTFMax)
+	size := utf8.EncodeRune(p, c)
+	i.offset -= size
+	if c == '\n' {
+		i.pos.line--
+		if len(i.pos.lineRune) > 1 {
+			i.pos.lineRune = i.pos.lineRune[:len(i.pos.lineRune)-1]
+		} else {
+			i.pos.lineRune[len(i.pos.lineRune)-1] = 0
+		}
+	} else {
+		i.pos.lineRune[len(i.pos.lineRune)-1]--
+	}
+}
diff --git a/internal/commentparser/comment_parser_test.go b/internal/commentparser/comment_parser_test.go
new file mode 100644
index 0000000..86dd996
--- /dev/null
+++ b/internal/commentparser/comment_parser_test.go
@@ -0,0 +1,413 @@
+// Copyright 2017 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+package commentparser
+
+import (
+	"fmt"
+	"reflect"
+	"testing"
+
+	"github.com/google/licenseclassifier/internal/commentparser/language"
+)
+
+const (
+	singleLineText = "single line text"
+	multilineText  = `first line of text
+second line of text
+third line of text
+`
+)
+
+func TestCommentParser_Lex(t *testing.T) {
+	tests := []struct {
+		description string
+		lang        language.Language
+		source      string
+		want        Comments
+	}{
+		{
+			description: "BCPL Single Line Comments",
+			lang:        language.Go,
+			source:      fmt.Sprintf("//%s\n", singleLineText),
+			want: []*Comment{
+				{
+					StartLine: 1,
+					EndLine:   1,
+					Text:      singleLineText,
+				},
+			},
+		},
+		{
+			description: "Go Comment With Multiline String",
+			lang:        language.Go,
+			source:      fmt.Sprintf("var a = `A\nmultiline\\x20\nstring`\n//%s\n", singleLineText),
+			want: []*Comment{
+				{
+					StartLine: 4,
+					EndLine:   4,
+					Text:      singleLineText,
+				},
+			},
+		},
+		{
+			description: "Python Multiline String",
+			lang:        language.Python,
+			source:      fmt.Sprintf("#%s\n'''this is a multiline\nstring'''", singleLineText),
+			want: []*Comment{
+				{
+					StartLine: 1,
+					EndLine:   1,
+					Text:      singleLineText,
+				},
+			},
+		},
+		{
+			description: "TR Command String",
+			lang:        language.Python,
+			source: fmt.Sprintf(`#%s
+AUTH= \
+| tr '"\n' \
+| base64 -w
+`, singleLineText),
+			want: []*Comment{
+				{
+					StartLine: 1,
+					EndLine:   1,
+					Text:      singleLineText,
+				},
+			},
+		},
+		{
+			description: "Lisp Single Line Comments",
+			lang:        language.Clojure,
+			source:      fmt.Sprintf(";%s\n", singleLineText),
+			want: []*Comment{
+				{
+					StartLine: 1,
+					EndLine:   1,
+					Text:      singleLineText,
+				},
+			},
+		},
+		{
+			description: "Shell Single Line Comments",
+			lang:        language.Shell,
+			source:      fmt.Sprintf("#%s\n", singleLineText),
+			want: []*Comment{
+				{
+					StartLine: 1,
+					EndLine:   1,
+					Text:      singleLineText,
+				},
+			},
+		},
+		{
+			description: "BCPL Multiline Comments",
+			lang:        language.C,
+			source:      fmt.Sprintf("/*%s*/\n", multilineText),
+			want: []*Comment{
+				{
+					StartLine: 1,
+					EndLine:   4,
+					Text:      multilineText,
+				},
+			},
+		},
+		{
+			description: "BCPL Multiline Comments no terminating newline",
+			lang:        language.C,
+			source:      fmt.Sprintf("/*%s*/", multilineText),
+			want: []*Comment{
+				{
+					StartLine: 1,
+					EndLine:   4,
+					Text:      multilineText,
+				},
+			},
+		},
+		{
+			description: "Nested Multiline Comments",
+			lang:        language.Swift,
+			source:      "/*a /*\n  nested\n*/\n  comment\n*/\n",
+			want: []*Comment{
+				{
+					StartLine: 1,
+					EndLine:   5,
+					Text:      "a /*\n  nested\n*/\n  comment\n",
+				},
+			},
+		},
+		{
+			description: "Ruby Multiline Comments",
+			lang:        language.Ruby,
+			source:      fmt.Sprintf("=begin\n%s=end\n", multilineText),
+			want: []*Comment{
+				{
+					StartLine: 1,
+					EndLine:   5,
+					Text:      "\n" + multilineText,
+				},
+			},
+		},
+		{
+			description: "Multiple Single Line Comments",
+			lang:        language.Shell,
+			source: `# First line
+# Second line
+# Third line
+`,
+			want: []*Comment{
+				{
+					StartLine: 1,
+					EndLine:   1,
+					Text:      " First line",
+				},
+				{
+					StartLine: 2,
+					EndLine:   2,
+					Text:      " Second line",
+				},
+				{
+					StartLine: 3,
+					EndLine:   3,
+					Text:      " Third line",
+				},
+			},
+		},
+		{
+			description: "Mixed Multiline / Single Line Comments",
+			lang:        language.C,
+			source: `/*
+ * The first multiline line.
+ * The second multiline line.
+ */
+ // The first single line comment.
+ // The second single line comment.
+`,
+			want: []*Comment{
+				{
+					StartLine: 1,
+					EndLine:   4,
+					Text: `
+ * The first multiline line.
+ * The second multiline line.
+ `,
+				},
+				{
+					StartLine: 5,
+					EndLine:   5,
+					Text:      " The first single line comment.",
+				},
+				{
+					StartLine: 6,
+					EndLine:   6,
+					Text:      " The second single line comment.",
+				},
+			},
+		},
+		{
+			description: "Mixed Multiline / Single Line Comments",
+			lang:        language.C,
+			source: `/*
+ * The first multiline line.
+ * The second multiline line.
+ */
+ // The first single line comment.
+ // The second single line comment.
+`,
+			want: []*Comment{
+				{
+					StartLine: 1,
+					EndLine:   4,
+					Text: `
+ * The first multiline line.
+ * The second multiline line.
+ `,
+				},
+				{
+					StartLine: 5,
+					EndLine:   5,
+					Text:      " The first single line comment.",
+				},
+				{
+					StartLine: 6,
+					EndLine:   6,
+					Text:      " The second single line comment.",
+				},
+			},
+		},
+		{
+			description: "HTML-like comments and quotes",
+			lang:        language.HTML,
+			source: `# This is an important topic
+I don't want to go on all day here! <-- notice the quote in there!
+<!-- Well, maybe I do... -->
+`,
+			want: []*Comment{
+				{
+					StartLine: 3,
+					EndLine:   3,
+					Text:      " Well, maybe I do... ",
+				},
+			},
+		},
+		{
+			description: "JavaScript regex",
+			lang:        language.JavaScript,
+			source: `var re = /hello"world/;
+// the comment
+`,
+			want: []*Comment{
+				{
+					StartLine: 2,
+					EndLine:   2,
+					Text:      " the comment",
+				},
+			},
+		},
+		{
+			description: "Perl regex",
+			lang:        language.Perl,
+			source: `if (/hello"world/) {
+  # the comment
+  print "Yo!"
+}
+`,
+			want: []*Comment{
+				{
+					StartLine: 2,
+					EndLine:   2,
+					Text:      " the comment",
+				},
+			},
+		},
+	}
+
+	for _, tt := range tests {
+		got := Parse([]byte(tt.source), tt.lang)
+		if !reflect.DeepEqual(got, tt.want) {
+			t.Errorf("Mismatch(%q) = %+v, want %+v", tt.description, got, tt.want)
+		}
+	}
+}
+
+func TestCommentParser_ChunkIterator(t *testing.T) {
+	tests := []struct {
+		description string
+		comments    Comments
+		want        []Comments
+	}{
+		{
+			description: "Empty Comments",
+			comments:    Comments{},
+			want:        nil,
+		},
+		{
+			description: "Single Line Comment Chunk",
+			comments: Comments{
+				{StartLine: 1, EndLine: 1, Text: "Block 1 line 1"},
+				{StartLine: 2, EndLine: 2, Text: "Block 1 line 2"},
+			},
+			want: []Comments{
+				Comments{
+					{StartLine: 1, EndLine: 1, Text: "Block 1 line 1"},
+					{StartLine: 2, EndLine: 2, Text: "Block 1 line 2"},
+				},
+			},
+		},
+		{
+			description: "Multiline Comment Chunk",
+			comments: Comments{
+				{StartLine: 1, EndLine: 3, Text: "Multiline 1\n2\n3"},
+			},
+			want: []Comments{
+				Comments{{StartLine: 1, EndLine: 3, Text: "Multiline 1\n2\n3"}},
+			},
+		},
+		{
+			description: "Multiple Single Line Comment Chunks",
+			comments: Comments{
+				{StartLine: 1, EndLine: 1, Text: "Block 1 line 1"},
+				{StartLine: 2, EndLine: 2, Text: "Block 1 line 2"},
+				{StartLine: 5, EndLine: 5, Text: "Block 2 line 1"},
+				{StartLine: 6, EndLine: 6, Text: "Block 2 line 2"},
+				{StartLine: 10, EndLine: 10, Text: "Block 3 line 1"},
+				{StartLine: 11, EndLine: 11, Text: "Block 3 line 2"},
+				{StartLine: 13, EndLine: 13, Text: "Block 3 line 3"},
+			},
+			want: []Comments{
+				Comments{
+					{StartLine: 1, EndLine: 1, Text: "Block 1 line 1"},
+					{StartLine: 2, EndLine: 2, Text: "Block 1 line 2"},
+				},
+				Comments{
+					{StartLine: 5, EndLine: 5, Text: "Block 2 line 1"},
+					{StartLine: 6, EndLine: 6, Text: "Block 2 line 2"},
+				},
+				Comments{
+					{StartLine: 10, EndLine: 10, Text: "Block 3 line 1"},
+					{StartLine: 11, EndLine: 11, Text: "Block 3 line 2"},
+					{StartLine: 13, EndLine: 13, Text: "Block 3 line 3"},
+				},
+			},
+		},
+		{
+			description: "Multiline Comment Chunk",
+			comments: Comments{
+				{StartLine: 1, EndLine: 3, Text: "Multiline 1\n2\n3"},
+				{StartLine: 4, EndLine: 6, Text: "Multiline 1\n2\n3"},
+			},
+			want: []Comments{
+				Comments{{StartLine: 1, EndLine: 3, Text: "Multiline 1\n2\n3"}},
+				Comments{{StartLine: 4, EndLine: 6, Text: "Multiline 1\n2\n3"}},
+			},
+		},
+		{
+			description: "Multiline and Single Line Comment Chunks",
+			comments: Comments{
+				{StartLine: 1, EndLine: 3, Text: "Multiline 1\n2\n3"},
+				{StartLine: 4, EndLine: 4, Text: "Block 2 line 1"},
+				{StartLine: 5, EndLine: 5, Text: "Block 2 line 2"},
+			},
+			want: []Comments{
+				Comments{
+					{StartLine: 1, EndLine: 3, Text: "Multiline 1\n2\n3"},
+				},
+				Comments{
+					{StartLine: 4, EndLine: 4, Text: "Block 2 line 1"},
+					{StartLine: 5, EndLine: 5, Text: "Block 2 line 2"},
+				},
+			},
+		},
+	}
+
+	for _, tt := range tests {
+		i := 0
+		for got := range tt.comments.ChunkIterator() {
+			if i >= len(tt.want) {
+				t.Errorf("Mismatch(%q) more comment chunks than expected = %v, want %v",
+					tt.description, i+1, len(tt.want))
+				break
+			}
+			if !reflect.DeepEqual(got, tt.want[i]) {
+				t.Errorf("Mismatch(%q) = %+v, want %+v", tt.description, got, tt.want[i])
+			}
+			i++
+		}
+		if i != len(tt.want) {
+			t.Errorf("Mismatch(%q) not enough comment chunks = %v, want %v",
+				tt.description, i, len(tt.want))
+		}
+	}
+}
diff --git a/internal/commentparser/language/language.go b/internal/commentparser/language/language.go
new file mode 100644
index 0000000..36e04eb
--- /dev/null
+++ b/internal/commentparser/language/language.go
@@ -0,0 +1,268 @@
+// Copyright 2017 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package language contains methods and information about the different
+// programming languages the comment parser supports.
+package language
+
+import (
+	"path/filepath"
+	"strings"
+)
+
+// Language is the progamming language we're grabbing the comments from.
+type Language int
+
+// Languages we can retrieve comments from.
+const (
+	Unknown Language = iota
+	AppleScript
+	Assembly
+	Batch
+	C
+	Clif
+	Clojure
+	CMake
+	CSharp
+	Dart
+	Fortran
+	GLSLF // OpenGL Shading Language
+	Go
+	Haskell
+	HTML
+	Java
+	JavaScript
+	Flex
+	Lisp
+	NinjaBuild
+	ObjectiveC
+	Perl
+	Python
+	R
+	Ruby
+	Rust
+	Shader
+	Shell
+	SQL
+	Swift
+	SWIG
+	TypeScript
+	Yacc
+	Yaml
+)
+
+// style is the comment styles that a language uses.
+type style int
+
+// Comment styles.
+const (
+	unknown     style = iota
+	applescript       // -- ... and (* ... *)
+	batch             // @REM
+	bcpl              // // ... and /* ... */
+	cmake             // # ... and #[[ ... ]]
+	fortran           // ! ...
+	haskell           // -- ... and {- ... -}
+	html              // <!-- ... -->
+	lisp              // ;;
+	ruby              // # ... and =begin ... =end
+	shell             // # ...
+)
+
+// ClassifyLanguage determines what language the source code was written in. It
+// does this by looking at the file's extension.
+func ClassifyLanguage(filename string) Language {
+	ext := strings.ToLower(filepath.Ext(filename))
+	if len(ext) == 0 || ext[0] != '.' {
+		return Unknown
+	}
+
+	switch ext[1:] { // Skip the '.'.
+	case "applescript":
+		return AppleScript
+	case "bat":
+		return Batch
+	case "c", "cc", "cpp", "c++", "h", "hh", "hpp":
+		return C
+	case "clif":
+		return Clif
+	case "cmake":
+		return CMake
+	case "cs":
+		return CSharp
+	case "dart":
+		return Dart
+	case "f", "f90", "f95":
+		return Fortran
+	case "glslf":
+		return GLSLF
+	case "go":
+		return Go
+	case "hs":
+		return Haskell
+	case "html", "htm", "md", "ng", "sgml":
+		return HTML
+	case "java":
+		return Java
+	case "js":
+		return JavaScript
+	case "l":
+		return Flex
+	case "lisp", "el", "clj":
+		return Lisp
+	case "m", "mm":
+		return ObjectiveC
+	case "gn":
+		return NinjaBuild
+	case "pl", "pm":
+		return Perl
+	case "py", "pi":
+		return Python
+	case "r":
+		return R
+	case "rb":
+		return Ruby
+	case "rs":
+		return Rust
+	case "s":
+		return Assembly
+	case "sh":
+		return Shell
+	case "shader":
+		return Shader
+	case "sql":
+		return SQL
+	case "swift":
+		return Swift
+	case "swig":
+		return SWIG
+	case "ts", "tsx":
+		return TypeScript
+	case "y":
+		return Yacc
+	case "yaml":
+		return Yaml
+	}
+	return Unknown
+}
+
+// commentStyle returns the language's comment style.
+func (lang Language) commentStyle() style {
+	switch lang {
+	case Assembly, C, CSharp, Dart, GLSLF, Go, Java, JavaScript, Flex, ObjectiveC, Rust, Shader, Swift, SWIG, TypeScript, Yacc:
+		return bcpl
+	case Batch:
+		return batch
+	case CMake:
+		return cmake
+	case Fortran:
+		return fortran
+	case Haskell, SQL:
+		return haskell
+	case HTML:
+		return html
+	case Clojure, Lisp:
+		return lisp
+	case Ruby:
+		return ruby
+	case Clif, NinjaBuild, Perl, Python, R, Shell, Yaml:
+		return shell
+	}
+	return unknown
+}
+
+// SingleLineCommentStart returns the starting string of a single line comment
+// for the given language. There is no equivalent "End" method, because it's
+// the end of line.
+func (lang Language) SingleLineCommentStart() string {
+	switch lang.commentStyle() {
+	case applescript, haskell:
+		return "--"
+	case batch:
+		return "@REM"
+	case bcpl:
+		return "//"
+	case fortran:
+		return "!"
+	case lisp:
+		return ";"
+	case shell, ruby, cmake:
+		return "#"
+	}
+	return ""
+}
+
+// MultilineCommentStart returns the starting string of a multiline comment for
+// the given language.
+func (lang Language) MultilineCommentStart() string {
+	switch lang.commentStyle() {
+	case applescript:
+		return "(*"
+	case bcpl:
+		if lang != Rust {
+			return "/*"
+		}
+	case cmake:
+		return "#[["
+	case haskell:
+		return "{-"
+	case html:
+		return "<!--"
+	case ruby:
+		return "=begin"
+	}
+	return ""
+}
+
+// MultilineCommentEnd returns the ending string of a multiline comment for the
+// given langauge.
+func (lang Language) MultilineCommentEnd() string {
+	switch lang.commentStyle() {
+	case applescript:
+		return "*)"
+	case bcpl:
+		if lang != Rust {
+			return "*/"
+		}
+	case cmake:
+		return "]]"
+	case haskell:
+		return "-}"
+	case html:
+		return "-->"
+	case ruby:
+		return "=end"
+	}
+	return ""
+}
+
+// QuoteCharacter returns 'true' if the character is considered the beginning
+// of a string in the given language. The second return value is true if the
+// string allows for escaping.
+func (lang Language) QuoteCharacter(quote rune) (ok bool, escape bool) {
+	switch quote {
+	case '"', '\'':
+		return true, true
+	case '`':
+		if lang == Go {
+			return true, false
+		}
+	}
+	return false, false
+}
+
+// NestedComments returns true if the language allows for nested multiline comments.
+func (lang Language) NestedComments() bool {
+	return lang == Swift
+}
diff --git a/internal/sets/sets.go b/internal/sets/sets.go
new file mode 100644
index 0000000..f34ae5b
--- /dev/null
+++ b/internal/sets/sets.go
@@ -0,0 +1,20 @@
+// Copyright 2017 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package sets provides sets for storing collections of unique elements.
+package sets
+
+// present is an empty struct used as the "value" in the map[int], since
+// empty structs consume zero bytes (unlike 1 unnecessary byte per bool).
+type present struct{}
diff --git a/internal/sets/sets_benchmark_test.go b/internal/sets/sets_benchmark_test.go
new file mode 100644
index 0000000..017dca6
--- /dev/null
+++ b/internal/sets/sets_benchmark_test.go
@@ -0,0 +1,213 @@
+// Copyright 2017 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+package sets
+
+import (
+	"strings"
+	"testing"
+)
+
+const (
+	postmodernThesisCollapse = `1. Expressions of collapse
+
+If one examines postcultural Marxism, one is faced with a choice: either
+reject capitalist submodern theory or conclude that the purpose of the reader
+is significant form. Bataille uses the term ‘capitalist construction’ to denote
+not, in fact, discourse, but prediscourse.
+
+Therefore, in Stardust, Gaiman analyses postcultural Marxism; in
+The Books of Magic, although, he denies capitalist submodern theory. If
+capitalist construction holds, we have to choose between capitalist submodern
+theory and Baudrillardist simulacra.
+
+However, conceptualist socialism implies that narrativity may be used to
+oppress the proletariat, given that sexuality is distinct from art. The subject
+is interpolated into a capitalist construction that includes language as a
+paradox.
+`
+	postmodernThesisNarratives = `1. Narratives of failure
+
+The main theme of the works of Joyce is the defining characteristic, and some
+would say the economy, of neocultural class. But Bataille promotes the use of
+socialist realism to deconstruct sexual identity.
+
+The subject is interpolated into a Baudrillardist simulation that includes
+consciousness as a whole. Thus, the primary theme of Pickett's[1] model of
+socialist realism is the role of the reader as artist.
+
+The subject is contextualised into a postcapitalist discourse that includes
+language as a paradox. It could be said that if Baudrillardist simulation
+holds, the works of Gibson are postmodern. The characteristic theme of the
+works of Gibson is the common ground between society and narrativity. However,
+Sartre uses the term 'postcapitalist discourse' to denote not, in fact,
+narrative, but postnarrative.
+`
+)
+
+var (
+	// Word lists:
+	stringsA = strings.Fields(postmodernThesisCollapse)
+	stringsB = strings.Fields(postmodernThesisNarratives)
+)
+
+func BenchmarkStringSets_NewStringSet(b *testing.B) {
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		NewStringSet(stringsA...)
+	}
+}
+
+func BenchmarkStringSets_Copy(b *testing.B) {
+	s := NewStringSet(stringsA...)
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		s.Copy()
+	}
+}
+
+func BenchmarkStringSets_Insert(b *testing.B) {
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		s := NewStringSet()
+		s.Insert(stringsA...)
+		s.Insert(stringsB...)
+	}
+}
+
+func BenchmarkStringSets_Delete(b *testing.B) {
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		s := NewStringSet(stringsA...)
+		s.Delete(stringsB...)
+	}
+}
+
+func BenchmarkStringSets_Intersect(b *testing.B) {
+	s := NewStringSet(stringsA...)
+	t := NewStringSet(stringsB...)
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		s.Intersect(t)
+	}
+}
+
+func BenchmarkStringSets_Disjoint(b *testing.B) {
+	s := NewStringSet(stringsA...)
+	t := NewStringSet(stringsB...)
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		s.Disjoint(t)
+	}
+}
+
+func BenchmarkStringSets_Difference(b *testing.B) {
+	s := NewStringSet(stringsA...)
+	t := NewStringSet(stringsB...)
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		s.Difference(t)
+	}
+}
+
+func BenchmarkStringSets_Unique(b *testing.B) {
+	s := NewStringSet(stringsA...)
+	t := NewStringSet(stringsB...)
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		s.Unique(t)
+	}
+}
+
+func BenchmarkStringSets_Equal(b *testing.B) {
+	s := NewStringSet(stringsA...)
+	t := NewStringSet(stringsB...)
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		s.Equal(t)
+	}
+}
+
+func BenchmarkStringSets_Union(b *testing.B) {
+	s := NewStringSet(stringsA...)
+	t := NewStringSet(stringsB...)
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		s.Union(t)
+	}
+}
+
+func BenchmarkStringSets_Contains(b *testing.B) {
+	s := NewStringSet(stringsA...)
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		for _, w := range stringsB {
+			s.Contains(w)
+		}
+	}
+}
+
+func BenchmarkStringSets_Len(b *testing.B) {
+	s := NewStringSet(stringsA...)
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		s.Len()
+	}
+}
+
+func BenchmarkStringSets_Empty(b *testing.B) {
+	s := NewStringSet(stringsA...)
+	t := NewStringSet()
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		s.Empty()
+		t.Empty()
+	}
+}
+
+func BenchmarkStringSets_Elements(b *testing.B) {
+	s := NewStringSet(stringsA...)
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		s.Elements()
+	}
+}
+
+func BenchmarkStringSets_Sorted(b *testing.B) {
+	s := NewStringSet(stringsA...)
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		s.Sorted()
+	}
+}
+
+func BenchmarkStringSets_String(b *testing.B) {
+	s := NewStringSet(stringsA...)
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		s.String()
+	}
+}
diff --git a/internal/sets/stringset.go b/internal/sets/stringset.go
new file mode 100644
index 0000000..9f93450
--- /dev/null
+++ b/internal/sets/stringset.go
@@ -0,0 +1,228 @@
+// Copyright 2017 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+package sets
+
+import (
+	"fmt"
+	"sort"
+	"strings"
+)
+
+// StringSet stores a set of unique string elements.
+type StringSet struct {
+	set map[string]present
+}
+
+// NewStringSet creates a StringSet containing the supplied initial string elements.
+func NewStringSet(elements ...string) *StringSet {
+	s := &StringSet{}
+	s.set = make(map[string]present)
+	s.Insert(elements...)
+	return s
+}
+
+// Copy returns a newly allocated copy of the supplied StringSet.
+func (s *StringSet) Copy() *StringSet {
+	c := NewStringSet()
+	if s != nil {
+		for e := range s.set {
+			c.set[e] = present{}
+		}
+	}
+	return c
+}
+
+// Insert zero or more string elements into the StringSet.
+// As expected for a Set, elements already present in the StringSet are
+// simply ignored.
+func (s *StringSet) Insert(elements ...string) {
+	for _, e := range elements {
+		s.set[e] = present{}
+	}
+}
+
+// Delete zero or more string elements from the StringSet.
+// Any elements not present in the StringSet are simply ignored.
+func (s *StringSet) Delete(elements ...string) {
+	for _, e := range elements {
+		delete(s.set, e)
+	}
+}
+
+// Intersect returns a new StringSet containing the intersection of the
+// receiver and argument StringSets.  Returns an empty set if the argument is nil.
+func (s *StringSet) Intersect(other *StringSet) *StringSet {
+	if other == nil {
+		return NewStringSet()
+	}
+
+	// Point a and b to the maps, setting a to the smaller of the two.
+	a, b := s.set, other.set
+	if len(b) < len(a) {
+		a, b = b, a
+	}
+
+	// Perform the intersection.
+	intersect := NewStringSet()
+	for e := range a {
+		if _, ok := b[e]; ok {
+			intersect.set[e] = present{}
+		}
+	}
+	return intersect
+}
+
+// Disjoint returns true if the intersection of the receiver and the argument
+// StringSets is the empty set.  Returns true if the argument is nil or either
+// StringSet is the empty set.
+func (s *StringSet) Disjoint(other *StringSet) bool {
+	if other == nil || len(other.set) == 0 || len(s.set) == 0 {
+		return true
+	}
+
+	// Point a and b to the maps, setting a to the smaller of the two.
+	a, b := s.set, other.set
+	if len(b) < len(a) {
+		a, b = b, a
+	}
+
+	// Check for non-empty intersection.
+	for e := range a {
+		if _, ok := b[e]; ok {
+			return false // Early-exit because intersecting.
+		}
+	}
+	return true
+}
+
+// Difference returns a new StringSet containing the elements in the receiver
+// that are not present in the argument StringSet. Returns a copy of the
+// receiver if the argument is nil.
+func (s *StringSet) Difference(other *StringSet) *StringSet {
+	if other == nil {
+		return s.Copy()
+	}
+
+	// Insert only the elements in the receiver that are not present in the
+	// argument StringSet.
+	diff := NewStringSet()
+	for e := range s.set {
+		if _, ok := other.set[e]; !ok {
+			diff.set[e] = present{}
+		}
+	}
+	return diff
+}
+
+// Unique returns a new StringSet containing the elements in the receiver
+// that are not present in the argument StringSet *and* the elements in the
+// argument StringSet that are not in the receiver (which is the union of two
+// disjoint sets). Returns a copy of the
+// receiver if the argument is nil.
+func (s *StringSet) Unique(other *StringSet) *StringSet {
+	if other == nil {
+		return s.Copy()
+	}
+
+	sNotInOther := s.Difference(other)
+	otherNotInS := other.Difference(s)
+
+	// Duplicate Union implementation here to avoid extra Copy, since both
+	// sNotInOther and otherNotInS are already copies.
+	unique := sNotInOther
+	for e := range otherNotInS.set {
+		unique.set[e] = present{}
+	}
+	return unique
+}
+
+// Equal returns true if the receiver and the argument StringSet contain
+// exactly the same elements. Returns false if the argument is nil.
+func (s *StringSet) Equal(other *StringSet) bool {
+	if s == nil || other == nil {
+		return s == nil && other == nil
+	}
+
+	// Two sets of different length cannot have the exact same unique elements.
+	if len(s.set) != len(other.set) {
+		return false
+	}
+
+	// Only one loop is needed. If the two sets are known to be of equal
+	// length, then the two sets are equal only if exactly all of the elements
+	// in the first set are found in the second.
+	for e := range s.set {
+		if _, ok := other.set[e]; !ok {
+			return false
+		}
+	}
+
+	return true
+}
+
+// Union returns a new StringSet containing the union of the receiver and
+// argument StringSets.  Returns a copy of the receiver if the argument is nil.
+func (s *StringSet) Union(other *StringSet) *StringSet {
+	union := s.Copy()
+	if other != nil {
+		for e := range other.set {
+			union.set[e] = present{}
+		}
+	}
+	return union
+}
+
+// Contains returns true if element is in the StringSet.
+func (s *StringSet) Contains(element string) bool {
+	_, in := s.set[element]
+	return in
+}
+
+// Len returns the number of unique elements in the StringSet.
+func (s *StringSet) Len() int {
+	return len(s.set)
+}
+
+// Empty returns true if the receiver is the empty set.
+func (s *StringSet) Empty() bool {
+	return len(s.set) == 0
+}
+
+// Elements returns a []string of the elements in the StringSet, in no
+// particular (or consistent) order.
+func (s *StringSet) Elements() []string {
+	elements := []string{} // Return at least an empty slice rather than nil.
+	for e := range s.set {
+		elements = append(elements, e)
+	}
+	return elements
+}
+
+// Sorted returns a sorted []string of the elements in the StringSet.
+func (s *StringSet) Sorted() []string {
+	elements := s.Elements()
+	sort.Strings(elements)
+	return elements
+}
+
+// String formats the StringSet elements as sorted strings, representing them
+// in "array initializer" syntax.
+func (s *StringSet) String() string {
+	elements := s.Sorted()
+	var quoted []string
+	for _, e := range elements {
+		quoted = append(quoted, fmt.Sprintf("%q", e))
+	}
+	return fmt.Sprintf("{%s}", strings.Join(quoted, ", "))
+}
diff --git a/internal/sets/stringset_test.go b/internal/sets/stringset_test.go
new file mode 100644
index 0000000..e37efaa
--- /dev/null
+++ b/internal/sets/stringset_test.go
@@ -0,0 +1,425 @@
+// Copyright 2017 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+package sets
+
+import (
+	"sort"
+	"testing"
+)
+
+func checkSameStringSet(t *testing.T, set *StringSet, unique []string) {
+	// Check that lengths are the same.
+	want := len(unique)
+	got := set.Len()
+
+	if got != want {
+		t.Errorf("NewStringSet(%v) want length %v, got %v", unique, want, got)
+	}
+
+	// Check that all strings are present in set.
+	for _, s := range unique {
+		want := true
+		got := set.Contains(s)
+
+		if got != want {
+			t.Errorf("Contains(%v) want %v, got %v", s, want, got)
+		}
+	}
+
+	// Check that all elements are present in strings.
+	sort.Strings(unique)
+
+	for i, got := range set.Sorted() {
+		want := unique[i]
+
+		if got != want {
+			t.Errorf("Sorted(%d) want %v, got %v", i, want, got)
+		}
+	}
+}
+
+func TestNewStringSet(t *testing.T) {
+	empty := NewStringSet()
+	want := 0
+	got := empty.Len()
+
+	if got != want {
+		t.Errorf("NewStringSet() want length %v, got %v", want, got)
+	}
+
+	unique := []string{"a", "b", "c"}
+	set := NewStringSet(unique...)
+	checkSameStringSet(t, set, unique)
+
+	// Append an already-present element.
+	nonUnique := append(unique, unique[0])
+	set = NewStringSet(nonUnique...)
+
+	// Non-unique unique should collapse to one.
+	want = len(unique)
+	got = set.Len()
+
+	if got != want {
+		t.Errorf("NewStringSet(%v) want length %v, got %v", nonUnique, want, got)
+	}
+}
+
+func TestStringSet_Copy(t *testing.T) {
+	// Check both copies represent the same set.
+	base := []string{"a", "b", "c"}
+	orig := NewStringSet(base...)
+	cpy := orig.Copy()
+	checkSameStringSet(t, orig, base)
+	checkSameStringSet(t, cpy, base)
+
+	// Check the two copies are independent.
+	more := []string{"d"}
+	orig.Insert(more...)
+	more = append(base, more...)
+	checkSameStringSet(t, orig, more)
+	checkSameStringSet(t, cpy, base)
+}
+
+func TestStringSet_Insert(t *testing.T) {
+	unique := []string{"a", "b", "c"}
+	set := NewStringSet(unique...)
+
+	// Insert existing element, which should basically be a no-op.
+	set.Insert(unique[0])
+	checkSameStringSet(t, set, unique)
+
+	// Actually insert new unique elements.
+	additional := []string{"d", "e"}
+	longer := append(unique, additional...)
+	set.Insert(additional...)
+	checkSameStringSet(t, set, longer)
+}
+
+func TestStringSet_Delete(t *testing.T) {
+	unique := []string{"a", "b", "c"}
+	set := NewStringSet(unique...)
+
+	// Delete non-existent element, which should basically be a no-op.
+	set.Delete("z")
+	checkSameStringSet(t, set, unique)
+
+	// Actually delete existing elements.
+	set.Delete(unique[1:]...)
+	checkSameStringSet(t, set, unique[:1])
+}
+
+func TestStringSet_Intersect(t *testing.T) {
+	input1 := []string{"a", "c", "d", "e", "f"}
+	input2 := []string{"b", "c", "e"}
+
+	// Check Intersect(nil) returns an empty set.
+	setA := NewStringSet(input1...)
+	got := setA.Intersect(nil)
+	checkSameStringSet(t, got, []string{})
+	// Check that the receiver is unchanged.
+	checkSameStringSet(t, setA, input1)
+
+	// Check Intersect returns the correct result.
+	setB := NewStringSet(input2...)
+	got = setA.Intersect(setB)
+	want := []string{"c", "e"}
+	checkSameStringSet(t, got, want)
+	// Also check the sources are unchanged.
+	checkSameStringSet(t, setA, input1)
+	checkSameStringSet(t, setB, input2)
+
+	// Reverse the inputs and verify Intersect produces the same results.
+	setA = NewStringSet(input2...)
+	setB = NewStringSet(input1...)
+	got = setA.Intersect(setB)
+	checkSameStringSet(t, got, want)
+	// Check the sources are again unchanged.
+	checkSameStringSet(t, setA, input2)
+	checkSameStringSet(t, setB, input1)
+}
+
+func TestStringSet_Disjoint(t *testing.T) {
+	input1 := []string{"a", "c", "d", "e", "f"}
+	input2 := []string{"b", "c", "e"}
+	input3 := []string{"x", "y", "z"}
+
+	// Check that sets are always disjoint with the empty set or nil
+	setA := NewStringSet(input1...)
+	emptySet := NewStringSet()
+
+	if disjoint := setA.Disjoint(nil); !disjoint {
+		t.Errorf("Disjoint(%s, %v) want %v, got %v", setA, nil, true, disjoint)
+	}
+
+	if disjoint := setA.Disjoint(emptySet); !disjoint {
+		t.Errorf("Disjoint(%s, %s) want %v, got %v", setA, emptySet, true, disjoint)
+	}
+
+	if disjoint := emptySet.Disjoint(setA); !disjoint {
+		t.Errorf("Disjoint(%s, %s) want %v, got %v", emptySet, setA, true, disjoint)
+	}
+
+	if disjoint := emptySet.Disjoint(emptySet); !disjoint {
+		t.Errorf("Disjoint(%s, %s) want %v, got %v", emptySet, emptySet, true, disjoint)
+	}
+
+	// Also check the sources are unchanged.
+	checkSameStringSet(t, setA, input1)
+	checkSameStringSet(t, emptySet, []string{})
+
+	// Check two non-empty, non-nil disjoint sets.
+	setC := NewStringSet(input3...)
+
+	if disjoint := setA.Disjoint(setC); !disjoint {
+		t.Errorf("Disjoint(%s, %s) want %v, got %v", setA, setC, true, disjoint)
+	}
+
+	// Also check the sources are unchanged.
+	checkSameStringSet(t, setA, input1)
+	checkSameStringSet(t, setC, input3)
+
+	// Check that two intersecting sets are not Disjoint.
+	setB := NewStringSet(input2...)
+
+	if disjoint := setA.Disjoint(setB); disjoint {
+		t.Errorf("Disjoint(%s, %s) want %v, got %v", setA, setB, false, disjoint)
+	}
+
+	// Also check the sources are unchanged.
+	checkSameStringSet(t, setA, input1)
+	checkSameStringSet(t, setB, input2)
+}
+
+func TestStringSet_Difference(t *testing.T) {
+	input1 := []string{"a", "c", "d", "e", "f"}
+	input2 := []string{"b", "c", "e"}
+	input3 := []string{"x", "y", "z"}
+
+	// Check Difference(nil) returns a copy of the receiver.
+	setA := NewStringSet(input1...)
+	got := setA.Difference(nil)
+	checkSameStringSet(t, got, input1)
+	// Check that the receiver is unchanged.
+	checkSameStringSet(t, setA, input1)
+
+	// Check A - A returns the empty set.
+	got = setA.Difference(setA)
+
+	if !got.Empty() {
+		t.Errorf("Difference(%s, %s).Empty() want %v, got %v",
+			setA, setA, true, false)
+	}
+
+	checkSameStringSet(t, got, []string{})
+	// Check that the receiver is unchanged.
+	checkSameStringSet(t, setA, input1)
+
+	// Check A - C simply returns elements in A if A and C are disjoint.
+	setC := NewStringSet(input3...)
+	got = setA.Difference(setC)
+	checkSameStringSet(t, got, input1)
+	// Also check the sources are unchanged.
+	checkSameStringSet(t, setA, input1)
+	checkSameStringSet(t, setC, input3)
+
+	// Check A - B returns elements in A not in B.
+	setB := NewStringSet(input2...)
+	got = setA.Difference(setB)
+	want := []string{"a", "d", "f"}
+	checkSameStringSet(t, got, want)
+
+	// Also check the sources are unchanged.
+	checkSameStringSet(t, setA, input1)
+	checkSameStringSet(t, setB, input2)
+
+	// Check B - A returns elements in B not in A.
+	got = setB.Difference(setA)
+	want = []string{"b"}
+	checkSameStringSet(t, got, want)
+	// Also check the sources are unchanged.
+	checkSameStringSet(t, setA, input1)
+	checkSameStringSet(t, setB, input2)
+}
+
+func TestStringSet_Unique(t *testing.T) {
+	input1 := []string{"a", "c", "d", "e", "f"}
+	input2 := []string{"b", "c", "e"}
+	input3 := []string{"x", "y", "z"}
+
+	// Check Unique(nil) returns a copy of the receiver.
+	setA := NewStringSet(input1...)
+	got := setA.Unique(nil)
+	checkSameStringSet(t, got, input1)
+	// Check that the receiver is unchanged.
+	checkSameStringSet(t, setA, input1)
+
+	// Check Unique returns only elements in A and B not in both A and B.
+	setB := NewStringSet(input2...)
+	got = setA.Unique(setB)
+	want := []string{"a", "b", "d", "f"}
+	checkSameStringSet(t, got, want)
+	// Also check the sources are unchanged.
+	checkSameStringSet(t, setA, input1)
+	checkSameStringSet(t, setB, input2)
+
+	// Check Unique of two disjoint sets is the Union of those sets.
+	setC := NewStringSet(input3...)
+	got = setA.Unique(setC)
+	union := setA.Union(setC)
+
+	if equal := union.Equal(got); !equal {
+		t.Errorf("Union of disjoint Equal(%s, %s) want %v, got %v",
+			union, got, true, equal)
+	}
+
+	// Check Unique is the Union of A - B and B - A.
+	aNotInB := setA.Difference(setB)
+	bNotInA := setB.Difference(setA)
+	union = aNotInB.Union(bNotInA)
+	want = []string{"a", "b", "d", "f"}
+	checkSameStringSet(t, union, want)
+	got = setA.Unique(setB)
+
+	if equal := union.Equal(got); !equal {
+		t.Errorf("Union of differences Equal(%s, %s) want %v, got %v",
+			union, got, true, equal)
+	}
+
+	// Also check the sources are unchanged.
+	checkSameStringSet(t, setA, input1)
+	checkSameStringSet(t, setB, input2)
+}
+
+func TestStringSet_Equal(t *testing.T) {
+	input1 := []string{"a", "c", "d", "e", "f"}
+	input2 := []string{"b", "c", "e"}
+	input3 := []string{"a", "c", "d", "e", "g"}
+
+	// Check Equal(nil) returns false.
+	setA := NewStringSet(input1...)
+
+	if equal := setA.Equal(nil); equal {
+		t.Errorf("Equal(%s, %v) want %v, got %v", setA, nil, false, true)
+	}
+
+	// Check that the receiver is unchanged.
+	checkSameStringSet(t, setA, input1)
+
+	// Check Equal returns true for a set and itself.
+	if equal := setA.Equal(setA); !equal {
+		t.Errorf("Equal(%s, %s) want %v, got %v", setA, setA, true, false)
+	}
+
+	// Check that the receiver is unchanged.
+	checkSameStringSet(t, setA, input1)
+
+	// Check Equal returns false for sets of non-equal length.
+	setB := NewStringSet(input2...)
+
+	if equal := setA.Equal(setB); equal {
+		t.Errorf("Equal(%s, %s) want %v, got %v", setA, setB, false, true)
+	}
+
+	// Also check the sources are unchanged.
+	checkSameStringSet(t, setA, input1)
+	checkSameStringSet(t, setB, input2)
+
+	// Check Equal returns false for equal-length sets with different elements.
+	setC := NewStringSet(input3...)
+
+	if equal := setA.Equal(setC); equal {
+		t.Errorf("Equal(%s, %s) want %v, got %v", setA, setC, false, true)
+	}
+
+	if equal := setC.Equal(setA); equal {
+		t.Errorf("Equal(%s, %s) want %v, got %v", setC, setA, false, true)
+	}
+
+	// Also check the sources are unchanged.
+	checkSameStringSet(t, setA, input1)
+	checkSameStringSet(t, setC, input3)
+
+	// Check Equal returns true for a set with itself.
+	if equal := setA.Equal(setA); !equal {
+		t.Errorf("Equal(%s, %s) want %v, got %v", setA, setA, true, false)
+	}
+
+	// Also check the source is unchanged.
+	checkSameStringSet(t, setA, input1)
+
+	// Check Equal returns true for two separate equal sets.
+	anotherA := NewStringSet(input1...)
+
+	if equal := setA.Equal(anotherA); !equal {
+		t.Errorf("Equal(%s, %s) want %v, got %v", setA, anotherA, true, false)
+	}
+
+	// Also check the sources are unchanged.
+	checkSameStringSet(t, setA, input1)
+	checkSameStringSet(t, anotherA, input1)
+
+	// Check for equality comparing to nil struct.
+	var nilSet *StringSet
+	if equal := nilSet.Equal(setA); equal {
+		t.Errorf("Equal(%s, %s) want %v, got %v", nilSet, setA, false, true)
+	}
+	if equal := setA.Equal(nilSet); equal {
+		t.Errorf("Equal(%s, %s) want %v, got %v", setA, nilSet, false, true)
+	}
+	if equal := nilSet.Equal(nilSet); !equal {
+		t.Errorf("Equal(%s, %s) want %v, got %v", nilSet, nilSet, true, false)
+	}
+
+	// Edge case: consider the empty set to be different than the nil set.
+	emptySet := NewStringSet()
+	if equal := nilSet.Equal(emptySet); equal {
+		t.Errorf("Equal(%s, %s) want %v, got %v", nilSet, emptySet, false, true)
+	}
+	if equal := emptySet.Equal(nilSet); equal {
+		t.Errorf("Equal(%s, %s) want %v, got %v", emptySet, nilSet, false, true)
+	}
+	if equal := emptySet.Equal(emptySet); !equal {
+		t.Errorf("Equal(%s, %s) want %v, got %v", emptySet, emptySet, true, false)
+	}
+}
+
+func TestStringSet_Union(t *testing.T) {
+	input1 := []string{"a", "c", "d", "e", "f"}
+	input2 := []string{"b", "c", "e"}
+
+	// Check Union(nil) returns a copy of the receiver.
+	setA := NewStringSet(input1...)
+	got := setA.Union(nil)
+	checkSameStringSet(t, got, input1)
+	// Check that the receiver is unchanged.
+	checkSameStringSet(t, setA, input1)
+
+	// Check Union returns the correct result.
+	setB := NewStringSet(input2...)
+	got = setA.Union(setB)
+	want := []string{"a", "b", "c", "d", "e", "f"}
+	checkSameStringSet(t, got, want)
+	// Also check the sources are unchanged.
+	checkSameStringSet(t, setA, input1)
+	checkSameStringSet(t, setB, input2)
+
+	// Reverse the inputs and verify Union produces the same results.
+	setA = NewStringSet(input2...)
+	setB = NewStringSet(input1...)
+	got = setA.Union(setB)
+	checkSameStringSet(t, got, want)
+	// Check the sources are again unchanged.
+	checkSameStringSet(t, setA, input2)
+	checkSameStringSet(t, setB, input1)
+}
author	Bill Wendling <morbo@google.com>	2017-04-09 20:49:36 -0700
committer	Bill Wendling <morbo@google.com>	2017-04-09 20:49:36 -0700
commit	85ea7edc49509df714ac09a6166e514b05e30d6d (patch)
tree	2bb8c36b5938ce5af17775a382e2a0e8608e2723 /internal
download	licenseclassifier-85ea7edc49509df714ac09a6166e514b05e30d6d.tar.gz