aboutsummaryrefslogtreecommitdiff
path: root/internal
diff options
context:
space:
mode:
authorBill Wendling <morbo@google.com>2017-04-09 20:49:36 -0700
committerBill Wendling <morbo@google.com>2017-04-09 20:49:36 -0700
commit85ea7edc49509df714ac09a6166e514b05e30d6d (patch)
tree2bb8c36b5938ce5af17775a382e2a0e8608e2723 /internal
downloadlicenseclassifier-85ea7edc49509df714ac09a6166e514b05e30d6d.tar.gz
Initial commit of license classifier.
Diffstat (limited to 'internal')
-rw-r--r--internal/commentparser/comment_parser.go336
-rw-r--r--internal/commentparser/comment_parser_test.go413
-rw-r--r--internal/commentparser/language/language.go268
-rw-r--r--internal/sets/sets.go20
-rw-r--r--internal/sets/sets_benchmark_test.go213
-rw-r--r--internal/sets/stringset.go228
-rw-r--r--internal/sets/stringset_test.go425
7 files changed, 1903 insertions, 0 deletions
diff --git a/internal/commentparser/comment_parser.go b/internal/commentparser/comment_parser.go
new file mode 100644
index 0000000..0cc3fe3
--- /dev/null
+++ b/internal/commentparser/comment_parser.go
@@ -0,0 +1,336 @@
+// Copyright 2017 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package commentparser does a basic parse over a source file and returns all
+// of the comments from the code. This is useful for when you want to analyze
+// text written in comments (like copyright notices) but not in the code
+// itself.
+package commentparser
+
+import (
+ "bytes"
+ "log"
+ "strings"
+ "unicode/utf8"
+
+ "github.com/google/licenseclassifier/internal/commentparser/language"
+)
+
+const (
+ eofInString = "%d:EOF in string"
+ eofInSingleLineComment = "%d:EOF in single line comment"
+ eofInMultilineComment = "%d:EOF in multiline comment"
+)
+
+// Parse parses the input data and returns the comments.
+func Parse(contents []byte, lang language.Language) Comments {
+ if len(contents) == 0 {
+ return nil
+ }
+
+ c := string(contents)
+ if !strings.HasSuffix(c, "\n") {
+ // Force a terminating newline if one isn't present.
+ c += "\n"
+ }
+ i := &input{
+ s: c,
+ lang: lang,
+ offset: 0,
+ pos: position{line: 1, lineRune: []int{0}},
+ }
+ i.lex()
+ return i.comments
+}
+
+// Comment is either a single line or multiline comment in a source code file.
+// A single line comment has StartLine equal to EndLine. The lines are 1-based.
+type Comment struct {
+ StartLine int
+ EndLine int
+ Text string
+}
+
+// Comments allows us to treat a slice of comments as a unit.
+type Comments []*Comment
+
+// ChunkIterator returns a read-only channel and generates the comments in a
+// goroutine, then closes the channel.
+func (c Comments) ChunkIterator() <-chan Comments {
+ ch := make(chan Comments)
+ go func() {
+ index := 0
+ prevLine := c.StartLine()
+
+ for ; index < len(c); index++ {
+ var chunk Comments
+ for ; index < len(c); index++ {
+ if c[index].StartLine > prevLine+2 {
+ break
+ }
+ chunk = append(chunk, c[index])
+ prevLine = c[index].StartLine
+ }
+ if len(chunk) == 0 {
+ break
+ }
+
+ ch <- chunk
+ if index >= len(c) {
+ break
+ }
+
+ prevLine = c[index].StartLine
+ index--
+ }
+
+ close(ch)
+ }()
+ return ch
+}
+
+// StartLine is the line number (1-based) the first part of the comment block
+// starts on.
+func (c Comments) StartLine() int {
+ if len(c) == 0 {
+ return 0
+ }
+ return c[0].StartLine
+}
+
+// String creates a string out of the text of the comments. Comment begin and
+// end markers are removed.
+func (c Comments) String() string {
+ var s []string
+ for _, cmt := range c {
+ s = append(s, cmt.Text)
+ }
+ return strings.Join(s, "\n")
+}
+
+// position records the location of a lexeme.
+type position struct {
+ line int // Line number of input: 1-based
+ lineRune []int // Rune offset from beginning of line: 0-based
+}
+
+// input holds the current state of the lexer.
+type input struct {
+ s string // Entire input.
+ lang language.Language // Source code language.
+ offset int // Offset into input.
+ pos position // Current position in the input.
+ comments Comments // Comments in the source file.
+}
+
+// lex is called to obtain the comments.
+func (i *input) lex() {
+ for {
+ c, ok := i.peekRune()
+ if !ok {
+ break
+ }
+
+ switch c {
+ case '"', '\'', '`': // String
+ // Ignore strings because they could contain comment
+ // start or end sequences which we need to ignore.
+ if i.lang == language.HTML {
+ // Quotes in HTML-like files aren't meaningful,
+ // because it's basically plain text
+ break
+ }
+
+ ok, hasEscape := i.lang.QuoteCharacter(c)
+ if !ok {
+ break
+ }
+
+ quote := string(c)
+ if i.lang == language.Python {
+ if c == '\'' && i.match("'''") {
+ quote = "'''"
+ } else if c == '"' && i.match(`"""`) {
+ quote = `"""`
+ } else {
+ i.readRune() // Eat quote.
+ }
+ } else {
+ i.readRune() // Eat quote.
+ }
+
+ startLine := i.pos.line
+ for {
+ c, ok = i.peekRune()
+ if !ok {
+ log.Printf(eofInString, startLine)
+ return
+ }
+ if hasEscape && c == '\\' {
+ i.readRune() // Eat escape.
+ } else if i.match(quote) {
+ break
+ } else if (i.lang == language.JavaScript || i.lang == language.Perl) && c == '\n' {
+ // JavaScript and Perl allow you to
+ // specify regexes without quotes, but
+ // which contain quotes. So treat the
+ // newline as terminating the string.
+ break
+ }
+ i.readRune() // Eat character.
+ if i.eof() {
+ log.Printf(eofInString, startLine)
+ return
+ }
+ }
+ default:
+ startLine := i.pos.line
+ var comment bytes.Buffer
+ if i.singleLineComment() { // Single line comment
+ for {
+ if i.eof() {
+ log.Printf(eofInSingleLineComment, i.pos.line)
+ return
+ }
+ c = i.readRune()
+ if c == '\n' {
+ i.unreadRune(c)
+ break
+ }
+ comment.WriteRune(c)
+ }
+ i.comments = append(i.comments, &Comment{
+ StartLine: startLine,
+ EndLine: i.pos.line,
+ Text: comment.String(),
+ })
+ } else if ok, start, end := i.multiLineComment(); ok { // Multiline comment
+ nesting := 0
+ startLine := i.pos.line
+ for {
+ if i.eof() {
+ log.Printf(eofInMultilineComment, startLine)
+ return
+ }
+ c := i.readRune()
+ comment.WriteRune(c)
+ if i.lang.NestedComments() && i.match(start) {
+ // Allows nested comments.
+ comment.WriteString(start)
+ nesting++
+ }
+ if i.match(end) {
+ if nesting > 0 {
+ comment.WriteString(end)
+ nesting--
+ } else {
+ break
+ }
+ }
+ }
+ i.comments = append(i.comments, &Comment{
+ StartLine: startLine,
+ EndLine: i.pos.line,
+ Text: comment.String(),
+ })
+ }
+ }
+
+ i.readRune() // Ignore non-comments.
+ }
+}
+
+// singleLineComment returns 'true' if we've run across a single line comment
+// in the given language.
+func (i *input) singleLineComment() bool {
+ return i.match(i.lang.SingleLineCommentStart())
+}
+
+// multiLineComment returns 'true' if we've run across a multiline comment in
+// the given language.
+func (i *input) multiLineComment() (bool, string, string) {
+ if s := i.lang.MultilineCommentStart(); i.match(s) {
+ return true, s, i.lang.MultilineCommentEnd()
+ }
+ return false, "", ""
+}
+
+// match returns 'true' if the next tokens in the stream match the given
+// string.
+func (i *input) match(s string) bool {
+ if s == "" {
+ return false
+ }
+ saved := s
+ var read []rune
+ for len(s) > 0 && !i.eof() {
+ r, size := utf8.DecodeRuneInString(s)
+ if c, ok := i.peekRune(); ok && c == r {
+ read = append(read, c)
+ } else {
+ // No match. Push the tokens we read back onto the stack.
+ for idx := len(read) - 1; idx >= 0; idx-- {
+ i.unreadRune(read[idx])
+ }
+ return false
+ }
+ s = s[size:]
+ i.readRune() // Eat token.
+ }
+ return string(read) == saved
+}
+
+// eof reports whether the input has reached the end of the file.
+func (i *input) eof() bool {
+ return len(i.s) <= i.offset
+}
+
+// peekRune returns the next rune in the input without consuming it.
+func (i *input) peekRune() (rune, bool) {
+ if i.eof() {
+ return rune(0), false
+ }
+ r, _ := utf8.DecodeRuneInString(i.s[i.offset:])
+ return r, true
+}
+
+// readRune consumes and returns the next rune in the input.
+func (i *input) readRune() rune {
+ r, size := utf8.DecodeRuneInString(i.s[i.offset:])
+ if r == '\n' {
+ i.pos.line++
+ i.pos.lineRune = append(i.pos.lineRune, 0)
+ } else {
+ i.pos.lineRune[len(i.pos.lineRune)-1]++
+ }
+ i.offset += size
+ return r
+}
+
+// unreadRune winds the lexer's state back to before the rune was read.
+func (i *input) unreadRune(c rune) {
+ p := make([]byte, utf8.UTFMax)
+ size := utf8.EncodeRune(p, c)
+ i.offset -= size
+ if c == '\n' {
+ i.pos.line--
+ if len(i.pos.lineRune) > 1 {
+ i.pos.lineRune = i.pos.lineRune[:len(i.pos.lineRune)-1]
+ } else {
+ i.pos.lineRune[len(i.pos.lineRune)-1] = 0
+ }
+ } else {
+ i.pos.lineRune[len(i.pos.lineRune)-1]--
+ }
+}
diff --git a/internal/commentparser/comment_parser_test.go b/internal/commentparser/comment_parser_test.go
new file mode 100644
index 0000000..86dd996
--- /dev/null
+++ b/internal/commentparser/comment_parser_test.go
@@ -0,0 +1,413 @@
+// Copyright 2017 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+package commentparser
+
+import (
+ "fmt"
+ "reflect"
+ "testing"
+
+ "github.com/google/licenseclassifier/internal/commentparser/language"
+)
+
+const (
+ singleLineText = "single line text"
+ multilineText = `first line of text
+second line of text
+third line of text
+`
+)
+
+func TestCommentParser_Lex(t *testing.T) {
+ tests := []struct {
+ description string
+ lang language.Language
+ source string
+ want Comments
+ }{
+ {
+ description: "BCPL Single Line Comments",
+ lang: language.Go,
+ source: fmt.Sprintf("//%s\n", singleLineText),
+ want: []*Comment{
+ {
+ StartLine: 1,
+ EndLine: 1,
+ Text: singleLineText,
+ },
+ },
+ },
+ {
+ description: "Go Comment With Multiline String",
+ lang: language.Go,
+ source: fmt.Sprintf("var a = `A\nmultiline\\x20\nstring`\n//%s\n", singleLineText),
+ want: []*Comment{
+ {
+ StartLine: 4,
+ EndLine: 4,
+ Text: singleLineText,
+ },
+ },
+ },
+ {
+ description: "Python Multiline String",
+ lang: language.Python,
+ source: fmt.Sprintf("#%s\n'''this is a multiline\nstring'''", singleLineText),
+ want: []*Comment{
+ {
+ StartLine: 1,
+ EndLine: 1,
+ Text: singleLineText,
+ },
+ },
+ },
+ {
+ description: "TR Command String",
+ lang: language.Python,
+ source: fmt.Sprintf(`#%s
+AUTH= \
+| tr '"\n' \
+| base64 -w
+`, singleLineText),
+ want: []*Comment{
+ {
+ StartLine: 1,
+ EndLine: 1,
+ Text: singleLineText,
+ },
+ },
+ },
+ {
+ description: "Lisp Single Line Comments",
+ lang: language.Clojure,
+ source: fmt.Sprintf(";%s\n", singleLineText),
+ want: []*Comment{
+ {
+ StartLine: 1,
+ EndLine: 1,
+ Text: singleLineText,
+ },
+ },
+ },
+ {
+ description: "Shell Single Line Comments",
+ lang: language.Shell,
+ source: fmt.Sprintf("#%s\n", singleLineText),
+ want: []*Comment{
+ {
+ StartLine: 1,
+ EndLine: 1,
+ Text: singleLineText,
+ },
+ },
+ },
+ {
+ description: "BCPL Multiline Comments",
+ lang: language.C,
+ source: fmt.Sprintf("/*%s*/\n", multilineText),
+ want: []*Comment{
+ {
+ StartLine: 1,
+ EndLine: 4,
+ Text: multilineText,
+ },
+ },
+ },
+ {
+ description: "BCPL Multiline Comments no terminating newline",
+ lang: language.C,
+ source: fmt.Sprintf("/*%s*/", multilineText),
+ want: []*Comment{
+ {
+ StartLine: 1,
+ EndLine: 4,
+ Text: multilineText,
+ },
+ },
+ },
+ {
+ description: "Nested Multiline Comments",
+ lang: language.Swift,
+ source: "/*a /*\n nested\n*/\n comment\n*/\n",
+ want: []*Comment{
+ {
+ StartLine: 1,
+ EndLine: 5,
+ Text: "a /*\n nested\n*/\n comment\n",
+ },
+ },
+ },
+ {
+ description: "Ruby Multiline Comments",
+ lang: language.Ruby,
+ source: fmt.Sprintf("=begin\n%s=end\n", multilineText),
+ want: []*Comment{
+ {
+ StartLine: 1,
+ EndLine: 5,
+ Text: "\n" + multilineText,
+ },
+ },
+ },
+ {
+ description: "Multiple Single Line Comments",
+ lang: language.Shell,
+ source: `# First line
+# Second line
+# Third line
+`,
+ want: []*Comment{
+ {
+ StartLine: 1,
+ EndLine: 1,
+ Text: " First line",
+ },
+ {
+ StartLine: 2,
+ EndLine: 2,
+ Text: " Second line",
+ },
+ {
+ StartLine: 3,
+ EndLine: 3,
+ Text: " Third line",
+ },
+ },
+ },
+ {
+ description: "Mixed Multiline / Single Line Comments",
+ lang: language.C,
+ source: `/*
+ * The first multiline line.
+ * The second multiline line.
+ */
+ // The first single line comment.
+ // The second single line comment.
+`,
+ want: []*Comment{
+ {
+ StartLine: 1,
+ EndLine: 4,
+ Text: `
+ * The first multiline line.
+ * The second multiline line.
+ `,
+ },
+ {
+ StartLine: 5,
+ EndLine: 5,
+ Text: " The first single line comment.",
+ },
+ {
+ StartLine: 6,
+ EndLine: 6,
+ Text: " The second single line comment.",
+ },
+ },
+ },
+ {
+ description: "Mixed Multiline / Single Line Comments",
+ lang: language.C,
+ source: `/*
+ * The first multiline line.
+ * The second multiline line.
+ */
+ // The first single line comment.
+ // The second single line comment.
+`,
+ want: []*Comment{
+ {
+ StartLine: 1,
+ EndLine: 4,
+ Text: `
+ * The first multiline line.
+ * The second multiline line.
+ `,
+ },
+ {
+ StartLine: 5,
+ EndLine: 5,
+ Text: " The first single line comment.",
+ },
+ {
+ StartLine: 6,
+ EndLine: 6,
+ Text: " The second single line comment.",
+ },
+ },
+ },
+ {
+ description: "HTML-like comments and quotes",
+ lang: language.HTML,
+ source: `# This is an important topic
+I don't want to go on all day here! <-- notice the quote in there!
+<!-- Well, maybe I do... -->
+`,
+ want: []*Comment{
+ {
+ StartLine: 3,
+ EndLine: 3,
+ Text: " Well, maybe I do... ",
+ },
+ },
+ },
+ {
+ description: "JavaScript regex",
+ lang: language.JavaScript,
+ source: `var re = /hello"world/;
+// the comment
+`,
+ want: []*Comment{
+ {
+ StartLine: 2,
+ EndLine: 2,
+ Text: " the comment",
+ },
+ },
+ },
+ {
+ description: "Perl regex",
+ lang: language.Perl,
+ source: `if (/hello"world/) {
+ # the comment
+ print "Yo!"
+}
+`,
+ want: []*Comment{
+ {
+ StartLine: 2,
+ EndLine: 2,
+ Text: " the comment",
+ },
+ },
+ },
+ }
+
+ for _, tt := range tests {
+ got := Parse([]byte(tt.source), tt.lang)
+ if !reflect.DeepEqual(got, tt.want) {
+ t.Errorf("Mismatch(%q) = %+v, want %+v", tt.description, got, tt.want)
+ }
+ }
+}
+
+func TestCommentParser_ChunkIterator(t *testing.T) {
+ tests := []struct {
+ description string
+ comments Comments
+ want []Comments
+ }{
+ {
+ description: "Empty Comments",
+ comments: Comments{},
+ want: nil,
+ },
+ {
+ description: "Single Line Comment Chunk",
+ comments: Comments{
+ {StartLine: 1, EndLine: 1, Text: "Block 1 line 1"},
+ {StartLine: 2, EndLine: 2, Text: "Block 1 line 2"},
+ },
+ want: []Comments{
+ Comments{
+ {StartLine: 1, EndLine: 1, Text: "Block 1 line 1"},
+ {StartLine: 2, EndLine: 2, Text: "Block 1 line 2"},
+ },
+ },
+ },
+ {
+ description: "Multiline Comment Chunk",
+ comments: Comments{
+ {StartLine: 1, EndLine: 3, Text: "Multiline 1\n2\n3"},
+ },
+ want: []Comments{
+ Comments{{StartLine: 1, EndLine: 3, Text: "Multiline 1\n2\n3"}},
+ },
+ },
+ {
+ description: "Multiple Single Line Comment Chunks",
+ comments: Comments{
+ {StartLine: 1, EndLine: 1, Text: "Block 1 line 1"},
+ {StartLine: 2, EndLine: 2, Text: "Block 1 line 2"},
+ {StartLine: 5, EndLine: 5, Text: "Block 2 line 1"},
+ {StartLine: 6, EndLine: 6, Text: "Block 2 line 2"},
+ {StartLine: 10, EndLine: 10, Text: "Block 3 line 1"},
+ {StartLine: 11, EndLine: 11, Text: "Block 3 line 2"},
+ {StartLine: 13, EndLine: 13, Text: "Block 3 line 3"},
+ },
+ want: []Comments{
+ Comments{
+ {StartLine: 1, EndLine: 1, Text: "Block 1 line 1"},
+ {StartLine: 2, EndLine: 2, Text: "Block 1 line 2"},
+ },
+ Comments{
+ {StartLine: 5, EndLine: 5, Text: "Block 2 line 1"},
+ {StartLine: 6, EndLine: 6, Text: "Block 2 line 2"},
+ },
+ Comments{
+ {StartLine: 10, EndLine: 10, Text: "Block 3 line 1"},
+ {StartLine: 11, EndLine: 11, Text: "Block 3 line 2"},
+ {StartLine: 13, EndLine: 13, Text: "Block 3 line 3"},
+ },
+ },
+ },
+ {
+ description: "Multiline Comment Chunk",
+ comments: Comments{
+ {StartLine: 1, EndLine: 3, Text: "Multiline 1\n2\n3"},
+ {StartLine: 4, EndLine: 6, Text: "Multiline 1\n2\n3"},
+ },
+ want: []Comments{
+ Comments{{StartLine: 1, EndLine: 3, Text: "Multiline 1\n2\n3"}},
+ Comments{{StartLine: 4, EndLine: 6, Text: "Multiline 1\n2\n3"}},
+ },
+ },
+ {
+ description: "Multiline and Single Line Comment Chunks",
+ comments: Comments{
+ {StartLine: 1, EndLine: 3, Text: "Multiline 1\n2\n3"},
+ {StartLine: 4, EndLine: 4, Text: "Block 2 line 1"},
+ {StartLine: 5, EndLine: 5, Text: "Block 2 line 2"},
+ },
+ want: []Comments{
+ Comments{
+ {StartLine: 1, EndLine: 3, Text: "Multiline 1\n2\n3"},
+ },
+ Comments{
+ {StartLine: 4, EndLine: 4, Text: "Block 2 line 1"},
+ {StartLine: 5, EndLine: 5, Text: "Block 2 line 2"},
+ },
+ },
+ },
+ }
+
+ for _, tt := range tests {
+ i := 0
+ for got := range tt.comments.ChunkIterator() {
+ if i >= len(tt.want) {
+ t.Errorf("Mismatch(%q) more comment chunks than expected = %v, want %v",
+ tt.description, i+1, len(tt.want))
+ break
+ }
+ if !reflect.DeepEqual(got, tt.want[i]) {
+ t.Errorf("Mismatch(%q) = %+v, want %+v", tt.description, got, tt.want[i])
+ }
+ i++
+ }
+ if i != len(tt.want) {
+ t.Errorf("Mismatch(%q) not enough comment chunks = %v, want %v",
+ tt.description, i, len(tt.want))
+ }
+ }
+}
diff --git a/internal/commentparser/language/language.go b/internal/commentparser/language/language.go
new file mode 100644
index 0000000..36e04eb
--- /dev/null
+++ b/internal/commentparser/language/language.go
@@ -0,0 +1,268 @@
+// Copyright 2017 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package language contains methods and information about the different
+// programming languages the comment parser supports.
+package language
+
+import (
+ "path/filepath"
+ "strings"
+)
+
+// Language is the progamming language we're grabbing the comments from.
+type Language int
+
+// Languages we can retrieve comments from.
+const (
+ Unknown Language = iota
+ AppleScript
+ Assembly
+ Batch
+ C
+ Clif
+ Clojure
+ CMake
+ CSharp
+ Dart
+ Fortran
+ GLSLF // OpenGL Shading Language
+ Go
+ Haskell
+ HTML
+ Java
+ JavaScript
+ Flex
+ Lisp
+ NinjaBuild
+ ObjectiveC
+ Perl
+ Python
+ R
+ Ruby
+ Rust
+ Shader
+ Shell
+ SQL
+ Swift
+ SWIG
+ TypeScript
+ Yacc
+ Yaml
+)
+
+// style is the comment styles that a language uses.
+type style int
+
+// Comment styles.
+const (
+ unknown style = iota
+ applescript // -- ... and (* ... *)
+ batch // @REM
+ bcpl // // ... and /* ... */
+ cmake // # ... and #[[ ... ]]
+ fortran // ! ...
+ haskell // -- ... and {- ... -}
+ html // <!-- ... -->
+ lisp // ;;
+ ruby // # ... and =begin ... =end
+ shell // # ...
+)
+
+// ClassifyLanguage determines what language the source code was written in. It
+// does this by looking at the file's extension.
+func ClassifyLanguage(filename string) Language {
+ ext := strings.ToLower(filepath.Ext(filename))
+ if len(ext) == 0 || ext[0] != '.' {
+ return Unknown
+ }
+
+ switch ext[1:] { // Skip the '.'.
+ case "applescript":
+ return AppleScript
+ case "bat":
+ return Batch
+ case "c", "cc", "cpp", "c++", "h", "hh", "hpp":
+ return C
+ case "clif":
+ return Clif
+ case "cmake":
+ return CMake
+ case "cs":
+ return CSharp
+ case "dart":
+ return Dart
+ case "f", "f90", "f95":
+ return Fortran
+ case "glslf":
+ return GLSLF
+ case "go":
+ return Go
+ case "hs":
+ return Haskell
+ case "html", "htm", "md", "ng", "sgml":
+ return HTML
+ case "java":
+ return Java
+ case "js":
+ return JavaScript
+ case "l":
+ return Flex
+ case "lisp", "el", "clj":
+ return Lisp
+ case "m", "mm":
+ return ObjectiveC
+ case "gn":
+ return NinjaBuild
+ case "pl", "pm":
+ return Perl
+ case "py", "pi":
+ return Python
+ case "r":
+ return R
+ case "rb":
+ return Ruby
+ case "rs":
+ return Rust
+ case "s":
+ return Assembly
+ case "sh":
+ return Shell
+ case "shader":
+ return Shader
+ case "sql":
+ return SQL
+ case "swift":
+ return Swift
+ case "swig":
+ return SWIG
+ case "ts", "tsx":
+ return TypeScript
+ case "y":
+ return Yacc
+ case "yaml":
+ return Yaml
+ }
+ return Unknown
+}
+
+// commentStyle returns the language's comment style.
+func (lang Language) commentStyle() style {
+ switch lang {
+ case Assembly, C, CSharp, Dart, GLSLF, Go, Java, JavaScript, Flex, ObjectiveC, Rust, Shader, Swift, SWIG, TypeScript, Yacc:
+ return bcpl
+ case Batch:
+ return batch
+ case CMake:
+ return cmake
+ case Fortran:
+ return fortran
+ case Haskell, SQL:
+ return haskell
+ case HTML:
+ return html
+ case Clojure, Lisp:
+ return lisp
+ case Ruby:
+ return ruby
+ case Clif, NinjaBuild, Perl, Python, R, Shell, Yaml:
+ return shell
+ }
+ return unknown
+}
+
+// SingleLineCommentStart returns the starting string of a single line comment
+// for the given language. There is no equivalent "End" method, because it's
+// the end of line.
+func (lang Language) SingleLineCommentStart() string {
+ switch lang.commentStyle() {
+ case applescript, haskell:
+ return "--"
+ case batch:
+ return "@REM"
+ case bcpl:
+ return "//"
+ case fortran:
+ return "!"
+ case lisp:
+ return ";"
+ case shell, ruby, cmake:
+ return "#"
+ }
+ return ""
+}
+
+// MultilineCommentStart returns the starting string of a multiline comment for
+// the given language.
+func (lang Language) MultilineCommentStart() string {
+ switch lang.commentStyle() {
+ case applescript:
+ return "(*"
+ case bcpl:
+ if lang != Rust {
+ return "/*"
+ }
+ case cmake:
+ return "#[["
+ case haskell:
+ return "{-"
+ case html:
+ return "<!--"
+ case ruby:
+ return "=begin"
+ }
+ return ""
+}
+
+// MultilineCommentEnd returns the ending string of a multiline comment for the
+// given langauge.
+func (lang Language) MultilineCommentEnd() string {
+ switch lang.commentStyle() {
+ case applescript:
+ return "*)"
+ case bcpl:
+ if lang != Rust {
+ return "*/"
+ }
+ case cmake:
+ return "]]"
+ case haskell:
+ return "-}"
+ case html:
+ return "-->"
+ case ruby:
+ return "=end"
+ }
+ return ""
+}
+
+// QuoteCharacter returns 'true' if the character is considered the beginning
+// of a string in the given language. The second return value is true if the
+// string allows for escaping.
+func (lang Language) QuoteCharacter(quote rune) (ok bool, escape bool) {
+ switch quote {
+ case '"', '\'':
+ return true, true
+ case '`':
+ if lang == Go {
+ return true, false
+ }
+ }
+ return false, false
+}
+
+// NestedComments returns true if the language allows for nested multiline comments.
+func (lang Language) NestedComments() bool {
+ return lang == Swift
+}
diff --git a/internal/sets/sets.go b/internal/sets/sets.go
new file mode 100644
index 0000000..f34ae5b
--- /dev/null
+++ b/internal/sets/sets.go
@@ -0,0 +1,20 @@
+// Copyright 2017 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package sets provides sets for storing collections of unique elements.
+package sets
+
+// present is an empty struct used as the "value" in the map[int], since
+// empty structs consume zero bytes (unlike 1 unnecessary byte per bool).
+type present struct{}
diff --git a/internal/sets/sets_benchmark_test.go b/internal/sets/sets_benchmark_test.go
new file mode 100644
index 0000000..017dca6
--- /dev/null
+++ b/internal/sets/sets_benchmark_test.go
@@ -0,0 +1,213 @@
+// Copyright 2017 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+package sets
+
+import (
+ "strings"
+ "testing"
+)
+
+const (
+ postmodernThesisCollapse = `1. Expressions of collapse
+
+If one examines postcultural Marxism, one is faced with a choice: either
+reject capitalist submodern theory or conclude that the purpose of the reader
+is significant form. Bataille uses the term ‘capitalist construction’ to denote
+not, in fact, discourse, but prediscourse.
+
+Therefore, in Stardust, Gaiman analyses postcultural Marxism; in
+The Books of Magic, although, he denies capitalist submodern theory. If
+capitalist construction holds, we have to choose between capitalist submodern
+theory and Baudrillardist simulacra.
+
+However, conceptualist socialism implies that narrativity may be used to
+oppress the proletariat, given that sexuality is distinct from art. The subject
+is interpolated into a capitalist construction that includes language as a
+paradox.
+`
+ postmodernThesisNarratives = `1. Narratives of failure
+
+The main theme of the works of Joyce is the defining characteristic, and some
+would say the economy, of neocultural class. But Bataille promotes the use of
+socialist realism to deconstruct sexual identity.
+
+The subject is interpolated into a Baudrillardist simulation that includes
+consciousness as a whole. Thus, the primary theme of Pickett's[1] model of
+socialist realism is the role of the reader as artist.
+
+The subject is contextualised into a postcapitalist discourse that includes
+language as a paradox. It could be said that if Baudrillardist simulation
+holds, the works of Gibson are postmodern. The characteristic theme of the
+works of Gibson is the common ground between society and narrativity. However,
+Sartre uses the term 'postcapitalist discourse' to denote not, in fact,
+narrative, but postnarrative.
+`
+)
+
+var (
+ // Word lists:
+ stringsA = strings.Fields(postmodernThesisCollapse)
+ stringsB = strings.Fields(postmodernThesisNarratives)
+)
+
+func BenchmarkStringSets_NewStringSet(b *testing.B) {
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ NewStringSet(stringsA...)
+ }
+}
+
+func BenchmarkStringSets_Copy(b *testing.B) {
+ s := NewStringSet(stringsA...)
+
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ s.Copy()
+ }
+}
+
+func BenchmarkStringSets_Insert(b *testing.B) {
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ s := NewStringSet()
+ s.Insert(stringsA...)
+ s.Insert(stringsB...)
+ }
+}
+
+func BenchmarkStringSets_Delete(b *testing.B) {
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ s := NewStringSet(stringsA...)
+ s.Delete(stringsB...)
+ }
+}
+
+func BenchmarkStringSets_Intersect(b *testing.B) {
+ s := NewStringSet(stringsA...)
+ t := NewStringSet(stringsB...)
+
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ s.Intersect(t)
+ }
+}
+
+func BenchmarkStringSets_Disjoint(b *testing.B) {
+ s := NewStringSet(stringsA...)
+ t := NewStringSet(stringsB...)
+
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ s.Disjoint(t)
+ }
+}
+
+func BenchmarkStringSets_Difference(b *testing.B) {
+ s := NewStringSet(stringsA...)
+ t := NewStringSet(stringsB...)
+
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ s.Difference(t)
+ }
+}
+
+func BenchmarkStringSets_Unique(b *testing.B) {
+ s := NewStringSet(stringsA...)
+ t := NewStringSet(stringsB...)
+
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ s.Unique(t)
+ }
+}
+
+func BenchmarkStringSets_Equal(b *testing.B) {
+ s := NewStringSet(stringsA...)
+ t := NewStringSet(stringsB...)
+
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ s.Equal(t)
+ }
+}
+
+func BenchmarkStringSets_Union(b *testing.B) {
+ s := NewStringSet(stringsA...)
+ t := NewStringSet(stringsB...)
+
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ s.Union(t)
+ }
+}
+
+func BenchmarkStringSets_Contains(b *testing.B) {
+ s := NewStringSet(stringsA...)
+
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ for _, w := range stringsB {
+ s.Contains(w)
+ }
+ }
+}
+
+func BenchmarkStringSets_Len(b *testing.B) {
+ s := NewStringSet(stringsA...)
+
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ s.Len()
+ }
+}
+
+func BenchmarkStringSets_Empty(b *testing.B) {
+ s := NewStringSet(stringsA...)
+ t := NewStringSet()
+
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ s.Empty()
+ t.Empty()
+ }
+}
+
+func BenchmarkStringSets_Elements(b *testing.B) {
+ s := NewStringSet(stringsA...)
+
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ s.Elements()
+ }
+}
+
+func BenchmarkStringSets_Sorted(b *testing.B) {
+ s := NewStringSet(stringsA...)
+
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ s.Sorted()
+ }
+}
+
+func BenchmarkStringSets_String(b *testing.B) {
+ s := NewStringSet(stringsA...)
+
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ s.String()
+ }
+}
diff --git a/internal/sets/stringset.go b/internal/sets/stringset.go
new file mode 100644
index 0000000..9f93450
--- /dev/null
+++ b/internal/sets/stringset.go
@@ -0,0 +1,228 @@
+// Copyright 2017 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+package sets
+
+import (
+ "fmt"
+ "sort"
+ "strings"
+)
+
+// StringSet stores a set of unique string elements.
+type StringSet struct {
+ set map[string]present
+}
+
+// NewStringSet creates a StringSet containing the supplied initial string elements.
+func NewStringSet(elements ...string) *StringSet {
+ s := &StringSet{}
+ s.set = make(map[string]present)
+ s.Insert(elements...)
+ return s
+}
+
+// Copy returns a newly allocated copy of the supplied StringSet.
+func (s *StringSet) Copy() *StringSet {
+ c := NewStringSet()
+ if s != nil {
+ for e := range s.set {
+ c.set[e] = present{}
+ }
+ }
+ return c
+}
+
+// Insert zero or more string elements into the StringSet.
+// As expected for a Set, elements already present in the StringSet are
+// simply ignored.
+func (s *StringSet) Insert(elements ...string) {
+ for _, e := range elements {
+ s.set[e] = present{}
+ }
+}
+
+// Delete zero or more string elements from the StringSet.
+// Any elements not present in the StringSet are simply ignored.
+func (s *StringSet) Delete(elements ...string) {
+ for _, e := range elements {
+ delete(s.set, e)
+ }
+}
+
+// Intersect returns a new StringSet containing the intersection of the
+// receiver and argument StringSets. Returns an empty set if the argument is nil.
+func (s *StringSet) Intersect(other *StringSet) *StringSet {
+ if other == nil {
+ return NewStringSet()
+ }
+
+ // Point a and b to the maps, setting a to the smaller of the two.
+ a, b := s.set, other.set
+ if len(b) < len(a) {
+ a, b = b, a
+ }
+
+ // Perform the intersection.
+ intersect := NewStringSet()
+ for e := range a {
+ if _, ok := b[e]; ok {
+ intersect.set[e] = present{}
+ }
+ }
+ return intersect
+}
+
+// Disjoint returns true if the intersection of the receiver and the argument
+// StringSets is the empty set. Returns true if the argument is nil or either
+// StringSet is the empty set.
+func (s *StringSet) Disjoint(other *StringSet) bool {
+ if other == nil || len(other.set) == 0 || len(s.set) == 0 {
+ return true
+ }
+
+ // Point a and b to the maps, setting a to the smaller of the two.
+ a, b := s.set, other.set
+ if len(b) < len(a) {
+ a, b = b, a
+ }
+
+ // Check for non-empty intersection.
+ for e := range a {
+ if _, ok := b[e]; ok {
+ return false // Early-exit because intersecting.
+ }
+ }
+ return true
+}
+
+// Difference returns a new StringSet containing the elements in the receiver
+// that are not present in the argument StringSet. Returns a copy of the
+// receiver if the argument is nil.
+func (s *StringSet) Difference(other *StringSet) *StringSet {
+ if other == nil {
+ return s.Copy()
+ }
+
+ // Insert only the elements in the receiver that are not present in the
+ // argument StringSet.
+ diff := NewStringSet()
+ for e := range s.set {
+ if _, ok := other.set[e]; !ok {
+ diff.set[e] = present{}
+ }
+ }
+ return diff
+}
+
+// Unique returns a new StringSet containing the elements in the receiver
+// that are not present in the argument StringSet *and* the elements in the
+// argument StringSet that are not in the receiver (which is the union of two
+// disjoint sets). Returns a copy of the
+// receiver if the argument is nil.
+func (s *StringSet) Unique(other *StringSet) *StringSet {
+ if other == nil {
+ return s.Copy()
+ }
+
+ sNotInOther := s.Difference(other)
+ otherNotInS := other.Difference(s)
+
+ // Duplicate Union implementation here to avoid extra Copy, since both
+ // sNotInOther and otherNotInS are already copies.
+ unique := sNotInOther
+ for e := range otherNotInS.set {
+ unique.set[e] = present{}
+ }
+ return unique
+}
+
+// Equal returns true if the receiver and the argument StringSet contain
+// exactly the same elements. Returns false if the argument is nil.
+func (s *StringSet) Equal(other *StringSet) bool {
+ if s == nil || other == nil {
+ return s == nil && other == nil
+ }
+
+ // Two sets of different length cannot have the exact same unique elements.
+ if len(s.set) != len(other.set) {
+ return false
+ }
+
+ // Only one loop is needed. If the two sets are known to be of equal
+ // length, then the two sets are equal only if exactly all of the elements
+ // in the first set are found in the second.
+ for e := range s.set {
+ if _, ok := other.set[e]; !ok {
+ return false
+ }
+ }
+
+ return true
+}
+
+// Union returns a new StringSet containing the union of the receiver and
+// argument StringSets. Returns a copy of the receiver if the argument is nil.
+func (s *StringSet) Union(other *StringSet) *StringSet {
+ union := s.Copy()
+ if other != nil {
+ for e := range other.set {
+ union.set[e] = present{}
+ }
+ }
+ return union
+}
+
+// Contains returns true if element is in the StringSet.
+func (s *StringSet) Contains(element string) bool {
+ _, in := s.set[element]
+ return in
+}
+
+// Len returns the number of unique elements in the StringSet.
+func (s *StringSet) Len() int {
+ return len(s.set)
+}
+
+// Empty returns true if the receiver is the empty set.
+func (s *StringSet) Empty() bool {
+ return len(s.set) == 0
+}
+
+// Elements returns a []string of the elements in the StringSet, in no
+// particular (or consistent) order.
+func (s *StringSet) Elements() []string {
+ elements := []string{} // Return at least an empty slice rather than nil.
+ for e := range s.set {
+ elements = append(elements, e)
+ }
+ return elements
+}
+
+// Sorted returns a sorted []string of the elements in the StringSet.
+func (s *StringSet) Sorted() []string {
+ elements := s.Elements()
+ sort.Strings(elements)
+ return elements
+}
+
+// String formats the StringSet elements as sorted strings, representing them
+// in "array initializer" syntax.
+func (s *StringSet) String() string {
+ elements := s.Sorted()
+ var quoted []string
+ for _, e := range elements {
+ quoted = append(quoted, fmt.Sprintf("%q", e))
+ }
+ return fmt.Sprintf("{%s}", strings.Join(quoted, ", "))
+}
diff --git a/internal/sets/stringset_test.go b/internal/sets/stringset_test.go
new file mode 100644
index 0000000..e37efaa
--- /dev/null
+++ b/internal/sets/stringset_test.go
@@ -0,0 +1,425 @@
+// Copyright 2017 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+package sets
+
+import (
+ "sort"
+ "testing"
+)
+
+func checkSameStringSet(t *testing.T, set *StringSet, unique []string) {
+ // Check that lengths are the same.
+ want := len(unique)
+ got := set.Len()
+
+ if got != want {
+ t.Errorf("NewStringSet(%v) want length %v, got %v", unique, want, got)
+ }
+
+ // Check that all strings are present in set.
+ for _, s := range unique {
+ want := true
+ got := set.Contains(s)
+
+ if got != want {
+ t.Errorf("Contains(%v) want %v, got %v", s, want, got)
+ }
+ }
+
+ // Check that all elements are present in strings.
+ sort.Strings(unique)
+
+ for i, got := range set.Sorted() {
+ want := unique[i]
+
+ if got != want {
+ t.Errorf("Sorted(%d) want %v, got %v", i, want, got)
+ }
+ }
+}
+
+func TestNewStringSet(t *testing.T) {
+ empty := NewStringSet()
+ want := 0
+ got := empty.Len()
+
+ if got != want {
+ t.Errorf("NewStringSet() want length %v, got %v", want, got)
+ }
+
+ unique := []string{"a", "b", "c"}
+ set := NewStringSet(unique...)
+ checkSameStringSet(t, set, unique)
+
+ // Append an already-present element.
+ nonUnique := append(unique, unique[0])
+ set = NewStringSet(nonUnique...)
+
+ // Non-unique unique should collapse to one.
+ want = len(unique)
+ got = set.Len()
+
+ if got != want {
+ t.Errorf("NewStringSet(%v) want length %v, got %v", nonUnique, want, got)
+ }
+}
+
+func TestStringSet_Copy(t *testing.T) {
+ // Check both copies represent the same set.
+ base := []string{"a", "b", "c"}
+ orig := NewStringSet(base...)
+ cpy := orig.Copy()
+ checkSameStringSet(t, orig, base)
+ checkSameStringSet(t, cpy, base)
+
+ // Check the two copies are independent.
+ more := []string{"d"}
+ orig.Insert(more...)
+ more = append(base, more...)
+ checkSameStringSet(t, orig, more)
+ checkSameStringSet(t, cpy, base)
+}
+
+func TestStringSet_Insert(t *testing.T) {
+ unique := []string{"a", "b", "c"}
+ set := NewStringSet(unique...)
+
+ // Insert existing element, which should basically be a no-op.
+ set.Insert(unique[0])
+ checkSameStringSet(t, set, unique)
+
+ // Actually insert new unique elements.
+ additional := []string{"d", "e"}
+ longer := append(unique, additional...)
+ set.Insert(additional...)
+ checkSameStringSet(t, set, longer)
+}
+
+func TestStringSet_Delete(t *testing.T) {
+ unique := []string{"a", "b", "c"}
+ set := NewStringSet(unique...)
+
+ // Delete non-existent element, which should basically be a no-op.
+ set.Delete("z")
+ checkSameStringSet(t, set, unique)
+
+ // Actually delete existing elements.
+ set.Delete(unique[1:]...)
+ checkSameStringSet(t, set, unique[:1])
+}
+
+func TestStringSet_Intersect(t *testing.T) {
+ input1 := []string{"a", "c", "d", "e", "f"}
+ input2 := []string{"b", "c", "e"}
+
+ // Check Intersect(nil) returns an empty set.
+ setA := NewStringSet(input1...)
+ got := setA.Intersect(nil)
+ checkSameStringSet(t, got, []string{})
+ // Check that the receiver is unchanged.
+ checkSameStringSet(t, setA, input1)
+
+ // Check Intersect returns the correct result.
+ setB := NewStringSet(input2...)
+ got = setA.Intersect(setB)
+ want := []string{"c", "e"}
+ checkSameStringSet(t, got, want)
+ // Also check the sources are unchanged.
+ checkSameStringSet(t, setA, input1)
+ checkSameStringSet(t, setB, input2)
+
+ // Reverse the inputs and verify Intersect produces the same results.
+ setA = NewStringSet(input2...)
+ setB = NewStringSet(input1...)
+ got = setA.Intersect(setB)
+ checkSameStringSet(t, got, want)
+ // Check the sources are again unchanged.
+ checkSameStringSet(t, setA, input2)
+ checkSameStringSet(t, setB, input1)
+}
+
+func TestStringSet_Disjoint(t *testing.T) {
+ input1 := []string{"a", "c", "d", "e", "f"}
+ input2 := []string{"b", "c", "e"}
+ input3 := []string{"x", "y", "z"}
+
+ // Check that sets are always disjoint with the empty set or nil
+ setA := NewStringSet(input1...)
+ emptySet := NewStringSet()
+
+ if disjoint := setA.Disjoint(nil); !disjoint {
+ t.Errorf("Disjoint(%s, %v) want %v, got %v", setA, nil, true, disjoint)
+ }
+
+ if disjoint := setA.Disjoint(emptySet); !disjoint {
+ t.Errorf("Disjoint(%s, %s) want %v, got %v", setA, emptySet, true, disjoint)
+ }
+
+ if disjoint := emptySet.Disjoint(setA); !disjoint {
+ t.Errorf("Disjoint(%s, %s) want %v, got %v", emptySet, setA, true, disjoint)
+ }
+
+ if disjoint := emptySet.Disjoint(emptySet); !disjoint {
+ t.Errorf("Disjoint(%s, %s) want %v, got %v", emptySet, emptySet, true, disjoint)
+ }
+
+ // Also check the sources are unchanged.
+ checkSameStringSet(t, setA, input1)
+ checkSameStringSet(t, emptySet, []string{})
+
+ // Check two non-empty, non-nil disjoint sets.
+ setC := NewStringSet(input3...)
+
+ if disjoint := setA.Disjoint(setC); !disjoint {
+ t.Errorf("Disjoint(%s, %s) want %v, got %v", setA, setC, true, disjoint)
+ }
+
+ // Also check the sources are unchanged.
+ checkSameStringSet(t, setA, input1)
+ checkSameStringSet(t, setC, input3)
+
+ // Check that two intersecting sets are not Disjoint.
+ setB := NewStringSet(input2...)
+
+ if disjoint := setA.Disjoint(setB); disjoint {
+ t.Errorf("Disjoint(%s, %s) want %v, got %v", setA, setB, false, disjoint)
+ }
+
+ // Also check the sources are unchanged.
+ checkSameStringSet(t, setA, input1)
+ checkSameStringSet(t, setB, input2)
+}
+
+func TestStringSet_Difference(t *testing.T) {
+ input1 := []string{"a", "c", "d", "e", "f"}
+ input2 := []string{"b", "c", "e"}
+ input3 := []string{"x", "y", "z"}
+
+ // Check Difference(nil) returns a copy of the receiver.
+ setA := NewStringSet(input1...)
+ got := setA.Difference(nil)
+ checkSameStringSet(t, got, input1)
+ // Check that the receiver is unchanged.
+ checkSameStringSet(t, setA, input1)
+
+ // Check A - A returns the empty set.
+ got = setA.Difference(setA)
+
+ if !got.Empty() {
+ t.Errorf("Difference(%s, %s).Empty() want %v, got %v",
+ setA, setA, true, false)
+ }
+
+ checkSameStringSet(t, got, []string{})
+ // Check that the receiver is unchanged.
+ checkSameStringSet(t, setA, input1)
+
+ // Check A - C simply returns elements in A if A and C are disjoint.
+ setC := NewStringSet(input3...)
+ got = setA.Difference(setC)
+ checkSameStringSet(t, got, input1)
+ // Also check the sources are unchanged.
+ checkSameStringSet(t, setA, input1)
+ checkSameStringSet(t, setC, input3)
+
+ // Check A - B returns elements in A not in B.
+ setB := NewStringSet(input2...)
+ got = setA.Difference(setB)
+ want := []string{"a", "d", "f"}
+ checkSameStringSet(t, got, want)
+
+ // Also check the sources are unchanged.
+ checkSameStringSet(t, setA, input1)
+ checkSameStringSet(t, setB, input2)
+
+ // Check B - A returns elements in B not in A.
+ got = setB.Difference(setA)
+ want = []string{"b"}
+ checkSameStringSet(t, got, want)
+ // Also check the sources are unchanged.
+ checkSameStringSet(t, setA, input1)
+ checkSameStringSet(t, setB, input2)
+}
+
+func TestStringSet_Unique(t *testing.T) {
+ input1 := []string{"a", "c", "d", "e", "f"}
+ input2 := []string{"b", "c", "e"}
+ input3 := []string{"x", "y", "z"}
+
+ // Check Unique(nil) returns a copy of the receiver.
+ setA := NewStringSet(input1...)
+ got := setA.Unique(nil)
+ checkSameStringSet(t, got, input1)
+ // Check that the receiver is unchanged.
+ checkSameStringSet(t, setA, input1)
+
+ // Check Unique returns only elements in A and B not in both A and B.
+ setB := NewStringSet(input2...)
+ got = setA.Unique(setB)
+ want := []string{"a", "b", "d", "f"}
+ checkSameStringSet(t, got, want)
+ // Also check the sources are unchanged.
+ checkSameStringSet(t, setA, input1)
+ checkSameStringSet(t, setB, input2)
+
+ // Check Unique of two disjoint sets is the Union of those sets.
+ setC := NewStringSet(input3...)
+ got = setA.Unique(setC)
+ union := setA.Union(setC)
+
+ if equal := union.Equal(got); !equal {
+ t.Errorf("Union of disjoint Equal(%s, %s) want %v, got %v",
+ union, got, true, equal)
+ }
+
+ // Check Unique is the Union of A - B and B - A.
+ aNotInB := setA.Difference(setB)
+ bNotInA := setB.Difference(setA)
+ union = aNotInB.Union(bNotInA)
+ want = []string{"a", "b", "d", "f"}
+ checkSameStringSet(t, union, want)
+ got = setA.Unique(setB)
+
+ if equal := union.Equal(got); !equal {
+ t.Errorf("Union of differences Equal(%s, %s) want %v, got %v",
+ union, got, true, equal)
+ }
+
+ // Also check the sources are unchanged.
+ checkSameStringSet(t, setA, input1)
+ checkSameStringSet(t, setB, input2)
+}
+
+func TestStringSet_Equal(t *testing.T) {
+ input1 := []string{"a", "c", "d", "e", "f"}
+ input2 := []string{"b", "c", "e"}
+ input3 := []string{"a", "c", "d", "e", "g"}
+
+ // Check Equal(nil) returns false.
+ setA := NewStringSet(input1...)
+
+ if equal := setA.Equal(nil); equal {
+ t.Errorf("Equal(%s, %v) want %v, got %v", setA, nil, false, true)
+ }
+
+ // Check that the receiver is unchanged.
+ checkSameStringSet(t, setA, input1)
+
+ // Check Equal returns true for a set and itself.
+ if equal := setA.Equal(setA); !equal {
+ t.Errorf("Equal(%s, %s) want %v, got %v", setA, setA, true, false)
+ }
+
+ // Check that the receiver is unchanged.
+ checkSameStringSet(t, setA, input1)
+
+ // Check Equal returns false for sets of non-equal length.
+ setB := NewStringSet(input2...)
+
+ if equal := setA.Equal(setB); equal {
+ t.Errorf("Equal(%s, %s) want %v, got %v", setA, setB, false, true)
+ }
+
+ // Also check the sources are unchanged.
+ checkSameStringSet(t, setA, input1)
+ checkSameStringSet(t, setB, input2)
+
+ // Check Equal returns false for equal-length sets with different elements.
+ setC := NewStringSet(input3...)
+
+ if equal := setA.Equal(setC); equal {
+ t.Errorf("Equal(%s, %s) want %v, got %v", setA, setC, false, true)
+ }
+
+ if equal := setC.Equal(setA); equal {
+ t.Errorf("Equal(%s, %s) want %v, got %v", setC, setA, false, true)
+ }
+
+ // Also check the sources are unchanged.
+ checkSameStringSet(t, setA, input1)
+ checkSameStringSet(t, setC, input3)
+
+ // Check Equal returns true for a set with itself.
+ if equal := setA.Equal(setA); !equal {
+ t.Errorf("Equal(%s, %s) want %v, got %v", setA, setA, true, false)
+ }
+
+ // Also check the source is unchanged.
+ checkSameStringSet(t, setA, input1)
+
+ // Check Equal returns true for two separate equal sets.
+ anotherA := NewStringSet(input1...)
+
+ if equal := setA.Equal(anotherA); !equal {
+ t.Errorf("Equal(%s, %s) want %v, got %v", setA, anotherA, true, false)
+ }
+
+ // Also check the sources are unchanged.
+ checkSameStringSet(t, setA, input1)
+ checkSameStringSet(t, anotherA, input1)
+
+ // Check for equality comparing to nil struct.
+ var nilSet *StringSet
+ if equal := nilSet.Equal(setA); equal {
+ t.Errorf("Equal(%s, %s) want %v, got %v", nilSet, setA, false, true)
+ }
+ if equal := setA.Equal(nilSet); equal {
+ t.Errorf("Equal(%s, %s) want %v, got %v", setA, nilSet, false, true)
+ }
+ if equal := nilSet.Equal(nilSet); !equal {
+ t.Errorf("Equal(%s, %s) want %v, got %v", nilSet, nilSet, true, false)
+ }
+
+ // Edge case: consider the empty set to be different than the nil set.
+ emptySet := NewStringSet()
+ if equal := nilSet.Equal(emptySet); equal {
+ t.Errorf("Equal(%s, %s) want %v, got %v", nilSet, emptySet, false, true)
+ }
+ if equal := emptySet.Equal(nilSet); equal {
+ t.Errorf("Equal(%s, %s) want %v, got %v", emptySet, nilSet, false, true)
+ }
+ if equal := emptySet.Equal(emptySet); !equal {
+ t.Errorf("Equal(%s, %s) want %v, got %v", emptySet, emptySet, true, false)
+ }
+}
+
+func TestStringSet_Union(t *testing.T) {
+ input1 := []string{"a", "c", "d", "e", "f"}
+ input2 := []string{"b", "c", "e"}
+
+ // Check Union(nil) returns a copy of the receiver.
+ setA := NewStringSet(input1...)
+ got := setA.Union(nil)
+ checkSameStringSet(t, got, input1)
+ // Check that the receiver is unchanged.
+ checkSameStringSet(t, setA, input1)
+
+ // Check Union returns the correct result.
+ setB := NewStringSet(input2...)
+ got = setA.Union(setB)
+ want := []string{"a", "b", "c", "d", "e", "f"}
+ checkSameStringSet(t, got, want)
+ // Also check the sources are unchanged.
+ checkSameStringSet(t, setA, input1)
+ checkSameStringSet(t, setB, input2)
+
+ // Reverse the inputs and verify Union produces the same results.
+ setA = NewStringSet(input2...)
+ setB = NewStringSet(input1...)
+ got = setA.Union(setB)
+ checkSameStringSet(t, got, want)
+ // Check the sources are again unchanged.
+ checkSameStringSet(t, setA, input2)
+ checkSameStringSet(t, setB, input1)
+}