API implementation for the Normalize method.

This method is used to help applications render diffs of input files against reference license docs. Normalize may need a few more tests based on what we learn from building diffs against it. As currently implemented, the contract is pretty simple resulting in simple tests, but I anticipate that may change. PiperOrigin-RevId: 407875955
author: Bill Neubauer <wcn@google.com> 2021-11-05 11:59:49 -0700
committer: Bill Neubauer <wcn@google.com> 2022-03-16 15:34:44 -0700
commit: 96b685b82f5466e36faa53de2f3cd35196376e60 (patch)
tree: 6865558a386de0e71f6a84a2bfe4861ef90002c4
parent: a856fae32bc46162436e1bfbc3cbcb0f2b6a9ef3 (diff)
download: licenseclassifier-96b685b82f5466e36faa53de2f3cd35196376e60.tar.gz
3 files changed, 109 insertions, 27 deletions
diff --git a/v2/classifier.go b/v2/classifier.go
index e146968..0668254 100644
--- a/v2/classifier.go
+++ b/v2/classifier.go
@@ -15,6 +15,7 @@
 package classifier
 
 import (
+	"bytes"
 	"fmt"
 	"io"
 	"io/ioutil"
@@ -200,6 +201,49 @@ func NewClassifier(threshold float64) *Classifier {
 	return classifier
 }
 
+// Normalize takes input content and applies the following transforms to aid in
+// identifying license content. The return value of this function is
+// line-separated text which is the basis for position values returned by the
+// classifier.
+//
+//
+// 1. Breaks up long lines of text. This helps with detecting licenses like in
+// TODO(wcn):URL reference
+//
+// 2. Certain ignorable texts are removed to aid matching blocks of text.
+// Introductory lines such as "The MIT License" are removed. Copyright notices
+// are removed since the parties are variable and shouldn't impact matching.
+//
+// It is NOT necessary to call this function to simply identify licenses in a
+// file. It should only be called to aid presenting this information to the user
+// in context (for example, creating diffs of differences to canonical
+// licenses).
+//
+// It is an invariant of the classifier that calling Match(Normalize(in)) will
+// return the same results as Match(in).
+func (c *Classifier) Normalize(in []byte) []byte {
+	text := normalizeDoc(in, false)
+	doc := extractDoc(text)
+
+	var buf bytes.Buffer
+
+	switch len(doc.Tokens) {
+	case 0:
+		return nil
+	case 1:
+		buf.WriteString(doc.Tokens[0].Text)
+		return buf.Bytes()
+	}
+
+	buf.WriteString(doc.Tokens[0].Text)
+
+	for _, t := range doc.Tokens[1:] {
+		buf.WriteString(" ")
+		buf.WriteString(t.Text)
+	}
+	return buf.Bytes()
+}
+
 // LoadLicenses adds the contents of the supplied directory to the corpus of the
 // classifier.
 func (c *Classifier) LoadLicenses(dir string) error {
diff --git a/v2/classifier_test.go b/v2/classifier_test.go
index e4cab30..cb613e5 100644
--- a/v2/classifier_test.go
+++ b/v2/classifier_test.go
@@ -309,3 +309,37 @@ func TestLicenseName(t *testing.T) {
 		})
 	}
 }
+
+func TestNormalize(t *testing.T) {
+	tests := []struct {
+		input string
+		want  string
+	}{
+		{
+			input: "Words  With   Extra Spaces are flattened out, preserving case",
+			want:  "Words With Extra Spaces are flattened out preserving case",
+		},
+		{
+			input: "",
+			want:  "",
+		},
+		{
+			input: "   License  ",
+			want:  "License",
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.input, func(t *testing.T) {
+			c, err := classifier()
+			if err != nil {
+				t.Fatalf("couldn't instantiate standard Google classifier: %v", err)
+			}
+
+			got := c.Normalize([]byte(tt.input))
+			if diff := cmp.Diff(tt.want, string(got)); diff != "" {
+				t.Errorf("Unexpected result; diff %v", diff)
+			}
+		})
+	}
+
+}
diff --git a/v2/tokenizer.go b/v2/tokenizer.go
index d20c410..eaa0479 100644
--- a/v2/tokenizer.go
+++ b/v2/tokenizer.go
@@ -59,47 +59,56 @@ func cleanupToken(in string) string {
 	// Remove internal hyphenization or URL constructs to better normalize
 	// strings for matching.
 	for _, c := range in {
-		if c >= 'a' && c <= 'z' {
+		if unicode.IsLetter(c) {
 			out.WriteRune(c)
 		}
 	}
 	return out.String()
 }
 
-// tokenize produces a document from the input content.
-func tokenize(in []byte) *document {
+func normalizeDoc(in []byte, normWords bool) string {
 	// Apply the global transforms described in SPDX
 
-	norm := strings.ToLower(string(in))
+	norm := string(in)
 	norm = html.UnescapeString(norm)
 	norm = normalizePunctuation(norm)
-	norm = normalizeEquivalentWords(norm)
 	norm = removeIgnorableTexts(norm)
 
+	if normWords {
+		norm = normalizeWords(norm)
+	}
+	return norm
+}
+
+func tokenize(in []byte) *document {
+	// tokenize produces a document from the input content.
+	text := normalizeDoc(in, true)
+	return extractDoc(text)
+}
+
+func extractDoc(text string) *document {
 	var doc document
 	// Iterate on a line-by-line basis.
-
-	line := norm
 	i := 0
 	pos := 0
 	for {
-		// Scan the line for the first likely textual content. The scan ignores punctuation
+		// Scan the text for the first likely textual content. The scan ignores punctuation
 		// artifacts that include visual boxes for layout as well as comment characters in
 		// source files.
 		firstInLine := true
 		var wid int
 		var r rune
 
-		if pos == len(line) {
+		if pos == len(text) {
 			break
 		}
 
 		next := func() {
-			r, wid = utf8.DecodeRuneInString(line[pos:])
+			r, wid = utf8.DecodeRuneInString(text[pos:])
 			pos += wid
 		}
 
-		for pos < len(line) {
+		for pos < len(text) {
 			start := pos
 			next()
 
@@ -115,7 +124,7 @@ func tokenize(in []byte) *document {
 			}
 
 			// We're at a word/number character.
-			for pos < len(line) {
+			for pos < len(text) {
 				next()
 				if unicode.IsSpace(r) {
 					pos -= wid // Will skip this in outer loop
@@ -124,7 +133,7 @@ func tokenize(in []byte) *document {
 			}
 
 			if pos > start {
-				if start >= 2 && line[start-2] == '.' && line[start-1] == ' ' {
+				if start >= 2 && text[start-2] == '.' && text[start-1] == ' ' {
 					// Insert a "soft EOL" that helps detect header-looking entries that
 					// follow this text. This resolves problems with licenses that are a
 					// very long line of text, motivated by
@@ -135,12 +144,12 @@ func tokenize(in []byte) *document {
 				}
 
 				tok := token{
-					Text: line[start:pos],
+					Text: text[start:pos],
 					Line: i + 1,
 				}
 				if firstInLine {
 					// Store the prefix material, it is useful to discern some corner cases
-					tok.Previous = line[0:start]
+					tok.Previous = text[0:start]
 				}
 				doc.Tokens = append(doc.Tokens, &tok)
 				firstInLine = false
@@ -276,8 +285,10 @@ var interchangeableWords = []struct {
 	{regexp.MustCompile("per cent"), "percent"},
 }
 
-// normalizeEquivalentWords normalizes equivalent words that are interchangeable.
-func normalizeEquivalentWords(s string) string {
+// normalizeWords remaps equivalent words that are interchangeable and lowercases
+// the word to allow for exact matching.
+func normalizeWords(s string) string {
+	s = strings.ToLower(s)
 	for _, iw := range interchangeableWords {
 		s = iw.interchangeable.ReplaceAllString(s, iw.substitute)
 	}
@@ -324,16 +335,9 @@ var listMarker = func() map[string]bool {
 // ignorableTexts is a list of lines at the start of the string we can remove
 // to get a cleaner match.
 var ignorableTexts = []*regexp.Regexp{
-	regexp.MustCompile(`(?i)^(?:the )?mit license(?: \(mit\))?$`),
-	regexp.MustCompile(`(?i)^(?:new )?bsd license$`),
-	regexp.MustCompile(`(?i)^copyright and permission notice$`),
-	regexp.MustCompile(`^(.{1,5})?copyright (\(c\) )?(\[yyyy\]|\d{4})[,.]?.*$`),
-	regexp.MustCompile(`^(.{1,5})?copyright \(c\) \[dates of first publication\].*$`),
-	regexp.MustCompile(`^\d{4}-(\d{2}|[a-z]{3})-\d{2}$`),
-	regexp.MustCompile(`^\d{4}-[a-z]{3}-\d{2}$`),
-	regexp.MustCompile(`(?i)^(all|some) rights reserved\.?$`),
-	regexp.MustCompile(`(?i)^@license$`),
-	regexp.MustCompile(`^\s*$`),
+	regexp.MustCompile(`(?i)^(.{1,5})?copyright (\(c\) )?(\[yyyy\]|\d{4})[,.]?.*$`),
+	regexp.MustCompile(`(?i)^(.{1,5})?copyright \(c\) \[dates of first publication\].*$`),
+	regexp.MustCompile(`(?i)^\d{4}-(\d{2}|[a-z]{3})-\d{2}$`),
 }
 
 // removeIgnorableTexts removes common text, which is not important for
author	Bill Neubauer <wcn@google.com>	2021-11-05 11:59:49 -0700
committer	Bill Neubauer <wcn@google.com>	2022-03-16 15:34:44 -0700
commit	96b685b82f5466e36faa53de2f3cd35196376e60 (patch)
tree	6865558a386de0e71f6a84a2bfe4861ef90002c4
parent	a856fae32bc46162436e1bfbc3cbcb0f2b6a9ef3 (diff)
download	licenseclassifier-96b685b82f5466e36faa53de2f3cd35196376e60.tar.gz