aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBill Neubauer <wcn@google.com>2021-11-05 11:59:49 -0700
committerBill Neubauer <wcn@google.com>2022-03-16 15:34:44 -0700
commit96b685b82f5466e36faa53de2f3cd35196376e60 (patch)
tree6865558a386de0e71f6a84a2bfe4861ef90002c4
parenta856fae32bc46162436e1bfbc3cbcb0f2b6a9ef3 (diff)
downloadlicenseclassifier-96b685b82f5466e36faa53de2f3cd35196376e60.tar.gz
API implementation for the Normalize method.
This method is used to help applications render diffs of input files against reference license docs. Normalize may need a few more tests based on what we learn from building diffs against it. As currently implemented, the contract is pretty simple resulting in simple tests, but I anticipate that may change. PiperOrigin-RevId: 407875955
-rw-r--r--v2/classifier.go44
-rw-r--r--v2/classifier_test.go34
-rw-r--r--v2/tokenizer.go58
3 files changed, 109 insertions, 27 deletions
diff --git a/v2/classifier.go b/v2/classifier.go
index e146968..0668254 100644
--- a/v2/classifier.go
+++ b/v2/classifier.go
@@ -15,6 +15,7 @@
package classifier
import (
+ "bytes"
"fmt"
"io"
"io/ioutil"
@@ -200,6 +201,49 @@ func NewClassifier(threshold float64) *Classifier {
return classifier
}
+// Normalize takes input content and applies the following transforms to aid in
+// identifying license content. The return value of this function is
+// line-separated text which is the basis for position values returned by the
+// classifier.
+//
+//
+// 1. Breaks up long lines of text. This helps with detecting licenses like in
+// TODO(wcn):URL reference
+//
+// 2. Certain ignorable texts are removed to aid matching blocks of text.
+// Introductory lines such as "The MIT License" are removed. Copyright notices
+// are removed since the parties are variable and shouldn't impact matching.
+//
+// It is NOT necessary to call this function to simply identify licenses in a
+// file. It should only be called to aid presenting this information to the user
+// in context (for example, creating diffs of differences to canonical
+// licenses).
+//
+// It is an invariant of the classifier that calling Match(Normalize(in)) will
+// return the same results as Match(in).
+func (c *Classifier) Normalize(in []byte) []byte {
+ text := normalizeDoc(in, false)
+ doc := extractDoc(text)
+
+ var buf bytes.Buffer
+
+ switch len(doc.Tokens) {
+ case 0:
+ return nil
+ case 1:
+ buf.WriteString(doc.Tokens[0].Text)
+ return buf.Bytes()
+ }
+
+ buf.WriteString(doc.Tokens[0].Text)
+
+ for _, t := range doc.Tokens[1:] {
+ buf.WriteString(" ")
+ buf.WriteString(t.Text)
+ }
+ return buf.Bytes()
+}
+
// LoadLicenses adds the contents of the supplied directory to the corpus of the
// classifier.
func (c *Classifier) LoadLicenses(dir string) error {
diff --git a/v2/classifier_test.go b/v2/classifier_test.go
index e4cab30..cb613e5 100644
--- a/v2/classifier_test.go
+++ b/v2/classifier_test.go
@@ -309,3 +309,37 @@ func TestLicenseName(t *testing.T) {
})
}
}
+
+func TestNormalize(t *testing.T) {
+ tests := []struct {
+ input string
+ want string
+ }{
+ {
+ input: "Words With Extra Spaces are flattened out, preserving case",
+ want: "Words With Extra Spaces are flattened out preserving case",
+ },
+ {
+ input: "",
+ want: "",
+ },
+ {
+ input: " License ",
+ want: "License",
+ },
+ }
+ for _, tt := range tests {
+ t.Run(tt.input, func(t *testing.T) {
+ c, err := classifier()
+ if err != nil {
+ t.Fatalf("couldn't instantiate standard Google classifier: %v", err)
+ }
+
+ got := c.Normalize([]byte(tt.input))
+ if diff := cmp.Diff(tt.want, string(got)); diff != "" {
+ t.Errorf("Unexpected result; diff %v", diff)
+ }
+ })
+ }
+
+}
diff --git a/v2/tokenizer.go b/v2/tokenizer.go
index d20c410..eaa0479 100644
--- a/v2/tokenizer.go
+++ b/v2/tokenizer.go
@@ -59,47 +59,56 @@ func cleanupToken(in string) string {
// Remove internal hyphenization or URL constructs to better normalize
// strings for matching.
for _, c := range in {
- if c >= 'a' && c <= 'z' {
+ if unicode.IsLetter(c) {
out.WriteRune(c)
}
}
return out.String()
}
-// tokenize produces a document from the input content.
-func tokenize(in []byte) *document {
+func normalizeDoc(in []byte, normWords bool) string {
// Apply the global transforms described in SPDX
- norm := strings.ToLower(string(in))
+ norm := string(in)
norm = html.UnescapeString(norm)
norm = normalizePunctuation(norm)
- norm = normalizeEquivalentWords(norm)
norm = removeIgnorableTexts(norm)
+ if normWords {
+ norm = normalizeWords(norm)
+ }
+ return norm
+}
+
+func tokenize(in []byte) *document {
+ // tokenize produces a document from the input content.
+ text := normalizeDoc(in, true)
+ return extractDoc(text)
+}
+
+func extractDoc(text string) *document {
var doc document
// Iterate on a line-by-line basis.
-
- line := norm
i := 0
pos := 0
for {
- // Scan the line for the first likely textual content. The scan ignores punctuation
+ // Scan the text for the first likely textual content. The scan ignores punctuation
// artifacts that include visual boxes for layout as well as comment characters in
// source files.
firstInLine := true
var wid int
var r rune
- if pos == len(line) {
+ if pos == len(text) {
break
}
next := func() {
- r, wid = utf8.DecodeRuneInString(line[pos:])
+ r, wid = utf8.DecodeRuneInString(text[pos:])
pos += wid
}
- for pos < len(line) {
+ for pos < len(text) {
start := pos
next()
@@ -115,7 +124,7 @@ func tokenize(in []byte) *document {
}
// We're at a word/number character.
- for pos < len(line) {
+ for pos < len(text) {
next()
if unicode.IsSpace(r) {
pos -= wid // Will skip this in outer loop
@@ -124,7 +133,7 @@ func tokenize(in []byte) *document {
}
if pos > start {
- if start >= 2 && line[start-2] == '.' && line[start-1] == ' ' {
+ if start >= 2 && text[start-2] == '.' && text[start-1] == ' ' {
// Insert a "soft EOL" that helps detect header-looking entries that
// follow this text. This resolves problems with licenses that are a
// very long line of text, motivated by
@@ -135,12 +144,12 @@ func tokenize(in []byte) *document {
}
tok := token{
- Text: line[start:pos],
+ Text: text[start:pos],
Line: i + 1,
}
if firstInLine {
// Store the prefix material, it is useful to discern some corner cases
- tok.Previous = line[0:start]
+ tok.Previous = text[0:start]
}
doc.Tokens = append(doc.Tokens, &tok)
firstInLine = false
@@ -276,8 +285,10 @@ var interchangeableWords = []struct {
{regexp.MustCompile("per cent"), "percent"},
}
-// normalizeEquivalentWords normalizes equivalent words that are interchangeable.
-func normalizeEquivalentWords(s string) string {
+// normalizeWords remaps equivalent words that are interchangeable and lowercases
+// the word to allow for exact matching.
+func normalizeWords(s string) string {
+ s = strings.ToLower(s)
for _, iw := range interchangeableWords {
s = iw.interchangeable.ReplaceAllString(s, iw.substitute)
}
@@ -324,16 +335,9 @@ var listMarker = func() map[string]bool {
// ignorableTexts is a list of lines at the start of the string we can remove
// to get a cleaner match.
var ignorableTexts = []*regexp.Regexp{
- regexp.MustCompile(`(?i)^(?:the )?mit license(?: \(mit\))?$`),
- regexp.MustCompile(`(?i)^(?:new )?bsd license$`),
- regexp.MustCompile(`(?i)^copyright and permission notice$`),
- regexp.MustCompile(`^(.{1,5})?copyright (\(c\) )?(\[yyyy\]|\d{4})[,.]?.*$`),
- regexp.MustCompile(`^(.{1,5})?copyright \(c\) \[dates of first publication\].*$`),
- regexp.MustCompile(`^\d{4}-(\d{2}|[a-z]{3})-\d{2}$`),
- regexp.MustCompile(`^\d{4}-[a-z]{3}-\d{2}$`),
- regexp.MustCompile(`(?i)^(all|some) rights reserved\.?$`),
- regexp.MustCompile(`(?i)^@license$`),
- regexp.MustCompile(`^\s*$`),
+ regexp.MustCompile(`(?i)^(.{1,5})?copyright (\(c\) )?(\[yyyy\]|\d{4})[,.]?.*$`),
+ regexp.MustCompile(`(?i)^(.{1,5})?copyright \(c\) \[dates of first publication\].*$`),
+ regexp.MustCompile(`(?i)^\d{4}-(\d{2}|[a-z]{3})-\d{2}$`),
}
// removeIgnorableTexts removes common text, which is not important for