API implementation for the Normalize method.

This method is used to help applications render diffs of input files against reference license docs. Normalize may need a few more tests based on what we learn from building diffs against it. As currently implemented, the contract is pretty simple resulting in simple tests, but I anticipate that may change. PiperOrigin-RevId: 407875955
author: Bill Neubauer <wcn@google.com> 2021-11-05 11:59:49 -0700
committer: Bill Neubauer <wcn@google.com> 2022-03-16 15:34:44 -0700
commit: 96b685b82f5466e36faa53de2f3cd35196376e60 (patch)
tree: 6865558a386de0e71f6a84a2bfe4861ef90002c4 /v2/tokenizer.go
parent: a856fae32bc46162436e1bfbc3cbcb0f2b6a9ef3 (diff)
download: licenseclassifier-96b685b82f5466e36faa53de2f3cd35196376e60.tar.gz
1 files changed, 31 insertions, 27 deletions
diff --git a/v2/tokenizer.go b/v2/tokenizer.go
index d20c410..eaa0479 100644
--- a/v2/tokenizer.go
+++ b/v2/tokenizer.go
@@ -59,47 +59,56 @@ func cleanupToken(in string) string {
 	// Remove internal hyphenization or URL constructs to better normalize
 	// strings for matching.
 	for _, c := range in {
-		if c >= 'a' && c <= 'z' {
+		if unicode.IsLetter(c) {
 			out.WriteRune(c)
 		}
 	}
 	return out.String()
 }
 
-// tokenize produces a document from the input content.
-func tokenize(in []byte) *document {
+func normalizeDoc(in []byte, normWords bool) string {
 	// Apply the global transforms described in SPDX
 
-	norm := strings.ToLower(string(in))
+	norm := string(in)
 	norm = html.UnescapeString(norm)
 	norm = normalizePunctuation(norm)
-	norm = normalizeEquivalentWords(norm)
 	norm = removeIgnorableTexts(norm)
 
+	if normWords {
+		norm = normalizeWords(norm)
+	}
+	return norm
+}
+
+func tokenize(in []byte) *document {
+	// tokenize produces a document from the input content.
+	text := normalizeDoc(in, true)
+	return extractDoc(text)
+}
+
+func extractDoc(text string) *document {
 	var doc document
 	// Iterate on a line-by-line basis.
-
-	line := norm
 	i := 0
 	pos := 0
 	for {
-		// Scan the line for the first likely textual content. The scan ignores punctuation
+		// Scan the text for the first likely textual content. The scan ignores punctuation
 		// artifacts that include visual boxes for layout as well as comment characters in
 		// source files.
 		firstInLine := true
 		var wid int
 		var r rune
 
-		if pos == len(line) {
+		if pos == len(text) {
 			break
 		}
 
 		next := func() {
-			r, wid = utf8.DecodeRuneInString(line[pos:])
+			r, wid = utf8.DecodeRuneInString(text[pos:])
 			pos += wid
 		}
 
-		for pos < len(line) {
+		for pos < len(text) {
 			start := pos
 			next()
 
@@ -115,7 +124,7 @@ func tokenize(in []byte) *document {
 			}
 
 			// We're at a word/number character.
-			for pos < len(line) {
+			for pos < len(text) {
 				next()
 				if unicode.IsSpace(r) {
 					pos -= wid // Will skip this in outer loop
@@ -124,7 +133,7 @@ func tokenize(in []byte) *document {
 			}
 
 			if pos > start {
-				if start >= 2 && line[start-2] == '.' && line[start-1] == ' ' {
+				if start >= 2 && text[start-2] == '.' && text[start-1] == ' ' {
 					// Insert a "soft EOL" that helps detect header-looking entries that
 					// follow this text. This resolves problems with licenses that are a
 					// very long line of text, motivated by
@@ -135,12 +144,12 @@ func tokenize(in []byte) *document {
 				}
 
 				tok := token{
-					Text: line[start:pos],
+					Text: text[start:pos],
 					Line: i + 1,
 				}
 				if firstInLine {
 					// Store the prefix material, it is useful to discern some corner cases
-					tok.Previous = line[0:start]
+					tok.Previous = text[0:start]
 				}
 				doc.Tokens = append(doc.Tokens, &tok)
 				firstInLine = false
@@ -276,8 +285,10 @@ var interchangeableWords = []struct {
 	{regexp.MustCompile("per cent"), "percent"},
 }
 
-// normalizeEquivalentWords normalizes equivalent words that are interchangeable.
-func normalizeEquivalentWords(s string) string {
+// normalizeWords remaps equivalent words that are interchangeable and lowercases
+// the word to allow for exact matching.
+func normalizeWords(s string) string {
+	s = strings.ToLower(s)
 	for _, iw := range interchangeableWords {
 		s = iw.interchangeable.ReplaceAllString(s, iw.substitute)
 	}
@@ -324,16 +335,9 @@ var listMarker = func() map[string]bool {
 // ignorableTexts is a list of lines at the start of the string we can remove
 // to get a cleaner match.
 var ignorableTexts = []*regexp.Regexp{
-	regexp.MustCompile(`(?i)^(?:the )?mit license(?: \(mit\))?$`),
-	regexp.MustCompile(`(?i)^(?:new )?bsd license$`),
-	regexp.MustCompile(`(?i)^copyright and permission notice$`),
-	regexp.MustCompile(`^(.{1,5})?copyright (\(c\) )?(\[yyyy\]|\d{4})[,.]?.*$`),
-	regexp.MustCompile(`^(.{1,5})?copyright \(c\) \[dates of first publication\].*$`),
-	regexp.MustCompile(`^\d{4}-(\d{2}|[a-z]{3})-\d{2}$`),
-	regexp.MustCompile(`^\d{4}-[a-z]{3}-\d{2}$`),
-	regexp.MustCompile(`(?i)^(all|some) rights reserved\.?$`),
-	regexp.MustCompile(`(?i)^@license$`),
-	regexp.MustCompile(`^\s*$`),
+	regexp.MustCompile(`(?i)^(.{1,5})?copyright (\(c\) )?(\[yyyy\]|\d{4})[,.]?.*$`),
+	regexp.MustCompile(`(?i)^(.{1,5})?copyright \(c\) \[dates of first publication\].*$`),
+	regexp.MustCompile(`(?i)^\d{4}-(\d{2}|[a-z]{3})-\d{2}$`),
 }
 
 // removeIgnorableTexts removes common text, which is not important for
author	Bill Neubauer <wcn@google.com>	2021-11-05 11:59:49 -0700
committer	Bill Neubauer <wcn@google.com>	2022-03-16 15:34:44 -0700
commit	96b685b82f5466e36faa53de2f3cd35196376e60 (patch)
tree	6865558a386de0e71f6a84a2bfe4861ef90002c4 /v2/tokenizer.go
parent	a856fae32bc46162436e1bfbc3cbcb0f2b6a9ef3 (diff)
download	licenseclassifier-96b685b82f5466e36faa53de2f3cd35196376e60.tar.gz