aboutsummaryrefslogtreecommitdiff
path: root/v2/tokenizer.go
diff options
context:
space:
mode:
authorBill Neubauer <wcn@google.com>2021-11-05 11:59:49 -0700
committerBill Neubauer <wcn@google.com>2022-03-16 15:34:44 -0700
commit96b685b82f5466e36faa53de2f3cd35196376e60 (patch)
tree6865558a386de0e71f6a84a2bfe4861ef90002c4 /v2/tokenizer.go
parenta856fae32bc46162436e1bfbc3cbcb0f2b6a9ef3 (diff)
downloadlicenseclassifier-96b685b82f5466e36faa53de2f3cd35196376e60.tar.gz
API implementation for the Normalize method.
This method is used to help applications render diffs of input files against reference license docs. Normalize may need a few more tests based on what we learn from building diffs against it. As currently implemented, the contract is pretty simple resulting in simple tests, but I anticipate that may change. PiperOrigin-RevId: 407875955
Diffstat (limited to 'v2/tokenizer.go')
-rw-r--r--v2/tokenizer.go58
1 files changed, 31 insertions, 27 deletions
diff --git a/v2/tokenizer.go b/v2/tokenizer.go
index d20c410..eaa0479 100644
--- a/v2/tokenizer.go
+++ b/v2/tokenizer.go
@@ -59,47 +59,56 @@ func cleanupToken(in string) string {
// Remove internal hyphenization or URL constructs to better normalize
// strings for matching.
for _, c := range in {
- if c >= 'a' && c <= 'z' {
+ if unicode.IsLetter(c) {
out.WriteRune(c)
}
}
return out.String()
}
-// tokenize produces a document from the input content.
-func tokenize(in []byte) *document {
+func normalizeDoc(in []byte, normWords bool) string {
// Apply the global transforms described in SPDX
- norm := strings.ToLower(string(in))
+ norm := string(in)
norm = html.UnescapeString(norm)
norm = normalizePunctuation(norm)
- norm = normalizeEquivalentWords(norm)
norm = removeIgnorableTexts(norm)
+ if normWords {
+ norm = normalizeWords(norm)
+ }
+ return norm
+}
+
+func tokenize(in []byte) *document {
+ // tokenize produces a document from the input content.
+ text := normalizeDoc(in, true)
+ return extractDoc(text)
+}
+
+func extractDoc(text string) *document {
var doc document
// Iterate on a line-by-line basis.
-
- line := norm
i := 0
pos := 0
for {
- // Scan the line for the first likely textual content. The scan ignores punctuation
+ // Scan the text for the first likely textual content. The scan ignores punctuation
// artifacts that include visual boxes for layout as well as comment characters in
// source files.
firstInLine := true
var wid int
var r rune
- if pos == len(line) {
+ if pos == len(text) {
break
}
next := func() {
- r, wid = utf8.DecodeRuneInString(line[pos:])
+ r, wid = utf8.DecodeRuneInString(text[pos:])
pos += wid
}
- for pos < len(line) {
+ for pos < len(text) {
start := pos
next()
@@ -115,7 +124,7 @@ func tokenize(in []byte) *document {
}
// We're at a word/number character.
- for pos < len(line) {
+ for pos < len(text) {
next()
if unicode.IsSpace(r) {
pos -= wid // Will skip this in outer loop
@@ -124,7 +133,7 @@ func tokenize(in []byte) *document {
}
if pos > start {
- if start >= 2 && line[start-2] == '.' && line[start-1] == ' ' {
+ if start >= 2 && text[start-2] == '.' && text[start-1] == ' ' {
// Insert a "soft EOL" that helps detect header-looking entries that
// follow this text. This resolves problems with licenses that are a
// very long line of text, motivated by
@@ -135,12 +144,12 @@ func tokenize(in []byte) *document {
}
tok := token{
- Text: line[start:pos],
+ Text: text[start:pos],
Line: i + 1,
}
if firstInLine {
// Store the prefix material, it is useful to discern some corner cases
- tok.Previous = line[0:start]
+ tok.Previous = text[0:start]
}
doc.Tokens = append(doc.Tokens, &tok)
firstInLine = false
@@ -276,8 +285,10 @@ var interchangeableWords = []struct {
{regexp.MustCompile("per cent"), "percent"},
}
-// normalizeEquivalentWords normalizes equivalent words that are interchangeable.
-func normalizeEquivalentWords(s string) string {
+// normalizeWords remaps equivalent words that are interchangeable and lowercases
+// the word to allow for exact matching.
+func normalizeWords(s string) string {
+ s = strings.ToLower(s)
for _, iw := range interchangeableWords {
s = iw.interchangeable.ReplaceAllString(s, iw.substitute)
}
@@ -324,16 +335,9 @@ var listMarker = func() map[string]bool {
// ignorableTexts is a list of lines at the start of the string we can remove
// to get a cleaner match.
var ignorableTexts = []*regexp.Regexp{
- regexp.MustCompile(`(?i)^(?:the )?mit license(?: \(mit\))?$`),
- regexp.MustCompile(`(?i)^(?:new )?bsd license$`),
- regexp.MustCompile(`(?i)^copyright and permission notice$`),
- regexp.MustCompile(`^(.{1,5})?copyright (\(c\) )?(\[yyyy\]|\d{4})[,.]?.*$`),
- regexp.MustCompile(`^(.{1,5})?copyright \(c\) \[dates of first publication\].*$`),
- regexp.MustCompile(`^\d{4}-(\d{2}|[a-z]{3})-\d{2}$`),
- regexp.MustCompile(`^\d{4}-[a-z]{3}-\d{2}$`),
- regexp.MustCompile(`(?i)^(all|some) rights reserved\.?$`),
- regexp.MustCompile(`(?i)^@license$`),
- regexp.MustCompile(`^\s*$`),
+ regexp.MustCompile(`(?i)^(.{1,5})?copyright (\(c\) )?(\[yyyy\]|\d{4})[,.]?.*$`),
+ regexp.MustCompile(`(?i)^(.{1,5})?copyright \(c\) \[dates of first publication\].*$`),
+ regexp.MustCompile(`(?i)^\d{4}-(\d{2}|[a-z]{3})-\d{2}$`),
}
// removeIgnorableTexts removes common text, which is not important for