Needed to make a change to number tokenization to resolve an issue

that cropped up in third_party_py_gevent_LICENSE where a xreference identifier without spaces was getting tokenized poorly, causing version mismatch issues. Analyzed 350K instances of number token instances in the license corpus and determined the only useful characters to retain when encountering a leading number are other numbers, periods, and dashes. All other characters can be discarded without affecting license matching. Doing so minimizes risk due to version matching by not introducing spurious deltas.
author: Bill Neubauer <wcn@google.com> 2020-05-08 20:02:16 -0700
committer: Bill Neubauer <bill.neubauer@gmail.com> 2020-11-13 09:54:34 -0800
commit: 87fffe5b9b8da2e66aab0fc9855cd73afefe445b (patch)
tree: c42bb28542330ce36e9f0c8f2851a7366a5dd22b /v2/tokenizer.go
parent: a5a72a939de358998819d1893c7ed706eefc64ea (diff)
download: licenseclassifier-87fffe5b9b8da2e66aab0fc9855cd73afefe445b.tar.gz
1 files changed, 12 insertions, 10 deletions
diff --git a/v2/tokenizer.go b/v2/tokenizer.go
index 2c52f1a..c296f0f 100644
--- a/v2/tokenizer.go
+++ b/v2/tokenizer.go
@@ -33,19 +33,24 @@ var eol = "\n"
 
 func cleanupToken(in string) string {
 	r, _ := utf8.DecodeRuneInString(in)
+	var out strings.Builder
 	if !unicode.IsLetter(r) {
-		// TODO: consider this more thoroughly. Sample data?
-		if unicode.IsNumber(r) {
-			// For matching version numbers or large numbers, we don't
-			// want splitting commas or trailing commas to affect the results.
-			return strings.ReplaceAll(in, ",", "")
+		if unicode.IsDigit(r) {
+			// Based on analysis of the license corpus, the characters
+			// that are significant are numbers, periods, and dashes. Anything
+			// else can be safely discarded, and helps avoid matching failures
+			// due to inconsistent whitespacing and formatting.
+			for _, c := range in {
+				if unicode.IsDigit(c) || c == '.' || c == '-' {
+					out.WriteRune(c)
+				}
+			}
+			return out.String()
 		}
-		return in
 	}
 
 	// Remove internal hyphenization or URL constructs to better normalize
 	// strings for matching.
-	var out strings.Builder
 	for _, c := range in {
 		if c >= 'a' && c <= 'z' {
 			out.WriteRune(c)
@@ -250,9 +255,6 @@ func normalizeEquivalentWords(s string) string {
 
 func header(tok *token) bool {
 	in := tok.Text
-	if in == "" {
-		return false
-	}
 	p, e := in[:len(in)-1], in[len(in)-1]
 	switch e {
 	case '.', ':', ')':
author	Bill Neubauer <wcn@google.com>	2020-05-08 20:02:16 -0700
committer	Bill Neubauer <bill.neubauer@gmail.com>	2020-11-13 09:54:34 -0800
commit	87fffe5b9b8da2e66aab0fc9855cd73afefe445b (patch)
tree	c42bb28542330ce36e9f0c8f2851a7366a5dd22b /v2/tokenizer.go
parent	a5a72a939de358998819d1893c7ed706eefc64ea (diff)
download	licenseclassifier-87fffe5b9b8da2e66aab0fc9855cd73afefe445b.tar.gz