diff options
author | Bill Neubauer <wcn@google.com> | 2020-05-08 20:02:16 -0700 |
---|---|---|
committer | Bill Neubauer <bill.neubauer@gmail.com> | 2020-11-13 09:54:34 -0800 |
commit | 87fffe5b9b8da2e66aab0fc9855cd73afefe445b (patch) | |
tree | c42bb28542330ce36e9f0c8f2851a7366a5dd22b /v2/tokenizer.go | |
parent | a5a72a939de358998819d1893c7ed706eefc64ea (diff) | |
download | licenseclassifier-87fffe5b9b8da2e66aab0fc9855cd73afefe445b.tar.gz |
Needed to make a change to number tokenization to resolve an issue
that cropped up in third_party_py_gevent_LICENSE where a xreference identifier
without spaces was getting tokenized poorly, causing version mismatch issues.
Analyzed 350K instances of number token instances in the license corpus and
determined the only useful characters to retain when encountering a leading
number are other numbers, periods, and dashes. All other characters can be
discarded without affecting license matching. Doing so minimizes risk due
to version matching by not introducing spurious deltas.
Diffstat (limited to 'v2/tokenizer.go')
-rw-r--r-- | v2/tokenizer.go | 22 |
1 files changed, 12 insertions, 10 deletions
diff --git a/v2/tokenizer.go b/v2/tokenizer.go index 2c52f1a..c296f0f 100644 --- a/v2/tokenizer.go +++ b/v2/tokenizer.go @@ -33,19 +33,24 @@ var eol = "\n" func cleanupToken(in string) string { r, _ := utf8.DecodeRuneInString(in) + var out strings.Builder if !unicode.IsLetter(r) { - // TODO: consider this more thoroughly. Sample data? - if unicode.IsNumber(r) { - // For matching version numbers or large numbers, we don't - // want splitting commas or trailing commas to affect the results. - return strings.ReplaceAll(in, ",", "") + if unicode.IsDigit(r) { + // Based on analysis of the license corpus, the characters + // that are significant are numbers, periods, and dashes. Anything + // else can be safely discarded, and helps avoid matching failures + // due to inconsistent whitespacing and formatting. + for _, c := range in { + if unicode.IsDigit(c) || c == '.' || c == '-' { + out.WriteRune(c) + } + } + return out.String() } - return in } // Remove internal hyphenization or URL constructs to better normalize // strings for matching. - var out strings.Builder for _, c := range in { if c >= 'a' && c <= 'z' { out.WriteRune(c) @@ -250,9 +255,6 @@ func normalizeEquivalentWords(s string) string { func header(tok *token) bool { in := tok.Text - if in == "" { - return false - } p, e := in[:len(in)-1], in[len(in)-1] switch e { case '.', ':', ')': |