aboutsummaryrefslogtreecommitdiff
path: root/v2/tokenizer.go
diff options
context:
space:
mode:
authorBill Neubauer <wcn@google.com>2020-05-08 20:02:16 -0700
committerBill Neubauer <bill.neubauer@gmail.com>2020-11-13 09:54:34 -0800
commit87fffe5b9b8da2e66aab0fc9855cd73afefe445b (patch)
treec42bb28542330ce36e9f0c8f2851a7366a5dd22b /v2/tokenizer.go
parenta5a72a939de358998819d1893c7ed706eefc64ea (diff)
downloadlicenseclassifier-87fffe5b9b8da2e66aab0fc9855cd73afefe445b.tar.gz
Needed to make a change to number tokenization to resolve an issue
that cropped up in third_party_py_gevent_LICENSE where a xreference identifier without spaces was getting tokenized poorly, causing version mismatch issues. Analyzed 350K instances of number token instances in the license corpus and determined the only useful characters to retain when encountering a leading number are other numbers, periods, and dashes. All other characters can be discarded without affecting license matching. Doing so minimizes risk due to version matching by not introducing spurious deltas.
Diffstat (limited to 'v2/tokenizer.go')
-rw-r--r--v2/tokenizer.go22
1 files changed, 12 insertions, 10 deletions
diff --git a/v2/tokenizer.go b/v2/tokenizer.go
index 2c52f1a..c296f0f 100644
--- a/v2/tokenizer.go
+++ b/v2/tokenizer.go
@@ -33,19 +33,24 @@ var eol = "\n"
func cleanupToken(in string) string {
r, _ := utf8.DecodeRuneInString(in)
+ var out strings.Builder
if !unicode.IsLetter(r) {
- // TODO: consider this more thoroughly. Sample data?
- if unicode.IsNumber(r) {
- // For matching version numbers or large numbers, we don't
- // want splitting commas or trailing commas to affect the results.
- return strings.ReplaceAll(in, ",", "")
+ if unicode.IsDigit(r) {
+ // Based on analysis of the license corpus, the characters
+ // that are significant are numbers, periods, and dashes. Anything
+ // else can be safely discarded, and helps avoid matching failures
+ // due to inconsistent whitespacing and formatting.
+ for _, c := range in {
+ if unicode.IsDigit(c) || c == '.' || c == '-' {
+ out.WriteRune(c)
+ }
+ }
+ return out.String()
}
- return in
}
// Remove internal hyphenization or URL constructs to better normalize
// strings for matching.
- var out strings.Builder
for _, c := range in {
if c >= 'a' && c <= 'z' {
out.WriteRune(c)
@@ -250,9 +255,6 @@ func normalizeEquivalentWords(s string) string {
func header(tok *token) bool {
in := tok.Text
- if in == "" {
- return false
- }
p, e := in[:len(in)-1], in[len(in)-1]
switch e {
case '.', ':', ')':