diff options
author | Bill Neubauer <wcn@google.com> | 2022-01-12 10:15:37 -0800 |
---|---|---|
committer | Bill Neubauer <wcn@google.com> | 2022-03-16 15:37:22 -0700 |
commit | b2b19b3c53333625136af32ef9ebee41f73dc1cd (patch) | |
tree | e4dcc1edf9f2b01cc1ca911a6d3e5a793bb928b4 /v2/tokenizer.go | |
parent | 4a94a4b2dc7ec0119164ed69649bc96bc2b1025d (diff) | |
download | licenseclassifier-b2b19b3c53333625136af32ef9ebee41f73dc1cd.tar.gz |
Fixes handling of newline characters so that Normalize preserves the newline
characters of the original input. Includes fixes for OOB array conditions
detected by fuzzing and the crash logs from the service.
The code doing the tokenization of the newlines had some minor bugs that
resulted in spurious newlines being introduced into the token stream. This
wasn't a problem before since they were only used inside the tokenizer to
detect header constructs and de-hyphenate words, and were always removed from
the token stream passed to calling functions. This meant that the token stream
Normalize was reassembling had no newlines so it just produced a big line of
text. Fixing Normalize to preserve the original newlines required fixing these
glitches.
Added more test cases to cover the different newline-related scenarios and new
scenario files based on fuzzer findings.
***
PiperOrigin-RevId: 421331357
Diffstat (limited to 'v2/tokenizer.go')
-rw-r--r-- | v2/tokenizer.go | 45 |
1 files changed, 30 insertions, 15 deletions
diff --git a/v2/tokenizer.go b/v2/tokenizer.go index eaa0479..885eab3 100644 --- a/v2/tokenizer.go +++ b/v2/tokenizer.go @@ -83,10 +83,10 @@ func normalizeDoc(in []byte, normWords bool) string { func tokenize(in []byte) *document { // tokenize produces a document from the input content. text := normalizeDoc(in, true) - return extractDoc(text) + return extractDoc(text, true) } -func extractDoc(text string) *document { +func extractDoc(text string, removeEol bool) *document { var doc document // Iterate on a line-by-line basis. i := 0 @@ -138,9 +138,13 @@ func extractDoc(text string) *document { // follow this text. This resolves problems with licenses that are a // very long line of text, motivated by // https://github.com/microsoft/TypeScript/commit/6e6e570d57b6785335668e30b63712e41f89bf74#diff-e60c8cd1bc09b7c4e1bf79c769c9c120L109 - doc.Tokens = append(doc.Tokens, &token{ - Text: eol, - Line: i + 1}) + // + // Don't do this if the previous token was already an EOL + if len(doc.Tokens) > 0 && doc.Tokens[len(doc.Tokens)-1].Text != eol { + doc.Tokens = append(doc.Tokens, &token{ + Text: eol, + Line: i + 1}) + } } tok := token{ @@ -155,17 +159,13 @@ func extractDoc(text string) *document { firstInLine = false } } - tok := token{ - Text: eol, - Line: i + 1, - } - doc.Tokens = append(doc.Tokens, &tok) } - doc.Tokens = cleanupTokens(doc.Tokens) + + doc.Tokens = cleanupTokens(doc.Tokens, removeEol) return &doc } -func cleanupTokens(in []*token) []*token { +func cleanupTokens(in []*token, removeEol bool) []*token { // This routine performs sanitization of tokens. If it is a header-looking // token (but not a version number) starting a line, it is removed. // Hyphenated words are reassembled. @@ -179,13 +179,22 @@ func cleanupTokens(in []*token) []*token { } if tok.Text == eol { firstInLine = true + if removeEol { + continue + } + // If we are reconstructing a hyphenated word, don't append the EOL + // now, do it when the word is reconstructed. + if partialWord == "" { + out = append(out, &token{Text: eol, Line: tok.Line, Index: tokIdx}) + tokIdx++ + } continue } firstInLine = false t := cleanupToken(tok.Text) // If this is the last token in a line, and it looks like a hyphenated // word, store it for reassembly. - if strings.HasSuffix(tok.Text, "-") && in[i+1].Text == eol { + if strings.HasSuffix(tok.Text, "-") && i+1 < len(in) && in[i+1].Text == eol { partialWord = t } else if partialWord != "" { // Repair hyphenated words @@ -195,6 +204,12 @@ func cleanupTokens(in []*token) []*token { tp.Previous = "" out = append(out, tp) tokIdx++ + if !removeEol { + // Append the EOL now that the whole word is recovered + out = append(out, &token{Text: eol, Line: tp.Line, Index: tokIdx}) + tokIdx++ + } + partialWord = "" } else { tok.Text = t @@ -344,7 +359,7 @@ var ignorableTexts = []*regexp.Regexp{ // classification func removeIgnorableTexts(s string) string { var out []string - lines := strings.Split(strings.TrimRight(s, "\n"), "\n") + lines := strings.Split(s, "\n") for _, l := range lines { line := strings.TrimSpace(l) var match bool @@ -360,5 +375,5 @@ func removeIgnorableTexts(s string) string { out = append(out, "") } } - return strings.Join(out, "\n") + "\n" + return strings.Join(out, "\n") } |