aboutsummaryrefslogtreecommitdiff
path: root/v2/tokenizer.go
diff options
context:
space:
mode:
authorBill Neubauer <wcn@google.com>2022-01-12 10:15:37 -0800
committerBill Neubauer <wcn@google.com>2022-03-16 15:37:22 -0700
commitb2b19b3c53333625136af32ef9ebee41f73dc1cd (patch)
treee4dcc1edf9f2b01cc1ca911a6d3e5a793bb928b4 /v2/tokenizer.go
parent4a94a4b2dc7ec0119164ed69649bc96bc2b1025d (diff)
downloadlicenseclassifier-b2b19b3c53333625136af32ef9ebee41f73dc1cd.tar.gz
Fixes handling of newline characters so that Normalize preserves the newline
characters of the original input. Includes fixes for OOB array conditions detected by fuzzing and the crash logs from the service. The code doing the tokenization of the newlines had some minor bugs that resulted in spurious newlines being introduced into the token stream. This wasn't a problem before since they were only used inside the tokenizer to detect header constructs and de-hyphenate words, and were always removed from the token stream passed to calling functions. This meant that the token stream Normalize was reassembling had no newlines so it just produced a big line of text. Fixing Normalize to preserve the original newlines required fixing these glitches. Added more test cases to cover the different newline-related scenarios and new scenario files based on fuzzer findings. *** PiperOrigin-RevId: 421331357
Diffstat (limited to 'v2/tokenizer.go')
-rw-r--r--v2/tokenizer.go45
1 files changed, 30 insertions, 15 deletions
diff --git a/v2/tokenizer.go b/v2/tokenizer.go
index eaa0479..885eab3 100644
--- a/v2/tokenizer.go
+++ b/v2/tokenizer.go
@@ -83,10 +83,10 @@ func normalizeDoc(in []byte, normWords bool) string {
func tokenize(in []byte) *document {
// tokenize produces a document from the input content.
text := normalizeDoc(in, true)
- return extractDoc(text)
+ return extractDoc(text, true)
}
-func extractDoc(text string) *document {
+func extractDoc(text string, removeEol bool) *document {
var doc document
// Iterate on a line-by-line basis.
i := 0
@@ -138,9 +138,13 @@ func extractDoc(text string) *document {
// follow this text. This resolves problems with licenses that are a
// very long line of text, motivated by
// https://github.com/microsoft/TypeScript/commit/6e6e570d57b6785335668e30b63712e41f89bf74#diff-e60c8cd1bc09b7c4e1bf79c769c9c120L109
- doc.Tokens = append(doc.Tokens, &token{
- Text: eol,
- Line: i + 1})
+ //
+ // Don't do this if the previous token was already an EOL
+ if len(doc.Tokens) > 0 && doc.Tokens[len(doc.Tokens)-1].Text != eol {
+ doc.Tokens = append(doc.Tokens, &token{
+ Text: eol,
+ Line: i + 1})
+ }
}
tok := token{
@@ -155,17 +159,13 @@ func extractDoc(text string) *document {
firstInLine = false
}
}
- tok := token{
- Text: eol,
- Line: i + 1,
- }
- doc.Tokens = append(doc.Tokens, &tok)
}
- doc.Tokens = cleanupTokens(doc.Tokens)
+
+ doc.Tokens = cleanupTokens(doc.Tokens, removeEol)
return &doc
}
-func cleanupTokens(in []*token) []*token {
+func cleanupTokens(in []*token, removeEol bool) []*token {
// This routine performs sanitization of tokens. If it is a header-looking
// token (but not a version number) starting a line, it is removed.
// Hyphenated words are reassembled.
@@ -179,13 +179,22 @@ func cleanupTokens(in []*token) []*token {
}
if tok.Text == eol {
firstInLine = true
+ if removeEol {
+ continue
+ }
+ // If we are reconstructing a hyphenated word, don't append the EOL
+ // now, do it when the word is reconstructed.
+ if partialWord == "" {
+ out = append(out, &token{Text: eol, Line: tok.Line, Index: tokIdx})
+ tokIdx++
+ }
continue
}
firstInLine = false
t := cleanupToken(tok.Text)
// If this is the last token in a line, and it looks like a hyphenated
// word, store it for reassembly.
- if strings.HasSuffix(tok.Text, "-") && in[i+1].Text == eol {
+ if strings.HasSuffix(tok.Text, "-") && i+1 < len(in) && in[i+1].Text == eol {
partialWord = t
} else if partialWord != "" {
// Repair hyphenated words
@@ -195,6 +204,12 @@ func cleanupTokens(in []*token) []*token {
tp.Previous = ""
out = append(out, tp)
tokIdx++
+ if !removeEol {
+ // Append the EOL now that the whole word is recovered
+ out = append(out, &token{Text: eol, Line: tp.Line, Index: tokIdx})
+ tokIdx++
+ }
+
partialWord = ""
} else {
tok.Text = t
@@ -344,7 +359,7 @@ var ignorableTexts = []*regexp.Regexp{
// classification
func removeIgnorableTexts(s string) string {
var out []string
- lines := strings.Split(strings.TrimRight(s, "\n"), "\n")
+ lines := strings.Split(s, "\n")
for _, l := range lines {
line := strings.TrimSpace(l)
var match bool
@@ -360,5 +375,5 @@ func removeIgnorableTexts(s string) string {
out = append(out, "")
}
}
- return strings.Join(out, "\n") + "\n"
+ return strings.Join(out, "\n")
}