Fixes handling of newline characters so that Normalize preserves the newline

characters of the original input. Includes fixes for OOB array conditions detected by fuzzing and the crash logs from the service. The code doing the tokenization of the newlines had some minor bugs that resulted in spurious newlines being introduced into the token stream. This wasn't a problem before since they were only used inside the tokenizer to detect header constructs and de-hyphenate words, and were always removed from the token stream passed to calling functions. This meant that the token stream Normalize was reassembling had no newlines so it just produced a big line of text. Fixing Normalize to preserve the original newlines required fixing these glitches. Added more test cases to cover the different newline-related scenarios and new scenario files based on fuzzer findings. *** PiperOrigin-RevId: 421331357
author: Bill Neubauer <wcn@google.com> 2022-01-12 10:15:37 -0800
committer: Bill Neubauer <wcn@google.com> 2022-03-16 15:37:22 -0700
commit: b2b19b3c53333625136af32ef9ebee41f73dc1cd (patch)
tree: e4dcc1edf9f2b01cc1ca911a6d3e5a793bb928b4 /v2/tokenizer.go
parent: 4a94a4b2dc7ec0119164ed69649bc96bc2b1025d (diff)
download: licenseclassifier-b2b19b3c53333625136af32ef9ebee41f73dc1cd.tar.gz
1 files changed, 30 insertions, 15 deletions
diff --git a/v2/tokenizer.go b/v2/tokenizer.go
index eaa0479..885eab3 100644
--- a/v2/tokenizer.go
+++ b/v2/tokenizer.go
@@ -83,10 +83,10 @@ func normalizeDoc(in []byte, normWords bool) string {
 func tokenize(in []byte) *document {
 	// tokenize produces a document from the input content.
 	text := normalizeDoc(in, true)
-	return extractDoc(text)
+	return extractDoc(text, true)
 }
 
-func extractDoc(text string) *document {
+func extractDoc(text string, removeEol bool) *document {
 	var doc document
 	// Iterate on a line-by-line basis.
 	i := 0
@@ -138,9 +138,13 @@ func extractDoc(text string) *document {
 					// follow this text. This resolves problems with licenses that are a
 					// very long line of text, motivated by
 					// https://github.com/microsoft/TypeScript/commit/6e6e570d57b6785335668e30b63712e41f89bf74#diff-e60c8cd1bc09b7c4e1bf79c769c9c120L109
-					doc.Tokens = append(doc.Tokens, &token{
-						Text: eol,
-						Line: i + 1})
+					//
+					// Don't do this if the previous token was already an EOL
+					if len(doc.Tokens) > 0 && doc.Tokens[len(doc.Tokens)-1].Text != eol {
+						doc.Tokens = append(doc.Tokens, &token{
+							Text: eol,
+							Line: i + 1})
+					}
 				}
 
 				tok := token{
@@ -155,17 +159,13 @@ func extractDoc(text string) *document {
 				firstInLine = false
 			}
 		}
-		tok := token{
-			Text: eol,
-			Line: i + 1,
-		}
-		doc.Tokens = append(doc.Tokens, &tok)
 	}
-	doc.Tokens = cleanupTokens(doc.Tokens)
+
+	doc.Tokens = cleanupTokens(doc.Tokens, removeEol)
 	return &doc
 }
 
-func cleanupTokens(in []*token) []*token {
+func cleanupTokens(in []*token, removeEol bool) []*token {
 	// This routine performs sanitization of tokens. If it is a header-looking
 	// token (but not a version number) starting a line, it is removed.
 	// Hyphenated words are reassembled.
@@ -179,13 +179,22 @@ func cleanupTokens(in []*token) []*token {
 		}
 		if tok.Text == eol {
 			firstInLine = true
+			if removeEol {
+				continue
+			}
+			// If we are reconstructing a hyphenated word, don't append the EOL
+			// now, do it when the word is reconstructed.
+			if partialWord == "" {
+				out = append(out, &token{Text: eol, Line: tok.Line, Index: tokIdx})
+				tokIdx++
+			}
 			continue
 		}
 		firstInLine = false
 		t := cleanupToken(tok.Text)
 		// If this is the last token in a line, and it looks like a hyphenated
 		// word, store it for reassembly.
-		if strings.HasSuffix(tok.Text, "-") && in[i+1].Text == eol {
+		if strings.HasSuffix(tok.Text, "-") && i+1 < len(in) && in[i+1].Text == eol {
 			partialWord = t
 		} else if partialWord != "" {
 			// Repair hyphenated words
@@ -195,6 +204,12 @@ func cleanupTokens(in []*token) []*token {
 			tp.Previous = ""
 			out = append(out, tp)
 			tokIdx++
+			if !removeEol {
+				// Append the EOL now that the whole word is recovered
+				out = append(out, &token{Text: eol, Line: tp.Line, Index: tokIdx})
+				tokIdx++
+			}
+
 			partialWord = ""
 		} else {
 			tok.Text = t
@@ -344,7 +359,7 @@ var ignorableTexts = []*regexp.Regexp{
 // classification
 func removeIgnorableTexts(s string) string {
 	var out []string
-	lines := strings.Split(strings.TrimRight(s, "\n"), "\n")
+	lines := strings.Split(s, "\n")
 	for _, l := range lines {
 		line := strings.TrimSpace(l)
 		var match bool
@@ -360,5 +375,5 @@ func removeIgnorableTexts(s string) string {
 			out = append(out, "")
 		}
 	}
-	return strings.Join(out, "\n") + "\n"
+	return strings.Join(out, "\n")
 }
author	Bill Neubauer <wcn@google.com>	2022-01-12 10:15:37 -0800
committer	Bill Neubauer <wcn@google.com>	2022-03-16 15:37:22 -0700
commit	b2b19b3c53333625136af32ef9ebee41f73dc1cd (patch)
tree	e4dcc1edf9f2b01cc1ca911a6d3e5a793bb928b4 /v2/tokenizer.go
parent	4a94a4b2dc7ec0119164ed69649bc96bc2b1025d (diff)
download	licenseclassifier-b2b19b3c53333625136af32ef9ebee41f73dc1cd.tar.gz