aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGoogle Open Source <noreply+opensource@google.com>2021-12-13 15:51:00 -0800
committerBill Neubauer <wcn@google.com>2022-03-16 15:36:54 -0700
commit926575c44bbc848f47f0e8b7b6a7d5f325a6ed3a (patch)
treecc8f0c6559142a443e4c95c53bbe2d4d2453b07e
parent9fc6ed6d865bf3329927b98d0da5591381969260 (diff)
downloadlicenseclassifier-926575c44bbc848f47f0e8b7b6a7d5f325a6ed3a.tar.gz
Automated g4 rollback of changelist 415285962.
*** Reason for rollback *** Crashing services with index out of bound (https://coroner.corp.google.com/#/crashes?user=compliance-presubmit3p) *** Original change description *** Fixes handling of newline characters so that Normalize preserves the newline characters of the original input. The code doing the tokenization of the newlines had some minor bugs that resulted in spurious newlines being introduced into the token stream. This wasn't a problem before since they were only used inside the tokenizer to detect header constructs and de-hyphenate words, and were always removed from the token stream passed to calling functions. This meant that the token stream Normalize... *** PiperOrigin-RevId: 416154051
-rw-r--r--v2/classifier.go22
-rw-r--r--v2/classifier_test.go28
-rw-r--r--v2/tokenizer.go43
3 files changed, 18 insertions, 75 deletions
diff --git a/v2/classifier.go b/v2/classifier.go
index 05937de..0668254 100644
--- a/v2/classifier.go
+++ b/v2/classifier.go
@@ -223,7 +223,7 @@ func NewClassifier(threshold float64) *Classifier {
// return the same results as Match(in).
func (c *Classifier) Normalize(in []byte) []byte {
text := normalizeDoc(in, false)
- doc := extractDoc(text, false)
+ doc := extractDoc(text)
var buf bytes.Buffer
@@ -235,25 +235,11 @@ func (c *Classifier) Normalize(in []byte) []byte {
return buf.Bytes()
}
- prevLine := 1
buf.WriteString(doc.Tokens[0].Text)
- for _, t := range doc.Tokens[1:] {
- // Only write out an EOL token that incremented the line
- if t.Line == prevLine+1 {
- buf.WriteString("\n")
- }
-
- // Only write tokens that aren't EOL
- if t.Text != eol {
- // Only put a space between tokens if the previous token was on the same
- // line. This prevents spaces after an EOL
- if t.Line == prevLine {
- buf.WriteString(" ")
- }
- buf.WriteString(t.Text)
- }
- prevLine = t.Line
+ for _, t := range doc.Tokens[1:] {
+ buf.WriteString(" ")
+ buf.WriteString(t.Text)
}
return buf.Bytes()
}
diff --git a/v2/classifier_test.go b/v2/classifier_test.go
index f9d2c74..cb613e5 100644
--- a/v2/classifier_test.go
+++ b/v2/classifier_test.go
@@ -327,34 +327,6 @@ func TestNormalize(t *testing.T) {
input: " License ",
want: "License",
},
- {
- // This tests that the line breaks in the input text are properly
- // preserved, which is important for visual diffing.
- input: `Preserving
-line
-
-breaks is important`,
- want: `Preserving
-line
-
-breaks is important`,
- },
- {
- // This tests that soft EOL functionality doesn't affect normalized output
- input: `This is a sentence looking construct. This is another sentence. What happens?`,
- want: `This is a sentence looking construct This is another sentence What happens`,
- },
- {
- input: `header
-........................ This is oddly formatted`,
- want: `header
-This is oddly formatted`,
- },
- {
- input: `baseball basket-
-ball football`,
- want: "baseball basketball\nfootball",
- },
}
for _, tt := range tests {
t.Run(tt.input, func(t *testing.T) {
diff --git a/v2/tokenizer.go b/v2/tokenizer.go
index 001eaba..eaa0479 100644
--- a/v2/tokenizer.go
+++ b/v2/tokenizer.go
@@ -83,10 +83,10 @@ func normalizeDoc(in []byte, normWords bool) string {
func tokenize(in []byte) *document {
// tokenize produces a document from the input content.
text := normalizeDoc(in, true)
- return extractDoc(text, true)
+ return extractDoc(text)
}
-func extractDoc(text string, removeEol bool) *document {
+func extractDoc(text string) *document {
var doc document
// Iterate on a line-by-line basis.
i := 0
@@ -138,13 +138,9 @@ func extractDoc(text string, removeEol bool) *document {
// follow this text. This resolves problems with licenses that are a
// very long line of text, motivated by
// https://github.com/microsoft/TypeScript/commit/6e6e570d57b6785335668e30b63712e41f89bf74#diff-e60c8cd1bc09b7c4e1bf79c769c9c120L109
- //
- // Don't do this if the previous token was already an EOL
- if doc.Tokens[len(doc.Tokens)-1].Text != eol {
- doc.Tokens = append(doc.Tokens, &token{
- Text: eol,
- Line: i + 1})
- }
+ doc.Tokens = append(doc.Tokens, &token{
+ Text: eol,
+ Line: i + 1})
}
tok := token{
@@ -159,13 +155,17 @@ func extractDoc(text string, removeEol bool) *document {
firstInLine = false
}
}
+ tok := token{
+ Text: eol,
+ Line: i + 1,
+ }
+ doc.Tokens = append(doc.Tokens, &tok)
}
-
- doc.Tokens = cleanupTokens(doc.Tokens, removeEol)
+ doc.Tokens = cleanupTokens(doc.Tokens)
return &doc
}
-func cleanupTokens(in []*token, removeEol bool) []*token {
+func cleanupTokens(in []*token) []*token {
// This routine performs sanitization of tokens. If it is a header-looking
// token (but not a version number) starting a line, it is removed.
// Hyphenated words are reassembled.
@@ -179,15 +179,6 @@ func cleanupTokens(in []*token, removeEol bool) []*token {
}
if tok.Text == eol {
firstInLine = true
- if removeEol {
- continue
- }
- // If we are reconstructing a hyphenated word, don't append the EOL
- // now, do it when the word is reconstructed.
- if partialWord == "" {
- out = append(out, &token{Text: eol, Line: tok.Line, Index: tokIdx})
- tokIdx++
- }
continue
}
firstInLine = false
@@ -204,12 +195,6 @@ func cleanupTokens(in []*token, removeEol bool) []*token {
tp.Previous = ""
out = append(out, tp)
tokIdx++
- if !removeEol {
- // Append the EOL now that the whole word is recovered
- out = append(out, &token{Text: eol, Line: tp.Line, Index: tokIdx})
- tokIdx++
- }
-
partialWord = ""
} else {
tok.Text = t
@@ -359,7 +344,7 @@ var ignorableTexts = []*regexp.Regexp{
// classification
func removeIgnorableTexts(s string) string {
var out []string
- lines := strings.Split(s, "\n")
+ lines := strings.Split(strings.TrimRight(s, "\n"), "\n")
for _, l := range lines {
line := strings.TrimSpace(l)
var match bool
@@ -375,5 +360,5 @@ func removeIgnorableTexts(s string) string {
out = append(out, "")
}
}
- return strings.Join(out, "\n")
+ return strings.Join(out, "\n") + "\n"
}