aboutsummaryrefslogtreecommitdiff
path: root/v2/tokenizer.go
diff options
context:
space:
mode:
Diffstat (limited to 'v2/tokenizer.go')
-rw-r--r--v2/tokenizer.go45
1 files changed, 30 insertions, 15 deletions
diff --git a/v2/tokenizer.go b/v2/tokenizer.go
index eaa0479..885eab3 100644
--- a/v2/tokenizer.go
+++ b/v2/tokenizer.go
@@ -83,10 +83,10 @@ func normalizeDoc(in []byte, normWords bool) string {
func tokenize(in []byte) *document {
// tokenize produces a document from the input content.
text := normalizeDoc(in, true)
- return extractDoc(text)
+ return extractDoc(text, true)
}
-func extractDoc(text string) *document {
+func extractDoc(text string, removeEol bool) *document {
var doc document
// Iterate on a line-by-line basis.
i := 0
@@ -138,9 +138,13 @@ func extractDoc(text string) *document {
// follow this text. This resolves problems with licenses that are a
// very long line of text, motivated by
// https://github.com/microsoft/TypeScript/commit/6e6e570d57b6785335668e30b63712e41f89bf74#diff-e60c8cd1bc09b7c4e1bf79c769c9c120L109
- doc.Tokens = append(doc.Tokens, &token{
- Text: eol,
- Line: i + 1})
+ //
+ // Don't do this if the previous token was already an EOL
+ if len(doc.Tokens) > 0 && doc.Tokens[len(doc.Tokens)-1].Text != eol {
+ doc.Tokens = append(doc.Tokens, &token{
+ Text: eol,
+ Line: i + 1})
+ }
}
tok := token{
@@ -155,17 +159,13 @@ func extractDoc(text string) *document {
firstInLine = false
}
}
- tok := token{
- Text: eol,
- Line: i + 1,
- }
- doc.Tokens = append(doc.Tokens, &tok)
}
- doc.Tokens = cleanupTokens(doc.Tokens)
+
+ doc.Tokens = cleanupTokens(doc.Tokens, removeEol)
return &doc
}
-func cleanupTokens(in []*token) []*token {
+func cleanupTokens(in []*token, removeEol bool) []*token {
// This routine performs sanitization of tokens. If it is a header-looking
// token (but not a version number) starting a line, it is removed.
// Hyphenated words are reassembled.
@@ -179,13 +179,22 @@ func cleanupTokens(in []*token) []*token {
}
if tok.Text == eol {
firstInLine = true
+ if removeEol {
+ continue
+ }
+ // If we are reconstructing a hyphenated word, don't append the EOL
+ // now, do it when the word is reconstructed.
+ if partialWord == "" {
+ out = append(out, &token{Text: eol, Line: tok.Line, Index: tokIdx})
+ tokIdx++
+ }
continue
}
firstInLine = false
t := cleanupToken(tok.Text)
// If this is the last token in a line, and it looks like a hyphenated
// word, store it for reassembly.
- if strings.HasSuffix(tok.Text, "-") && in[i+1].Text == eol {
+ if strings.HasSuffix(tok.Text, "-") && i+1 < len(in) && in[i+1].Text == eol {
partialWord = t
} else if partialWord != "" {
// Repair hyphenated words
@@ -195,6 +204,12 @@ func cleanupTokens(in []*token) []*token {
tp.Previous = ""
out = append(out, tp)
tokIdx++
+ if !removeEol {
+ // Append the EOL now that the whole word is recovered
+ out = append(out, &token{Text: eol, Line: tp.Line, Index: tokIdx})
+ tokIdx++
+ }
+
partialWord = ""
} else {
tok.Text = t
@@ -344,7 +359,7 @@ var ignorableTexts = []*regexp.Regexp{
// classification
func removeIgnorableTexts(s string) string {
var out []string
- lines := strings.Split(strings.TrimRight(s, "\n"), "\n")
+ lines := strings.Split(s, "\n")
for _, l := range lines {
line := strings.TrimSpace(l)
var match bool
@@ -360,5 +375,5 @@ func removeIgnorableTexts(s string) string {
out = append(out, "")
}
}
- return strings.Join(out, "\n") + "\n"
+ return strings.Join(out, "\n")
}