diff options
Diffstat (limited to 'v2/tokenizer.go')
-rw-r--r-- | v2/tokenizer.go | 45 |
1 files changed, 30 insertions, 15 deletions
diff --git a/v2/tokenizer.go b/v2/tokenizer.go index eaa0479..885eab3 100644 --- a/v2/tokenizer.go +++ b/v2/tokenizer.go @@ -83,10 +83,10 @@ func normalizeDoc(in []byte, normWords bool) string { func tokenize(in []byte) *document { // tokenize produces a document from the input content. text := normalizeDoc(in, true) - return extractDoc(text) + return extractDoc(text, true) } -func extractDoc(text string) *document { +func extractDoc(text string, removeEol bool) *document { var doc document // Iterate on a line-by-line basis. i := 0 @@ -138,9 +138,13 @@ func extractDoc(text string) *document { // follow this text. This resolves problems with licenses that are a // very long line of text, motivated by // https://github.com/microsoft/TypeScript/commit/6e6e570d57b6785335668e30b63712e41f89bf74#diff-e60c8cd1bc09b7c4e1bf79c769c9c120L109 - doc.Tokens = append(doc.Tokens, &token{ - Text: eol, - Line: i + 1}) + // + // Don't do this if the previous token was already an EOL + if len(doc.Tokens) > 0 && doc.Tokens[len(doc.Tokens)-1].Text != eol { + doc.Tokens = append(doc.Tokens, &token{ + Text: eol, + Line: i + 1}) + } } tok := token{ @@ -155,17 +159,13 @@ func extractDoc(text string) *document { firstInLine = false } } - tok := token{ - Text: eol, - Line: i + 1, - } - doc.Tokens = append(doc.Tokens, &tok) } - doc.Tokens = cleanupTokens(doc.Tokens) + + doc.Tokens = cleanupTokens(doc.Tokens, removeEol) return &doc } -func cleanupTokens(in []*token) []*token { +func cleanupTokens(in []*token, removeEol bool) []*token { // This routine performs sanitization of tokens. If it is a header-looking // token (but not a version number) starting a line, it is removed. // Hyphenated words are reassembled. @@ -179,13 +179,22 @@ func cleanupTokens(in []*token) []*token { } if tok.Text == eol { firstInLine = true + if removeEol { + continue + } + // If we are reconstructing a hyphenated word, don't append the EOL + // now, do it when the word is reconstructed. + if partialWord == "" { + out = append(out, &token{Text: eol, Line: tok.Line, Index: tokIdx}) + tokIdx++ + } continue } firstInLine = false t := cleanupToken(tok.Text) // If this is the last token in a line, and it looks like a hyphenated // word, store it for reassembly. - if strings.HasSuffix(tok.Text, "-") && in[i+1].Text == eol { + if strings.HasSuffix(tok.Text, "-") && i+1 < len(in) && in[i+1].Text == eol { partialWord = t } else if partialWord != "" { // Repair hyphenated words @@ -195,6 +204,12 @@ func cleanupTokens(in []*token) []*token { tp.Previous = "" out = append(out, tp) tokIdx++ + if !removeEol { + // Append the EOL now that the whole word is recovered + out = append(out, &token{Text: eol, Line: tp.Line, Index: tokIdx}) + tokIdx++ + } + partialWord = "" } else { tok.Text = t @@ -344,7 +359,7 @@ var ignorableTexts = []*regexp.Regexp{ // classification func removeIgnorableTexts(s string) string { var out []string - lines := strings.Split(strings.TrimRight(s, "\n"), "\n") + lines := strings.Split(s, "\n") for _, l := range lines { line := strings.TrimSpace(l) var match bool @@ -360,5 +375,5 @@ func removeIgnorableTexts(s string) string { out = append(out, "") } } - return strings.Join(out, "\n") + "\n" + return strings.Join(out, "\n") } |