aboutsummaryrefslogtreecommitdiff
path: root/v2/tokenizer.go
diff options
context:
space:
mode:
Diffstat (limited to 'v2/tokenizer.go')
-rw-r--r--v2/tokenizer.go634
1 files changed, 340 insertions, 294 deletions
diff --git a/v2/tokenizer.go b/v2/tokenizer.go
index 875cc7e..0d3917e 100644
--- a/v2/tokenizer.go
+++ b/v2/tokenizer.go
@@ -15,366 +15,412 @@
package classifier
import (
+ "bytes"
"html"
+ "io"
"regexp"
"strings"
"unicode"
"unicode/utf8"
)
-// isSignificant looks for runes that are likely to be the part of English language content
-// of interest in licenses. Notably, it skips over punctuation, looking only for letters
-// or numbers that consistitute the tokens of most interest.
-func isSignificant(r rune) bool {
- return unicode.IsLetter(r) || unicode.IsDigit(r)
-}
-
var eol = "\n"
-func cleanupToken(in string) string {
- r, _ := utf8.DecodeRuneInString(in)
- var out strings.Builder
- if !unicode.IsLetter(r) {
- if unicode.IsDigit(r) {
- // Based on analysis of the license corpus, the characters
- // that are significant are numbers, periods, and dashes. Anything
- // else can be safely discarded, and helps avoid matching failures
- // due to inconsistent whitespacing and formatting.
- for _, c := range in {
- if unicode.IsDigit(c) || c == '.' || c == '-' {
- out.WriteRune(c)
- }
+func header(in string) bool {
+ if len(in) == 0 {
+ return false
+ }
+ p, e := in[:len(in)-1], in[len(in)-1]
+ switch e {
+ case '.', ':', ')':
+ if listMarker[p] {
+ if e != ')' {
+ return true
}
-
- // Numbers should not end in a . since that doesn't indicate a version
- // number, but usually an end of a line.
- res := out.String()
- for strings.HasSuffix(res, ".") {
- res = res[0 : len(res)-1]
+ }
+ // Check for patterns like 1.2.3
+ for _, r := range p {
+ if unicode.IsDigit(r) || r == '.' {
+ continue
}
- return res
+ return false
}
+ return true
}
+ return false
+}
- // Remove internal hyphenization or URL constructs to better normalize
- // strings for matching.
- for _, c := range in {
- if unicode.IsLetter(c) {
- out.WriteRune(c)
- }
+var listMarker = func() map[string]bool {
+ const allListMarkers = "a b c d e f g h i j k l m n o p q r ii iii iv v vi vii viii ix xi xii xiii xiv xv"
+ l := map[string]bool{}
+ for _, marker := range strings.Split(allListMarkers, " ") {
+ l[marker] = true
}
- return out.String()
+ return l
+}()
+
+// ignorableTexts is a list of lines at the start of the string we can remove
+// to get a cleaner match.
+var ignorableTexts = []*regexp.Regexp{
+ regexp.MustCompile(`(?i)^(.{1,5})?copyright (\(c\) )?(\[yyyy\]|\d{4})[,.]?.*$`),
+ regexp.MustCompile(`(?i)^(.{1,5})?copyright \(c\) \[dates of first publication\].*$`),
+ regexp.MustCompile(`(?i)^\d{4}-(\d{2}|[a-z]{3})-\d{2}$`),
}
-func normalizeDoc(in []byte, normWords bool) (string, Matches) {
- // Apply the global transforms described in SPDX
+func tokenize(in []byte, dict *dictionary, updateDict bool) *indexedDocument {
+ // Since bytes.NewReader().Read() will never return an error, tokenizeStream
+ // will never return an error so it's okay to ignore the return value in this
+ // case.
+ id, _ := tokenizeStream(bytes.NewReader(in), true, dict, updateDict)
+ return id
+}
- norm := string(in)
- norm = html.UnescapeString(norm)
- norm = normalizePunctuation(norm)
- norm, matches := removeIgnorableTexts(norm)
+// tokenizeStream reads bytes from src and produces an indexedDocument of its
+// cotent. tokenizeStream will never return an error of its own, it can only
+// return an error from the provided Reader. If the provided Reader never
+// returns an error, it is safe to assume that tokenizeStream will not return an
+// error.
+func tokenizeStream(src io.Reader, normalize bool, dict *dictionary, updateDict bool) (*indexedDocument, error) {
+ const bufSize = 1024
+ // The longest UTF-8 encoded rune is 4 bytes, so we keep enough leftover bytes
+ // in the buffer to ensure we never run out of bytes trying to finish
+ // constructing a rune. These leftover 4 bytes will be copied to the start of
+ // the buffer before additional bytes are read.
+ tgt := bufSize - 4
- if normWords {
- norm = normalizeWords(norm)
- }
- return norm, matches
-}
+ rbuf := make([]byte, bufSize)
+ obuf := make([]byte, 0)
+ linebuf := make([]tokenID, 0)
+ idx := 0
+ line := 1 // 1s-based count
+ deferredEOL := false
+ deferredWord := false
+ // the tokenizer uses a local dictionary to conserve memory while
+ // analyzing the input doc to avoid polluting the global dictionary
+ ld := newDictionary()
-func tokenize(in []byte) *document {
- // tokenize produces a document from the input content.
- text, matches := normalizeDoc(in, true)
- return extractDoc(text, true, matches)
-}
+ var doc indexedDocument
-func extractDoc(text string, removeEol bool, matches Matches) *document {
- var doc document
- doc.Matches = matches
- // Iterate on a line-by-line basis.
- i := 0
- pos := 0
- for {
- // Scan the text for the first likely textual content. The scan ignores punctuation
- // artifacts that include visual boxes for layout as well as comment characters in
- // source files.
- firstInLine := true
- var wid int
- var r rune
-
- if pos == len(text) {
- break
- }
+ isEOF := func(in error) bool {
+ return in == io.EOF || in == io.ErrUnexpectedEOF
+ }
- next := func() {
- r, wid = utf8.DecodeRuneInString(text[pos:])
- pos += wid
+ // Read out the stream in chunks
+ for {
+ // Fill up the buffer with bytes to extract runes from
+ // idx is offset to hold any bytes left over from previous reads
+ n, err := io.ReadFull(src, rbuf[idx:])
+ if isEOF(err) {
+ // There are no more bytes to read, so we must now consume all bytes in the
+ // buffer.
+ tgt = idx + n
+ } else if err != nil {
+ return nil, err
}
- for pos < len(text) {
- start := pos
- next()
+ for idx = 0; idx < tgt; {
+ r, n := utf8.DecodeRune(rbuf[idx:])
+ idx += n
if r == '\n' {
- doc.Tokens = append(doc.Tokens, &token{
- Text: eol,
- Line: i + 1})
- i++
- }
+ // Deal with carriage return
- if !isSignificant(r) {
+ // If we are in a word (len(obuf) > 0)and the last rune is a -
+ // strike that rune and keep accumulating.
+ // Otherwise we treat it like a space and
+ // flush the word
+
+ if len(obuf) > 0 {
+ if obuf[len(obuf)-1] == '-' {
+ obuf = obuf[0 : len(obuf)-1]
+ deferredEOL = true
+ continue
+ }
+
+ // Append the word fragment to the line buffer
+ linebuf = append(linebuf, flushBuf(len(linebuf), obuf, normalize, ld))
+ }
+
+ // If there is something in the line to process, do so now
+ if len(linebuf) > 0 {
+ appendToDoc(&doc, dict, line, linebuf, ld, normalize, updateDict, linebuf)
+ linebuf = nil
+ obuf = nil
+ }
+ if !normalize {
+ tokID := dict.getIndex(eol)
+ if tokID == unknownIndex {
+ tokID = dict.add(eol)
+ }
+ doc.Tokens = append(doc.Tokens, indexedToken{
+ ID: tokID,
+ Line: line})
+ }
+ line++
continue
}
- // We're at a word/number character.
- for pos < len(text) {
- next()
- if unicode.IsSpace(r) {
- pos -= wid // Will skip this in outer loop
- break
+ if len(obuf) == 0 {
+ if unicode.IsLetter(r) || unicode.IsDigit(r) || r == '&' || r == '(' {
+ // Number or word character starts an interesting word
+ // Now we slurp up all non-space runes and aggregate it as
+ // a single word
+
+ // Buffer the initial token, normalizing to lower case if needed
+ if normalize {
+ r = unicode.ToLower(r)
+ }
+ obuf = utf8.AppendRune(obuf, r)
}
+ continue
}
- if pos > start {
- if start >= 2 && text[start-2] == '.' && text[start-1] == ' ' {
- // Insert a "soft EOL" that helps detect header-looking entries that
- // follow this text. This resolves problems with licenses that are a
- // very long line of text, motivated by
- // https://github.com/microsoft/TypeScript/commit/6e6e570d57b6785335668e30b63712e41f89bf74#diff-e60c8cd1bc09b7c4e1bf79c769c9c120L109
- //
- // Don't do this if the previous token was already an EOL
- if len(doc.Tokens) > 0 && doc.Tokens[len(doc.Tokens)-1].Text != eol {
- doc.Tokens = append(doc.Tokens, &token{
- Text: eol,
- Line: i + 1})
- }
+ // At this point, len(obuf) > 0 and we are accumulating more runes
+ // to complete a word.
+ if unicode.IsSpace(r) {
+ // If we have a deferred EOL, we need to pick up a non-space character
+ // to resume the hyphenated word, so we just consume spaces until that
+ // happens
+ if deferredEOL {
+ continue
}
- tok := token{
- Text: text[start:pos],
- Line: i + 1,
+ // This is a space between word characters, so we assemble the word as a
+ // token and flush it out.
+ idx -= n
+
+ linebuf = append(linebuf, flushBuf(len(linebuf), obuf, normalize, ld))
+ if deferredWord {
+ appendToDoc(&doc, dict, line, linebuf, ld, normalize, updateDict, linebuf)
+ linebuf = nil
+ deferredWord = false
+ // Increment the line count now so the remainder token is credited
+ // to the previous line number.
+ line++
}
- if firstInLine {
- // Store the prefix material, it is useful to discern some corner cases
- tok.Previous = text[0:start]
+ obuf = make([]byte, 0)
+ continue
+ }
+
+ if deferredEOL {
+ deferredEOL = false
+ deferredWord = true
+ }
+ // perform token mappings for punctuation to emulate
+ // normalizePunctuation. this returns a string and each rune needs to be
+ // injected.
+ if rep, found := punctuationMappings[r]; found {
+ for _, t := range rep {
+ obuf = utf8.AppendRune(obuf, unicode.ToLower(t))
}
- doc.Tokens = append(doc.Tokens, &tok)
- firstInLine = false
+ continue
}
+
+ // if it's not punctuation, lowercase and buffer the token
+ obuf = utf8.AppendRune(obuf, unicode.ToLower(r))
+ }
+
+ // Break out if we have consumed all read bytes
+ if isEOF(err) {
+ break
}
+
+ // Copy the unconsumed bytes at the end of the buffer to the start
+ // of the buffer so the next read appends after them.
+ n = copy(rbuf, rbuf[idx:])
+ idx = n
+ }
+
+ // Process the remaining bytes in the buffer
+ if len(obuf) > 0 {
+ linebuf = append(linebuf, flushBuf(len(linebuf), obuf, normalize, ld))
+ }
+ if len(linebuf) > 0 {
+ appendToDoc(&doc, dict, line, linebuf, ld, normalize, updateDict, linebuf)
}
- doc.Tokens = cleanupTokens(doc.Tokens, removeEol)
- return &doc
+ doc.dict = dict
+ doc.generateFrequencies()
+ doc.runes = diffWordsToRunes(&doc, 0, doc.size())
+ doc.Norm = doc.normalized()
+ return &doc, nil
}
-func cleanupTokens(in []*token, removeEol bool) []*token {
- // This routine performs sanitization of tokens. If it is a header-looking
- // token (but not a version number) starting a line, it is removed.
- // Hyphenated words are reassembled.
- partialWord := ""
- var out []*token
- tokIdx := 0
- firstInLine := true
- for i, tok := range in {
- if firstInLine && header(tok) {
+func appendToDoc(doc *indexedDocument, dict *dictionary, line int, in []tokenID, ld *dictionary, normalize bool, updateDict bool, linebuf []tokenID) {
+ tokens, m := stringifyLineBuf(dict, line, linebuf, ld, normalize, updateDict)
+ if tokens != nil {
+ doc.Tokens = append(doc.Tokens, tokens...)
+ } else if m != nil {
+ doc.Matches = append(doc.Matches, m)
+ }
+}
+
+func stringifyLineBuf(dict *dictionary, line int, in []tokenID, ld *dictionary, normalize bool, updateDict bool) ([]indexedToken, *Match) {
+ if len(in) == 0 {
+ return nil, nil
+ }
+ var sb strings.Builder
+ for i, r := range in {
+ out := ld.getWord(r)
+ if out == "" {
continue
}
- if tok.Text == eol {
- firstInLine = true
- if removeEol {
- continue
- }
- // If we are reconstructing a hyphenated word, don't append the EOL
- // now, do it when the word is reconstructed.
- if partialWord == "" {
- out = append(out, &token{Text: eol, Line: tok.Line})
- tokIdx++
- }
- continue
+ sb.WriteString(out)
+ if i < len(in)-1 {
+ sb.WriteByte(' ')
}
- firstInLine = false
- t := cleanupToken(tok.Text)
- // If this is the last token in a line, and it looks like a hyphenated
- // word, store it for reassembly.
- if strings.HasSuffix(tok.Text, "-") && i+1 < len(in) && in[i+1].Text == eol {
- partialWord = t
- } else if partialWord != "" {
- // Repair hyphenated words
- tp := in[i-1]
- tp.Text = partialWord + t
- tp.Previous = ""
- out = append(out, tp)
- tokIdx++
- if !removeEol {
- // Append the EOL now that the whole word is recovered
- out = append(out, &token{Text: eol, Line: tp.Line})
- tokIdx++
- }
+ }
- partialWord = ""
- } else {
- tok.Text = t
- tok.Previous = ""
- out = append(out, tok)
- tokIdx++
+ out := sb.String()
+
+ for _, re := range ignorableTexts {
+ if re.MatchString(out) {
+ return nil, &Match{Name: "Copyright", MatchType: "Copyright", Confidence: 1.0, StartLine: line, EndLine: line}
}
}
- return out
-}
-// interchangeablePunctutation is punctuation that can be normalized.
-var interchangeablePunctuation = []struct {
- interchangeable string
- substitute string
-}{
- // Hyphen, Dash, En Dash, and Em Dash.
- {`-‒–—‐`, "-"},
- // Single, Double, Curly Single, and Curly Double.
- {"'\"`‘’“”", "'"},
- // Copyright.
- {"©", "(c)"},
- // Currency and Section. (Different copies of the CDDL use each marker.)
- {"§¤", "(s)"},
- // Middle Dot
- {"·", " "},
- {"*", " "},
-}
-
-// normalizePunctuation takes all hyphens and quotes and normalizes them.
-func normalizePunctuation(s string) string {
- for _, iw := range interchangeablePunctuation {
- for _, in := range strings.Split(iw.interchangeable, "") {
- s = strings.ReplaceAll(s, in, iw.substitute)
+ var tokens []indexedToken
+ for i, r := range in {
+ txt := cleanupToken(i, ld.getWord(r), normalize)
+ if txt != "" {
+ var tokID tokenID
+ if updateDict {
+ tokID = dict.add(txt)
+ } else {
+ tokID = dict.getIndex(txt)
+ }
+ tokens = append(tokens, indexedToken{
+ Line: line,
+ ID: tokID,
+ })
}
}
- return s
+
+ return tokens, nil
}
-// interchangeableWords are words we can substitute for a normalized form
-// without changing the meaning of the license. See
-// https://spdx.org/spdx-license-list/matching-guidelines for the list.
-var interchangeableWords = []struct {
- interchangeable *regexp.Regexp
- substitute string
-}{
- {regexp.MustCompile("acknowledgement"), "acknowledgment"},
- {regexp.MustCompile("analogue"), "analog"},
- {regexp.MustCompile("analyse"), "analyze"},
- {regexp.MustCompile("artefact"), "artifact"},
- {regexp.MustCompile("authorisation"), "authorization"},
- {regexp.MustCompile("authorised"), "authorized"},
- {regexp.MustCompile("calibre"), "caliber"},
- {regexp.MustCompile("cancelled"), "canceled"},
- {regexp.MustCompile("capitalisations"), "capitalizations"},
- {regexp.MustCompile("catalogue"), "catalog"},
- {regexp.MustCompile("categorise"), "categorize"},
- {regexp.MustCompile("centre"), "center"},
- {regexp.MustCompile("emphasised"), "emphasized"},
- {regexp.MustCompile("favour"), "favor"},
- {regexp.MustCompile("favourite"), "favorite"},
- {regexp.MustCompile("fulfil\\b"), "fulfill"},
- {regexp.MustCompile("fulfilment"), "fulfillment"},
- {regexp.MustCompile("https"), "http"},
- {regexp.MustCompile("initialise"), "initialize"},
- {regexp.MustCompile("judgment"), "judgement"},
- {regexp.MustCompile("labelling"), "labeling"},
- {regexp.MustCompile("labour"), "labor"},
- {regexp.MustCompile("licence"), "license"},
- {regexp.MustCompile("maximise"), "maximize"},
- {regexp.MustCompile("modelled"), "modeled"},
- {regexp.MustCompile("modelling"), "modeling"},
- {regexp.MustCompile("offence"), "offense"},
- {regexp.MustCompile("optimise"), "optimize"},
- {regexp.MustCompile("organisation"), "organization"},
- {regexp.MustCompile("organise"), "organize"},
- {regexp.MustCompile("practise"), "practice"},
- {regexp.MustCompile("programme"), "program"},
- {regexp.MustCompile("realise"), "realize"},
- {regexp.MustCompile("recognise"), "recognize"},
- {regexp.MustCompile("signalling"), "signaling"},
- {regexp.MustCompile("sub[ -]license"), "sublicense"},
- {regexp.MustCompile("utilisation"), "utilization"},
- {regexp.MustCompile("whilst"), "while"},
- {regexp.MustCompile("wilful"), "wilfull"},
- {regexp.MustCompile("non[ -]commercial"), "noncommercial"},
- {regexp.MustCompile("per cent"), "percent"},
+func normalizeToken(in string) string {
+ // This performs some preprocessing on the token.
+ // This is different than cleanupToken in that fixups here
+ // are not exact match on the token.
+ // Normalizing URLs from https to http is an example of a fix applied
+ // here.
+ return strings.ReplaceAll(in, "https", "http")
}
-// normalizeWords remaps equivalent words that are interchangeable and lowercases
-// the word to allow for exact matching.
-func normalizeWords(s string) string {
- s = strings.ToLower(s)
- for _, iw := range interchangeableWords {
- s = iw.interchangeable.ReplaceAllString(s, iw.substitute)
- }
- return s
+func flushBuf(pos int, obuf []byte, normalizeWord bool, ld *dictionary) tokenID {
+ // clean up the contents of the rune buffer
+ token := string(obuf)
+ // escape sequences can occur anywhere in the string, not just the beginning
+ // so always attempt to unescape the word's content.
+ token = html.UnescapeString(token)
+
+ clean := normalizeToken(token)
+
+ return ld.add(clean)
}
-func header(tok *token) bool {
- in := tok.Text
- p, e := in[:len(in)-1], in[len(in)-1]
- switch e {
- case '.', ':', ')':
- if listMarker[p] {
- if e != ')' {
- return true
+func cleanupToken(pos int, in string, normalizeWord bool) string {
+ r, _ := utf8.DecodeRuneInString(in)
+ var out strings.Builder
+ if pos == 0 && header(in) {
+ return ""
+ }
+
+ if !unicode.IsLetter(r) {
+ if unicode.IsDigit(r) {
+ // Based on analysis of the license corpus, the characters that are
+ // significant are numbers, periods, and dashes. Anything else can be
+ // safely discarded, and helps avoid matching failures due to inconsistent
+ // whitespacing and formatting.
+ for _, c := range in {
+ if unicode.IsDigit(c) || c == '.' || c == '-' {
+ out.WriteRune(c)
+ }
}
- // Sometimes an internal reference like "(ii)" from NPL-1.02.txt
- // endds up at the beginning of a line. In that case, it's
- // not actually a header.
- if e == ')' && !strings.HasSuffix(tok.Previous, "(") {
- return true
+
+ // Numbers should not end in a . since that doesn't indicate a version
+ // number, but usually an end of a line.
+ res := out.String()
+ for strings.HasSuffix(res, ".") {
+ res = res[0 : len(res)-1]
}
+ return res
}
- // Check for patterns like 1.2.3
- for _, r := range p {
- if unicode.IsDigit(r) || r == '.' {
- continue
- }
- return false
+ }
+
+ // Remove internal hyphenization or URL constructs to better normalize strings
+ // for matching.
+
+ for _, c := range in {
+ if unicode.IsLetter(c) {
+ out.WriteRune(c)
}
- return true
}
- return false
-}
-var listMarker = func() map[string]bool {
- const allListMarkers = "a b c d e f g h i j k l m n o p q r ii iii iv v vi vii viii ix xi xii xiii xiv xv"
- l := map[string]bool{}
- for _, marker := range strings.Split(allListMarkers, " ") {
- l[marker] = true
+ tok := out.String()
+ if !normalizeWord {
+ return tok
}
- return l
-}()
-// ignorableTexts is a list of lines at the start of the string we can remove
-// to get a cleaner match.
-var ignorableTexts = []*regexp.Regexp{
- regexp.MustCompile(`(?i)^(.{1,5})?copyright (\(c\) )?(\[yyyy\]|\d{4})[,.]?.*$`),
- regexp.MustCompile(`(?i)^(.{1,5})?copyright \(c\) \[dates of first publication\].*$`),
- regexp.MustCompile(`(?i)^\d{4}-(\d{2}|[a-z]{3})-\d{2}$`),
+ if iw, ok := interchangeableWords[tok]; ok && normalizeWord {
+ return iw
+ }
+ return tok
}
-// removeIgnorableTexts removes common text, which is not important for
-// classification
-func removeIgnorableTexts(s string) (string, Matches) {
- var out []string
- var matches Matches
- lines := strings.Split(s, "\n")
- for i, l := range lines {
- line := strings.TrimSpace(l)
- var match bool
- for _, re := range ignorableTexts {
- if re.MatchString(line) {
- match = true
- }
- }
- if !match {
- out = append(out, l)
- } else {
- // We want to preserve line presence for the positional information
- out = append(out, "")
- matches = append(matches, &Match{Name: "Copyright", MatchType: "Copyright", Confidence: 1.0, StartLine: i + 1, EndLine: i + 1})
- }
- }
- return strings.Join(out, "\n"), matches
+var interchangeableWords = map[string]string{
+ "analyse": "analyze",
+ "artefact": "artifact",
+ "authorisation": "authorization",
+ "authorised": "authorized",
+ "calibre": "caliber",
+ "cancelled": "canceled",
+ "capitalisations": "capitalizations",
+ "catalogue": "catalog",
+ "categorise": "categorize",
+ "centre": "center",
+ "emphasised": "emphasized",
+ "favour": "favor",
+ "favourite": "favorite",
+ "fulfil": "fulfill",
+ "fulfilment": "fulfillment",
+ "https": "http",
+ "initialise": "initialize",
+ "judgment": "judgement",
+ "labelling": "labeling",
+ "labour": "labor",
+ "licence": "license",
+ "maximise": "maximize",
+ "modelled": "modeled",
+ "modelling": "modeling",
+ "offence": "offense",
+ "optimise": "optimize",
+ "organisation": "organization",
+ "organise": "organize",
+ "practise": "practice",
+ "programme": "program",
+ "realise": "realize",
+ "recognise": "recognize",
+ "signalling": "signaling",
+ "utilisation": "utilization",
+ "whilst": "while",
+ "wilful": "wilfull",
+ // TODO: These three need tokenizer magic
+ "non commercial": "noncommercial",
+ "per cent": "percent",
+ "sub license": "sublicense",
+}
+
+var punctuationMappings = map[rune]string{
+ '-': "-",
+ '‒': "-",
+ '–': "-",
+ '—': "-",
+ '‐': "-",
+ '©': "(c)",
+ '§': "(s)",
+ '¤': "(s)",
+ '·': " ",
+ '*': " ",
}