diff options
Diffstat (limited to 'v2/tokenizer.go')
-rw-r--r-- | v2/tokenizer.go | 634 |
1 files changed, 340 insertions, 294 deletions
diff --git a/v2/tokenizer.go b/v2/tokenizer.go index 875cc7e..0d3917e 100644 --- a/v2/tokenizer.go +++ b/v2/tokenizer.go @@ -15,366 +15,412 @@ package classifier import ( + "bytes" "html" + "io" "regexp" "strings" "unicode" "unicode/utf8" ) -// isSignificant looks for runes that are likely to be the part of English language content -// of interest in licenses. Notably, it skips over punctuation, looking only for letters -// or numbers that consistitute the tokens of most interest. -func isSignificant(r rune) bool { - return unicode.IsLetter(r) || unicode.IsDigit(r) -} - var eol = "\n" -func cleanupToken(in string) string { - r, _ := utf8.DecodeRuneInString(in) - var out strings.Builder - if !unicode.IsLetter(r) { - if unicode.IsDigit(r) { - // Based on analysis of the license corpus, the characters - // that are significant are numbers, periods, and dashes. Anything - // else can be safely discarded, and helps avoid matching failures - // due to inconsistent whitespacing and formatting. - for _, c := range in { - if unicode.IsDigit(c) || c == '.' || c == '-' { - out.WriteRune(c) - } +func header(in string) bool { + if len(in) == 0 { + return false + } + p, e := in[:len(in)-1], in[len(in)-1] + switch e { + case '.', ':', ')': + if listMarker[p] { + if e != ')' { + return true } - - // Numbers should not end in a . since that doesn't indicate a version - // number, but usually an end of a line. - res := out.String() - for strings.HasSuffix(res, ".") { - res = res[0 : len(res)-1] + } + // Check for patterns like 1.2.3 + for _, r := range p { + if unicode.IsDigit(r) || r == '.' { + continue } - return res + return false } + return true } + return false +} - // Remove internal hyphenization or URL constructs to better normalize - // strings for matching. - for _, c := range in { - if unicode.IsLetter(c) { - out.WriteRune(c) - } +var listMarker = func() map[string]bool { + const allListMarkers = "a b c d e f g h i j k l m n o p q r ii iii iv v vi vii viii ix xi xii xiii xiv xv" + l := map[string]bool{} + for _, marker := range strings.Split(allListMarkers, " ") { + l[marker] = true } - return out.String() + return l +}() + +// ignorableTexts is a list of lines at the start of the string we can remove +// to get a cleaner match. +var ignorableTexts = []*regexp.Regexp{ + regexp.MustCompile(`(?i)^(.{1,5})?copyright (\(c\) )?(\[yyyy\]|\d{4})[,.]?.*$`), + regexp.MustCompile(`(?i)^(.{1,5})?copyright \(c\) \[dates of first publication\].*$`), + regexp.MustCompile(`(?i)^\d{4}-(\d{2}|[a-z]{3})-\d{2}$`), } -func normalizeDoc(in []byte, normWords bool) (string, Matches) { - // Apply the global transforms described in SPDX +func tokenize(in []byte, dict *dictionary, updateDict bool) *indexedDocument { + // Since bytes.NewReader().Read() will never return an error, tokenizeStream + // will never return an error so it's okay to ignore the return value in this + // case. + id, _ := tokenizeStream(bytes.NewReader(in), true, dict, updateDict) + return id +} - norm := string(in) - norm = html.UnescapeString(norm) - norm = normalizePunctuation(norm) - norm, matches := removeIgnorableTexts(norm) +// tokenizeStream reads bytes from src and produces an indexedDocument of its +// cotent. tokenizeStream will never return an error of its own, it can only +// return an error from the provided Reader. If the provided Reader never +// returns an error, it is safe to assume that tokenizeStream will not return an +// error. +func tokenizeStream(src io.Reader, normalize bool, dict *dictionary, updateDict bool) (*indexedDocument, error) { + const bufSize = 1024 + // The longest UTF-8 encoded rune is 4 bytes, so we keep enough leftover bytes + // in the buffer to ensure we never run out of bytes trying to finish + // constructing a rune. These leftover 4 bytes will be copied to the start of + // the buffer before additional bytes are read. + tgt := bufSize - 4 - if normWords { - norm = normalizeWords(norm) - } - return norm, matches -} + rbuf := make([]byte, bufSize) + obuf := make([]byte, 0) + linebuf := make([]tokenID, 0) + idx := 0 + line := 1 // 1s-based count + deferredEOL := false + deferredWord := false + // the tokenizer uses a local dictionary to conserve memory while + // analyzing the input doc to avoid polluting the global dictionary + ld := newDictionary() -func tokenize(in []byte) *document { - // tokenize produces a document from the input content. - text, matches := normalizeDoc(in, true) - return extractDoc(text, true, matches) -} + var doc indexedDocument -func extractDoc(text string, removeEol bool, matches Matches) *document { - var doc document - doc.Matches = matches - // Iterate on a line-by-line basis. - i := 0 - pos := 0 - for { - // Scan the text for the first likely textual content. The scan ignores punctuation - // artifacts that include visual boxes for layout as well as comment characters in - // source files. - firstInLine := true - var wid int - var r rune - - if pos == len(text) { - break - } + isEOF := func(in error) bool { + return in == io.EOF || in == io.ErrUnexpectedEOF + } - next := func() { - r, wid = utf8.DecodeRuneInString(text[pos:]) - pos += wid + // Read out the stream in chunks + for { + // Fill up the buffer with bytes to extract runes from + // idx is offset to hold any bytes left over from previous reads + n, err := io.ReadFull(src, rbuf[idx:]) + if isEOF(err) { + // There are no more bytes to read, so we must now consume all bytes in the + // buffer. + tgt = idx + n + } else if err != nil { + return nil, err } - for pos < len(text) { - start := pos - next() + for idx = 0; idx < tgt; { + r, n := utf8.DecodeRune(rbuf[idx:]) + idx += n if r == '\n' { - doc.Tokens = append(doc.Tokens, &token{ - Text: eol, - Line: i + 1}) - i++ - } + // Deal with carriage return - if !isSignificant(r) { + // If we are in a word (len(obuf) > 0)and the last rune is a - + // strike that rune and keep accumulating. + // Otherwise we treat it like a space and + // flush the word + + if len(obuf) > 0 { + if obuf[len(obuf)-1] == '-' { + obuf = obuf[0 : len(obuf)-1] + deferredEOL = true + continue + } + + // Append the word fragment to the line buffer + linebuf = append(linebuf, flushBuf(len(linebuf), obuf, normalize, ld)) + } + + // If there is something in the line to process, do so now + if len(linebuf) > 0 { + appendToDoc(&doc, dict, line, linebuf, ld, normalize, updateDict, linebuf) + linebuf = nil + obuf = nil + } + if !normalize { + tokID := dict.getIndex(eol) + if tokID == unknownIndex { + tokID = dict.add(eol) + } + doc.Tokens = append(doc.Tokens, indexedToken{ + ID: tokID, + Line: line}) + } + line++ continue } - // We're at a word/number character. - for pos < len(text) { - next() - if unicode.IsSpace(r) { - pos -= wid // Will skip this in outer loop - break + if len(obuf) == 0 { + if unicode.IsLetter(r) || unicode.IsDigit(r) || r == '&' || r == '(' { + // Number or word character starts an interesting word + // Now we slurp up all non-space runes and aggregate it as + // a single word + + // Buffer the initial token, normalizing to lower case if needed + if normalize { + r = unicode.ToLower(r) + } + obuf = utf8.AppendRune(obuf, r) } + continue } - if pos > start { - if start >= 2 && text[start-2] == '.' && text[start-1] == ' ' { - // Insert a "soft EOL" that helps detect header-looking entries that - // follow this text. This resolves problems with licenses that are a - // very long line of text, motivated by - // https://github.com/microsoft/TypeScript/commit/6e6e570d57b6785335668e30b63712e41f89bf74#diff-e60c8cd1bc09b7c4e1bf79c769c9c120L109 - // - // Don't do this if the previous token was already an EOL - if len(doc.Tokens) > 0 && doc.Tokens[len(doc.Tokens)-1].Text != eol { - doc.Tokens = append(doc.Tokens, &token{ - Text: eol, - Line: i + 1}) - } + // At this point, len(obuf) > 0 and we are accumulating more runes + // to complete a word. + if unicode.IsSpace(r) { + // If we have a deferred EOL, we need to pick up a non-space character + // to resume the hyphenated word, so we just consume spaces until that + // happens + if deferredEOL { + continue } - tok := token{ - Text: text[start:pos], - Line: i + 1, + // This is a space between word characters, so we assemble the word as a + // token and flush it out. + idx -= n + + linebuf = append(linebuf, flushBuf(len(linebuf), obuf, normalize, ld)) + if deferredWord { + appendToDoc(&doc, dict, line, linebuf, ld, normalize, updateDict, linebuf) + linebuf = nil + deferredWord = false + // Increment the line count now so the remainder token is credited + // to the previous line number. + line++ } - if firstInLine { - // Store the prefix material, it is useful to discern some corner cases - tok.Previous = text[0:start] + obuf = make([]byte, 0) + continue + } + + if deferredEOL { + deferredEOL = false + deferredWord = true + } + // perform token mappings for punctuation to emulate + // normalizePunctuation. this returns a string and each rune needs to be + // injected. + if rep, found := punctuationMappings[r]; found { + for _, t := range rep { + obuf = utf8.AppendRune(obuf, unicode.ToLower(t)) } - doc.Tokens = append(doc.Tokens, &tok) - firstInLine = false + continue } + + // if it's not punctuation, lowercase and buffer the token + obuf = utf8.AppendRune(obuf, unicode.ToLower(r)) + } + + // Break out if we have consumed all read bytes + if isEOF(err) { + break } + + // Copy the unconsumed bytes at the end of the buffer to the start + // of the buffer so the next read appends after them. + n = copy(rbuf, rbuf[idx:]) + idx = n + } + + // Process the remaining bytes in the buffer + if len(obuf) > 0 { + linebuf = append(linebuf, flushBuf(len(linebuf), obuf, normalize, ld)) + } + if len(linebuf) > 0 { + appendToDoc(&doc, dict, line, linebuf, ld, normalize, updateDict, linebuf) } - doc.Tokens = cleanupTokens(doc.Tokens, removeEol) - return &doc + doc.dict = dict + doc.generateFrequencies() + doc.runes = diffWordsToRunes(&doc, 0, doc.size()) + doc.Norm = doc.normalized() + return &doc, nil } -func cleanupTokens(in []*token, removeEol bool) []*token { - // This routine performs sanitization of tokens. If it is a header-looking - // token (but not a version number) starting a line, it is removed. - // Hyphenated words are reassembled. - partialWord := "" - var out []*token - tokIdx := 0 - firstInLine := true - for i, tok := range in { - if firstInLine && header(tok) { +func appendToDoc(doc *indexedDocument, dict *dictionary, line int, in []tokenID, ld *dictionary, normalize bool, updateDict bool, linebuf []tokenID) { + tokens, m := stringifyLineBuf(dict, line, linebuf, ld, normalize, updateDict) + if tokens != nil { + doc.Tokens = append(doc.Tokens, tokens...) + } else if m != nil { + doc.Matches = append(doc.Matches, m) + } +} + +func stringifyLineBuf(dict *dictionary, line int, in []tokenID, ld *dictionary, normalize bool, updateDict bool) ([]indexedToken, *Match) { + if len(in) == 0 { + return nil, nil + } + var sb strings.Builder + for i, r := range in { + out := ld.getWord(r) + if out == "" { continue } - if tok.Text == eol { - firstInLine = true - if removeEol { - continue - } - // If we are reconstructing a hyphenated word, don't append the EOL - // now, do it when the word is reconstructed. - if partialWord == "" { - out = append(out, &token{Text: eol, Line: tok.Line}) - tokIdx++ - } - continue + sb.WriteString(out) + if i < len(in)-1 { + sb.WriteByte(' ') } - firstInLine = false - t := cleanupToken(tok.Text) - // If this is the last token in a line, and it looks like a hyphenated - // word, store it for reassembly. - if strings.HasSuffix(tok.Text, "-") && i+1 < len(in) && in[i+1].Text == eol { - partialWord = t - } else if partialWord != "" { - // Repair hyphenated words - tp := in[i-1] - tp.Text = partialWord + t - tp.Previous = "" - out = append(out, tp) - tokIdx++ - if !removeEol { - // Append the EOL now that the whole word is recovered - out = append(out, &token{Text: eol, Line: tp.Line}) - tokIdx++ - } + } - partialWord = "" - } else { - tok.Text = t - tok.Previous = "" - out = append(out, tok) - tokIdx++ + out := sb.String() + + for _, re := range ignorableTexts { + if re.MatchString(out) { + return nil, &Match{Name: "Copyright", MatchType: "Copyright", Confidence: 1.0, StartLine: line, EndLine: line} } } - return out -} -// interchangeablePunctutation is punctuation that can be normalized. -var interchangeablePunctuation = []struct { - interchangeable string - substitute string -}{ - // Hyphen, Dash, En Dash, and Em Dash. - {`-‒–—‐`, "-"}, - // Single, Double, Curly Single, and Curly Double. - {"'\"`‘’“”", "'"}, - // Copyright. - {"©", "(c)"}, - // Currency and Section. (Different copies of the CDDL use each marker.) - {"§¤", "(s)"}, - // Middle Dot - {"·", " "}, - {"*", " "}, -} - -// normalizePunctuation takes all hyphens and quotes and normalizes them. -func normalizePunctuation(s string) string { - for _, iw := range interchangeablePunctuation { - for _, in := range strings.Split(iw.interchangeable, "") { - s = strings.ReplaceAll(s, in, iw.substitute) + var tokens []indexedToken + for i, r := range in { + txt := cleanupToken(i, ld.getWord(r), normalize) + if txt != "" { + var tokID tokenID + if updateDict { + tokID = dict.add(txt) + } else { + tokID = dict.getIndex(txt) + } + tokens = append(tokens, indexedToken{ + Line: line, + ID: tokID, + }) } } - return s + + return tokens, nil } -// interchangeableWords are words we can substitute for a normalized form -// without changing the meaning of the license. See -// https://spdx.org/spdx-license-list/matching-guidelines for the list. -var interchangeableWords = []struct { - interchangeable *regexp.Regexp - substitute string -}{ - {regexp.MustCompile("acknowledgement"), "acknowledgment"}, - {regexp.MustCompile("analogue"), "analog"}, - {regexp.MustCompile("analyse"), "analyze"}, - {regexp.MustCompile("artefact"), "artifact"}, - {regexp.MustCompile("authorisation"), "authorization"}, - {regexp.MustCompile("authorised"), "authorized"}, - {regexp.MustCompile("calibre"), "caliber"}, - {regexp.MustCompile("cancelled"), "canceled"}, - {regexp.MustCompile("capitalisations"), "capitalizations"}, - {regexp.MustCompile("catalogue"), "catalog"}, - {regexp.MustCompile("categorise"), "categorize"}, - {regexp.MustCompile("centre"), "center"}, - {regexp.MustCompile("emphasised"), "emphasized"}, - {regexp.MustCompile("favour"), "favor"}, - {regexp.MustCompile("favourite"), "favorite"}, - {regexp.MustCompile("fulfil\\b"), "fulfill"}, - {regexp.MustCompile("fulfilment"), "fulfillment"}, - {regexp.MustCompile("https"), "http"}, - {regexp.MustCompile("initialise"), "initialize"}, - {regexp.MustCompile("judgment"), "judgement"}, - {regexp.MustCompile("labelling"), "labeling"}, - {regexp.MustCompile("labour"), "labor"}, - {regexp.MustCompile("licence"), "license"}, - {regexp.MustCompile("maximise"), "maximize"}, - {regexp.MustCompile("modelled"), "modeled"}, - {regexp.MustCompile("modelling"), "modeling"}, - {regexp.MustCompile("offence"), "offense"}, - {regexp.MustCompile("optimise"), "optimize"}, - {regexp.MustCompile("organisation"), "organization"}, - {regexp.MustCompile("organise"), "organize"}, - {regexp.MustCompile("practise"), "practice"}, - {regexp.MustCompile("programme"), "program"}, - {regexp.MustCompile("realise"), "realize"}, - {regexp.MustCompile("recognise"), "recognize"}, - {regexp.MustCompile("signalling"), "signaling"}, - {regexp.MustCompile("sub[ -]license"), "sublicense"}, - {regexp.MustCompile("utilisation"), "utilization"}, - {regexp.MustCompile("whilst"), "while"}, - {regexp.MustCompile("wilful"), "wilfull"}, - {regexp.MustCompile("non[ -]commercial"), "noncommercial"}, - {regexp.MustCompile("per cent"), "percent"}, +func normalizeToken(in string) string { + // This performs some preprocessing on the token. + // This is different than cleanupToken in that fixups here + // are not exact match on the token. + // Normalizing URLs from https to http is an example of a fix applied + // here. + return strings.ReplaceAll(in, "https", "http") } -// normalizeWords remaps equivalent words that are interchangeable and lowercases -// the word to allow for exact matching. -func normalizeWords(s string) string { - s = strings.ToLower(s) - for _, iw := range interchangeableWords { - s = iw.interchangeable.ReplaceAllString(s, iw.substitute) - } - return s +func flushBuf(pos int, obuf []byte, normalizeWord bool, ld *dictionary) tokenID { + // clean up the contents of the rune buffer + token := string(obuf) + // escape sequences can occur anywhere in the string, not just the beginning + // so always attempt to unescape the word's content. + token = html.UnescapeString(token) + + clean := normalizeToken(token) + + return ld.add(clean) } -func header(tok *token) bool { - in := tok.Text - p, e := in[:len(in)-1], in[len(in)-1] - switch e { - case '.', ':', ')': - if listMarker[p] { - if e != ')' { - return true +func cleanupToken(pos int, in string, normalizeWord bool) string { + r, _ := utf8.DecodeRuneInString(in) + var out strings.Builder + if pos == 0 && header(in) { + return "" + } + + if !unicode.IsLetter(r) { + if unicode.IsDigit(r) { + // Based on analysis of the license corpus, the characters that are + // significant are numbers, periods, and dashes. Anything else can be + // safely discarded, and helps avoid matching failures due to inconsistent + // whitespacing and formatting. + for _, c := range in { + if unicode.IsDigit(c) || c == '.' || c == '-' { + out.WriteRune(c) + } } - // Sometimes an internal reference like "(ii)" from NPL-1.02.txt - // endds up at the beginning of a line. In that case, it's - // not actually a header. - if e == ')' && !strings.HasSuffix(tok.Previous, "(") { - return true + + // Numbers should not end in a . since that doesn't indicate a version + // number, but usually an end of a line. + res := out.String() + for strings.HasSuffix(res, ".") { + res = res[0 : len(res)-1] } + return res } - // Check for patterns like 1.2.3 - for _, r := range p { - if unicode.IsDigit(r) || r == '.' { - continue - } - return false + } + + // Remove internal hyphenization or URL constructs to better normalize strings + // for matching. + + for _, c := range in { + if unicode.IsLetter(c) { + out.WriteRune(c) } - return true } - return false -} -var listMarker = func() map[string]bool { - const allListMarkers = "a b c d e f g h i j k l m n o p q r ii iii iv v vi vii viii ix xi xii xiii xiv xv" - l := map[string]bool{} - for _, marker := range strings.Split(allListMarkers, " ") { - l[marker] = true + tok := out.String() + if !normalizeWord { + return tok } - return l -}() -// ignorableTexts is a list of lines at the start of the string we can remove -// to get a cleaner match. -var ignorableTexts = []*regexp.Regexp{ - regexp.MustCompile(`(?i)^(.{1,5})?copyright (\(c\) )?(\[yyyy\]|\d{4})[,.]?.*$`), - regexp.MustCompile(`(?i)^(.{1,5})?copyright \(c\) \[dates of first publication\].*$`), - regexp.MustCompile(`(?i)^\d{4}-(\d{2}|[a-z]{3})-\d{2}$`), + if iw, ok := interchangeableWords[tok]; ok && normalizeWord { + return iw + } + return tok } -// removeIgnorableTexts removes common text, which is not important for -// classification -func removeIgnorableTexts(s string) (string, Matches) { - var out []string - var matches Matches - lines := strings.Split(s, "\n") - for i, l := range lines { - line := strings.TrimSpace(l) - var match bool - for _, re := range ignorableTexts { - if re.MatchString(line) { - match = true - } - } - if !match { - out = append(out, l) - } else { - // We want to preserve line presence for the positional information - out = append(out, "") - matches = append(matches, &Match{Name: "Copyright", MatchType: "Copyright", Confidence: 1.0, StartLine: i + 1, EndLine: i + 1}) - } - } - return strings.Join(out, "\n"), matches +var interchangeableWords = map[string]string{ + "analyse": "analyze", + "artefact": "artifact", + "authorisation": "authorization", + "authorised": "authorized", + "calibre": "caliber", + "cancelled": "canceled", + "capitalisations": "capitalizations", + "catalogue": "catalog", + "categorise": "categorize", + "centre": "center", + "emphasised": "emphasized", + "favour": "favor", + "favourite": "favorite", + "fulfil": "fulfill", + "fulfilment": "fulfillment", + "https": "http", + "initialise": "initialize", + "judgment": "judgement", + "labelling": "labeling", + "labour": "labor", + "licence": "license", + "maximise": "maximize", + "modelled": "modeled", + "modelling": "modeling", + "offence": "offense", + "optimise": "optimize", + "organisation": "organization", + "organise": "organize", + "practise": "practice", + "programme": "program", + "realise": "realize", + "recognise": "recognize", + "signalling": "signaling", + "utilisation": "utilization", + "whilst": "while", + "wilful": "wilfull", + // TODO: These three need tokenizer magic + "non commercial": "noncommercial", + "per cent": "percent", + "sub license": "sublicense", +} + +var punctuationMappings = map[rune]string{ + '-': "-", + '‒': "-", + '–': "-", + '—': "-", + '‐': "-", + '©': "(c)", + '§': "(s)", + '¤': "(s)", + '·': " ", + '*': " ", } |