diff options
Diffstat (limited to 'v2/tokenizer.go')
-rw-r--r-- | v2/tokenizer.go | 417 |
1 files changed, 417 insertions, 0 deletions
diff --git a/v2/tokenizer.go b/v2/tokenizer.go new file mode 100644 index 0000000..607b0d4 --- /dev/null +++ b/v2/tokenizer.go @@ -0,0 +1,417 @@ +// Copyright 2020 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package classifier + +import ( + "html" + "io" + "regexp" + "strings" + "unicode" + "unicode/utf8" +) + +var eol = "\n" + +func header(in string) bool { + if len(in) == 0 { + return false + } + p, e := in[:len(in)-1], in[len(in)-1] + switch e { + case '.', ':', ')': + if listMarker[p] { + if e != ')' { + return true + } + } + // Check for patterns like 1.2.3 + for _, r := range p { + if unicode.IsDigit(r) || r == '.' { + continue + } + return false + } + return true + } + return false +} + +var listMarker = func() map[string]bool { + const allListMarkers = "a b c d e f g h i j k l m n o p q r ii iii iv v vi vii viii ix xi xii xiii xiv xv" + l := map[string]bool{} + for _, marker := range strings.Split(allListMarkers, " ") { + l[marker] = true + } + return l +}() + +// ignorableTexts is a list of lines at the start of the string we can remove +// to get a cleaner match. +var ignorableTexts = []*regexp.Regexp{ + regexp.MustCompile(`(?i)^(.{1,5})?copyright (\(c\) )?(\[yyyy\]|\d{4})[,.]?.*$`), + regexp.MustCompile(`(?i)^(.{1,5})?copyright \(c\) \[dates of first publication\].*$`), + regexp.MustCompile(`(?i)^\d{4}-(\d{2}|[a-z]{3})-\d{2}$`), +} + +// tokenizeStream reads bytes from src and produces an indexedDocument of its +// cotent. tokenizeStream will never return an error of its own, it can only +// return an error from the provided Reader. If the provided Reader never +// returns an error, it is safe to assume that tokenizeStream will not return an +// error. +func tokenizeStream(src io.Reader, normalize bool, dict *dictionary, updateDict bool) (*indexedDocument, error) { + const bufSize = 1024 + // The longest UTF-8 encoded rune is 4 bytes, so we keep enough leftover bytes + // in the buffer to ensure we never run out of bytes trying to finish + // constructing a rune. These leftover 4 bytes will be copied to the start of + // the buffer before additional bytes are read. + tgt := bufSize - 4 + + rbuf := make([]byte, bufSize) + obuf := make([]byte, 0) + linebuf := make([]tokenID, 0) + idx := 0 + line := 1 // 1s-based count + deferredEOL := false + deferredWord := false + // the tokenizer uses a local dictionary to conserve memory while + // analyzing the input doc to avoid polluting the global dictionary + ld := newDictionary() + + var doc indexedDocument + + isEOF := func(in error) bool { + return in == io.EOF || in == io.ErrUnexpectedEOF + } + + // Read out the stream in chunks + for { + // Fill up the buffer with bytes to extract runes from + // idx is offset to hold any bytes left over from previous reads + n, err := io.ReadFull(src, rbuf[idx:]) + if isEOF(err) { + // There are no more bytes to read, so we must now consume all bytes in the + // buffer. + tgt = idx + n + } else if err != nil { + return nil, err + } + + for idx = 0; idx < tgt; { + r, n := utf8.DecodeRune(rbuf[idx:]) + idx += n + + if r == '\n' { + // Deal with carriage return + + // If we are in a word (len(obuf) > 0)and the last rune is a - + // strike that rune and keep accumulating. + // Otherwise we treat it like a space and + // flush the word + + if len(obuf) > 0 { + if obuf[len(obuf)-1] == '-' { + obuf = obuf[0 : len(obuf)-1] + deferredEOL = true + continue + } + + // Append the word fragment to the line buffer + linebuf = append(linebuf, flushBuf(len(linebuf), obuf, normalize, ld)) + } + + // If there is something in the line to process, do so now + if len(linebuf) > 0 { + appendToDoc(&doc, dict, line, linebuf, ld, normalize, updateDict, linebuf) + linebuf = nil + obuf = nil + } + if !normalize { + tokID := dict.getIndex(eol) + if tokID == unknownIndex { + tokID = dict.add(eol) + } + doc.Tokens = append(doc.Tokens, indexedToken{ + ID: tokID, + Line: line}) + } + line++ + continue + } + + if len(obuf) == 0 { + if unicode.IsLetter(r) || unicode.IsDigit(r) || r == '&' || r == '(' { + // Number or word character starts an interesting word + // Now we slurp up all non-space runes and aggregate it as + // a single word + + // Buffer the initial token, normalizing to lower case if needed + if normalize { + r = unicode.ToLower(r) + } + obuf = utf8.AppendRune(obuf, r) + } + continue + } + + // At this point, len(obuf) > 0 and we are accumulating more runes + // to complete a word. + if unicode.IsSpace(r) { + // If we have a deferred EOL, we need to pick up a non-space character + // to resume the hyphenated word, so we just consume spaces until that + // happens + if deferredEOL { + continue + } + + // This is a space between word characters, so we assemble the word as a + // token and flush it out. + idx -= n + + linebuf = append(linebuf, flushBuf(len(linebuf), obuf, normalize, ld)) + if deferredWord { + appendToDoc(&doc, dict, line, linebuf, ld, normalize, updateDict, linebuf) + linebuf = nil + deferredWord = false + // Increment the line count now so the remainder token is credited + // to the previous line number. + line++ + } + obuf = make([]byte, 0) + continue + } + + if deferredEOL { + deferredEOL = false + deferredWord = true + } + // perform token mappings for punctuation to emulate + // normalizePunctuation. this returns a string and each rune needs to be + // injected. + if rep, found := punctuationMappings[r]; found { + for _, t := range rep { + obuf = utf8.AppendRune(obuf, unicode.ToLower(t)) + } + continue + } + + // if it's not punctuation, lowercase and buffer the token + obuf = utf8.AppendRune(obuf, unicode.ToLower(r)) + } + + // Break out if we have consumed all read bytes + if isEOF(err) { + break + } + + // Copy the unconsumed bytes at the end of the buffer to the start + // of the buffer so the next read appends after them. + n = copy(rbuf, rbuf[idx:]) + idx = n + } + + // Process the remaining bytes in the buffer + if len(obuf) > 0 { + linebuf = append(linebuf, flushBuf(len(linebuf), obuf, normalize, ld)) + } + if len(linebuf) > 0 { + appendToDoc(&doc, dict, line, linebuf, ld, normalize, updateDict, linebuf) + } + + doc.dict = dict + doc.generateFrequencies() + doc.runes = diffWordsToRunes(&doc, 0, doc.size()) + doc.Norm = doc.normalized() + return &doc, nil +} + +func appendToDoc(doc *indexedDocument, dict *dictionary, line int, in []tokenID, ld *dictionary, normalize bool, updateDict bool, linebuf []tokenID) { + tokens, m := stringifyLineBuf(dict, line, linebuf, ld, normalize, updateDict) + if tokens != nil { + doc.Tokens = append(doc.Tokens, tokens...) + } else if m != nil { + doc.Matches = append(doc.Matches, m) + } +} + +func stringifyLineBuf(dict *dictionary, line int, in []tokenID, ld *dictionary, normalize bool, updateDict bool) ([]indexedToken, *Match) { + if len(in) == 0 { + return nil, nil + } + var sb strings.Builder + for i, r := range in { + out := ld.getWord(r) + if out == "" { + continue + } + sb.WriteString(out) + if i < len(in)-1 { + sb.WriteByte(' ') + } + } + + out := sb.String() + + for _, re := range ignorableTexts { + if re.MatchString(out) { + return nil, &Match{Name: "Copyright", MatchType: "Copyright", Confidence: 1.0, StartLine: line, EndLine: line} + } + } + + var tokens []indexedToken + for i, r := range in { + txt := cleanupToken(i, ld.getWord(r), normalize) + if txt != "" { + var tokID tokenID + if updateDict { + tokID = dict.add(txt) + } else { + tokID = dict.getIndex(txt) + } + tokens = append(tokens, indexedToken{ + Line: line, + ID: tokID, + }) + } + } + + return tokens, nil +} + +func normalizeToken(in string) string { + // This performs some preprocessing on the token. + // This is different than cleanupToken in that fixups here + // are not exact match on the token. + // Normalizing URLs from https to http is an example of a fix applied + // here. + return strings.ReplaceAll(in, "https", "http") +} + +func flushBuf(pos int, obuf []byte, normalizeWord bool, ld *dictionary) tokenID { + // clean up the contents of the rune buffer + token := string(obuf) + // escape sequences can occur anywhere in the string, not just the beginning + // so always attempt to unescape the word's content. + token = html.UnescapeString(token) + + clean := normalizeToken(token) + + return ld.add(clean) +} + +func cleanupToken(pos int, in string, normalizeWord bool) string { + r, _ := utf8.DecodeRuneInString(in) + var out strings.Builder + if pos == 0 && header(in) { + return "" + } + + if !unicode.IsLetter(r) { + if unicode.IsDigit(r) { + // Based on analysis of the license corpus, the characters that are + // significant are numbers, periods, and dashes. Anything else can be + // safely discarded, and helps avoid matching failures due to inconsistent + // whitespacing and formatting. + for _, c := range in { + if unicode.IsDigit(c) || c == '.' || c == '-' { + out.WriteRune(c) + } + } + + // Numbers should not end in a . since that doesn't indicate a version + // number, but usually an end of a line. + res := out.String() + for strings.HasSuffix(res, ".") { + res = res[0 : len(res)-1] + } + return res + } + } + + // Remove internal hyphenization or URL constructs to better normalize strings + // for matching. + + for _, c := range in { + if unicode.IsLetter(c) { + out.WriteRune(c) + } + } + + tok := out.String() + if !normalizeWord { + return tok + } + + if iw, ok := interchangeableWords[tok]; ok && normalizeWord { + return iw + } + return tok +} + +var interchangeableWords = map[string]string{ + "analyse": "analyze", + "artefact": "artifact", + "authorisation": "authorization", + "authorised": "authorized", + "calibre": "caliber", + "cancelled": "canceled", + "capitalisations": "capitalizations", + "catalogue": "catalog", + "categorise": "categorize", + "centre": "center", + "emphasised": "emphasized", + "favour": "favor", + "favourite": "favorite", + "fulfil": "fulfill", + "fulfilment": "fulfillment", + "https": "http", + "initialise": "initialize", + "judgment": "judgement", + "labelling": "labeling", + "labour": "labor", + "licence": "license", + "maximise": "maximize", + "modelled": "modeled", + "modelling": "modeling", + "offence": "offense", + "optimise": "optimize", + "organisation": "organization", + "organise": "organize", + "practise": "practice", + "programme": "program", + "realise": "realize", + "recognise": "recognize", + "signalling": "signaling", + "utilisation": "utilization", + "whilst": "while", + "wilful": "wilfull", + // TODO: These three need tokenizer magic + "non commercial": "noncommercial", + "per cent": "percent", + "sub license": "sublicense", +} + +var punctuationMappings = map[rune]string{ + '-': "-", + '‒': "-", + '–': "-", + '—': "-", + '‐': "-", + '©': "(c)", + '§': "(s)", + '¤': "(s)", + '·': " ", + '*': " ", +} |