// Copyright 2020 Google Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package classifier import ( "html" "io" "regexp" "strings" "unicode" "unicode/utf8" ) var eol = "\n" func header(in string) bool { if len(in) == 0 { return false } p, e := in[:len(in)-1], in[len(in)-1] switch e { case '.', ':', ')': if listMarker[p] { if e != ')' { return true } } // Check for patterns like 1.2.3 for _, r := range p { if unicode.IsDigit(r) || r == '.' { continue } return false } return true } return false } var listMarker = func() map[string]bool { const allListMarkers = "a b c d e f g h i j k l m n o p q r ii iii iv v vi vii viii ix xi xii xiii xiv xv" l := map[string]bool{} for _, marker := range strings.Split(allListMarkers, " ") { l[marker] = true } return l }() // ignorableTexts is a list of lines at the start of the string we can remove // to get a cleaner match. var ignorableTexts = []*regexp.Regexp{ regexp.MustCompile(`(?i)^(.{1,5})?copyright (\(c\) )?(\[yyyy\]|\d{4})[,.]?.*$`), regexp.MustCompile(`(?i)^(.{1,5})?copyright \(c\) \[dates of first publication\].*$`), regexp.MustCompile(`(?i)^\d{4}-(\d{2}|[a-z]{3})-\d{2}$`), } // tokenizeStream reads bytes from src and produces an indexedDocument of its // cotent. tokenizeStream will never return an error of its own, it can only // return an error from the provided Reader. If the provided Reader never // returns an error, it is safe to assume that tokenizeStream will not return an // error. func tokenizeStream(src io.Reader, normalize bool, dict *dictionary, updateDict bool) (*indexedDocument, error) { const bufSize = 1024 // The longest UTF-8 encoded rune is 4 bytes, so we keep enough leftover bytes // in the buffer to ensure we never run out of bytes trying to finish // constructing a rune. These leftover 4 bytes will be copied to the start of // the buffer before additional bytes are read. tgt := bufSize - 4 rbuf := make([]byte, bufSize) obuf := make([]byte, 0) linebuf := make([]tokenID, 0) idx := 0 line := 1 // 1s-based count deferredEOL := false deferredWord := false // the tokenizer uses a local dictionary to conserve memory while // analyzing the input doc to avoid polluting the global dictionary ld := newDictionary() var doc indexedDocument isEOF := func(in error) bool { return in == io.EOF || in == io.ErrUnexpectedEOF } // Read out the stream in chunks for { // Fill up the buffer with bytes to extract runes from // idx is offset to hold any bytes left over from previous reads n, err := io.ReadFull(src, rbuf[idx:]) if isEOF(err) { // There are no more bytes to read, so we must now consume all bytes in the // buffer. tgt = idx + n } else if err != nil { return nil, err } for idx = 0; idx < tgt; { r, n := utf8.DecodeRune(rbuf[idx:]) idx += n if r == '\n' { // Deal with carriage return // If we are in a word (len(obuf) > 0)and the last rune is a - // strike that rune and keep accumulating. // Otherwise we treat it like a space and // flush the word if len(obuf) > 0 { if obuf[len(obuf)-1] == '-' { obuf = obuf[0 : len(obuf)-1] deferredEOL = true continue } // Append the word fragment to the line buffer linebuf = append(linebuf, flushBuf(len(linebuf), obuf, normalize, ld)) } // If there is something in the line to process, do so now if len(linebuf) > 0 { appendToDoc(&doc, dict, line, linebuf, ld, normalize, updateDict, linebuf) linebuf = nil obuf = nil } if !normalize { tokID := dict.getIndex(eol) if tokID == unknownIndex { tokID = dict.add(eol) } doc.Tokens = append(doc.Tokens, indexedToken{ ID: tokID, Line: line}) } line++ continue } if len(obuf) == 0 { if unicode.IsLetter(r) || unicode.IsDigit(r) || r == '&' || r == '(' { // Number or word character starts an interesting word // Now we slurp up all non-space runes and aggregate it as // a single word // Buffer the initial token, normalizing to lower case if needed if normalize { r = unicode.ToLower(r) } obuf = utf8.AppendRune(obuf, r) } continue } // At this point, len(obuf) > 0 and we are accumulating more runes // to complete a word. if unicode.IsSpace(r) { // If we have a deferred EOL, we need to pick up a non-space character // to resume the hyphenated word, so we just consume spaces until that // happens if deferredEOL { continue } // This is a space between word characters, so we assemble the word as a // token and flush it out. idx -= n linebuf = append(linebuf, flushBuf(len(linebuf), obuf, normalize, ld)) if deferredWord { appendToDoc(&doc, dict, line, linebuf, ld, normalize, updateDict, linebuf) linebuf = nil deferredWord = false // Increment the line count now so the remainder token is credited // to the previous line number. line++ } obuf = make([]byte, 0) continue } if deferredEOL { deferredEOL = false deferredWord = true } // perform token mappings for punctuation to emulate // normalizePunctuation. this returns a string and each rune needs to be // injected. if rep, found := punctuationMappings[r]; found { for _, t := range rep { obuf = utf8.AppendRune(obuf, unicode.ToLower(t)) } continue } // if it's not punctuation, lowercase and buffer the token obuf = utf8.AppendRune(obuf, unicode.ToLower(r)) } // Break out if we have consumed all read bytes if isEOF(err) { break } // Copy the unconsumed bytes at the end of the buffer to the start // of the buffer so the next read appends after them. n = copy(rbuf, rbuf[idx:]) idx = n } // Process the remaining bytes in the buffer if len(obuf) > 0 { linebuf = append(linebuf, flushBuf(len(linebuf), obuf, normalize, ld)) } if len(linebuf) > 0 { appendToDoc(&doc, dict, line, linebuf, ld, normalize, updateDict, linebuf) } doc.dict = dict doc.generateFrequencies() doc.runes = diffWordsToRunes(&doc, 0, doc.size()) doc.Norm = doc.normalized() return &doc, nil } func appendToDoc(doc *indexedDocument, dict *dictionary, line int, in []tokenID, ld *dictionary, normalize bool, updateDict bool, linebuf []tokenID) { tokens, m := stringifyLineBuf(dict, line, linebuf, ld, normalize, updateDict) if tokens != nil { doc.Tokens = append(doc.Tokens, tokens...) } else if m != nil { doc.Matches = append(doc.Matches, m) } } func stringifyLineBuf(dict *dictionary, line int, in []tokenID, ld *dictionary, normalize bool, updateDict bool) ([]indexedToken, *Match) { if len(in) == 0 { return nil, nil } var sb strings.Builder for i, r := range in { out := ld.getWord(r) if out == "" { continue } sb.WriteString(out) if i < len(in)-1 { sb.WriteByte(' ') } } out := sb.String() for _, re := range ignorableTexts { if re.MatchString(out) { return nil, &Match{Name: "Copyright", MatchType: "Copyright", Confidence: 1.0, StartLine: line, EndLine: line} } } var tokens []indexedToken for i, r := range in { txt := cleanupToken(i, ld.getWord(r), normalize) if txt != "" { var tokID tokenID if updateDict { tokID = dict.add(txt) } else { tokID = dict.getIndex(txt) } tokens = append(tokens, indexedToken{ Line: line, ID: tokID, }) } } return tokens, nil } func normalizeToken(in string) string { // This performs some preprocessing on the token. // This is different than cleanupToken in that fixups here // are not exact match on the token. // Normalizing URLs from https to http is an example of a fix applied // here. return strings.ReplaceAll(in, "https", "http") } func flushBuf(pos int, obuf []byte, normalizeWord bool, ld *dictionary) tokenID { // clean up the contents of the rune buffer token := string(obuf) // escape sequences can occur anywhere in the string, not just the beginning // so always attempt to unescape the word's content. token = html.UnescapeString(token) clean := normalizeToken(token) return ld.add(clean) } func cleanupToken(pos int, in string, normalizeWord bool) string { r, _ := utf8.DecodeRuneInString(in) var out strings.Builder if pos == 0 && header(in) { return "" } if !unicode.IsLetter(r) { if unicode.IsDigit(r) { // Based on analysis of the license corpus, the characters that are // significant are numbers, periods, and dashes. Anything else can be // safely discarded, and helps avoid matching failures due to inconsistent // whitespacing and formatting. for _, c := range in { if unicode.IsDigit(c) || c == '.' || c == '-' { out.WriteRune(c) } } // Numbers should not end in a . since that doesn't indicate a version // number, but usually an end of a line. res := out.String() for strings.HasSuffix(res, ".") { res = res[0 : len(res)-1] } return res } } // Remove internal hyphenization or URL constructs to better normalize strings // for matching. for _, c := range in { if unicode.IsLetter(c) { out.WriteRune(c) } } tok := out.String() if !normalizeWord { return tok } if iw, ok := interchangeableWords[tok]; ok && normalizeWord { return iw } return tok } var interchangeableWords = map[string]string{ "analyse": "analyze", "artefact": "artifact", "authorisation": "authorization", "authorised": "authorized", "calibre": "caliber", "cancelled": "canceled", "capitalisations": "capitalizations", "catalogue": "catalog", "categorise": "categorize", "centre": "center", "emphasised": "emphasized", "favour": "favor", "favourite": "favorite", "fulfil": "fulfill", "fulfilment": "fulfillment", "https": "http", "initialise": "initialize", "judgment": "judgement", "labelling": "labeling", "labour": "labor", "licence": "license", "maximise": "maximize", "modelled": "modeled", "modelling": "modeling", "offence": "offense", "optimise": "optimize", "organisation": "organization", "organise": "organize", "practise": "practice", "programme": "program", "realise": "realize", "recognise": "recognize", "signalling": "signaling", "utilisation": "utilization", "whilst": "while", "wilful": "wilfull", // TODO: These three need tokenizer magic "non commercial": "noncommercial", "per cent": "percent", "sub license": "sublicense", } var punctuationMappings = map[rune]string{ '-': "-", '‒': "-", '–': "-", '—': "-", '‐': "-", '©': "(c)", '§': "(s)", '¤': "(s)", '·': " ", '*': " ", }