1 files changed, 417 insertions, 0 deletions
diff --git a/v2/tokenizer.go b/v2/tokenizer.go
new file mode 100644
index 0000000..607b0d4
--- /dev/null
+++ b/v2/tokenizer.go
@@ -0,0 +1,417 @@
+// Copyright 2020 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package classifier
+
+import (
+	"html"
+	"io"
+	"regexp"
+	"strings"
+	"unicode"
+	"unicode/utf8"
+)
+
+var eol = "\n"
+
+func header(in string) bool {
+	if len(in) == 0 {
+		return false
+	}
+	p, e := in[:len(in)-1], in[len(in)-1]
+	switch e {
+	case '.', ':', ')':
+		if listMarker[p] {
+			if e != ')' {
+				return true
+			}
+		}
+		// Check for patterns like 1.2.3
+		for _, r := range p {
+			if unicode.IsDigit(r) || r == '.' {
+				continue
+			}
+			return false
+		}
+		return true
+	}
+	return false
+}
+
+var listMarker = func() map[string]bool {
+	const allListMarkers = "a b c d e f g h i j k l m n o p q r ii iii iv v vi vii viii ix xi xii xiii xiv xv"
+	l := map[string]bool{}
+	for _, marker := range strings.Split(allListMarkers, " ") {
+		l[marker] = true
+	}
+	return l
+}()
+
+// ignorableTexts is a list of lines at the start of the string we can remove
+// to get a cleaner match.
+var ignorableTexts = []*regexp.Regexp{
+	regexp.MustCompile(`(?i)^(.{1,5})?copyright (\(c\) )?(\[yyyy\]|\d{4})[,.]?.*$`),
+	regexp.MustCompile(`(?i)^(.{1,5})?copyright \(c\) \[dates of first publication\].*$`),
+	regexp.MustCompile(`(?i)^\d{4}-(\d{2}|[a-z]{3})-\d{2}$`),
+}
+
+// tokenizeStream reads bytes from src and produces an indexedDocument of its
+// cotent. tokenizeStream will never return an error of its own, it can only
+// return an error from the provided Reader. If the provided Reader never
+// returns an error, it is safe to assume that tokenizeStream will not return an
+// error.
+func tokenizeStream(src io.Reader, normalize bool, dict *dictionary, updateDict bool) (*indexedDocument, error) {
+	const bufSize = 1024
+	// The longest UTF-8 encoded rune is 4 bytes, so we keep enough leftover bytes
+	// in the buffer to ensure we never run out of bytes trying to finish
+	// constructing a rune. These leftover 4 bytes will be copied to the start of
+	// the buffer before additional bytes are read.
+	tgt := bufSize - 4
+
+	rbuf := make([]byte, bufSize)
+	obuf := make([]byte, 0)
+	linebuf := make([]tokenID, 0)
+	idx := 0
+	line := 1 // 1s-based count
+	deferredEOL := false
+	deferredWord := false
+	// the tokenizer uses a local dictionary to conserve memory while
+	// analyzing the input doc to avoid polluting the global dictionary
+	ld := newDictionary()
+
+	var doc indexedDocument
+
+	isEOF := func(in error) bool {
+		return in == io.EOF || in == io.ErrUnexpectedEOF
+	}
+
+	// Read out the stream in chunks
+	for {
+		// Fill up the buffer with bytes to extract runes from
+		// idx is offset to hold any bytes left over from previous reads
+		n, err := io.ReadFull(src, rbuf[idx:])
+		if isEOF(err) {
+			// There are no more bytes to read, so we must now consume all bytes in the
+			// buffer.
+			tgt = idx + n
+		} else if err != nil {
+			return nil, err
+		}
+
+		for idx = 0; idx < tgt; {
+			r, n := utf8.DecodeRune(rbuf[idx:])
+			idx += n
+
+			if r == '\n' {
+				// Deal with carriage return
+
+				// If we are in a word (len(obuf) > 0)and the last rune is a -
+				// strike that rune and keep accumulating.
+				// Otherwise we treat it like a space and
+				// flush the word
+
+				if len(obuf) > 0 {
+					if obuf[len(obuf)-1] == '-' {
+						obuf = obuf[0 : len(obuf)-1]
+						deferredEOL = true
+						continue
+					}
+
+					// Append the word fragment to the line buffer
+					linebuf = append(linebuf, flushBuf(len(linebuf), obuf, normalize, ld))
+				}
+
+				// If there is something in the line to process, do so now
+				if len(linebuf) > 0 {
+					appendToDoc(&doc, dict, line, linebuf, ld, normalize, updateDict, linebuf)
+					linebuf = nil
+					obuf = nil
+				}
+				if !normalize {
+					tokID := dict.getIndex(eol)
+					if tokID == unknownIndex {
+						tokID = dict.add(eol)
+					}
+					doc.Tokens = append(doc.Tokens, indexedToken{
+						ID:   tokID,
+						Line: line})
+				}
+				line++
+				continue
+			}
+
+			if len(obuf) == 0 {
+				if unicode.IsLetter(r) || unicode.IsDigit(r) || r == '&' || r == '(' {
+					// Number or word character starts an interesting word
+					// Now we slurp up all non-space runes and aggregate it as
+					// a single word
+
+					// Buffer the initial token, normalizing to lower case if needed
+					if normalize {
+						r = unicode.ToLower(r)
+					}
+					obuf = utf8.AppendRune(obuf, r)
+				}
+				continue
+			}
+
+			// At this point, len(obuf) > 0 and we are accumulating more runes
+			// to complete a word.
+			if unicode.IsSpace(r) {
+				// If we have a deferred EOL, we need to pick up a non-space character
+				// to resume the hyphenated word, so we just consume spaces until that
+				// happens
+				if deferredEOL {
+					continue
+				}
+
+				// This is a space between word characters, so we assemble the word as a
+				// token and flush it out.
+				idx -= n
+
+				linebuf = append(linebuf, flushBuf(len(linebuf), obuf, normalize, ld))
+				if deferredWord {
+					appendToDoc(&doc, dict, line, linebuf, ld, normalize, updateDict, linebuf)
+					linebuf = nil
+					deferredWord = false
+					// Increment the line count now so the remainder token is credited
+					// to the previous line number.
+					line++
+				}
+				obuf = make([]byte, 0)
+				continue
+			}
+
+			if deferredEOL {
+				deferredEOL = false
+				deferredWord = true
+			}
+			// perform token mappings for punctuation to emulate
+			// normalizePunctuation. this returns a string and each rune needs to be
+			// injected.
+			if rep, found := punctuationMappings[r]; found {
+				for _, t := range rep {
+					obuf = utf8.AppendRune(obuf, unicode.ToLower(t))
+				}
+				continue
+			}
+
+			// if it's not punctuation, lowercase and buffer the token
+			obuf = utf8.AppendRune(obuf, unicode.ToLower(r))
+		}
+
+		// Break out if we have consumed all read bytes
+		if isEOF(err) {
+			break
+		}
+
+		// Copy the unconsumed bytes at the end of the buffer to the start
+		// of the buffer so the next read appends after them.
+		n = copy(rbuf, rbuf[idx:])
+		idx = n
+	}
+
+	// Process the remaining bytes in the buffer
+	if len(obuf) > 0 {
+		linebuf = append(linebuf, flushBuf(len(linebuf), obuf, normalize, ld))
+	}
+	if len(linebuf) > 0 {
+		appendToDoc(&doc, dict, line, linebuf, ld, normalize, updateDict, linebuf)
+	}
+
+	doc.dict = dict
+	doc.generateFrequencies()
+	doc.runes = diffWordsToRunes(&doc, 0, doc.size())
+	doc.Norm = doc.normalized()
+	return &doc, nil
+}
+
+func appendToDoc(doc *indexedDocument, dict *dictionary, line int, in []tokenID, ld *dictionary, normalize bool, updateDict bool, linebuf []tokenID) {
+	tokens, m := stringifyLineBuf(dict, line, linebuf, ld, normalize, updateDict)
+	if tokens != nil {
+		doc.Tokens = append(doc.Tokens, tokens...)
+	} else if m != nil {
+		doc.Matches = append(doc.Matches, m)
+	}
+}
+
+func stringifyLineBuf(dict *dictionary, line int, in []tokenID, ld *dictionary, normalize bool, updateDict bool) ([]indexedToken, *Match) {
+	if len(in) == 0 {
+		return nil, nil
+	}
+	var sb strings.Builder
+	for i, r := range in {
+		out := ld.getWord(r)
+		if out == "" {
+			continue
+		}
+		sb.WriteString(out)
+		if i < len(in)-1 {
+			sb.WriteByte(' ')
+		}
+	}
+
+	out := sb.String()
+
+	for _, re := range ignorableTexts {
+		if re.MatchString(out) {
+			return nil, &Match{Name: "Copyright", MatchType: "Copyright", Confidence: 1.0, StartLine: line, EndLine: line}
+		}
+	}
+
+	var tokens []indexedToken
+	for i, r := range in {
+		txt := cleanupToken(i, ld.getWord(r), normalize)
+		if txt != "" {
+			var tokID tokenID
+			if updateDict {
+				tokID = dict.add(txt)
+			} else {
+				tokID = dict.getIndex(txt)
+			}
+			tokens = append(tokens, indexedToken{
+				Line: line,
+				ID:   tokID,
+			})
+		}
+	}
+
+	return tokens, nil
+}
+
+func normalizeToken(in string) string {
+	// This performs some preprocessing on the token.
+	// This is different than cleanupToken in that fixups here
+	// are not exact match on the token.
+	// Normalizing URLs from https to http is an example of a fix applied
+	// here.
+	return strings.ReplaceAll(in, "https", "http")
+}
+
+func flushBuf(pos int, obuf []byte, normalizeWord bool, ld *dictionary) tokenID {
+	// clean up the contents of the rune buffer
+	token := string(obuf)
+	// escape sequences can occur anywhere in the string, not just the beginning
+	// so always attempt to unescape the word's content.
+	token = html.UnescapeString(token)
+
+	clean := normalizeToken(token)
+
+	return ld.add(clean)
+}
+
+func cleanupToken(pos int, in string, normalizeWord bool) string {
+	r, _ := utf8.DecodeRuneInString(in)
+	var out strings.Builder
+	if pos == 0 && header(in) {
+		return ""
+	}
+
+	if !unicode.IsLetter(r) {
+		if unicode.IsDigit(r) {
+			// Based on analysis of the license corpus, the characters that are
+			// significant are numbers, periods, and dashes. Anything else can be
+			// safely discarded, and helps avoid matching failures due to inconsistent
+			// whitespacing and formatting.
+			for _, c := range in {
+				if unicode.IsDigit(c) || c == '.' || c == '-' {
+					out.WriteRune(c)
+				}
+			}
+
+			// Numbers should not end in a .  since that doesn't indicate a version
+			// number, but usually an end of a line.
+			res := out.String()
+			for strings.HasSuffix(res, ".") {
+				res = res[0 : len(res)-1]
+			}
+			return res
+		}
+	}
+
+	// Remove internal hyphenization or URL constructs to better normalize strings
+	// for matching.
+
+	for _, c := range in {
+		if unicode.IsLetter(c) {
+			out.WriteRune(c)
+		}
+	}
+
+	tok := out.String()
+	if !normalizeWord {
+		return tok
+	}
+
+	if iw, ok := interchangeableWords[tok]; ok && normalizeWord {
+		return iw
+	}
+	return tok
+}
+
+var interchangeableWords = map[string]string{
+	"analyse":         "analyze",
+	"artefact":        "artifact",
+	"authorisation":   "authorization",
+	"authorised":      "authorized",
+	"calibre":         "caliber",
+	"cancelled":       "canceled",
+	"capitalisations": "capitalizations",
+	"catalogue":       "catalog",
+	"categorise":      "categorize",
+	"centre":          "center",
+	"emphasised":      "emphasized",
+	"favour":          "favor",
+	"favourite":       "favorite",
+	"fulfil":          "fulfill",
+	"fulfilment":      "fulfillment",
+	"https":           "http",
+	"initialise":      "initialize",
+	"judgment":        "judgement",
+	"labelling":       "labeling",
+	"labour":          "labor",
+	"licence":         "license",
+	"maximise":        "maximize",
+	"modelled":        "modeled",
+	"modelling":       "modeling",
+	"offence":         "offense",
+	"optimise":        "optimize",
+	"organisation":    "organization",
+	"organise":        "organize",
+	"practise":        "practice",
+	"programme":       "program",
+	"realise":         "realize",
+	"recognise":       "recognize",
+	"signalling":      "signaling",
+	"utilisation":     "utilization",
+	"whilst":          "while",
+	"wilful":          "wilfull",
+	// TODO: These three need tokenizer magic
+	"non commercial": "noncommercial",
+	"per cent":       "percent",
+	"sub license":    "sublicense",
+}
+
+var punctuationMappings = map[rune]string{
+	'-': "-",
+	'‒': "-",
+	'–': "-",
+	'—': "-",
+	'‐': "-",
+	'©': "(c)",
+	'§': "(s)",
+	'¤': "(s)",
+	'·': " ",
+	'*': " ",
+}