Rewrite the tokenization process to work on streams rather than requiring the

entire text for analysis be present in memory. Some of the changes here improved the accuracy of classification, requiring updates in the expected tests. PiperOrigin-RevId: 468114923
author: Bill Neubauer <wcn@google.com> 2022-08-16 23:29:17 -0700
committer: Bill Neubauer <bill.neubauer@gmail.com> 2022-09-16 10:06:11 -0700
commit: bbfad6347cd1f2f7e28fb20144f64d60c700181b (patch)
tree: 079db4d13a9d1e83db6b56a694f5b5209ba4527d
parent: 27441af7cea0e66e29fc9ead35db306e8313dc14 (diff)
download: licenseclassifier-bbfad6347cd1f2f7e28fb20144f64d60c700181b.tar.gz
7 files changed, 488 insertions, 364 deletions
diff --git a/v2/assets/License/Apache-1.1/log4j.txt b/v2/assets/License/Apache-1.1/log4j.txt
new file mode 100644
index 0000000..f3506ce
--- /dev/null
+++ b/v2/assets/License/Apache-1.1/log4j.txt
@@ -0,0 +1,48 @@
+/*
+ * ============================================================================
+ *                   The Apache Software License, Version 1.1
+ * ============================================================================
+ *
+ *    Copyright (C) 1999 The Apache Software Foundation. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modifica-
+ * tion, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of  source code must  retain the above copyright  notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * 3. The end-user documentation included with the redistribution, if any, must
+ *    include  the following  acknowledgment:  "This product includes  software
+ *    developed  by the  Apache Software Foundation  (http://www.apache.org/)."
+ *    Alternately, this  acknowledgment may  appear in the software itself,  if
+ *    and wherever such third-party acknowledgments normally appear.
+ *
+ * 4. The names "log4j" and  "Apache Software Foundation"  must not be used to
+ *    endorse  or promote  products derived  from this  software without  prior
+ *    written permission. For written permission, please contact
+ *    apache@apache.org.
+ *
+ * 5. Products  derived from this software may not  be called "Apache", nor may
+ *    "Apache" appear  in their name,  without prior written permission  of the
+ *    Apache Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS  FOR A PARTICULAR  PURPOSE ARE  DISCLAIMED.  IN NO  EVENT SHALL  THE
+ * APACHE SOFTWARE  FOUNDATION  OR ITS CONTRIBUTORS  BE LIABLE FOR  ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL,  EXEMPLARY, OR CONSEQUENTIAL  DAMAGES (INCLU-
+ * DING, BUT NOT LIMITED TO, PROCUREMENT  OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR  PROFITS; OR BUSINESS  INTERRUPTION)  HOWEVER CAUSED AND ON
+ * ANY  THEORY OF LIABILITY,  WHETHER  IN CONTRACT,  STRICT LIABILITY,  OR TORT
+ * (INCLUDING  NEGLIGENCE OR  OTHERWISE) ARISING IN  ANY WAY OUT OF THE  USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * This software  consists of voluntary contributions made  by many individuals
+ * on  behalf of the Apache Software  Foundation.  For more  information on the
+ * Apache Software Foundation, please see <http://www.apache.org/>.
+ *
+ */
diff --git a/v2/classifier.go b/v2/classifier.go
index b2f4d76..f163030 100644
--- a/v2/classifier.go
+++ b/v2/classifier.go
@@ -230,8 +230,10 @@ func NewClassifier(threshold float64) *Classifier {
 // It is an invariant of the classifier that calling Match(Normalize(in)) will
 // return the same results as Match(in).
 func (c *Classifier) Normalize(in []byte) []byte {
-	text, _ := normalizeDoc(in, false)
-	doc := extractDoc(text, false, nil)
+	doc, err := tokenizeStream(bytes.NewReader(in), false, c.dict, true)
+	if err != nil {
+		panic("should not be reachable, since bytes.NewReader().Read() should never fail")
+	}
 
 	var buf bytes.Buffer
 
@@ -239,26 +241,28 @@ func (c *Classifier) Normalize(in []byte) []byte {
 	case 0:
 		return nil
 	case 1:
-		buf.WriteString(doc.Tokens[0].Text)
+		buf.WriteString(c.dict.getWord(doc.Tokens[0].ID))
 		return buf.Bytes()
 	}
 
 	prevLine := 1
-	buf.WriteString(doc.Tokens[0].Text)
+	buf.WriteString(c.dict.getWord(doc.Tokens[0].ID))
 	for _, t := range doc.Tokens[1:] {
 		// Only write out an EOL token that incremented the line
 		if t.Line == prevLine+1 {
-			buf.WriteString("\n")
+			buf.WriteString(eol)
 		}
 
 		// Only write tokens that aren't EOL
-		if t.Text != eol {
+		txt := c.dict.getWord(t.ID)
+
+		if txt != eol {
 			// Only put a space between tokens if the previous token was on the same
 			// line. This prevents spaces after an EOL
 			if t.Line == prevLine {
 				buf.WriteString(" ")
 			}
-			buf.WriteString(t.Text)
+			buf.WriteString(txt)
 		}
 
 		prevLine = t.Line
diff --git a/v2/document.go b/v2/document.go
index 73ccaab..6f3c1b5 100644
--- a/v2/document.go
+++ b/v2/document.go
@@ -30,25 +30,19 @@ type token struct {
 	Previous string // for the first token in a line, any previous text.
 }
 
-// document is the representation of the input text for downstream filtering and matching.
-type document struct {
-	Tokens  []*token // ordered tokens of the document
-	Matches Matches  // these are matches identified while processing the original, untokenized text via regexp matching
-}
-
 type indexedToken struct {
 	Line int     // line position of this token in the source
 	ID   tokenID // identifier of the text in the dictionary
 }
 
 type indexedDocument struct {
+	Norm    string          // The normalized token sequence
 	Tokens  []indexedToken  // ordered tokens of the document
 	Matches Matches         // these are matches identified while processing the original, untokenized text via regexp matching
 	f       *frequencyTable // frequencies computed for this document
 	dict    *dictionary     // The corpus dictionary for this document
 	s       *searchSet      // The searchset for this document
 	runes   []rune
-	norm    string // The normalized token sequence
 }
 
 func (d *indexedDocument) generateSearchSet(q int) {
@@ -101,58 +95,26 @@ func max(a, b int) int {
 // AddContent incorporates the provided textual content into the classifier for
 // matching. This will not modify the supplied content.
 func (c *Classifier) AddContent(category, name, variant string, content []byte) {
-	doc := tokenize(content)
+	doc := tokenize(content, c.dict, true)
 	c.addDocument(category, name, variant, doc)
 }
 
 // addDocument takes a textual document and incorporates it into the classifier for matching.
-func (c *Classifier) addDocument(category, name, variant string, doc *document) {
+func (c *Classifier) addDocument(category, name, variant string, id *indexedDocument) {
 	// For documents that are part of the corpus, we add them to the dictionary and
 	// compute their associated search data eagerly so they are ready for matching against
 	// candidates.
 	indexName := c.generateDocName(category, name, variant)
-	id := c.generateIndexedDocument(doc, true)
-	id.generateFrequencies()
 	id.generateSearchSet(c.q)
 	id.s.origin = indexName
 	c.docs[indexName] = id
 }
 
-// generateIndexedDocument creates an indexedDocument from the supplied document. if addWords
-// is true, the classifier dictionary is updated with new tokens encountered in the document.
-func (c *Classifier) generateIndexedDocument(d *document, addWords bool) *indexedDocument {
-	id := &indexedDocument{
-		Tokens:  make([]indexedToken, 0, len(d.Tokens)),
-		dict:    c.dict,
-		Matches: d.Matches,
-	}
-
-	for _, t := range d.Tokens {
-		var tokID tokenID
-		if addWords {
-			tokID = id.dict.add(t.Text)
-		} else {
-			tokID = id.dict.getIndex(t.Text)
-		}
-
-		id.Tokens = append(id.Tokens, indexedToken{
-			Line: t.Line,
-			ID:   tokID,
-		})
-
-	}
-	id.generateFrequencies()
-	id.runes = diffWordsToRunes(id, 0, id.size())
-	id.norm = id.normalized()
-	return id
-}
-
 // createTargetIndexedDocument creates an indexed document without adding the
 // words to the classifier dictionary. This should be used for matching targets, not
 // populating the corpus.
 func (c *Classifier) createTargetIndexedDocument(in []byte) *indexedDocument {
-	doc := tokenize(in)
-	return c.generateIndexedDocument(doc, false)
+	return tokenize(in, c.dict, false)
 }
 
 func (c *Classifier) generateDocName(category, name, variant string) string {
diff --git a/v2/scoring.go b/v2/scoring.go
index 34dffb5..616ea78 100644
--- a/v2/scoring.go
+++ b/v2/scoring.go
@@ -41,7 +41,7 @@ func (c *Classifier) score(id string, unknown, known *indexedDocument, unknownSt
 	knownLength := known.size()
 	diffs := docDiff(id, unknown, unknownStart, unknownEnd, known, 0, knownLength)
 
-	start, end := diffRange(known.norm, diffs)
+	start, end := diffRange(known.Norm, diffs)
 	distance := scoreDiffs(id, diffs[start:end])
 
 	if c.tc.traceScoring(known.s.origin) {
diff --git a/v2/searchset_test.go b/v2/searchset_test.go
index ccaa3c3..accbc15 100644
--- a/v2/searchset_test.go
+++ b/v2/searchset_test.go
@@ -63,7 +63,7 @@ func TestSearchSet_New(t *testing.T) {
 			text:        "",
 			q:           4,
 			want: &searchSet{
-				Tokens:         []indexedToken{},
+				Tokens:         nil,
 				Hashes:         make(hash),
 				Checksums:      nil,
 				ChecksumRanges: nil,
diff --git a/v2/tokenizer.go b/v2/tokenizer.go
index 875cc7e..0d3917e 100644
--- a/v2/tokenizer.go
+++ b/v2/tokenizer.go
@@ -15,366 +15,412 @@
 package classifier
 
 import (
+	"bytes"
 	"html"
+	"io"
 	"regexp"
 	"strings"
 	"unicode"
 	"unicode/utf8"
 )
 
-// isSignificant looks for runes that are likely to be the part of English language content
-// of interest in licenses. Notably, it skips over punctuation, looking only for letters
-// or numbers that consistitute the tokens of most interest.
-func isSignificant(r rune) bool {
-	return unicode.IsLetter(r) || unicode.IsDigit(r)
-}
-
 var eol = "\n"
 
-func cleanupToken(in string) string {
-	r, _ := utf8.DecodeRuneInString(in)
-	var out strings.Builder
-	if !unicode.IsLetter(r) {
-		if unicode.IsDigit(r) {
-			// Based on analysis of the license corpus, the characters
-			// that are significant are numbers, periods, and dashes. Anything
-			// else can be safely discarded, and helps avoid matching failures
-			// due to inconsistent whitespacing and formatting.
-			for _, c := range in {
-				if unicode.IsDigit(c) || c == '.' || c == '-' {
-					out.WriteRune(c)
-				}
+func header(in string) bool {
+	if len(in) == 0 {
+		return false
+	}
+	p, e := in[:len(in)-1], in[len(in)-1]
+	switch e {
+	case '.', ':', ')':
+		if listMarker[p] {
+			if e != ')' {
+				return true
 			}
-
-			// Numbers should not end in a . since that doesn't indicate a version
-			// number, but usually an end of a line.
-			res := out.String()
-			for strings.HasSuffix(res, ".") {
-				res = res[0 : len(res)-1]
+		}
+		// Check for patterns like 1.2.3
+		for _, r := range p {
+			if unicode.IsDigit(r) || r == '.' {
+				continue
 			}
-			return res
+			return false
 		}
+		return true
 	}
+	return false
+}
 
-	// Remove internal hyphenization or URL constructs to better normalize
-	// strings for matching.
-	for _, c := range in {
-		if unicode.IsLetter(c) {
-			out.WriteRune(c)
-		}
+var listMarker = func() map[string]bool {
+	const allListMarkers = "a b c d e f g h i j k l m n o p q r ii iii iv v vi vii viii ix xi xii xiii xiv xv"
+	l := map[string]bool{}
+	for _, marker := range strings.Split(allListMarkers, " ") {
+		l[marker] = true
 	}
-	return out.String()
+	return l
+}()
+
+// ignorableTexts is a list of lines at the start of the string we can remove
+// to get a cleaner match.
+var ignorableTexts = []*regexp.Regexp{
+	regexp.MustCompile(`(?i)^(.{1,5})?copyright (\(c\) )?(\[yyyy\]|\d{4})[,.]?.*$`),
+	regexp.MustCompile(`(?i)^(.{1,5})?copyright \(c\) \[dates of first publication\].*$`),
+	regexp.MustCompile(`(?i)^\d{4}-(\d{2}|[a-z]{3})-\d{2}$`),
 }
 
-func normalizeDoc(in []byte, normWords bool) (string, Matches) {
-	// Apply the global transforms described in SPDX
+func tokenize(in []byte, dict *dictionary, updateDict bool) *indexedDocument {
+	// Since bytes.NewReader().Read() will never return an error, tokenizeStream
+	// will never return an error so it's okay to ignore the return value in this
+	// case.
+	id, _ := tokenizeStream(bytes.NewReader(in), true, dict, updateDict)
+	return id
+}
 
-	norm := string(in)
-	norm = html.UnescapeString(norm)
-	norm = normalizePunctuation(norm)
-	norm, matches := removeIgnorableTexts(norm)
+// tokenizeStream reads bytes from src and produces an indexedDocument of its
+// cotent. tokenizeStream will never return an error of its own, it can only
+// return an error from the provided Reader. If the provided Reader never
+// returns an error, it is safe to assume that tokenizeStream will not return an
+// error.
+func tokenizeStream(src io.Reader, normalize bool, dict *dictionary, updateDict bool) (*indexedDocument, error) {
+	const bufSize = 1024
+	// The longest UTF-8 encoded rune is 4 bytes, so we keep enough leftover bytes
+	// in the buffer to ensure we never run out of bytes trying to finish
+	// constructing a rune. These leftover 4 bytes will be copied to the start of
+	// the buffer before additional bytes are read.
+	tgt := bufSize - 4
 
-	if normWords {
-		norm = normalizeWords(norm)
-	}
-	return norm, matches
-}
+	rbuf := make([]byte, bufSize)
+	obuf := make([]byte, 0)
+	linebuf := make([]tokenID, 0)
+	idx := 0
+	line := 1 // 1s-based count
+	deferredEOL := false
+	deferredWord := false
+	// the tokenizer uses a local dictionary to conserve memory while
+	// analyzing the input doc to avoid polluting the global dictionary
+	ld := newDictionary()
 
-func tokenize(in []byte) *document {
-	// tokenize produces a document from the input content.
-	text, matches := normalizeDoc(in, true)
-	return extractDoc(text, true, matches)
-}
+	var doc indexedDocument
 
-func extractDoc(text string, removeEol bool, matches Matches) *document {
-	var doc document
-	doc.Matches = matches
-	// Iterate on a line-by-line basis.
-	i := 0
-	pos := 0
-	for {
-		// Scan the text for the first likely textual content. The scan ignores punctuation
-		// artifacts that include visual boxes for layout as well as comment characters in
-		// source files.
-		firstInLine := true
-		var wid int
-		var r rune
-
-		if pos == len(text) {
-			break
-		}
+	isEOF := func(in error) bool {
+		return in == io.EOF || in == io.ErrUnexpectedEOF
+	}
 
-		next := func() {
-			r, wid = utf8.DecodeRuneInString(text[pos:])
-			pos += wid
+	// Read out the stream in chunks
+	for {
+		// Fill up the buffer with bytes to extract runes from
+		// idx is offset to hold any bytes left over from previous reads
+		n, err := io.ReadFull(src, rbuf[idx:])
+		if isEOF(err) {
+			// There are no more bytes to read, so we must now consume all bytes in the
+			// buffer.
+			tgt = idx + n
+		} else if err != nil {
+			return nil, err
 		}
 
-		for pos < len(text) {
-			start := pos
-			next()
+		for idx = 0; idx < tgt; {
+			r, n := utf8.DecodeRune(rbuf[idx:])
+			idx += n
 
 			if r == '\n' {
-				doc.Tokens = append(doc.Tokens, &token{
-					Text: eol,
-					Line: i + 1})
-				i++
-			}
+				// Deal with carriage return
 
-			if !isSignificant(r) {
+				// If we are in a word (len(obuf) > 0)and the last rune is a -
+				// strike that rune and keep accumulating.
+				// Otherwise we treat it like a space and
+				// flush the word
+
+				if len(obuf) > 0 {
+					if obuf[len(obuf)-1] == '-' {
+						obuf = obuf[0 : len(obuf)-1]
+						deferredEOL = true
+						continue
+					}
+
+					// Append the word fragment to the line buffer
+					linebuf = append(linebuf, flushBuf(len(linebuf), obuf, normalize, ld))
+				}
+
+				// If there is something in the line to process, do so now
+				if len(linebuf) > 0 {
+					appendToDoc(&doc, dict, line, linebuf, ld, normalize, updateDict, linebuf)
+					linebuf = nil
+					obuf = nil
+				}
+				if !normalize {
+					tokID := dict.getIndex(eol)
+					if tokID == unknownIndex {
+						tokID = dict.add(eol)
+					}
+					doc.Tokens = append(doc.Tokens, indexedToken{
+						ID:   tokID,
+						Line: line})
+				}
+				line++
 				continue
 			}
 
-			// We're at a word/number character.
-			for pos < len(text) {
-				next()
-				if unicode.IsSpace(r) {
-					pos -= wid // Will skip this in outer loop
-					break
+			if len(obuf) == 0 {
+				if unicode.IsLetter(r) || unicode.IsDigit(r) || r == '&' || r == '(' {
+					// Number or word character starts an interesting word
+					// Now we slurp up all non-space runes and aggregate it as
+					// a single word
+
+					// Buffer the initial token, normalizing to lower case if needed
+					if normalize {
+						r = unicode.ToLower(r)
+					}
+					obuf = utf8.AppendRune(obuf, r)
 				}
+				continue
 			}
 
-			if pos > start {
-				if start >= 2 && text[start-2] == '.' && text[start-1] == ' ' {
-					// Insert a "soft EOL" that helps detect header-looking entries that
-					// follow this text. This resolves problems with licenses that are a
-					// very long line of text, motivated by
-					// https://github.com/microsoft/TypeScript/commit/6e6e570d57b6785335668e30b63712e41f89bf74#diff-e60c8cd1bc09b7c4e1bf79c769c9c120L109
-					//
-					// Don't do this if the previous token was already an EOL
-					if len(doc.Tokens) > 0 && doc.Tokens[len(doc.Tokens)-1].Text != eol {
-						doc.Tokens = append(doc.Tokens, &token{
-							Text: eol,
-							Line: i + 1})
-					}
+			// At this point, len(obuf) > 0 and we are accumulating more runes
+			// to complete a word.
+			if unicode.IsSpace(r) {
+				// If we have a deferred EOL, we need to pick up a non-space character
+				// to resume the hyphenated word, so we just consume spaces until that
+				// happens
+				if deferredEOL {
+					continue
 				}
 
-				tok := token{
-					Text: text[start:pos],
-					Line: i + 1,
+				// This is a space between word characters, so we assemble the word as a
+				// token and flush it out.
+				idx -= n
+
+				linebuf = append(linebuf, flushBuf(len(linebuf), obuf, normalize, ld))
+				if deferredWord {
+					appendToDoc(&doc, dict, line, linebuf, ld, normalize, updateDict, linebuf)
+					linebuf = nil
+					deferredWord = false
+					// Increment the line count now so the remainder token is credited
+					// to the previous line number.
+					line++
 				}
-				if firstInLine {
-					// Store the prefix material, it is useful to discern some corner cases
-					tok.Previous = text[0:start]
+				obuf = make([]byte, 0)
+				continue
+			}
+
+			if deferredEOL {
+				deferredEOL = false
+				deferredWord = true
+			}
+			// perform token mappings for punctuation to emulate
+			// normalizePunctuation. this returns a string and each rune needs to be
+			// injected.
+			if rep, found := punctuationMappings[r]; found {
+				for _, t := range rep {
+					obuf = utf8.AppendRune(obuf, unicode.ToLower(t))
 				}
-				doc.Tokens = append(doc.Tokens, &tok)
-				firstInLine = false
+				continue
 			}
+
+			// if it's not punctuation, lowercase and buffer the token
+			obuf = utf8.AppendRune(obuf, unicode.ToLower(r))
+		}
+
+		// Break out if we have consumed all read bytes
+		if isEOF(err) {
+			break
 		}
+
+		// Copy the unconsumed bytes at the end of the buffer to the start
+		// of the buffer so the next read appends after them.
+		n = copy(rbuf, rbuf[idx:])
+		idx = n
+	}
+
+	// Process the remaining bytes in the buffer
+	if len(obuf) > 0 {
+		linebuf = append(linebuf, flushBuf(len(linebuf), obuf, normalize, ld))
+	}
+	if len(linebuf) > 0 {
+		appendToDoc(&doc, dict, line, linebuf, ld, normalize, updateDict, linebuf)
 	}
 
-	doc.Tokens = cleanupTokens(doc.Tokens, removeEol)
-	return &doc
+	doc.dict = dict
+	doc.generateFrequencies()
+	doc.runes = diffWordsToRunes(&doc, 0, doc.size())
+	doc.Norm = doc.normalized()
+	return &doc, nil
 }
 
-func cleanupTokens(in []*token, removeEol bool) []*token {
-	// This routine performs sanitization of tokens. If it is a header-looking
-	// token (but not a version number) starting a line, it is removed.
-	// Hyphenated words are reassembled.
-	partialWord := ""
-	var out []*token
-	tokIdx := 0
-	firstInLine := true
-	for i, tok := range in {
-		if firstInLine && header(tok) {
+func appendToDoc(doc *indexedDocument, dict *dictionary, line int, in []tokenID, ld *dictionary, normalize bool, updateDict bool, linebuf []tokenID) {
+	tokens, m := stringifyLineBuf(dict, line, linebuf, ld, normalize, updateDict)
+	if tokens != nil {
+		doc.Tokens = append(doc.Tokens, tokens...)
+	} else if m != nil {
+		doc.Matches = append(doc.Matches, m)
+	}
+}
+
+func stringifyLineBuf(dict *dictionary, line int, in []tokenID, ld *dictionary, normalize bool, updateDict bool) ([]indexedToken, *Match) {
+	if len(in) == 0 {
+		return nil, nil
+	}
+	var sb strings.Builder
+	for i, r := range in {
+		out := ld.getWord(r)
+		if out == "" {
 			continue
 		}
-		if tok.Text == eol {
-			firstInLine = true
-			if removeEol {
-				continue
-			}
-			// If we are reconstructing a hyphenated word, don't append the EOL
-			// now, do it when the word is reconstructed.
-			if partialWord == "" {
-				out = append(out, &token{Text: eol, Line: tok.Line})
-				tokIdx++
-			}
-			continue
+		sb.WriteString(out)
+		if i < len(in)-1 {
+			sb.WriteByte(' ')
 		}
-		firstInLine = false
-		t := cleanupToken(tok.Text)
-		// If this is the last token in a line, and it looks like a hyphenated
-		// word, store it for reassembly.
-		if strings.HasSuffix(tok.Text, "-") && i+1 < len(in) && in[i+1].Text == eol {
-			partialWord = t
-		} else if partialWord != "" {
-			// Repair hyphenated words
-			tp := in[i-1]
-			tp.Text = partialWord + t
-			tp.Previous = ""
-			out = append(out, tp)
-			tokIdx++
-			if !removeEol {
-				// Append the EOL now that the whole word is recovered
-				out = append(out, &token{Text: eol, Line: tp.Line})
-				tokIdx++
-			}
+	}
 
-			partialWord = ""
-		} else {
-			tok.Text = t
-			tok.Previous = ""
-			out = append(out, tok)
-			tokIdx++
+	out := sb.String()
+
+	for _, re := range ignorableTexts {
+		if re.MatchString(out) {
+			return nil, &Match{Name: "Copyright", MatchType: "Copyright", Confidence: 1.0, StartLine: line, EndLine: line}
 		}
 	}
-	return out
-}
 
-// interchangeablePunctutation is punctuation that can be normalized.
-var interchangeablePunctuation = []struct {
-	interchangeable string
-	substitute      string
-}{
-	// Hyphen, Dash, En Dash, and Em Dash.
-	{`-‒–—‐`, "-"},
-	// Single, Double, Curly Single, and Curly Double.
-	{"'\"`‘’“”", "'"},
-	// Copyright.
-	{"©", "(c)"},
-	// Currency and Section. (Different copies of the CDDL use each marker.)
-	{"§¤", "(s)"},
-	// Middle Dot
-	{"·", " "},
-	{"*", " "},
-}
-
-// normalizePunctuation takes all hyphens and quotes and normalizes them.
-func normalizePunctuation(s string) string {
-	for _, iw := range interchangeablePunctuation {
-		for _, in := range strings.Split(iw.interchangeable, "") {
-			s = strings.ReplaceAll(s, in, iw.substitute)
+	var tokens []indexedToken
+	for i, r := range in {
+		txt := cleanupToken(i, ld.getWord(r), normalize)
+		if txt != "" {
+			var tokID tokenID
+			if updateDict {
+				tokID = dict.add(txt)
+			} else {
+				tokID = dict.getIndex(txt)
+			}
+			tokens = append(tokens, indexedToken{
+				Line: line,
+				ID:   tokID,
+			})
 		}
 	}
-	return s
+
+	return tokens, nil
 }
 
-// interchangeableWords are words we can substitute for a normalized form
-// without changing the meaning of the license. See
-// https://spdx.org/spdx-license-list/matching-guidelines for the list.
-var interchangeableWords = []struct {
-	interchangeable *regexp.Regexp
-	substitute      string
-}{
-	{regexp.MustCompile("acknowledgement"), "acknowledgment"},
-	{regexp.MustCompile("analogue"), "analog"},
-	{regexp.MustCompile("analyse"), "analyze"},
-	{regexp.MustCompile("artefact"), "artifact"},
-	{regexp.MustCompile("authorisation"), "authorization"},
-	{regexp.MustCompile("authorised"), "authorized"},
-	{regexp.MustCompile("calibre"), "caliber"},
-	{regexp.MustCompile("cancelled"), "canceled"},
-	{regexp.MustCompile("capitalisations"), "capitalizations"},
-	{regexp.MustCompile("catalogue"), "catalog"},
-	{regexp.MustCompile("categorise"), "categorize"},
-	{regexp.MustCompile("centre"), "center"},
-	{regexp.MustCompile("emphasised"), "emphasized"},
-	{regexp.MustCompile("favour"), "favor"},
-	{regexp.MustCompile("favourite"), "favorite"},
-	{regexp.MustCompile("fulfil\\b"), "fulfill"},
-	{regexp.MustCompile("fulfilment"), "fulfillment"},
-	{regexp.MustCompile("https"), "http"},
-	{regexp.MustCompile("initialise"), "initialize"},
-	{regexp.MustCompile("judgment"), "judgement"},
-	{regexp.MustCompile("labelling"), "labeling"},
-	{regexp.MustCompile("labour"), "labor"},
-	{regexp.MustCompile("licence"), "license"},
-	{regexp.MustCompile("maximise"), "maximize"},
-	{regexp.MustCompile("modelled"), "modeled"},
-	{regexp.MustCompile("modelling"), "modeling"},
-	{regexp.MustCompile("offence"), "offense"},
-	{regexp.MustCompile("optimise"), "optimize"},
-	{regexp.MustCompile("organisation"), "organization"},
-	{regexp.MustCompile("organise"), "organize"},
-	{regexp.MustCompile("practise"), "practice"},
-	{regexp.MustCompile("programme"), "program"},
-	{regexp.MustCompile("realise"), "realize"},
-	{regexp.MustCompile("recognise"), "recognize"},
-	{regexp.MustCompile("signalling"), "signaling"},
-	{regexp.MustCompile("sub[ -]license"), "sublicense"},
-	{regexp.MustCompile("utilisation"), "utilization"},
-	{regexp.MustCompile("whilst"), "while"},
-	{regexp.MustCompile("wilful"), "wilfull"},
-	{regexp.MustCompile("non[ -]commercial"), "noncommercial"},
-	{regexp.MustCompile("per cent"), "percent"},
+func normalizeToken(in string) string {
+	// This performs some preprocessing on the token.
+	// This is different than cleanupToken in that fixups here
+	// are not exact match on the token.
+	// Normalizing URLs from https to http is an example of a fix applied
+	// here.
+	return strings.ReplaceAll(in, "https", "http")
 }
 
-// normalizeWords remaps equivalent words that are interchangeable and lowercases
-// the word to allow for exact matching.
-func normalizeWords(s string) string {
-	s = strings.ToLower(s)
-	for _, iw := range interchangeableWords {
-		s = iw.interchangeable.ReplaceAllString(s, iw.substitute)
-	}
-	return s
+func flushBuf(pos int, obuf []byte, normalizeWord bool, ld *dictionary) tokenID {
+	// clean up the contents of the rune buffer
+	token := string(obuf)
+	// escape sequences can occur anywhere in the string, not just the beginning
+	// so always attempt to unescape the word's content.
+	token = html.UnescapeString(token)
+
+	clean := normalizeToken(token)
+
+	return ld.add(clean)
 }
 
-func header(tok *token) bool {
-	in := tok.Text
-	p, e := in[:len(in)-1], in[len(in)-1]
-	switch e {
-	case '.', ':', ')':
-		if listMarker[p] {
-			if e != ')' {
-				return true
+func cleanupToken(pos int, in string, normalizeWord bool) string {
+	r, _ := utf8.DecodeRuneInString(in)
+	var out strings.Builder
+	if pos == 0 && header(in) {
+		return ""
+	}
+
+	if !unicode.IsLetter(r) {
+		if unicode.IsDigit(r) {
+			// Based on analysis of the license corpus, the characters that are
+			// significant are numbers, periods, and dashes. Anything else can be
+			// safely discarded, and helps avoid matching failures due to inconsistent
+			// whitespacing and formatting.
+			for _, c := range in {
+				if unicode.IsDigit(c) || c == '.' || c == '-' {
+					out.WriteRune(c)
+				}
 			}
-			// Sometimes an internal reference like "(ii)" from NPL-1.02.txt
-			// endds up at the beginning of a line. In that case, it's
-			// not actually a header.
-			if e == ')' && !strings.HasSuffix(tok.Previous, "(") {
-				return true
+
+			// Numbers should not end in a .  since that doesn't indicate a version
+			// number, but usually an end of a line.
+			res := out.String()
+			for strings.HasSuffix(res, ".") {
+				res = res[0 : len(res)-1]
 			}
+			return res
 		}
-		// Check for patterns like 1.2.3
-		for _, r := range p {
-			if unicode.IsDigit(r) || r == '.' {
-				continue
-			}
-			return false
+	}
+
+	// Remove internal hyphenization or URL constructs to better normalize strings
+	// for matching.
+
+	for _, c := range in {
+		if unicode.IsLetter(c) {
+			out.WriteRune(c)
 		}
-		return true
 	}
-	return false
-}
 
-var listMarker = func() map[string]bool {
-	const allListMarkers = "a b c d e f g h i j k l m n o p q r ii iii iv v vi vii viii ix xi xii xiii xiv xv"
-	l := map[string]bool{}
-	for _, marker := range strings.Split(allListMarkers, " ") {
-		l[marker] = true
+	tok := out.String()
+	if !normalizeWord {
+		return tok
 	}
-	return l
-}()
 
-// ignorableTexts is a list of lines at the start of the string we can remove
-// to get a cleaner match.
-var ignorableTexts = []*regexp.Regexp{
-	regexp.MustCompile(`(?i)^(.{1,5})?copyright (\(c\) )?(\[yyyy\]|\d{4})[,.]?.*$`),
-	regexp.MustCompile(`(?i)^(.{1,5})?copyright \(c\) \[dates of first publication\].*$`),
-	regexp.MustCompile(`(?i)^\d{4}-(\d{2}|[a-z]{3})-\d{2}$`),
+	if iw, ok := interchangeableWords[tok]; ok && normalizeWord {
+		return iw
+	}
+	return tok
 }
 
-// removeIgnorableTexts removes common text, which is not important for
-// classification
-func removeIgnorableTexts(s string) (string, Matches) {
-	var out []string
-	var matches Matches
-	lines := strings.Split(s, "\n")
-	for i, l := range lines {
-		line := strings.TrimSpace(l)
-		var match bool
-		for _, re := range ignorableTexts {
-			if re.MatchString(line) {
-				match = true
-			}
-		}
-		if !match {
-			out = append(out, l)
-		} else {
-			// We want to preserve line presence for the positional information
-			out = append(out, "")
-			matches = append(matches, &Match{Name: "Copyright", MatchType: "Copyright", Confidence: 1.0, StartLine: i + 1, EndLine: i + 1})
-		}
-	}
-	return strings.Join(out, "\n"), matches
+var interchangeableWords = map[string]string{
+	"analyse":         "analyze",
+	"artefact":        "artifact",
+	"authorisation":   "authorization",
+	"authorised":      "authorized",
+	"calibre":         "caliber",
+	"cancelled":       "canceled",
+	"capitalisations": "capitalizations",
+	"catalogue":       "catalog",
+	"categorise":      "categorize",
+	"centre":          "center",
+	"emphasised":      "emphasized",
+	"favour":          "favor",
+	"favourite":       "favorite",
+	"fulfil":          "fulfill",
+	"fulfilment":      "fulfillment",
+	"https":           "http",
+	"initialise":      "initialize",
+	"judgment":        "judgement",
+	"labelling":       "labeling",
+	"labour":          "labor",
+	"licence":         "license",
+	"maximise":        "maximize",
+	"modelled":        "modeled",
+	"modelling":       "modeling",
+	"offence":         "offense",
+	"optimise":        "optimize",
+	"organisation":    "organization",
+	"organise":        "organize",
+	"practise":        "practice",
+	"programme":       "program",
+	"realise":         "realize",
+	"recognise":       "recognize",
+	"signalling":      "signaling",
+	"utilisation":     "utilization",
+	"whilst":          "while",
+	"wilful":          "wilfull",
+	// TODO: These three need tokenizer magic
+	"non commercial": "noncommercial",
+	"per cent":       "percent",
+	"sub license":    "sublicense",
+}
+
+var punctuationMappings = map[rune]string{
+	'-': "-",
+	'‒': "-",
+	'–': "-",
+	'—': "-",
+	'‐': "-",
+	'©': "(c)",
+	'§': "(s)",
+	'¤': "(s)",
+	'·': " ",
+	'*': " ",
 }
diff --git a/v2/tokenizer_test.go b/v2/tokenizer_test.go
index 662685c..6ddab4c 100644
--- a/v2/tokenizer_test.go
+++ b/v2/tokenizer_test.go
@@ -15,6 +15,7 @@
 package classifier
 
 import (
+	"io"
 	"strings"
 	"testing"
 
@@ -56,7 +57,7 @@ func TestCleanupToken(t *testing.T) {
 		},
 	}
 	for _, test := range tests {
-		if got := cleanupToken(test.input); got != test.output {
+		if got := cleanupToken(0, test.input, true); got != test.output {
 			t.Errorf("%q: got %q want %q", test.input, got, test.output)
 		}
 	}
@@ -66,8 +67,21 @@ func TestTokenize(t *testing.T) {
 	tests := []struct {
 		name   string
 		input  string
-		output *document
+		output *indexedDocument
 	}{
+		{name: "hyphenization recovery",
+			input: `basket-
+ball`,
+			output: &indexedDocument{
+				Tokens: []indexedToken{
+					{
+						ID:   1,
+						Line: 1,
+					},
+				},
+				Norm: "basketball",
+			},
+		},
 		{
 			name: "basic scenario",
 			input: `The AWESOME Project LICENSE
@@ -80,63 +94,112 @@ Copyright 1996-2002, 2006 by A. Developer
 Introduction
 
 The AWESOME Project`,
-			output: &document{
-				Tokens: []*token{
+			output: &indexedDocument{
+				Tokens: []indexedToken{
 					{
-						Text: "the",
+						ID:   1,
 						Line: 1,
 					},
 					{
-						Text: "awesome",
+						ID:   2,
 						Line: 1,
 					},
 					{
-						Text: "project",
+						ID:   3,
 						Line: 1,
 					},
 					{
-						Text: "license",
+						ID:   4,
 						Line: 1,
 					},
 					{
-						Text: "modifications",
+						ID:   5,
 						Line: 3,
 					},
 					{
-						Text: "prohibited",
+						ID:   6,
 						Line: 4,
 					},
 					{
-						Text: "introduction",
+						ID:   7,
 						Line: 8,
 					},
 					{
-						Text: "the",
+						ID:   1,
 						Line: 10,
 					},
 					{
-						Text: "awesome",
+						ID:   2,
 						Line: 10,
 					},
 					{
-						Text: "project",
+						ID:   3,
 						Line: 10,
 					},
 				},
 				Matches: Matches{&Match{Name: "Copyright", Confidence: 1.0, MatchType: "Copyright", StartLine: 6, EndLine: 6}},
+				Norm:    "the awesome project license modifications prohibited introduction the awesome project",
 			},
 		},
 	}
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
-			d := tokenize([]byte(test.input))
-			if !cmp.Equal(d, test.output, cmpopts.IgnoreUnexported(document{})) {
-				t.Errorf("%s failed: %s", test.name, cmp.Diff(d, test.output))
+			d := tokenize([]byte(test.input), newDictionary(), true)
+			if diff := cmp.Diff(d, test.output, cmpopts.IgnoreUnexported(indexedDocument{})); diff != "" {
+				t.Errorf("%s failed:\nDiff(+got,-want): %s", test.name, diff)
 			}
 		})
 	}
 }
 
+type mockReader struct {
+	t        *testing.T
+	schedule []int
+	cur      int
+}
+
+func (m *mockReader) Read(buf []byte) (int, error) {
+	if m.cur > len(m.schedule) {
+		m.t.Fatal("Unexpected read on mock")
+	}
+
+	if m.cur == len(m.schedule) {
+		return 0, io.EOF
+	}
+
+	if len(buf) != m.schedule[m.cur] {
+		m.t.Fatalf("step %d: got %d, want %d", m.cur, len(buf), m.schedule[m.cur])
+	}
+	m.cur++
+
+	for i := range buf {
+		buf[i] = 'a'
+	}
+
+	return len(buf), nil
+}
+
+func TestTokenizerBuffering(t *testing.T) {
+	dict := newDictionary()
+	mr := mockReader{
+		t:        t,
+		schedule: []int{1024, 1020, 1020},
+	}
+	d, err := tokenizeStream(&mr, true, dict, true)
+	if err != nil {
+		t.Errorf("Read returned unexpected error: %v", err)
+	}
+
+	// Do a basic test to make sure the data returned is sound
+	if len(d.Tokens) != 1 {
+		t.Errorf("Got %d tokens, expected 1", len(d.Tokens))
+	}
+
+	if len(d.Norm) != 3064 {
+		t.Errorf("Got %d bytes, expected 3064", len(d.Norm))
+	}
+}
+
 func TestTokenizer(t *testing.T) {
 	// This test focuses primarily on the textual content extracted and does not look
 	// at the other parts of the document.
@@ -229,10 +292,11 @@ The FreeType Project`,
 
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
-			d := tokenize([]byte(test.input))
+			dict := newDictionary()
+			d := tokenize([]byte(test.input), dict, true)
 			var b strings.Builder
 			for _, tok := range d.Tokens {
-				b.WriteString(tok.Text)
+				b.WriteString(dict.getWord(tok.ID))
 				b.WriteString(" ")
 			}
 			actual := strings.TrimSpace(b.String())
author	Bill Neubauer <wcn@google.com>	2022-08-16 23:29:17 -0700
committer	Bill Neubauer <bill.neubauer@gmail.com>	2022-09-16 10:06:11 -0700
commit	bbfad6347cd1f2f7e28fb20144f64d60c700181b (patch)
tree	079db4d13a9d1e83db6b56a694f5b5209ba4527d
parent	27441af7cea0e66e29fc9ead35db306e8313dc14 (diff)
download	licenseclassifier-bbfad6347cd1f2f7e28fb20144f64d60c700181b.tar.gz