7 files changed, 488 insertions, 364 deletions
diff --git a/v2/assets/License/Apache-1.1/log4j.txt b/v2/assets/License/Apache-1.1/log4j.txt
new file mode 100644
index 0000000..f3506ce
--- /dev/null
+++ b/v2/assets/License/Apache-1.1/log4j.txt
@@ -0,0 +1,48 @@
+/*
+ * ============================================================================
+ *                   The Apache Software License, Version 1.1
+ * ============================================================================
+ *
+ *    Copyright (C) 1999 The Apache Software Foundation. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modifica-
+ * tion, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of  source code must  retain the above copyright  notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * 3. The end-user documentation included with the redistribution, if any, must
+ *    include  the following  acknowledgment:  "This product includes  software
+ *    developed  by the  Apache Software Foundation  (http://www.apache.org/)."
+ *    Alternately, this  acknowledgment may  appear in the software itself,  if
+ *    and wherever such third-party acknowledgments normally appear.
+ *
+ * 4. The names "log4j" and  "Apache Software Foundation"  must not be used to
+ *    endorse  or promote  products derived  from this  software without  prior
+ *    written permission. For written permission, please contact
+ *    apache@apache.org.
+ *
+ * 5. Products  derived from this software may not  be called "Apache", nor may
+ *    "Apache" appear  in their name,  without prior written permission  of the
+ *    Apache Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS  FOR A PARTICULAR  PURPOSE ARE  DISCLAIMED.  IN NO  EVENT SHALL  THE
+ * APACHE SOFTWARE  FOUNDATION  OR ITS CONTRIBUTORS  BE LIABLE FOR  ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL,  EXEMPLARY, OR CONSEQUENTIAL  DAMAGES (INCLU-
+ * DING, BUT NOT LIMITED TO, PROCUREMENT  OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR  PROFITS; OR BUSINESS  INTERRUPTION)  HOWEVER CAUSED AND ON
+ * ANY  THEORY OF LIABILITY,  WHETHER  IN CONTRACT,  STRICT LIABILITY,  OR TORT
+ * (INCLUDING  NEGLIGENCE OR  OTHERWISE) ARISING IN  ANY WAY OUT OF THE  USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * This software  consists of voluntary contributions made  by many individuals
+ * on  behalf of the Apache Software  Foundation.  For more  information on the
+ * Apache Software Foundation, please see <http://www.apache.org/>.
+ *
+ */
diff --git a/v2/classifier.go b/v2/classifier.go
index b2f4d76..f163030 100644
--- a/v2/classifier.go
+++ b/v2/classifier.go
@@ -230,8 +230,10 @@ func NewClassifier(threshold float64) *Classifier {
 // It is an invariant of the classifier that calling Match(Normalize(in)) will
 // return the same results as Match(in).
 func (c *Classifier) Normalize(in []byte) []byte {
-	text, _ := normalizeDoc(in, false)
-	doc := extractDoc(text, false, nil)
+	doc, err := tokenizeStream(bytes.NewReader(in), false, c.dict, true)
+	if err != nil {
+		panic("should not be reachable, since bytes.NewReader().Read() should never fail")
+	}
 
 	var buf bytes.Buffer
 
@@ -239,26 +241,28 @@ func (c *Classifier) Normalize(in []byte) []byte {
 	case 0:
 		return nil
 	case 1:
-		buf.WriteString(doc.Tokens[0].Text)
+		buf.WriteString(c.dict.getWord(doc.Tokens[0].ID))
 		return buf.Bytes()
 	}
 
 	prevLine := 1
-	buf.WriteString(doc.Tokens[0].Text)
+	buf.WriteString(c.dict.getWord(doc.Tokens[0].ID))
 	for _, t := range doc.Tokens[1:] {
 		// Only write out an EOL token that incremented the line
 		if t.Line == prevLine+1 {
-			buf.WriteString("\n")
+			buf.WriteString(eol)
 		}
 
 		// Only write tokens that aren't EOL
-		if t.Text != eol {
+		txt := c.dict.getWord(t.ID)
+
+		if txt != eol {
 			// Only put a space between tokens if the previous token was on the same
 			// line. This prevents spaces after an EOL
 			if t.Line == prevLine {
 				buf.WriteString(" ")
 			}
-			buf.WriteString(t.Text)
+			buf.WriteString(txt)
 		}
 
 		prevLine = t.Line
diff --git a/v2/document.go b/v2/document.go
index 73ccaab..6f3c1b5 100644
--- a/v2/document.go
+++ b/v2/document.go
@@ -30,25 +30,19 @@ type token struct {
 	Previous string // for the first token in a line, any previous text.
 }
 
-// document is the representation of the input text for downstream filtering and matching.
-type document struct {
-	Tokens  []*token // ordered tokens of the document
-	Matches Matches  // these are matches identified while processing the original, untokenized text via regexp matching
-}
-
 type indexedToken struct {
 	Line int     // line position of this token in the source
 	ID   tokenID // identifier of the text in the dictionary
 }
 
 type indexedDocument struct {
+	Norm    string          // The normalized token sequence
 	Tokens  []indexedToken  // ordered tokens of the document
 	Matches Matches         // these are matches identified while processing the original, untokenized text via regexp matching
 	f       *frequencyTable // frequencies computed for this document
 	dict    *dictionary     // The corpus dictionary for this document
 	s       *searchSet      // The searchset for this document
 	runes   []rune
-	norm    string // The normalized token sequence
 }
 
 func (d *indexedDocument) generateSearchSet(q int) {
@@ -101,58 +95,26 @@ func max(a, b int) int {
 // AddContent incorporates the provided textual content into the classifier for
 // matching. This will not modify the supplied content.
 func (c *Classifier) AddContent(category, name, variant string, content []byte) {
-	doc := tokenize(content)
+	doc := tokenize(content, c.dict, true)
 	c.addDocument(category, name, variant, doc)
 }
 
 // addDocument takes a textual document and incorporates it into the classifier for matching.
-func (c *Classifier) addDocument(category, name, variant string, doc *document) {
+func (c *Classifier) addDocument(category, name, variant string, id *indexedDocument) {
 	// For documents that are part of the corpus, we add them to the dictionary and
 	// compute their associated search data eagerly so they are ready for matching against
 	// candidates.
 	indexName := c.generateDocName(category, name, variant)
-	id := c.generateIndexedDocument(doc, true)
-	id.generateFrequencies()
 	id.generateSearchSet(c.q)
 	id.s.origin = indexName
 	c.docs[indexName] = id
 }
 
-// generateIndexedDocument creates an indexedDocument from the supplied document. if addWords
-// is true, the classifier dictionary is updated with new tokens encountered in the document.
-func (c *Classifier) generateIndexedDocument(d *document, addWords bool) *indexedDocument {
-	id := &indexedDocument{
-		Tokens:  make([]indexedToken, 0, len(d.Tokens)),
-		dict:    c.dict,
-		Matches: d.Matches,
-	}
-
-	for _, t := range d.Tokens {
-		var tokID tokenID
-		if addWords {
-			tokID = id.dict.add(t.Text)
-		} else {
-			tokID = id.dict.getIndex(t.Text)
-		}
-
-		id.Tokens = append(id.Tokens, indexedToken{
-			Line: t.Line,
-			ID:   tokID,
-		})
-
-	}
-	id.generateFrequencies()
-	id.runes = diffWordsToRunes(id, 0, id.size())
-	id.norm = id.normalized()
-	return id
-}
-
 // createTargetIndexedDocument creates an indexed document without adding the
 // words to the classifier dictionary. This should be used for matching targets, not
 // populating the corpus.
 func (c *Classifier) createTargetIndexedDocument(in []byte) *indexedDocument {
-	doc := tokenize(in)
-	return c.generateIndexedDocument(doc, false)
+	return tokenize(in, c.dict, false)
 }
 
 func (c *Classifier) generateDocName(category, name, variant string) string {
diff --git a/v2/scoring.go b/v2/scoring.go
index 34dffb5..616ea78 100644
--- a/v2/scoring.go
+++ b/v2/scoring.go
@@ -41,7 +41,7 @@ func (c *Classifier) score(id string, unknown, known *indexedDocument, unknownSt
 	knownLength := known.size()
 	diffs := docDiff(id, unknown, unknownStart, unknownEnd, known, 0, knownLength)
 
-	start, end := diffRange(known.norm, diffs)
+	start, end := diffRange(known.Norm, diffs)
 	distance := scoreDiffs(id, diffs[start:end])
 
 	if c.tc.traceScoring(known.s.origin) {
diff --git a/v2/searchset_test.go b/v2/searchset_test.go
index ccaa3c3..accbc15 100644
--- a/v2/searchset_test.go
+++ b/v2/searchset_test.go
@@ -63,7 +63,7 @@ func TestSearchSet_New(t *testing.T) {
 			text:        "",
 			q:           4,
 			want: &searchSet{
-				Tokens:         []indexedToken{},
+				Tokens:         nil,
 				Hashes:         make(hash),
 				Checksums:      nil,
 				ChecksumRanges: nil,
diff --git a/v2/tokenizer.go b/v2/tokenizer.go
index 875cc7e..0d3917e 100644
--- a/v2/tokenizer.go
+++ b/v2/tokenizer.go
@@ -15,366 +15,412 @@
 package classifier
 
 import (
+	"bytes"
 	"html"
+	"io"
 	"regexp"
 	"strings"
 	"unicode"
 	"unicode/utf8"
 )
 
-// isSignificant looks for runes that are likely to be the part of English language content
-// of interest in licenses. Notably, it skips over punctuation, looking only for letters
-// or numbers that consistitute the tokens of most interest.
-func isSignificant(r rune) bool {
-	return unicode.IsLetter(r) || unicode.IsDigit(r)
-}
-
 var eol = "\n"
 
-func cleanupToken(in string) string {
-	r, _ := utf8.DecodeRuneInString(in)
-	var out strings.Builder
-	if !unicode.IsLetter(r) {
-		if unicode.IsDigit(r) {
-			// Based on analysis of the license corpus, the characters
-			// that are significant are numbers, periods, and dashes. Anything
-			// else can be safely discarded, and helps avoid matching failures
-			// due to inconsistent whitespacing and formatting.
-			for _, c := range in {
-				if unicode.IsDigit(c) || c == '.' || c == '-' {
-					out.WriteRune(c)
-				}
+func header(in string) bool {
+	if len(in) == 0 {
+		return false
+	}
+	p, e := in[:len(in)-1], in[len(in)-1]
+	switch e {
+	case '.', ':', ')':
+		if listMarker[p] {
+			if e != ')' {
+				return true
 			}
-
-			// Numbers should not end in a . since that doesn't indicate a version
-			// number, but usually an end of a line.
-			res := out.String()
-			for strings.HasSuffix(res, ".") {
-				res = res[0 : len(res)-1]
+		}
+		// Check for patterns like 1.2.3
+		for _, r := range p {
+			if unicode.IsDigit(r) || r == '.' {
+				continue
 			}
-			return res
+			return false
 		}
+		return true
 	}
+	return false
+}
 
-	// Remove internal hyphenization or URL constructs to better normalize
-	// strings for matching.
-	for _, c := range in {
-		if unicode.IsLetter(c) {
-			out.WriteRune(c)
-		}
+var listMarker = func() map[string]bool {
+	const allListMarkers = "a b c d e f g h i j k l m n o p q r ii iii iv v vi vii viii ix xi xii xiii xiv xv"
+	l := map[string]bool{}
+	for _, marker := range strings.Split(allListMarkers, " ") {
+		l[marker] = true
 	}
-	return out.String()
+	return l
+}()
+
+// ignorableTexts is a list of lines at the start of the string we can remove
+// to get a cleaner match.
+var ignorableTexts = []*regexp.Regexp{
+	regexp.MustCompile(`(?i)^(.{1,5})?copyright (\(c\) )?(\[yyyy\]|\d{4})[,.]?.*$`),
+	regexp.MustCompile(`(?i)^(.{1,5})?copyright \(c\) \[dates of first publication\].*$`),
+	regexp.MustCompile(`(?i)^\d{4}-(\d{2}|[a-z]{3})-\d{2}$`),
 }
 
-func normalizeDoc(in []byte, normWords bool) (string, Matches) {
-	// Apply the global transforms described in SPDX
+func tokenize(in []byte, dict *dictionary, updateDict bool) *indexedDocument {
+	// Since bytes.NewReader().Read() will never return an error, tokenizeStream
+	// will never return an error so it's okay to ignore the return value in this
+	// case.
+	id, _ := tokenizeStream(bytes.NewReader(in), true, dict, updateDict)
+	return id
+}
 
-	norm := string(in)
-	norm = html.UnescapeString(norm)
-	norm = normalizePunctuation(norm)
-	norm, matches := removeIgnorableTexts(norm)
+// tokenizeStream reads bytes from src and produces an indexedDocument of its
+// cotent. tokenizeStream will never return an error of its own, it can only
+// return an error from the provided Reader. If the provided Reader never
+// returns an error, it is safe to assume that tokenizeStream will not return an
+// error.
+func tokenizeStream(src io.Reader, normalize bool, dict *dictionary, updateDict bool) (*indexedDocument, error) {
+	const bufSize = 1024
+	// The longest UTF-8 encoded rune is 4 bytes, so we keep enough leftover bytes
+	// in the buffer to ensure we never run out of bytes trying to finish
+	// constructing a rune. These leftover 4 bytes will be copied to the start of
+	// the buffer before additional bytes are read.
+	tgt := bufSize - 4
 
-	if normWords {
-		norm = normalizeWords(norm)
-	}
-	return norm, matches
-}
+	rbuf := make([]byte, bufSize)
+	obuf := make([]byte, 0)
+	linebuf := make([]tokenID, 0)
+	idx := 0
+	line := 1 // 1s-based count
+	deferredEOL := false
+	deferredWord := false
+	// the tokenizer uses a local dictionary to conserve memory while
+	// analyzing the input doc to avoid polluting the global dictionary
+	ld := newDictionary()
 
-func tokenize(in []byte) *document {
-	// tokenize produces a document from the input content.
-	text, matches := normalizeDoc(in, true)
-	return extractDoc(text, true, matches)
-}
+	var doc indexedDocument
 
-func extractDoc(text string, removeEol bool, matches Matches) *document {
-	var doc document
-	doc.Matches = matches
-	// Iterate on a line-by-line basis.
-	i := 0
-	pos := 0
-	for {
-		// Scan the text for the first likely textual content. The scan ignores punctuation
-		// artifacts that include visual boxes for layout as well as comment characters in
-		// source files.
-		firstInLine := true
-		var wid int
-		var r rune
-
-		if pos == len(text) {
-			break
-		}
+	isEOF := func(in error) bool {
+		return in == io.EOF || in == io.ErrUnexpectedEOF
+	}
 
-		next := func() {
-			r, wid = utf8.DecodeRuneInString(text[pos:])
-			pos += wid
+	// Read out the stream in chunks
+	for {
+		// Fill up the buffer with bytes to extract runes from
+		// idx is offset to hold any bytes left over from previous reads
+		n, err := io.ReadFull(src, rbuf[idx:])
+		if isEOF(err) {
+			// There are no more bytes to read, so we must now consume all bytes in the
+			// buffer.
+			tgt = idx + n
+		} else if err != nil {
+			return nil, err
 		}
 
-		for pos < len(text) {
-			start := pos
-			next()
+		for idx = 0; idx < tgt; {
+			r, n := utf8.DecodeRune(rbuf[idx:])
+			idx += n
 
 			if r == '\n' {
-				doc.Tokens = append(doc.Tokens, &token{
-					Text: eol,
-					Line: i + 1})
-				i++
-			}
+				// Deal with carriage return
 
-			if !isSignificant(r) {
+				// If we are in a word (len(obuf) > 0)and the last rune is a -
+				// strike that rune and keep accumulating.
+				// Otherwise we treat it like a space and
+				// flush the word
+
+				if len(obuf) > 0 {
+					if obuf[len(obuf)-1] == '-' {
+						obuf = obuf[0 : len(obuf)-1]
+						deferredEOL = true
+						continue
+					}
+
+					// Append the word fragment to the line buffer
+					linebuf = append(linebuf, flushBuf(len(linebuf), obuf, normalize, ld))
+				}
+
+				// If there is something in the line to process, do so now
+				if len(linebuf) > 0 {
+					appendToDoc(&doc, dict, line, linebuf, ld, normalize, updateDict, linebuf)
+					linebuf = nil
+					obuf = nil
+				}
+				if !normalize {
+					tokID := dict.getIndex(eol)
+					if tokID == unknownIndex {
+						tokID = dict.add(eol)
+					}
+					doc.Tokens = append(doc.Tokens, indexedToken{
+						ID:   tokID,
+						Line: line})
+				}
+				line++
 				continue
 			}
 
-			// We're at a word/number character.
-			for pos < len(text) {
-				next()
-				if unicode.IsSpace(r) {
-					pos -= wid // Will skip this in outer loop
-					break
+			if len(obuf) == 0 {
+				if unicode.IsLetter(r) || unicode.IsDigit(r) || r == '&' || r == '(' {
+					// Number or word character starts an interesting word
+					// Now we slurp up all non-space runes and aggregate it as
+					// a single word
+
+					// Buffer the initial token, normalizing to lower case if needed
+					if normalize {
+						r = unicode.ToLower(r)
+					}
+					obuf = utf8.AppendRune(obuf, r)
 				}
+				continue
 			}
 
-			if pos > start {
-				if start >= 2 && text[start-2] == '.' && text[start-1] == ' ' {
-					// Insert a "soft EOL" that helps detect header-looking entries that
-					// follow this text. This resolves problems with licenses that are a
-					// very long line of text, motivated by
-					// https://github.com/microsoft/TypeScript/commit/6e6e570d57b6785335668e30b63712e41f89bf74#diff-e60c8cd1bc09b7c4e1bf79c769c9c120L109
-					//
-					// Don't do this if the previous token was already an EOL
-					if len(doc.Tokens) > 0 && doc.Tokens[len(doc.Tokens)-1].Text != eol {
-						doc.Tokens = append(doc.Tokens, &token{
-							Text: eol,
-							Line: i + 1})
-					}
+			// At this point, len(obuf) > 0 and we are accumulating more runes
+			// to complete a word.
+			if unicode.IsSpace(r) {
+				// If we have a deferred EOL, we need to pick up a non-space character
+				// to resume the hyphenated word, so we just consume spaces until that
+				// happens
+				if deferredEOL {
+					continue
 				}
 
-				tok := token{
-					Text: text[start:pos],
-					Line: i + 1,
+				// This is a space between word characters, so we assemble the word as a
+				// token and flush it out.
+				idx -= n
+
+				linebuf = append(linebuf, flushBuf(len(linebuf), obuf, normalize, ld))
+				if deferredWord {
+					appendToDoc(&doc, dict, line, linebuf, ld, normalize, updateDict, linebuf)
+					linebuf = nil
+					deferredWord = false
+					// Increment the line count now so the remainder token is credited
+					// to the previous line number.
+					line++
 				}
-				if firstInLine {
-					// Store the prefix material, it is useful to discern some corner cases
-					tok.Previous = text[0:start]
+				obuf = make([]byte, 0)
+				continue
+			}
+
+			if deferredEOL {
+				deferredEOL = false
+				deferredWord = true
+			}
+			// perform token mappings for punctuation to emulate
+			// normalizePunctuation. this returns a string and each rune needs to be
+			// injected.
+			if rep, found := punctuationMappings[r]; found {
+				for _, t := range rep {
+					obuf = utf8.AppendRune(obuf, unicode.ToLower(t))
 				}
-				doc.Tokens = append(doc.Tokens, &tok)
-				firstInLine = false
+				continue
 			}
+
+			// if it's not punctuation, lowercase and buffer the token
+			obuf = utf8.AppendRune(obuf, unicode.ToLower(r))
+		}
+
+		// Break out if we have consumed all read bytes
+		if isEOF(err) {
+			break
 		}
+
+		// Copy the unconsumed bytes at the end of the buffer to the start
+		// of the buffer so the next read appends after them.
+		n = copy(rbuf, rbuf[idx:])
+		idx = n
+	}
+
+	// Process the remaining bytes in the buffer
+	if len(obuf) > 0 {
+		linebuf = append(linebuf, flushBuf(len(linebuf), obuf, normalize, ld))
+	}
+	if len(linebuf) > 0 {
+		appendToDoc(&doc, dict, line, linebuf, ld, normalize, updateDict, linebuf)
 	}
 
-	doc.Tokens = cleanupTokens(doc.Tokens, removeEol)
-	return &doc
+	doc.dict = dict
+	doc.generateFrequencies()
+	doc.runes = diffWordsToRunes(&doc, 0, doc.size())
+	doc.Norm = doc.normalized()
+	return &doc, nil
 }
 
-func cleanupTokens(in []*token, removeEol bool) []*token {
-	// This routine performs sanitization of tokens. If it is a header-looking
-	// token (but not a version number) starting a line, it is removed.
-	// Hyphenated words are reassembled.
-	partialWord := ""
-	var out []*token
-	tokIdx := 0
-	firstInLine := true
-	for i, tok := range in {
-		if firstInLine && header(tok) {
+func appendToDoc(doc *indexedDocument, dict *dictionary, line int, in []tokenID, ld *dictionary, normalize bool, updateDict bool, linebuf []tokenID) {
+	tokens, m := stringifyLineBuf(dict, line, linebuf, ld, normalize, updateDict)
+	if tokens != nil {
+		doc.Tokens = append(doc.Tokens, tokens...)
+	} else if m != nil {
+		doc.Matches = append(doc.Matches, m)
+	}
+}
+
+func stringifyLineBuf(dict *dictionary, line int, in []tokenID, ld *dictionary, normalize bool, updateDict bool) ([]indexedToken, *Match) {
+	if len(in) == 0 {
+		return nil, nil
+	}
+	var sb strings.Builder
+	for i, r := range in {
+		out := ld.getWord(r)
+		if out == "" {
 			continue
 		}
-		if tok.Text == eol {
-			firstInLine = true
-			if removeEol {
-				continue
-			}
-			// If we are reconstructing a hyphenated word, don't append the EOL
-			// now, do it when the word is reconstructed.
-			if partialWord == "" {
-				out = append(out, &token{Text: eol, Line: tok.Line})
-				tokIdx++
-			}
-			continue
+		sb.WriteString(out)
+		if i < len(in)-1 {
+			sb.WriteByte(' ')
 		}
-		firstInLine = false
-		t := cleanupToken(tok.Text)
-		// If this is the last token in a line, and it looks like a hyphenated
-		// word, store it for reassembly.
-		if strings.HasSuffix(tok.Text, "-") && i+1 < len(in) && in[i+1].Text == eol {
-			partialWord = t
-		} else if partialWord != "" {
-			// Repair hyphenated words
-			tp := in[i-1]
-			tp.Text = partialWord + t
-			tp.Previous = ""
-			out = append(out, tp)
-			tokIdx++
-			if !removeEol {
-				// Append the EOL now that the whole word is recovered
-				out = append(out, &token{Text: eol, Line: tp.Line})
-				tokIdx++
-			}
+	}
 
-			partialWord = ""
-		} else {
-			tok.Text = t
-			tok.Previous = ""
-			out = append(out, tok)
-			tokIdx++
+	out := sb.String()
+
+	for _, re := range ignorableTexts {
+		if re.MatchString(out) {
+			return nil, &Match{Name: "Copyright", MatchType: "Copyright", Confidence: 1.0, StartLine: line, EndLine: line}
 		}
 	}
-	return out
-}
 
-// interchangeablePunctutation is punctuation that can be normalized.
-var interchangeablePunctuation = []struct {
-	interchangeable string
-	substitute      string
-}{
-	// Hyphen, Dash, En Dash, and Em Dash.
-	{`-‒–—‐`, "-"},
-	// Single, Double, Curly Single, and Curly Double.
-	{"'\"`‘’“”", "'"},
-	// Copyright.
-	{"©", "(c)"},
-	// Currency and Section. (Different copies of the CDDL use each marker.)
-	{"§¤", "(s)"},
-	// Middle Dot
-	{"·", " "},
-	{"*", " "},
-}
-
-// normalizePunctuation takes all hyphens and quotes and normalizes them.
-func normalizePunctuation(s string) string {
-	for _, iw := range interchangeablePunctuation {
-		for _, in := range strings.Split(iw.interchangeable, "") {
-			s = strings.ReplaceAll(s, in, iw.substitute)
+	var tokens []indexedToken
+	for i, r := range in {
+		txt := cleanupToken(i, ld.getWord(r), normalize)
+		if txt != "" {
+			var tokID tokenID
+			if updateDict {
+				tokID = dict.add(txt)
+			} else {
+				tokID = dict.getIndex(txt)
+			}
+			tokens = append(tokens, indexedToken{
+				Line: line,
+				ID:   tokID,
+			})
 		}
 	}
-	return s
+
+	return tokens, nil
 }
 
-// interchangeableWords are words we can substitute for a normalized form
-// without changing the meaning of the license. See
-// https://spdx.org/spdx-license-list/matching-guidelines for the list.
-var interchangeableWords = []struct {
-	interchangeable *regexp.Regexp
-	substitute      string
-}{
-	{regexp.MustCompile("acknowledgement"), "acknowledgment"},
-	{regexp.MustCompile("analogue"), "analog"},
-	{regexp.MustCompile("analyse"), "analyze"},
-	{regexp.MustCompile("artefact"), "artifact"},
-	{regexp.MustCompile("authorisation"), "authorization"},
-	{regexp.MustCompile("authorised"), "authorized"},
-	{regexp.MustCompile("calibre"), "caliber"},
-	{regexp.MustCompile("cancelled"), "canceled"},
-	{regexp.MustCompile("capitalisations"), "capitalizations"},
-	{regexp.MustCompile("catalogue"), "catalog"},
-	{regexp.MustCompile("categorise"), "categorize"},
-	{regexp.MustCompile("centre"), "center"},
-	{regexp.MustCompile("emphasised"), "emphasized"},
-	{regexp.MustCompile("favour"), "favor"},
-	{regexp.MustCompile("favourite"), "favorite"},
-	{regexp.MustCompile("fulfil\\b"), "fulfill"},
-	{regexp.MustCompile("fulfilment"), "fulfillment"},
-	{regexp.MustCompile("https"), "http"},
-	{regexp.MustCompile("initialise"), "initialize"},
-	{regexp.MustCompile("judgment"), "judgement"},
-	{regexp.MustCompile("labelling"), "labeling"},
-	{regexp.MustCompile("labour"), "labor"},
-	{regexp.MustCompile("licence"), "license"},
-	{regexp.MustCompile("maximise"), "maximize"},
-	{regexp.MustCompile("modelled"), "modeled"},
-	{regexp.MustCompile("modelling"), "modeling"},
-	{regexp.MustCompile("offence"), "offense"},
-	{regexp.MustCompile("optimise"), "optimize"},
-	{regexp.MustCompile("organisation"), "organization"},
-	{regexp.MustCompile("organise"), "organize"},
-	{regexp.MustCompile("practise"), "practice"},
-	{regexp.MustCompile("programme"), "program"},
-	{regexp.MustCompile("realise"), "realize"},
-	{regexp.MustCompile("recognise"), "recognize"},
-	{regexp.MustCompile("signalling"), "signaling"},
-	{regexp.MustCompile("sub[ -]license"), "sublicense"},
-	{regexp.MustCompile("utilisation"), "utilization"},
-	{regexp.MustCompile("whilst"), "while"},
-	{regexp.MustCompile("wilful"), "wilfull"},
-	{regexp.MustCompile("non[ -]commercial"), "noncommercial"},
-	{regexp.MustCompile("per cent"), "percent"},
+func normalizeToken(in string) string {
+	// This performs some preprocessing on the token.
+	// This is different than cleanupToken in that fixups here
+	// are not exact match on the token.
+	// Normalizing URLs from https to http is an example of a fix applied
+	// here.
+	return strings.ReplaceAll(in, "https", "http")
 }
 
-// normalizeWords remaps equivalent words that are interchangeable and lowercases
-// the word to allow for exact matching.
-func normalizeWords(s string) string {
-	s = strings.ToLower(s)
-	for _, iw := range interchangeableWords {
-		s = iw.interchangeable.ReplaceAllString(s, iw.substitute)
-	}
-	return s
+func flushBuf(pos int, obuf []byte, normalizeWord bool, ld *dictionary) tokenID {
+	// clean up the contents of the rune buffer
+	token := string(obuf)
+	// escape sequences can occur anywhere in the string, not just the beginning
+	// so always attempt to unescape the word's content.
+	token = html.UnescapeString(token)
+
+	clean := normalizeToken(token)
+
+	return ld.add(clean)
 }
 
-func header(tok *token) bool {
-	in := tok.Text
-	p, e := in[:len(in)-1], in[len(in)-1]
-	switch e {
-	case '.', ':', ')':
-		if listMarker[p] {
-			if e != ')' {
-				return true
+func cleanupToken(pos int, in string, normalizeWord bool) string {
+	r, _ := utf8.DecodeRuneInString(in)
+	var out strings.Builder
+	if pos == 0 && header(in) {
+		return ""
+	}
+
+	if !unicode.IsLetter(r) {
+		if unicode.IsDigit(r) {
+			// Based on analysis of the license corpus, the characters that are
+			// significant are numbers, periods, and dashes. Anything else can be
+			// safely discarded, and helps avoid matching failures due to inconsistent
+			// whitespacing and formatting.
+			for _, c := range in {
+				if unicode.IsDigit(c) || c == '.' || c == '-' {
+					out.WriteRune(c)
+				}
 			}
-			// Sometimes an internal reference like "(ii)" from NPL-1.02.txt
-			// endds up at the beginning of a line. In that case, it's
-			// not actually a header.
-			if e == ')' && !strings.HasSuffix(tok.Previous, "(") {
-				return true
+
+			// Numbers should not end in a .  since that doesn't indicate a version
+			// number, but usually an end of a line.
+			res := out.String()
+			for strings.HasSuffix(res, ".") {
+				res = res[0 : len(res)-1]
 			}
+			return res
 		}
-		// Check for patterns like 1.2.3
-		for _, r := range p {
-			if unicode.IsDigit(r) || r == '.' {
-				continue
-			}
-			return false
+	}
+
+	// Remove internal hyphenization or URL constructs to better normalize strings
+	// for matching.
+
+	for _, c := range in {
+		if unicode.IsLetter(c) {
+			out.WriteRune(c)
 		}
-		return true
 	}
-	return false
-}
 
-var listMarker = func() map[string]bool {
-	const allListMarkers = "a b c d e f g h i j k l m n o p q r ii iii iv v vi vii viii ix xi xii xiii xiv xv"
-	l := map[string]bool{}
-	for _, marker := range strings.Split(allListMarkers, " ") {
-		l[marker] = true
+	tok := out.String()
+	if !normalizeWord {
+		return tok
 	}
-	return l
-}()
 
-// ignorableTexts is a list of lines at the start of the string we can remove
-// to get a cleaner match.
-var ignorableTexts = []*regexp.Regexp{
-	regexp.MustCompile(`(?i)^(.{1,5})?copyright (\(c\) )?(\[yyyy\]|\d{4})[,.]?.*$`),
-	regexp.MustCompile(`(?i)^(.{1,5})?copyright \(c\) \[dates of first publication\].*$`),
-	regexp.MustCompile(`(?i)^\d{4}-(\d{2}|[a-z]{3})-\d{2}$`),
+	if iw, ok := interchangeableWords[tok]; ok && normalizeWord {
+		return iw
+	}
+	return tok
 }
 
-// removeIgnorableTexts removes common text, which is not important for
-// classification
-func removeIgnorableTexts(s string) (string, Matches) {
-	var out []string
-	var matches Matches
-	lines := strings.Split(s, "\n")
-	for i, l := range lines {
-		line := strings.TrimSpace(l)
-		var match bool
-		for _, re := range ignorableTexts {
-			if re.MatchString(line) {
-				match = true
-			}
-		}
-		if !match {
-			out = append(out, l)
-		} else {
-			// We want to preserve line presence for the positional information
-			out = append(out, "")
-			matches = append(matches, &Match{Name: "Copyright", MatchType: "Copyright", Confidence: 1.0, StartLine: i + 1, EndLine: i + 1})
-		}
-	}
-	return strings.Join(out, "\n"), matches
+var interchangeableWords = map[string]string{
+	"analyse":         "analyze",
+	"artefact":        "artifact",
+	"authorisation":   "authorization",
+	"authorised":      "authorized",
+	"calibre":         "caliber",
+	"cancelled":       "canceled",
+	"capitalisations": "capitalizations",
+	"catalogue":       "catalog",
+	"categorise":      "categorize",
+	"centre":          "center",
+	"emphasised":      "emphasized",
+	"favour":          "favor",
+	"favourite":       "favorite",
+	"fulfil":          "fulfill",
+	"fulfilment":      "fulfillment",
+	"https":           "http",
+	"initialise":      "initialize",
+	"judgment":        "judgement",
+	"labelling":       "labeling",
+	"labour":          "labor",
+	"licence":         "license",
+	"maximise":        "maximize",
+	"modelled":        "modeled",
+	"modelling":       "modeling",
+	"offence":         "offense",
+	"optimise":        "optimize",
+	"organisation":    "organization",
+	"organise":        "organize",
+	"practise":        "practice",
+	"programme":       "program",
+	"realise":         "realize",
+	"recognise":       "recognize",
+	"signalling":      "signaling",
+	"utilisation":     "utilization",
+	"whilst":          "while",
+	"wilful":          "wilfull",
+	// TODO: These three need tokenizer magic
+	"non commercial": "noncommercial",
+	"per cent":       "percent",
+	"sub license":    "sublicense",
+}
+
+var punctuationMappings = map[rune]string{
+	'-': "-",
+	'‒': "-",
+	'–': "-",
+	'—': "-",
+	'‐': "-",
+	'©': "(c)",
+	'§': "(s)",
+	'¤': "(s)",
+	'·': " ",
+	'*': " ",
 }
diff --git a/v2/tokenizer_test.go b/v2/tokenizer_test.go
index 662685c..6ddab4c 100644
--- a/v2/tokenizer_test.go
+++ b/v2/tokenizer_test.go
@@ -15,6 +15,7 @@
 package classifier
 
 import (
+	"io"
 	"strings"
 	"testing"
 
@@ -56,7 +57,7 @@ func TestCleanupToken(t *testing.T) {
 		},
 	}
 	for _, test := range tests {
-		if got := cleanupToken(test.input); got != test.output {
+		if got := cleanupToken(0, test.input, true); got != test.output {
 			t.Errorf("%q: got %q want %q", test.input, got, test.output)
 		}
 	}
@@ -66,8 +67,21 @@ func TestTokenize(t *testing.T) {
 	tests := []struct {
 		name   string
 		input  string
-		output *document
+		output *indexedDocument
 	}{
+		{name: "hyphenization recovery",
+			input: `basket-
+ball`,
+			output: &indexedDocument{
+				Tokens: []indexedToken{
+					{
+						ID:   1,
+						Line: 1,
+					},
+				},
+				Norm: "basketball",
+			},
+		},
 		{
 			name: "basic scenario",
 			input: `The AWESOME Project LICENSE
@@ -80,63 +94,112 @@ Copyright 1996-2002, 2006 by A. Developer
 Introduction
 
 The AWESOME Project`,
-			output: &document{
-				Tokens: []*token{
+			output: &indexedDocument{
+				Tokens: []indexedToken{
 					{
-						Text: "the",
+						ID:   1,
 						Line: 1,
 					},
 					{
-						Text: "awesome",
+						ID:   2,
 						Line: 1,
 					},
 					{
-						Text: "project",
+						ID:   3,
 						Line: 1,
 					},
 					{
-						Text: "license",
+						ID:   4,
 						Line: 1,
 					},
 					{
-						Text: "modifications",
+						ID:   5,
 						Line: 3,
 					},
 					{
-						Text: "prohibited",
+						ID:   6,
 						Line: 4,
 					},
 					{
-						Text: "introduction",
+						ID:   7,
 						Line: 8,
 					},
 					{
-						Text: "the",
+						ID:   1,
 						Line: 10,
 					},
 					{
-						Text: "awesome",
+						ID:   2,
 						Line: 10,
 					},
 					{
-						Text: "project",
+						ID:   3,
 						Line: 10,
 					},
 				},
 				Matches: Matches{&Match{Name: "Copyright", Confidence: 1.0, MatchType: "Copyright", StartLine: 6, EndLine: 6}},
+				Norm:    "the awesome project license modifications prohibited introduction the awesome project",
 			},
 		},
 	}
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
-			d := tokenize([]byte(test.input))
-			if !cmp.Equal(d, test.output, cmpopts.IgnoreUnexported(document{})) {
-				t.Errorf("%s failed: %s", test.name, cmp.Diff(d, test.output))
+			d := tokenize([]byte(test.input), newDictionary(), true)
+			if diff := cmp.Diff(d, test.output, cmpopts.IgnoreUnexported(indexedDocument{})); diff != "" {
+				t.Errorf("%s failed:\nDiff(+got,-want): %s", test.name, diff)
 			}
 		})
 	}
 }
 
+type mockReader struct {
+	t        *testing.T
+	schedule []int
+	cur      int
+}
+
+func (m *mockReader) Read(buf []byte) (int, error) {
+	if m.cur > len(m.schedule) {
+		m.t.Fatal("Unexpected read on mock")
+	}
+
+	if m.cur == len(m.schedule) {
+		return 0, io.EOF
+	}
+
+	if len(buf) != m.schedule[m.cur] {
+		m.t.Fatalf("step %d: got %d, want %d", m.cur, len(buf), m.schedule[m.cur])
+	}
+	m.cur++
+
+	for i := range buf {
+		buf[i] = 'a'
+	}
+
+	return len(buf), nil
+}
+
+func TestTokenizerBuffering(t *testing.T) {
+	dict := newDictionary()
+	mr := mockReader{
+		t:        t,
+		schedule: []int{1024, 1020, 1020},
+	}
+	d, err := tokenizeStream(&mr, true, dict, true)
+	if err != nil {
+		t.Errorf("Read returned unexpected error: %v", err)
+	}
+
+	// Do a basic test to make sure the data returned is sound
+	if len(d.Tokens) != 1 {
+		t.Errorf("Got %d tokens, expected 1", len(d.Tokens))
+	}
+
+	if len(d.Norm) != 3064 {
+		t.Errorf("Got %d bytes, expected 3064", len(d.Norm))
+	}
+}
+
 func TestTokenizer(t *testing.T) {
 	// This test focuses primarily on the textual content extracted and does not look
 	// at the other parts of the document.
@@ -229,10 +292,11 @@ The FreeType Project`,
 
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
-			d := tokenize([]byte(test.input))
+			dict := newDictionary()
+			d := tokenize([]byte(test.input), dict, true)
 			var b strings.Builder
 			for _, tok := range d.Tokens {
-				b.WriteString(tok.Text)
+				b.WriteString(dict.getWord(tok.ID))
 				b.WriteString(" ")
 			}
 			actual := strings.TrimSpace(b.String())