1 files changed, 340 insertions, 294 deletions
diff --git a/v2/tokenizer.go b/v2/tokenizer.go
index 875cc7e..0d3917e 100644
--- a/v2/tokenizer.go
+++ b/v2/tokenizer.go
@@ -15,366 +15,412 @@
 package classifier
 
 import (
+	"bytes"
 	"html"
+	"io"
 	"regexp"
 	"strings"
 	"unicode"
 	"unicode/utf8"
 )
 
-// isSignificant looks for runes that are likely to be the part of English language content
-// of interest in licenses. Notably, it skips over punctuation, looking only for letters
-// or numbers that consistitute the tokens of most interest.
-func isSignificant(r rune) bool {
-	return unicode.IsLetter(r) || unicode.IsDigit(r)
-}
-
 var eol = "\n"
 
-func cleanupToken(in string) string {
-	r, _ := utf8.DecodeRuneInString(in)
-	var out strings.Builder
-	if !unicode.IsLetter(r) {
-		if unicode.IsDigit(r) {
-			// Based on analysis of the license corpus, the characters
-			// that are significant are numbers, periods, and dashes. Anything
-			// else can be safely discarded, and helps avoid matching failures
-			// due to inconsistent whitespacing and formatting.
-			for _, c := range in {
-				if unicode.IsDigit(c) || c == '.' || c == '-' {
-					out.WriteRune(c)
-				}
+func header(in string) bool {
+	if len(in) == 0 {
+		return false
+	}
+	p, e := in[:len(in)-1], in[len(in)-1]
+	switch e {
+	case '.', ':', ')':
+		if listMarker[p] {
+			if e != ')' {
+				return true
 			}
-
-			// Numbers should not end in a . since that doesn't indicate a version
-			// number, but usually an end of a line.
-			res := out.String()
-			for strings.HasSuffix(res, ".") {
-				res = res[0 : len(res)-1]
+		}
+		// Check for patterns like 1.2.3
+		for _, r := range p {
+			if unicode.IsDigit(r) || r == '.' {
+				continue
 			}
-			return res
+			return false
 		}
+		return true
 	}
+	return false
+}
 
-	// Remove internal hyphenization or URL constructs to better normalize
-	// strings for matching.
-	for _, c := range in {
-		if unicode.IsLetter(c) {
-			out.WriteRune(c)
-		}
+var listMarker = func() map[string]bool {
+	const allListMarkers = "a b c d e f g h i j k l m n o p q r ii iii iv v vi vii viii ix xi xii xiii xiv xv"
+	l := map[string]bool{}
+	for _, marker := range strings.Split(allListMarkers, " ") {
+		l[marker] = true
 	}
-	return out.String()
+	return l
+}()
+
+// ignorableTexts is a list of lines at the start of the string we can remove
+// to get a cleaner match.
+var ignorableTexts = []*regexp.Regexp{
+	regexp.MustCompile(`(?i)^(.{1,5})?copyright (\(c\) )?(\[yyyy\]|\d{4})[,.]?.*$`),
+	regexp.MustCompile(`(?i)^(.{1,5})?copyright \(c\) \[dates of first publication\].*$`),
+	regexp.MustCompile(`(?i)^\d{4}-(\d{2}|[a-z]{3})-\d{2}$`),
 }
 
-func normalizeDoc(in []byte, normWords bool) (string, Matches) {
-	// Apply the global transforms described in SPDX
+func tokenize(in []byte, dict *dictionary, updateDict bool) *indexedDocument {
+	// Since bytes.NewReader().Read() will never return an error, tokenizeStream
+	// will never return an error so it's okay to ignore the return value in this
+	// case.
+	id, _ := tokenizeStream(bytes.NewReader(in), true, dict, updateDict)
+	return id
+}
 
-	norm := string(in)
-	norm = html.UnescapeString(norm)
-	norm = normalizePunctuation(norm)
-	norm, matches := removeIgnorableTexts(norm)
+// tokenizeStream reads bytes from src and produces an indexedDocument of its
+// cotent. tokenizeStream will never return an error of its own, it can only
+// return an error from the provided Reader. If the provided Reader never
+// returns an error, it is safe to assume that tokenizeStream will not return an
+// error.
+func tokenizeStream(src io.Reader, normalize bool, dict *dictionary, updateDict bool) (*indexedDocument, error) {
+	const bufSize = 1024
+	// The longest UTF-8 encoded rune is 4 bytes, so we keep enough leftover bytes
+	// in the buffer to ensure we never run out of bytes trying to finish
+	// constructing a rune. These leftover 4 bytes will be copied to the start of
+	// the buffer before additional bytes are read.
+	tgt := bufSize - 4
 
-	if normWords {
-		norm = normalizeWords(norm)
-	}
-	return norm, matches
-}
+	rbuf := make([]byte, bufSize)
+	obuf := make([]byte, 0)
+	linebuf := make([]tokenID, 0)
+	idx := 0
+	line := 1 // 1s-based count
+	deferredEOL := false
+	deferredWord := false
+	// the tokenizer uses a local dictionary to conserve memory while
+	// analyzing the input doc to avoid polluting the global dictionary
+	ld := newDictionary()
 
-func tokenize(in []byte) *document {
-	// tokenize produces a document from the input content.
-	text, matches := normalizeDoc(in, true)
-	return extractDoc(text, true, matches)
-}
+	var doc indexedDocument
 
-func extractDoc(text string, removeEol bool, matches Matches) *document {
-	var doc document
-	doc.Matches = matches
-	// Iterate on a line-by-line basis.
-	i := 0
-	pos := 0
-	for {
-		// Scan the text for the first likely textual content. The scan ignores punctuation
-		// artifacts that include visual boxes for layout as well as comment characters in
-		// source files.
-		firstInLine := true
-		var wid int
-		var r rune
-
-		if pos == len(text) {
-			break
-		}
+	isEOF := func(in error) bool {
+		return in == io.EOF || in == io.ErrUnexpectedEOF
+	}
 
-		next := func() {
-			r, wid = utf8.DecodeRuneInString(text[pos:])
-			pos += wid
+	// Read out the stream in chunks
+	for {
+		// Fill up the buffer with bytes to extract runes from
+		// idx is offset to hold any bytes left over from previous reads
+		n, err := io.ReadFull(src, rbuf[idx:])
+		if isEOF(err) {
+			// There are no more bytes to read, so we must now consume all bytes in the
+			// buffer.
+			tgt = idx + n
+		} else if err != nil {
+			return nil, err
 		}
 
-		for pos < len(text) {
-			start := pos
-			next()
+		for idx = 0; idx < tgt; {
+			r, n := utf8.DecodeRune(rbuf[idx:])
+			idx += n
 
 			if r == '\n' {
-				doc.Tokens = append(doc.Tokens, &token{
-					Text: eol,
-					Line: i + 1})
-				i++
-			}
+				// Deal with carriage return
 
-			if !isSignificant(r) {
+				// If we are in a word (len(obuf) > 0)and the last rune is a -
+				// strike that rune and keep accumulating.
+				// Otherwise we treat it like a space and
+				// flush the word
+
+				if len(obuf) > 0 {
+					if obuf[len(obuf)-1] == '-' {
+						obuf = obuf[0 : len(obuf)-1]
+						deferredEOL = true
+						continue
+					}
+
+					// Append the word fragment to the line buffer
+					linebuf = append(linebuf, flushBuf(len(linebuf), obuf, normalize, ld))
+				}
+
+				// If there is something in the line to process, do so now
+				if len(linebuf) > 0 {
+					appendToDoc(&doc, dict, line, linebuf, ld, normalize, updateDict, linebuf)
+					linebuf = nil
+					obuf = nil
+				}
+				if !normalize {
+					tokID := dict.getIndex(eol)
+					if tokID == unknownIndex {
+						tokID = dict.add(eol)
+					}
+					doc.Tokens = append(doc.Tokens, indexedToken{
+						ID:   tokID,
+						Line: line})
+				}
+				line++
 				continue
 			}
 
-			// We're at a word/number character.
-			for pos < len(text) {
-				next()
-				if unicode.IsSpace(r) {
-					pos -= wid // Will skip this in outer loop
-					break
+			if len(obuf) == 0 {
+				if unicode.IsLetter(r) || unicode.IsDigit(r) || r == '&' || r == '(' {
+					// Number or word character starts an interesting word
+					// Now we slurp up all non-space runes and aggregate it as
+					// a single word
+
+					// Buffer the initial token, normalizing to lower case if needed
+					if normalize {
+						r = unicode.ToLower(r)
+					}
+					obuf = utf8.AppendRune(obuf, r)
 				}
+				continue
 			}
 
-			if pos > start {
-				if start >= 2 && text[start-2] == '.' && text[start-1] == ' ' {
-					// Insert a "soft EOL" that helps detect header-looking entries that
-					// follow this text. This resolves problems with licenses that are a
-					// very long line of text, motivated by
-					// https://github.com/microsoft/TypeScript/commit/6e6e570d57b6785335668e30b63712e41f89bf74#diff-e60c8cd1bc09b7c4e1bf79c769c9c120L109
-					//
-					// Don't do this if the previous token was already an EOL
-					if len(doc.Tokens) > 0 && doc.Tokens[len(doc.Tokens)-1].Text != eol {
-						doc.Tokens = append(doc.Tokens, &token{
-							Text: eol,
-							Line: i + 1})
-					}
+			// At this point, len(obuf) > 0 and we are accumulating more runes
+			// to complete a word.
+			if unicode.IsSpace(r) {
+				// If we have a deferred EOL, we need to pick up a non-space character
+				// to resume the hyphenated word, so we just consume spaces until that
+				// happens
+				if deferredEOL {
+					continue
 				}
 
-				tok := token{
-					Text: text[start:pos],
-					Line: i + 1,
+				// This is a space between word characters, so we assemble the word as a
+				// token and flush it out.
+				idx -= n
+
+				linebuf = append(linebuf, flushBuf(len(linebuf), obuf, normalize, ld))
+				if deferredWord {
+					appendToDoc(&doc, dict, line, linebuf, ld, normalize, updateDict, linebuf)
+					linebuf = nil
+					deferredWord = false
+					// Increment the line count now so the remainder token is credited
+					// to the previous line number.
+					line++
 				}
-				if firstInLine {
-					// Store the prefix material, it is useful to discern some corner cases
-					tok.Previous = text[0:start]
+				obuf = make([]byte, 0)
+				continue
+			}
+
+			if deferredEOL {
+				deferredEOL = false
+				deferredWord = true
+			}
+			// perform token mappings for punctuation to emulate
+			// normalizePunctuation. this returns a string and each rune needs to be
+			// injected.
+			if rep, found := punctuationMappings[r]; found {
+				for _, t := range rep {
+					obuf = utf8.AppendRune(obuf, unicode.ToLower(t))
 				}
-				doc.Tokens = append(doc.Tokens, &tok)
-				firstInLine = false
+				continue
 			}
+
+			// if it's not punctuation, lowercase and buffer the token
+			obuf = utf8.AppendRune(obuf, unicode.ToLower(r))
+		}
+
+		// Break out if we have consumed all read bytes
+		if isEOF(err) {
+			break
 		}
+
+		// Copy the unconsumed bytes at the end of the buffer to the start
+		// of the buffer so the next read appends after them.
+		n = copy(rbuf, rbuf[idx:])
+		idx = n
+	}
+
+	// Process the remaining bytes in the buffer
+	if len(obuf) > 0 {
+		linebuf = append(linebuf, flushBuf(len(linebuf), obuf, normalize, ld))
+	}
+	if len(linebuf) > 0 {
+		appendToDoc(&doc, dict, line, linebuf, ld, normalize, updateDict, linebuf)
 	}
 
-	doc.Tokens = cleanupTokens(doc.Tokens, removeEol)
-	return &doc
+	doc.dict = dict
+	doc.generateFrequencies()
+	doc.runes = diffWordsToRunes(&doc, 0, doc.size())
+	doc.Norm = doc.normalized()
+	return &doc, nil
 }
 
-func cleanupTokens(in []*token, removeEol bool) []*token {
-	// This routine performs sanitization of tokens. If it is a header-looking
-	// token (but not a version number) starting a line, it is removed.
-	// Hyphenated words are reassembled.
-	partialWord := ""
-	var out []*token
-	tokIdx := 0
-	firstInLine := true
-	for i, tok := range in {
-		if firstInLine && header(tok) {
+func appendToDoc(doc *indexedDocument, dict *dictionary, line int, in []tokenID, ld *dictionary, normalize bool, updateDict bool, linebuf []tokenID) {
+	tokens, m := stringifyLineBuf(dict, line, linebuf, ld, normalize, updateDict)
+	if tokens != nil {
+		doc.Tokens = append(doc.Tokens, tokens...)
+	} else if m != nil {
+		doc.Matches = append(doc.Matches, m)
+	}
+}
+
+func stringifyLineBuf(dict *dictionary, line int, in []tokenID, ld *dictionary, normalize bool, updateDict bool) ([]indexedToken, *Match) {
+	if len(in) == 0 {
+		return nil, nil
+	}
+	var sb strings.Builder
+	for i, r := range in {
+		out := ld.getWord(r)
+		if out == "" {
 			continue
 		}
-		if tok.Text == eol {
-			firstInLine = true
-			if removeEol {
-				continue
-			}
-			// If we are reconstructing a hyphenated word, don't append the EOL
-			// now, do it when the word is reconstructed.
-			if partialWord == "" {
-				out = append(out, &token{Text: eol, Line: tok.Line})
-				tokIdx++
-			}
-			continue
+		sb.WriteString(out)
+		if i < len(in)-1 {
+			sb.WriteByte(' ')
 		}
-		firstInLine = false
-		t := cleanupToken(tok.Text)
-		// If this is the last token in a line, and it looks like a hyphenated
-		// word, store it for reassembly.
-		if strings.HasSuffix(tok.Text, "-") && i+1 < len(in) && in[i+1].Text == eol {
-			partialWord = t
-		} else if partialWord != "" {
-			// Repair hyphenated words
-			tp := in[i-1]
-			tp.Text = partialWord + t
-			tp.Previous = ""
-			out = append(out, tp)
-			tokIdx++
-			if !removeEol {
-				// Append the EOL now that the whole word is recovered
-				out = append(out, &token{Text: eol, Line: tp.Line})
-				tokIdx++
-			}
+	}
 
-			partialWord = ""
-		} else {
-			tok.Text = t
-			tok.Previous = ""
-			out = append(out, tok)
-			tokIdx++
+	out := sb.String()
+
+	for _, re := range ignorableTexts {
+		if re.MatchString(out) {
+			return nil, &Match{Name: "Copyright", MatchType: "Copyright", Confidence: 1.0, StartLine: line, EndLine: line}
 		}
 	}
-	return out
-}
 
-// interchangeablePunctutation is punctuation that can be normalized.
-var interchangeablePunctuation = []struct {
-	interchangeable string
-	substitute      string
-}{
-	// Hyphen, Dash, En Dash, and Em Dash.
-	{`-‒–—‐`, "-"},
-	// Single, Double, Curly Single, and Curly Double.
-	{"'\"`‘’“”", "'"},
-	// Copyright.
-	{"©", "(c)"},
-	// Currency and Section. (Different copies of the CDDL use each marker.)
-	{"§¤", "(s)"},
-	// Middle Dot
-	{"·", " "},
-	{"*", " "},
-}
-
-// normalizePunctuation takes all hyphens and quotes and normalizes them.
-func normalizePunctuation(s string) string {
-	for _, iw := range interchangeablePunctuation {
-		for _, in := range strings.Split(iw.interchangeable, "") {
-			s = strings.ReplaceAll(s, in, iw.substitute)
+	var tokens []indexedToken
+	for i, r := range in {
+		txt := cleanupToken(i, ld.getWord(r), normalize)
+		if txt != "" {
+			var tokID tokenID
+			if updateDict {
+				tokID = dict.add(txt)
+			} else {
+				tokID = dict.getIndex(txt)
+			}
+			tokens = append(tokens, indexedToken{
+				Line: line,
+				ID:   tokID,
+			})
 		}
 	}
-	return s
+
+	return tokens, nil
 }
 
-// interchangeableWords are words we can substitute for a normalized form
-// without changing the meaning of the license. See
-// https://spdx.org/spdx-license-list/matching-guidelines for the list.
-var interchangeableWords = []struct {
-	interchangeable *regexp.Regexp
-	substitute      string
-}{
-	{regexp.MustCompile("acknowledgement"), "acknowledgment"},
-	{regexp.MustCompile("analogue"), "analog"},
-	{regexp.MustCompile("analyse"), "analyze"},
-	{regexp.MustCompile("artefact"), "artifact"},
-	{regexp.MustCompile("authorisation"), "authorization"},
-	{regexp.MustCompile("authorised"), "authorized"},
-	{regexp.MustCompile("calibre"), "caliber"},
-	{regexp.MustCompile("cancelled"), "canceled"},
-	{regexp.MustCompile("capitalisations"), "capitalizations"},
-	{regexp.MustCompile("catalogue"), "catalog"},
-	{regexp.MustCompile("categorise"), "categorize"},
-	{regexp.MustCompile("centre"), "center"},
-	{regexp.MustCompile("emphasised"), "emphasized"},
-	{regexp.MustCompile("favour"), "favor"},
-	{regexp.MustCompile("favourite"), "favorite"},
-	{regexp.MustCompile("fulfil\\b"), "fulfill"},
-	{regexp.MustCompile("fulfilment"), "fulfillment"},
-	{regexp.MustCompile("https"), "http"},
-	{regexp.MustCompile("initialise"), "initialize"},
-	{regexp.MustCompile("judgment"), "judgement"},
-	{regexp.MustCompile("labelling"), "labeling"},
-	{regexp.MustCompile("labour"), "labor"},
-	{regexp.MustCompile("licence"), "license"},
-	{regexp.MustCompile("maximise"), "maximize"},
-	{regexp.MustCompile("modelled"), "modeled"},
-	{regexp.MustCompile("modelling"), "modeling"},
-	{regexp.MustCompile("offence"), "offense"},
-	{regexp.MustCompile("optimise"), "optimize"},
-	{regexp.MustCompile("organisation"), "organization"},
-	{regexp.MustCompile("organise"), "organize"},
-	{regexp.MustCompile("practise"), "practice"},
-	{regexp.MustCompile("programme"), "program"},
-	{regexp.MustCompile("realise"), "realize"},
-	{regexp.MustCompile("recognise"), "recognize"},
-	{regexp.MustCompile("signalling"), "signaling"},
-	{regexp.MustCompile("sub[ -]license"), "sublicense"},
-	{regexp.MustCompile("utilisation"), "utilization"},
-	{regexp.MustCompile("whilst"), "while"},
-	{regexp.MustCompile("wilful"), "wilfull"},
-	{regexp.MustCompile("non[ -]commercial"), "noncommercial"},
-	{regexp.MustCompile("per cent"), "percent"},
+func normalizeToken(in string) string {
+	// This performs some preprocessing on the token.
+	// This is different than cleanupToken in that fixups here
+	// are not exact match on the token.
+	// Normalizing URLs from https to http is an example of a fix applied
+	// here.
+	return strings.ReplaceAll(in, "https", "http")
 }
 
-// normalizeWords remaps equivalent words that are interchangeable and lowercases
-// the word to allow for exact matching.
-func normalizeWords(s string) string {
-	s = strings.ToLower(s)
-	for _, iw := range interchangeableWords {
-		s = iw.interchangeable.ReplaceAllString(s, iw.substitute)
-	}
-	return s
+func flushBuf(pos int, obuf []byte, normalizeWord bool, ld *dictionary) tokenID {
+	// clean up the contents of the rune buffer
+	token := string(obuf)
+	// escape sequences can occur anywhere in the string, not just the beginning
+	// so always attempt to unescape the word's content.
+	token = html.UnescapeString(token)
+
+	clean := normalizeToken(token)
+
+	return ld.add(clean)
 }
 
-func header(tok *token) bool {
-	in := tok.Text
-	p, e := in[:len(in)-1], in[len(in)-1]
-	switch e {
-	case '.', ':', ')':
-		if listMarker[p] {
-			if e != ')' {
-				return true
+func cleanupToken(pos int, in string, normalizeWord bool) string {
+	r, _ := utf8.DecodeRuneInString(in)
+	var out strings.Builder
+	if pos == 0 && header(in) {
+		return ""
+	}
+
+	if !unicode.IsLetter(r) {
+		if unicode.IsDigit(r) {
+			// Based on analysis of the license corpus, the characters that are
+			// significant are numbers, periods, and dashes. Anything else can be
+			// safely discarded, and helps avoid matching failures due to inconsistent
+			// whitespacing and formatting.
+			for _, c := range in {
+				if unicode.IsDigit(c) || c == '.' || c == '-' {
+					out.WriteRune(c)
+				}
 			}
-			// Sometimes an internal reference like "(ii)" from NPL-1.02.txt
-			// endds up at the beginning of a line. In that case, it's
-			// not actually a header.
-			if e == ')' && !strings.HasSuffix(tok.Previous, "(") {
-				return true
+
+			// Numbers should not end in a .  since that doesn't indicate a version
+			// number, but usually an end of a line.
+			res := out.String()
+			for strings.HasSuffix(res, ".") {
+				res = res[0 : len(res)-1]
 			}
+			return res
 		}
-		// Check for patterns like 1.2.3
-		for _, r := range p {
-			if unicode.IsDigit(r) || r == '.' {
-				continue
-			}
-			return false
+	}
+
+	// Remove internal hyphenization or URL constructs to better normalize strings
+	// for matching.
+
+	for _, c := range in {
+		if unicode.IsLetter(c) {
+			out.WriteRune(c)
 		}
-		return true
 	}
-	return false
-}
 
-var listMarker = func() map[string]bool {
-	const allListMarkers = "a b c d e f g h i j k l m n o p q r ii iii iv v vi vii viii ix xi xii xiii xiv xv"
-	l := map[string]bool{}
-	for _, marker := range strings.Split(allListMarkers, " ") {
-		l[marker] = true
+	tok := out.String()
+	if !normalizeWord {
+		return tok
 	}
-	return l
-}()
 
-// ignorableTexts is a list of lines at the start of the string we can remove
-// to get a cleaner match.
-var ignorableTexts = []*regexp.Regexp{
-	regexp.MustCompile(`(?i)^(.{1,5})?copyright (\(c\) )?(\[yyyy\]|\d{4})[,.]?.*$`),
-	regexp.MustCompile(`(?i)^(.{1,5})?copyright \(c\) \[dates of first publication\].*$`),
-	regexp.MustCompile(`(?i)^\d{4}-(\d{2}|[a-z]{3})-\d{2}$`),
+	if iw, ok := interchangeableWords[tok]; ok && normalizeWord {
+		return iw
+	}
+	return tok
 }
 
-// removeIgnorableTexts removes common text, which is not important for
-// classification
-func removeIgnorableTexts(s string) (string, Matches) {
-	var out []string
-	var matches Matches
-	lines := strings.Split(s, "\n")
-	for i, l := range lines {
-		line := strings.TrimSpace(l)
-		var match bool
-		for _, re := range ignorableTexts {
-			if re.MatchString(line) {
-				match = true
-			}
-		}
-		if !match {
-			out = append(out, l)
-		} else {
-			// We want to preserve line presence for the positional information
-			out = append(out, "")
-			matches = append(matches, &Match{Name: "Copyright", MatchType: "Copyright", Confidence: 1.0, StartLine: i + 1, EndLine: i + 1})
-		}
-	}
-	return strings.Join(out, "\n"), matches
+var interchangeableWords = map[string]string{
+	"analyse":         "analyze",
+	"artefact":        "artifact",
+	"authorisation":   "authorization",
+	"authorised":      "authorized",
+	"calibre":         "caliber",
+	"cancelled":       "canceled",
+	"capitalisations": "capitalizations",
+	"catalogue":       "catalog",
+	"categorise":      "categorize",
+	"centre":          "center",
+	"emphasised":      "emphasized",
+	"favour":          "favor",
+	"favourite":       "favorite",
+	"fulfil":          "fulfill",
+	"fulfilment":      "fulfillment",
+	"https":           "http",
+	"initialise":      "initialize",
+	"judgment":        "judgement",
+	"labelling":       "labeling",
+	"labour":          "labor",
+	"licence":         "license",
+	"maximise":        "maximize",
+	"modelled":        "modeled",
+	"modelling":       "modeling",
+	"offence":         "offense",
+	"optimise":        "optimize",
+	"organisation":    "organization",
+	"organise":        "organize",
+	"practise":        "practice",
+	"programme":       "program",
+	"realise":         "realize",
+	"recognise":       "recognize",
+	"signalling":      "signaling",
+	"utilisation":     "utilization",
+	"whilst":          "while",
+	"wilful":          "wilfull",
+	// TODO: These three need tokenizer magic
+	"non commercial": "noncommercial",
+	"per cent":       "percent",
+	"sub license":    "sublicense",
+}
+
+var punctuationMappings = map[rune]string{
+	'-': "-",
+	'‒': "-",
+	'–': "-",
+	'—': "-",
+	'‐': "-",
+	'©': "(c)",
+	'§': "(s)",
+	'¤': "(s)",
+	'·': " ",
+	'*': " ",
 }