aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--v2/assets/License/Apache-1.1/log4j.txt48
-rw-r--r--v2/classifier.go18
-rw-r--r--v2/document.go46
-rw-r--r--v2/scoring.go2
-rw-r--r--v2/searchset_test.go2
-rw-r--r--v2/tokenizer.go634
-rw-r--r--v2/tokenizer_test.go102
7 files changed, 488 insertions, 364 deletions
diff --git a/v2/assets/License/Apache-1.1/log4j.txt b/v2/assets/License/Apache-1.1/log4j.txt
new file mode 100644
index 0000000..f3506ce
--- /dev/null
+++ b/v2/assets/License/Apache-1.1/log4j.txt
@@ -0,0 +1,48 @@
+/*
+ * ============================================================================
+ * The Apache Software License, Version 1.1
+ * ============================================================================
+ *
+ * Copyright (C) 1999 The Apache Software Foundation. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modifica-
+ * tion, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. The end-user documentation included with the redistribution, if any, must
+ * include the following acknowledgment: "This product includes software
+ * developed by the Apache Software Foundation (http://www.apache.org/)."
+ * Alternately, this acknowledgment may appear in the software itself, if
+ * and wherever such third-party acknowledgments normally appear.
+ *
+ * 4. The names "log4j" and "Apache Software Foundation" must not be used to
+ * endorse or promote products derived from this software without prior
+ * written permission. For written permission, please contact
+ * apache@apache.org.
+ *
+ * 5. Products derived from this software may not be called "Apache", nor may
+ * "Apache" appear in their name, without prior written permission of the
+ * Apache Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * APACHE SOFTWARE FOUNDATION OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLU-
+ * DING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * This software consists of voluntary contributions made by many individuals
+ * on behalf of the Apache Software Foundation. For more information on the
+ * Apache Software Foundation, please see <http://www.apache.org/>.
+ *
+ */
diff --git a/v2/classifier.go b/v2/classifier.go
index b2f4d76..f163030 100644
--- a/v2/classifier.go
+++ b/v2/classifier.go
@@ -230,8 +230,10 @@ func NewClassifier(threshold float64) *Classifier {
// It is an invariant of the classifier that calling Match(Normalize(in)) will
// return the same results as Match(in).
func (c *Classifier) Normalize(in []byte) []byte {
- text, _ := normalizeDoc(in, false)
- doc := extractDoc(text, false, nil)
+ doc, err := tokenizeStream(bytes.NewReader(in), false, c.dict, true)
+ if err != nil {
+ panic("should not be reachable, since bytes.NewReader().Read() should never fail")
+ }
var buf bytes.Buffer
@@ -239,26 +241,28 @@ func (c *Classifier) Normalize(in []byte) []byte {
case 0:
return nil
case 1:
- buf.WriteString(doc.Tokens[0].Text)
+ buf.WriteString(c.dict.getWord(doc.Tokens[0].ID))
return buf.Bytes()
}
prevLine := 1
- buf.WriteString(doc.Tokens[0].Text)
+ buf.WriteString(c.dict.getWord(doc.Tokens[0].ID))
for _, t := range doc.Tokens[1:] {
// Only write out an EOL token that incremented the line
if t.Line == prevLine+1 {
- buf.WriteString("\n")
+ buf.WriteString(eol)
}
// Only write tokens that aren't EOL
- if t.Text != eol {
+ txt := c.dict.getWord(t.ID)
+
+ if txt != eol {
// Only put a space between tokens if the previous token was on the same
// line. This prevents spaces after an EOL
if t.Line == prevLine {
buf.WriteString(" ")
}
- buf.WriteString(t.Text)
+ buf.WriteString(txt)
}
prevLine = t.Line
diff --git a/v2/document.go b/v2/document.go
index 73ccaab..6f3c1b5 100644
--- a/v2/document.go
+++ b/v2/document.go
@@ -30,25 +30,19 @@ type token struct {
Previous string // for the first token in a line, any previous text.
}
-// document is the representation of the input text for downstream filtering and matching.
-type document struct {
- Tokens []*token // ordered tokens of the document
- Matches Matches // these are matches identified while processing the original, untokenized text via regexp matching
-}
-
type indexedToken struct {
Line int // line position of this token in the source
ID tokenID // identifier of the text in the dictionary
}
type indexedDocument struct {
+ Norm string // The normalized token sequence
Tokens []indexedToken // ordered tokens of the document
Matches Matches // these are matches identified while processing the original, untokenized text via regexp matching
f *frequencyTable // frequencies computed for this document
dict *dictionary // The corpus dictionary for this document
s *searchSet // The searchset for this document
runes []rune
- norm string // The normalized token sequence
}
func (d *indexedDocument) generateSearchSet(q int) {
@@ -101,58 +95,26 @@ func max(a, b int) int {
// AddContent incorporates the provided textual content into the classifier for
// matching. This will not modify the supplied content.
func (c *Classifier) AddContent(category, name, variant string, content []byte) {
- doc := tokenize(content)
+ doc := tokenize(content, c.dict, true)
c.addDocument(category, name, variant, doc)
}
// addDocument takes a textual document and incorporates it into the classifier for matching.
-func (c *Classifier) addDocument(category, name, variant string, doc *document) {
+func (c *Classifier) addDocument(category, name, variant string, id *indexedDocument) {
// For documents that are part of the corpus, we add them to the dictionary and
// compute their associated search data eagerly so they are ready for matching against
// candidates.
indexName := c.generateDocName(category, name, variant)
- id := c.generateIndexedDocument(doc, true)
- id.generateFrequencies()
id.generateSearchSet(c.q)
id.s.origin = indexName
c.docs[indexName] = id
}
-// generateIndexedDocument creates an indexedDocument from the supplied document. if addWords
-// is true, the classifier dictionary is updated with new tokens encountered in the document.
-func (c *Classifier) generateIndexedDocument(d *document, addWords bool) *indexedDocument {
- id := &indexedDocument{
- Tokens: make([]indexedToken, 0, len(d.Tokens)),
- dict: c.dict,
- Matches: d.Matches,
- }
-
- for _, t := range d.Tokens {
- var tokID tokenID
- if addWords {
- tokID = id.dict.add(t.Text)
- } else {
- tokID = id.dict.getIndex(t.Text)
- }
-
- id.Tokens = append(id.Tokens, indexedToken{
- Line: t.Line,
- ID: tokID,
- })
-
- }
- id.generateFrequencies()
- id.runes = diffWordsToRunes(id, 0, id.size())
- id.norm = id.normalized()
- return id
-}
-
// createTargetIndexedDocument creates an indexed document without adding the
// words to the classifier dictionary. This should be used for matching targets, not
// populating the corpus.
func (c *Classifier) createTargetIndexedDocument(in []byte) *indexedDocument {
- doc := tokenize(in)
- return c.generateIndexedDocument(doc, false)
+ return tokenize(in, c.dict, false)
}
func (c *Classifier) generateDocName(category, name, variant string) string {
diff --git a/v2/scoring.go b/v2/scoring.go
index 34dffb5..616ea78 100644
--- a/v2/scoring.go
+++ b/v2/scoring.go
@@ -41,7 +41,7 @@ func (c *Classifier) score(id string, unknown, known *indexedDocument, unknownSt
knownLength := known.size()
diffs := docDiff(id, unknown, unknownStart, unknownEnd, known, 0, knownLength)
- start, end := diffRange(known.norm, diffs)
+ start, end := diffRange(known.Norm, diffs)
distance := scoreDiffs(id, diffs[start:end])
if c.tc.traceScoring(known.s.origin) {
diff --git a/v2/searchset_test.go b/v2/searchset_test.go
index ccaa3c3..accbc15 100644
--- a/v2/searchset_test.go
+++ b/v2/searchset_test.go
@@ -63,7 +63,7 @@ func TestSearchSet_New(t *testing.T) {
text: "",
q: 4,
want: &searchSet{
- Tokens: []indexedToken{},
+ Tokens: nil,
Hashes: make(hash),
Checksums: nil,
ChecksumRanges: nil,
diff --git a/v2/tokenizer.go b/v2/tokenizer.go
index 875cc7e..0d3917e 100644
--- a/v2/tokenizer.go
+++ b/v2/tokenizer.go
@@ -15,366 +15,412 @@
package classifier
import (
+ "bytes"
"html"
+ "io"
"regexp"
"strings"
"unicode"
"unicode/utf8"
)
-// isSignificant looks for runes that are likely to be the part of English language content
-// of interest in licenses. Notably, it skips over punctuation, looking only for letters
-// or numbers that consistitute the tokens of most interest.
-func isSignificant(r rune) bool {
- return unicode.IsLetter(r) || unicode.IsDigit(r)
-}
-
var eol = "\n"
-func cleanupToken(in string) string {
- r, _ := utf8.DecodeRuneInString(in)
- var out strings.Builder
- if !unicode.IsLetter(r) {
- if unicode.IsDigit(r) {
- // Based on analysis of the license corpus, the characters
- // that are significant are numbers, periods, and dashes. Anything
- // else can be safely discarded, and helps avoid matching failures
- // due to inconsistent whitespacing and formatting.
- for _, c := range in {
- if unicode.IsDigit(c) || c == '.' || c == '-' {
- out.WriteRune(c)
- }
+func header(in string) bool {
+ if len(in) == 0 {
+ return false
+ }
+ p, e := in[:len(in)-1], in[len(in)-1]
+ switch e {
+ case '.', ':', ')':
+ if listMarker[p] {
+ if e != ')' {
+ return true
}
-
- // Numbers should not end in a . since that doesn't indicate a version
- // number, but usually an end of a line.
- res := out.String()
- for strings.HasSuffix(res, ".") {
- res = res[0 : len(res)-1]
+ }
+ // Check for patterns like 1.2.3
+ for _, r := range p {
+ if unicode.IsDigit(r) || r == '.' {
+ continue
}
- return res
+ return false
}
+ return true
}
+ return false
+}
- // Remove internal hyphenization or URL constructs to better normalize
- // strings for matching.
- for _, c := range in {
- if unicode.IsLetter(c) {
- out.WriteRune(c)
- }
+var listMarker = func() map[string]bool {
+ const allListMarkers = "a b c d e f g h i j k l m n o p q r ii iii iv v vi vii viii ix xi xii xiii xiv xv"
+ l := map[string]bool{}
+ for _, marker := range strings.Split(allListMarkers, " ") {
+ l[marker] = true
}
- return out.String()
+ return l
+}()
+
+// ignorableTexts is a list of lines at the start of the string we can remove
+// to get a cleaner match.
+var ignorableTexts = []*regexp.Regexp{
+ regexp.MustCompile(`(?i)^(.{1,5})?copyright (\(c\) )?(\[yyyy\]|\d{4})[,.]?.*$`),
+ regexp.MustCompile(`(?i)^(.{1,5})?copyright \(c\) \[dates of first publication\].*$`),
+ regexp.MustCompile(`(?i)^\d{4}-(\d{2}|[a-z]{3})-\d{2}$`),
}
-func normalizeDoc(in []byte, normWords bool) (string, Matches) {
- // Apply the global transforms described in SPDX
+func tokenize(in []byte, dict *dictionary, updateDict bool) *indexedDocument {
+ // Since bytes.NewReader().Read() will never return an error, tokenizeStream
+ // will never return an error so it's okay to ignore the return value in this
+ // case.
+ id, _ := tokenizeStream(bytes.NewReader(in), true, dict, updateDict)
+ return id
+}
- norm := string(in)
- norm = html.UnescapeString(norm)
- norm = normalizePunctuation(norm)
- norm, matches := removeIgnorableTexts(norm)
+// tokenizeStream reads bytes from src and produces an indexedDocument of its
+// cotent. tokenizeStream will never return an error of its own, it can only
+// return an error from the provided Reader. If the provided Reader never
+// returns an error, it is safe to assume that tokenizeStream will not return an
+// error.
+func tokenizeStream(src io.Reader, normalize bool, dict *dictionary, updateDict bool) (*indexedDocument, error) {
+ const bufSize = 1024
+ // The longest UTF-8 encoded rune is 4 bytes, so we keep enough leftover bytes
+ // in the buffer to ensure we never run out of bytes trying to finish
+ // constructing a rune. These leftover 4 bytes will be copied to the start of
+ // the buffer before additional bytes are read.
+ tgt := bufSize - 4
- if normWords {
- norm = normalizeWords(norm)
- }
- return norm, matches
-}
+ rbuf := make([]byte, bufSize)
+ obuf := make([]byte, 0)
+ linebuf := make([]tokenID, 0)
+ idx := 0
+ line := 1 // 1s-based count
+ deferredEOL := false
+ deferredWord := false
+ // the tokenizer uses a local dictionary to conserve memory while
+ // analyzing the input doc to avoid polluting the global dictionary
+ ld := newDictionary()
-func tokenize(in []byte) *document {
- // tokenize produces a document from the input content.
- text, matches := normalizeDoc(in, true)
- return extractDoc(text, true, matches)
-}
+ var doc indexedDocument
-func extractDoc(text string, removeEol bool, matches Matches) *document {
- var doc document
- doc.Matches = matches
- // Iterate on a line-by-line basis.
- i := 0
- pos := 0
- for {
- // Scan the text for the first likely textual content. The scan ignores punctuation
- // artifacts that include visual boxes for layout as well as comment characters in
- // source files.
- firstInLine := true
- var wid int
- var r rune
-
- if pos == len(text) {
- break
- }
+ isEOF := func(in error) bool {
+ return in == io.EOF || in == io.ErrUnexpectedEOF
+ }
- next := func() {
- r, wid = utf8.DecodeRuneInString(text[pos:])
- pos += wid
+ // Read out the stream in chunks
+ for {
+ // Fill up the buffer with bytes to extract runes from
+ // idx is offset to hold any bytes left over from previous reads
+ n, err := io.ReadFull(src, rbuf[idx:])
+ if isEOF(err) {
+ // There are no more bytes to read, so we must now consume all bytes in the
+ // buffer.
+ tgt = idx + n
+ } else if err != nil {
+ return nil, err
}
- for pos < len(text) {
- start := pos
- next()
+ for idx = 0; idx < tgt; {
+ r, n := utf8.DecodeRune(rbuf[idx:])
+ idx += n
if r == '\n' {
- doc.Tokens = append(doc.Tokens, &token{
- Text: eol,
- Line: i + 1})
- i++
- }
+ // Deal with carriage return
- if !isSignificant(r) {
+ // If we are in a word (len(obuf) > 0)and the last rune is a -
+ // strike that rune and keep accumulating.
+ // Otherwise we treat it like a space and
+ // flush the word
+
+ if len(obuf) > 0 {
+ if obuf[len(obuf)-1] == '-' {
+ obuf = obuf[0 : len(obuf)-1]
+ deferredEOL = true
+ continue
+ }
+
+ // Append the word fragment to the line buffer
+ linebuf = append(linebuf, flushBuf(len(linebuf), obuf, normalize, ld))
+ }
+
+ // If there is something in the line to process, do so now
+ if len(linebuf) > 0 {
+ appendToDoc(&doc, dict, line, linebuf, ld, normalize, updateDict, linebuf)
+ linebuf = nil
+ obuf = nil
+ }
+ if !normalize {
+ tokID := dict.getIndex(eol)
+ if tokID == unknownIndex {
+ tokID = dict.add(eol)
+ }
+ doc.Tokens = append(doc.Tokens, indexedToken{
+ ID: tokID,
+ Line: line})
+ }
+ line++
continue
}
- // We're at a word/number character.
- for pos < len(text) {
- next()
- if unicode.IsSpace(r) {
- pos -= wid // Will skip this in outer loop
- break
+ if len(obuf) == 0 {
+ if unicode.IsLetter(r) || unicode.IsDigit(r) || r == '&' || r == '(' {
+ // Number or word character starts an interesting word
+ // Now we slurp up all non-space runes and aggregate it as
+ // a single word
+
+ // Buffer the initial token, normalizing to lower case if needed
+ if normalize {
+ r = unicode.ToLower(r)
+ }
+ obuf = utf8.AppendRune(obuf, r)
}
+ continue
}
- if pos > start {
- if start >= 2 && text[start-2] == '.' && text[start-1] == ' ' {
- // Insert a "soft EOL" that helps detect header-looking entries that
- // follow this text. This resolves problems with licenses that are a
- // very long line of text, motivated by
- // https://github.com/microsoft/TypeScript/commit/6e6e570d57b6785335668e30b63712e41f89bf74#diff-e60c8cd1bc09b7c4e1bf79c769c9c120L109
- //
- // Don't do this if the previous token was already an EOL
- if len(doc.Tokens) > 0 && doc.Tokens[len(doc.Tokens)-1].Text != eol {
- doc.Tokens = append(doc.Tokens, &token{
- Text: eol,
- Line: i + 1})
- }
+ // At this point, len(obuf) > 0 and we are accumulating more runes
+ // to complete a word.
+ if unicode.IsSpace(r) {
+ // If we have a deferred EOL, we need to pick up a non-space character
+ // to resume the hyphenated word, so we just consume spaces until that
+ // happens
+ if deferredEOL {
+ continue
}
- tok := token{
- Text: text[start:pos],
- Line: i + 1,
+ // This is a space between word characters, so we assemble the word as a
+ // token and flush it out.
+ idx -= n
+
+ linebuf = append(linebuf, flushBuf(len(linebuf), obuf, normalize, ld))
+ if deferredWord {
+ appendToDoc(&doc, dict, line, linebuf, ld, normalize, updateDict, linebuf)
+ linebuf = nil
+ deferredWord = false
+ // Increment the line count now so the remainder token is credited
+ // to the previous line number.
+ line++
}
- if firstInLine {
- // Store the prefix material, it is useful to discern some corner cases
- tok.Previous = text[0:start]
+ obuf = make([]byte, 0)
+ continue
+ }
+
+ if deferredEOL {
+ deferredEOL = false
+ deferredWord = true
+ }
+ // perform token mappings for punctuation to emulate
+ // normalizePunctuation. this returns a string and each rune needs to be
+ // injected.
+ if rep, found := punctuationMappings[r]; found {
+ for _, t := range rep {
+ obuf = utf8.AppendRune(obuf, unicode.ToLower(t))
}
- doc.Tokens = append(doc.Tokens, &tok)
- firstInLine = false
+ continue
}
+
+ // if it's not punctuation, lowercase and buffer the token
+ obuf = utf8.AppendRune(obuf, unicode.ToLower(r))
+ }
+
+ // Break out if we have consumed all read bytes
+ if isEOF(err) {
+ break
}
+
+ // Copy the unconsumed bytes at the end of the buffer to the start
+ // of the buffer so the next read appends after them.
+ n = copy(rbuf, rbuf[idx:])
+ idx = n
+ }
+
+ // Process the remaining bytes in the buffer
+ if len(obuf) > 0 {
+ linebuf = append(linebuf, flushBuf(len(linebuf), obuf, normalize, ld))
+ }
+ if len(linebuf) > 0 {
+ appendToDoc(&doc, dict, line, linebuf, ld, normalize, updateDict, linebuf)
}
- doc.Tokens = cleanupTokens(doc.Tokens, removeEol)
- return &doc
+ doc.dict = dict
+ doc.generateFrequencies()
+ doc.runes = diffWordsToRunes(&doc, 0, doc.size())
+ doc.Norm = doc.normalized()
+ return &doc, nil
}
-func cleanupTokens(in []*token, removeEol bool) []*token {
- // This routine performs sanitization of tokens. If it is a header-looking
- // token (but not a version number) starting a line, it is removed.
- // Hyphenated words are reassembled.
- partialWord := ""
- var out []*token
- tokIdx := 0
- firstInLine := true
- for i, tok := range in {
- if firstInLine && header(tok) {
+func appendToDoc(doc *indexedDocument, dict *dictionary, line int, in []tokenID, ld *dictionary, normalize bool, updateDict bool, linebuf []tokenID) {
+ tokens, m := stringifyLineBuf(dict, line, linebuf, ld, normalize, updateDict)
+ if tokens != nil {
+ doc.Tokens = append(doc.Tokens, tokens...)
+ } else if m != nil {
+ doc.Matches = append(doc.Matches, m)
+ }
+}
+
+func stringifyLineBuf(dict *dictionary, line int, in []tokenID, ld *dictionary, normalize bool, updateDict bool) ([]indexedToken, *Match) {
+ if len(in) == 0 {
+ return nil, nil
+ }
+ var sb strings.Builder
+ for i, r := range in {
+ out := ld.getWord(r)
+ if out == "" {
continue
}
- if tok.Text == eol {
- firstInLine = true
- if removeEol {
- continue
- }
- // If we are reconstructing a hyphenated word, don't append the EOL
- // now, do it when the word is reconstructed.
- if partialWord == "" {
- out = append(out, &token{Text: eol, Line: tok.Line})
- tokIdx++
- }
- continue
+ sb.WriteString(out)
+ if i < len(in)-1 {
+ sb.WriteByte(' ')
}
- firstInLine = false
- t := cleanupToken(tok.Text)
- // If this is the last token in a line, and it looks like a hyphenated
- // word, store it for reassembly.
- if strings.HasSuffix(tok.Text, "-") && i+1 < len(in) && in[i+1].Text == eol {
- partialWord = t
- } else if partialWord != "" {
- // Repair hyphenated words
- tp := in[i-1]
- tp.Text = partialWord + t
- tp.Previous = ""
- out = append(out, tp)
- tokIdx++
- if !removeEol {
- // Append the EOL now that the whole word is recovered
- out = append(out, &token{Text: eol, Line: tp.Line})
- tokIdx++
- }
+ }
- partialWord = ""
- } else {
- tok.Text = t
- tok.Previous = ""
- out = append(out, tok)
- tokIdx++
+ out := sb.String()
+
+ for _, re := range ignorableTexts {
+ if re.MatchString(out) {
+ return nil, &Match{Name: "Copyright", MatchType: "Copyright", Confidence: 1.0, StartLine: line, EndLine: line}
}
}
- return out
-}
-// interchangeablePunctutation is punctuation that can be normalized.
-var interchangeablePunctuation = []struct {
- interchangeable string
- substitute string
-}{
- // Hyphen, Dash, En Dash, and Em Dash.
- {`-‒–—‐`, "-"},
- // Single, Double, Curly Single, and Curly Double.
- {"'\"`‘’“”", "'"},
- // Copyright.
- {"©", "(c)"},
- // Currency and Section. (Different copies of the CDDL use each marker.)
- {"§¤", "(s)"},
- // Middle Dot
- {"·", " "},
- {"*", " "},
-}
-
-// normalizePunctuation takes all hyphens and quotes and normalizes them.
-func normalizePunctuation(s string) string {
- for _, iw := range interchangeablePunctuation {
- for _, in := range strings.Split(iw.interchangeable, "") {
- s = strings.ReplaceAll(s, in, iw.substitute)
+ var tokens []indexedToken
+ for i, r := range in {
+ txt := cleanupToken(i, ld.getWord(r), normalize)
+ if txt != "" {
+ var tokID tokenID
+ if updateDict {
+ tokID = dict.add(txt)
+ } else {
+ tokID = dict.getIndex(txt)
+ }
+ tokens = append(tokens, indexedToken{
+ Line: line,
+ ID: tokID,
+ })
}
}
- return s
+
+ return tokens, nil
}
-// interchangeableWords are words we can substitute for a normalized form
-// without changing the meaning of the license. See
-// https://spdx.org/spdx-license-list/matching-guidelines for the list.
-var interchangeableWords = []struct {
- interchangeable *regexp.Regexp
- substitute string
-}{
- {regexp.MustCompile("acknowledgement"), "acknowledgment"},
- {regexp.MustCompile("analogue"), "analog"},
- {regexp.MustCompile("analyse"), "analyze"},
- {regexp.MustCompile("artefact"), "artifact"},
- {regexp.MustCompile("authorisation"), "authorization"},
- {regexp.MustCompile("authorised"), "authorized"},
- {regexp.MustCompile("calibre"), "caliber"},
- {regexp.MustCompile("cancelled"), "canceled"},
- {regexp.MustCompile("capitalisations"), "capitalizations"},
- {regexp.MustCompile("catalogue"), "catalog"},
- {regexp.MustCompile("categorise"), "categorize"},
- {regexp.MustCompile("centre"), "center"},
- {regexp.MustCompile("emphasised"), "emphasized"},
- {regexp.MustCompile("favour"), "favor"},
- {regexp.MustCompile("favourite"), "favorite"},
- {regexp.MustCompile("fulfil\\b"), "fulfill"},
- {regexp.MustCompile("fulfilment"), "fulfillment"},
- {regexp.MustCompile("https"), "http"},
- {regexp.MustCompile("initialise"), "initialize"},
- {regexp.MustCompile("judgment"), "judgement"},
- {regexp.MustCompile("labelling"), "labeling"},
- {regexp.MustCompile("labour"), "labor"},
- {regexp.MustCompile("licence"), "license"},
- {regexp.MustCompile("maximise"), "maximize"},
- {regexp.MustCompile("modelled"), "modeled"},
- {regexp.MustCompile("modelling"), "modeling"},
- {regexp.MustCompile("offence"), "offense"},
- {regexp.MustCompile("optimise"), "optimize"},
- {regexp.MustCompile("organisation"), "organization"},
- {regexp.MustCompile("organise"), "organize"},
- {regexp.MustCompile("practise"), "practice"},
- {regexp.MustCompile("programme"), "program"},
- {regexp.MustCompile("realise"), "realize"},
- {regexp.MustCompile("recognise"), "recognize"},
- {regexp.MustCompile("signalling"), "signaling"},
- {regexp.MustCompile("sub[ -]license"), "sublicense"},
- {regexp.MustCompile("utilisation"), "utilization"},
- {regexp.MustCompile("whilst"), "while"},
- {regexp.MustCompile("wilful"), "wilfull"},
- {regexp.MustCompile("non[ -]commercial"), "noncommercial"},
- {regexp.MustCompile("per cent"), "percent"},
+func normalizeToken(in string) string {
+ // This performs some preprocessing on the token.
+ // This is different than cleanupToken in that fixups here
+ // are not exact match on the token.
+ // Normalizing URLs from https to http is an example of a fix applied
+ // here.
+ return strings.ReplaceAll(in, "https", "http")
}
-// normalizeWords remaps equivalent words that are interchangeable and lowercases
-// the word to allow for exact matching.
-func normalizeWords(s string) string {
- s = strings.ToLower(s)
- for _, iw := range interchangeableWords {
- s = iw.interchangeable.ReplaceAllString(s, iw.substitute)
- }
- return s
+func flushBuf(pos int, obuf []byte, normalizeWord bool, ld *dictionary) tokenID {
+ // clean up the contents of the rune buffer
+ token := string(obuf)
+ // escape sequences can occur anywhere in the string, not just the beginning
+ // so always attempt to unescape the word's content.
+ token = html.UnescapeString(token)
+
+ clean := normalizeToken(token)
+
+ return ld.add(clean)
}
-func header(tok *token) bool {
- in := tok.Text
- p, e := in[:len(in)-1], in[len(in)-1]
- switch e {
- case '.', ':', ')':
- if listMarker[p] {
- if e != ')' {
- return true
+func cleanupToken(pos int, in string, normalizeWord bool) string {
+ r, _ := utf8.DecodeRuneInString(in)
+ var out strings.Builder
+ if pos == 0 && header(in) {
+ return ""
+ }
+
+ if !unicode.IsLetter(r) {
+ if unicode.IsDigit(r) {
+ // Based on analysis of the license corpus, the characters that are
+ // significant are numbers, periods, and dashes. Anything else can be
+ // safely discarded, and helps avoid matching failures due to inconsistent
+ // whitespacing and formatting.
+ for _, c := range in {
+ if unicode.IsDigit(c) || c == '.' || c == '-' {
+ out.WriteRune(c)
+ }
}
- // Sometimes an internal reference like "(ii)" from NPL-1.02.txt
- // endds up at the beginning of a line. In that case, it's
- // not actually a header.
- if e == ')' && !strings.HasSuffix(tok.Previous, "(") {
- return true
+
+ // Numbers should not end in a . since that doesn't indicate a version
+ // number, but usually an end of a line.
+ res := out.String()
+ for strings.HasSuffix(res, ".") {
+ res = res[0 : len(res)-1]
}
+ return res
}
- // Check for patterns like 1.2.3
- for _, r := range p {
- if unicode.IsDigit(r) || r == '.' {
- continue
- }
- return false
+ }
+
+ // Remove internal hyphenization or URL constructs to better normalize strings
+ // for matching.
+
+ for _, c := range in {
+ if unicode.IsLetter(c) {
+ out.WriteRune(c)
}
- return true
}
- return false
-}
-var listMarker = func() map[string]bool {
- const allListMarkers = "a b c d e f g h i j k l m n o p q r ii iii iv v vi vii viii ix xi xii xiii xiv xv"
- l := map[string]bool{}
- for _, marker := range strings.Split(allListMarkers, " ") {
- l[marker] = true
+ tok := out.String()
+ if !normalizeWord {
+ return tok
}
- return l
-}()
-// ignorableTexts is a list of lines at the start of the string we can remove
-// to get a cleaner match.
-var ignorableTexts = []*regexp.Regexp{
- regexp.MustCompile(`(?i)^(.{1,5})?copyright (\(c\) )?(\[yyyy\]|\d{4})[,.]?.*$`),
- regexp.MustCompile(`(?i)^(.{1,5})?copyright \(c\) \[dates of first publication\].*$`),
- regexp.MustCompile(`(?i)^\d{4}-(\d{2}|[a-z]{3})-\d{2}$`),
+ if iw, ok := interchangeableWords[tok]; ok && normalizeWord {
+ return iw
+ }
+ return tok
}
-// removeIgnorableTexts removes common text, which is not important for
-// classification
-func removeIgnorableTexts(s string) (string, Matches) {
- var out []string
- var matches Matches
- lines := strings.Split(s, "\n")
- for i, l := range lines {
- line := strings.TrimSpace(l)
- var match bool
- for _, re := range ignorableTexts {
- if re.MatchString(line) {
- match = true
- }
- }
- if !match {
- out = append(out, l)
- } else {
- // We want to preserve line presence for the positional information
- out = append(out, "")
- matches = append(matches, &Match{Name: "Copyright", MatchType: "Copyright", Confidence: 1.0, StartLine: i + 1, EndLine: i + 1})
- }
- }
- return strings.Join(out, "\n"), matches
+var interchangeableWords = map[string]string{
+ "analyse": "analyze",
+ "artefact": "artifact",
+ "authorisation": "authorization",
+ "authorised": "authorized",
+ "calibre": "caliber",
+ "cancelled": "canceled",
+ "capitalisations": "capitalizations",
+ "catalogue": "catalog",
+ "categorise": "categorize",
+ "centre": "center",
+ "emphasised": "emphasized",
+ "favour": "favor",
+ "favourite": "favorite",
+ "fulfil": "fulfill",
+ "fulfilment": "fulfillment",
+ "https": "http",
+ "initialise": "initialize",
+ "judgment": "judgement",
+ "labelling": "labeling",
+ "labour": "labor",
+ "licence": "license",
+ "maximise": "maximize",
+ "modelled": "modeled",
+ "modelling": "modeling",
+ "offence": "offense",
+ "optimise": "optimize",
+ "organisation": "organization",
+ "organise": "organize",
+ "practise": "practice",
+ "programme": "program",
+ "realise": "realize",
+ "recognise": "recognize",
+ "signalling": "signaling",
+ "utilisation": "utilization",
+ "whilst": "while",
+ "wilful": "wilfull",
+ // TODO: These three need tokenizer magic
+ "non commercial": "noncommercial",
+ "per cent": "percent",
+ "sub license": "sublicense",
+}
+
+var punctuationMappings = map[rune]string{
+ '-': "-",
+ '‒': "-",
+ '–': "-",
+ '—': "-",
+ '‐': "-",
+ '©': "(c)",
+ '§': "(s)",
+ '¤': "(s)",
+ '·': " ",
+ '*': " ",
}
diff --git a/v2/tokenizer_test.go b/v2/tokenizer_test.go
index 662685c..6ddab4c 100644
--- a/v2/tokenizer_test.go
+++ b/v2/tokenizer_test.go
@@ -15,6 +15,7 @@
package classifier
import (
+ "io"
"strings"
"testing"
@@ -56,7 +57,7 @@ func TestCleanupToken(t *testing.T) {
},
}
for _, test := range tests {
- if got := cleanupToken(test.input); got != test.output {
+ if got := cleanupToken(0, test.input, true); got != test.output {
t.Errorf("%q: got %q want %q", test.input, got, test.output)
}
}
@@ -66,8 +67,21 @@ func TestTokenize(t *testing.T) {
tests := []struct {
name string
input string
- output *document
+ output *indexedDocument
}{
+ {name: "hyphenization recovery",
+ input: `basket-
+ball`,
+ output: &indexedDocument{
+ Tokens: []indexedToken{
+ {
+ ID: 1,
+ Line: 1,
+ },
+ },
+ Norm: "basketball",
+ },
+ },
{
name: "basic scenario",
input: `The AWESOME Project LICENSE
@@ -80,63 +94,112 @@ Copyright 1996-2002, 2006 by A. Developer
Introduction
The AWESOME Project`,
- output: &document{
- Tokens: []*token{
+ output: &indexedDocument{
+ Tokens: []indexedToken{
{
- Text: "the",
+ ID: 1,
Line: 1,
},
{
- Text: "awesome",
+ ID: 2,
Line: 1,
},
{
- Text: "project",
+ ID: 3,
Line: 1,
},
{
- Text: "license",
+ ID: 4,
Line: 1,
},
{
- Text: "modifications",
+ ID: 5,
Line: 3,
},
{
- Text: "prohibited",
+ ID: 6,
Line: 4,
},
{
- Text: "introduction",
+ ID: 7,
Line: 8,
},
{
- Text: "the",
+ ID: 1,
Line: 10,
},
{
- Text: "awesome",
+ ID: 2,
Line: 10,
},
{
- Text: "project",
+ ID: 3,
Line: 10,
},
},
Matches: Matches{&Match{Name: "Copyright", Confidence: 1.0, MatchType: "Copyright", StartLine: 6, EndLine: 6}},
+ Norm: "the awesome project license modifications prohibited introduction the awesome project",
},
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
- d := tokenize([]byte(test.input))
- if !cmp.Equal(d, test.output, cmpopts.IgnoreUnexported(document{})) {
- t.Errorf("%s failed: %s", test.name, cmp.Diff(d, test.output))
+ d := tokenize([]byte(test.input), newDictionary(), true)
+ if diff := cmp.Diff(d, test.output, cmpopts.IgnoreUnexported(indexedDocument{})); diff != "" {
+ t.Errorf("%s failed:\nDiff(+got,-want): %s", test.name, diff)
}
})
}
}
+type mockReader struct {
+ t *testing.T
+ schedule []int
+ cur int
+}
+
+func (m *mockReader) Read(buf []byte) (int, error) {
+ if m.cur > len(m.schedule) {
+ m.t.Fatal("Unexpected read on mock")
+ }
+
+ if m.cur == len(m.schedule) {
+ return 0, io.EOF
+ }
+
+ if len(buf) != m.schedule[m.cur] {
+ m.t.Fatalf("step %d: got %d, want %d", m.cur, len(buf), m.schedule[m.cur])
+ }
+ m.cur++
+
+ for i := range buf {
+ buf[i] = 'a'
+ }
+
+ return len(buf), nil
+}
+
+func TestTokenizerBuffering(t *testing.T) {
+ dict := newDictionary()
+ mr := mockReader{
+ t: t,
+ schedule: []int{1024, 1020, 1020},
+ }
+ d, err := tokenizeStream(&mr, true, dict, true)
+ if err != nil {
+ t.Errorf("Read returned unexpected error: %v", err)
+ }
+
+ // Do a basic test to make sure the data returned is sound
+ if len(d.Tokens) != 1 {
+ t.Errorf("Got %d tokens, expected 1", len(d.Tokens))
+ }
+
+ if len(d.Norm) != 3064 {
+ t.Errorf("Got %d bytes, expected 3064", len(d.Norm))
+ }
+}
+
func TestTokenizer(t *testing.T) {
// This test focuses primarily on the textual content extracted and does not look
// at the other parts of the document.
@@ -229,10 +292,11 @@ The FreeType Project`,
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
- d := tokenize([]byte(test.input))
+ dict := newDictionary()
+ d := tokenize([]byte(test.input), dict, true)
var b strings.Builder
for _, tok := range d.Tokens {
- b.WriteString(tok.Text)
+ b.WriteString(dict.getWord(tok.ID))
b.WriteString(" ")
}
actual := strings.TrimSpace(b.String())