diff options
-rw-r--r-- | v2/assets/License/Apache-1.1/log4j.txt | 48 | ||||
-rw-r--r-- | v2/classifier.go | 18 | ||||
-rw-r--r-- | v2/document.go | 46 | ||||
-rw-r--r-- | v2/scoring.go | 2 | ||||
-rw-r--r-- | v2/searchset_test.go | 2 | ||||
-rw-r--r-- | v2/tokenizer.go | 634 | ||||
-rw-r--r-- | v2/tokenizer_test.go | 102 |
7 files changed, 488 insertions, 364 deletions
diff --git a/v2/assets/License/Apache-1.1/log4j.txt b/v2/assets/License/Apache-1.1/log4j.txt new file mode 100644 index 0000000..f3506ce --- /dev/null +++ b/v2/assets/License/Apache-1.1/log4j.txt @@ -0,0 +1,48 @@ +/* + * ============================================================================ + * The Apache Software License, Version 1.1 + * ============================================================================ + * + * Copyright (C) 1999 The Apache Software Foundation. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modifica- + * tion, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. The end-user documentation included with the redistribution, if any, must + * include the following acknowledgment: "This product includes software + * developed by the Apache Software Foundation (http://www.apache.org/)." + * Alternately, this acknowledgment may appear in the software itself, if + * and wherever such third-party acknowledgments normally appear. + * + * 4. The names "log4j" and "Apache Software Foundation" must not be used to + * endorse or promote products derived from this software without prior + * written permission. For written permission, please contact + * apache@apache.org. + * + * 5. Products derived from this software may not be called "Apache", nor may + * "Apache" appear in their name, without prior written permission of the + * Apache Software Foundation. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * APACHE SOFTWARE FOUNDATION OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLU- + * DING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * This software consists of voluntary contributions made by many individuals + * on behalf of the Apache Software Foundation. For more information on the + * Apache Software Foundation, please see <http://www.apache.org/>. + * + */ diff --git a/v2/classifier.go b/v2/classifier.go index b2f4d76..f163030 100644 --- a/v2/classifier.go +++ b/v2/classifier.go @@ -230,8 +230,10 @@ func NewClassifier(threshold float64) *Classifier { // It is an invariant of the classifier that calling Match(Normalize(in)) will // return the same results as Match(in). func (c *Classifier) Normalize(in []byte) []byte { - text, _ := normalizeDoc(in, false) - doc := extractDoc(text, false, nil) + doc, err := tokenizeStream(bytes.NewReader(in), false, c.dict, true) + if err != nil { + panic("should not be reachable, since bytes.NewReader().Read() should never fail") + } var buf bytes.Buffer @@ -239,26 +241,28 @@ func (c *Classifier) Normalize(in []byte) []byte { case 0: return nil case 1: - buf.WriteString(doc.Tokens[0].Text) + buf.WriteString(c.dict.getWord(doc.Tokens[0].ID)) return buf.Bytes() } prevLine := 1 - buf.WriteString(doc.Tokens[0].Text) + buf.WriteString(c.dict.getWord(doc.Tokens[0].ID)) for _, t := range doc.Tokens[1:] { // Only write out an EOL token that incremented the line if t.Line == prevLine+1 { - buf.WriteString("\n") + buf.WriteString(eol) } // Only write tokens that aren't EOL - if t.Text != eol { + txt := c.dict.getWord(t.ID) + + if txt != eol { // Only put a space between tokens if the previous token was on the same // line. This prevents spaces after an EOL if t.Line == prevLine { buf.WriteString(" ") } - buf.WriteString(t.Text) + buf.WriteString(txt) } prevLine = t.Line diff --git a/v2/document.go b/v2/document.go index 73ccaab..6f3c1b5 100644 --- a/v2/document.go +++ b/v2/document.go @@ -30,25 +30,19 @@ type token struct { Previous string // for the first token in a line, any previous text. } -// document is the representation of the input text for downstream filtering and matching. -type document struct { - Tokens []*token // ordered tokens of the document - Matches Matches // these are matches identified while processing the original, untokenized text via regexp matching -} - type indexedToken struct { Line int // line position of this token in the source ID tokenID // identifier of the text in the dictionary } type indexedDocument struct { + Norm string // The normalized token sequence Tokens []indexedToken // ordered tokens of the document Matches Matches // these are matches identified while processing the original, untokenized text via regexp matching f *frequencyTable // frequencies computed for this document dict *dictionary // The corpus dictionary for this document s *searchSet // The searchset for this document runes []rune - norm string // The normalized token sequence } func (d *indexedDocument) generateSearchSet(q int) { @@ -101,58 +95,26 @@ func max(a, b int) int { // AddContent incorporates the provided textual content into the classifier for // matching. This will not modify the supplied content. func (c *Classifier) AddContent(category, name, variant string, content []byte) { - doc := tokenize(content) + doc := tokenize(content, c.dict, true) c.addDocument(category, name, variant, doc) } // addDocument takes a textual document and incorporates it into the classifier for matching. -func (c *Classifier) addDocument(category, name, variant string, doc *document) { +func (c *Classifier) addDocument(category, name, variant string, id *indexedDocument) { // For documents that are part of the corpus, we add them to the dictionary and // compute their associated search data eagerly so they are ready for matching against // candidates. indexName := c.generateDocName(category, name, variant) - id := c.generateIndexedDocument(doc, true) - id.generateFrequencies() id.generateSearchSet(c.q) id.s.origin = indexName c.docs[indexName] = id } -// generateIndexedDocument creates an indexedDocument from the supplied document. if addWords -// is true, the classifier dictionary is updated with new tokens encountered in the document. -func (c *Classifier) generateIndexedDocument(d *document, addWords bool) *indexedDocument { - id := &indexedDocument{ - Tokens: make([]indexedToken, 0, len(d.Tokens)), - dict: c.dict, - Matches: d.Matches, - } - - for _, t := range d.Tokens { - var tokID tokenID - if addWords { - tokID = id.dict.add(t.Text) - } else { - tokID = id.dict.getIndex(t.Text) - } - - id.Tokens = append(id.Tokens, indexedToken{ - Line: t.Line, - ID: tokID, - }) - - } - id.generateFrequencies() - id.runes = diffWordsToRunes(id, 0, id.size()) - id.norm = id.normalized() - return id -} - // createTargetIndexedDocument creates an indexed document without adding the // words to the classifier dictionary. This should be used for matching targets, not // populating the corpus. func (c *Classifier) createTargetIndexedDocument(in []byte) *indexedDocument { - doc := tokenize(in) - return c.generateIndexedDocument(doc, false) + return tokenize(in, c.dict, false) } func (c *Classifier) generateDocName(category, name, variant string) string { diff --git a/v2/scoring.go b/v2/scoring.go index 34dffb5..616ea78 100644 --- a/v2/scoring.go +++ b/v2/scoring.go @@ -41,7 +41,7 @@ func (c *Classifier) score(id string, unknown, known *indexedDocument, unknownSt knownLength := known.size() diffs := docDiff(id, unknown, unknownStart, unknownEnd, known, 0, knownLength) - start, end := diffRange(known.norm, diffs) + start, end := diffRange(known.Norm, diffs) distance := scoreDiffs(id, diffs[start:end]) if c.tc.traceScoring(known.s.origin) { diff --git a/v2/searchset_test.go b/v2/searchset_test.go index ccaa3c3..accbc15 100644 --- a/v2/searchset_test.go +++ b/v2/searchset_test.go @@ -63,7 +63,7 @@ func TestSearchSet_New(t *testing.T) { text: "", q: 4, want: &searchSet{ - Tokens: []indexedToken{}, + Tokens: nil, Hashes: make(hash), Checksums: nil, ChecksumRanges: nil, diff --git a/v2/tokenizer.go b/v2/tokenizer.go index 875cc7e..0d3917e 100644 --- a/v2/tokenizer.go +++ b/v2/tokenizer.go @@ -15,366 +15,412 @@ package classifier import ( + "bytes" "html" + "io" "regexp" "strings" "unicode" "unicode/utf8" ) -// isSignificant looks for runes that are likely to be the part of English language content -// of interest in licenses. Notably, it skips over punctuation, looking only for letters -// or numbers that consistitute the tokens of most interest. -func isSignificant(r rune) bool { - return unicode.IsLetter(r) || unicode.IsDigit(r) -} - var eol = "\n" -func cleanupToken(in string) string { - r, _ := utf8.DecodeRuneInString(in) - var out strings.Builder - if !unicode.IsLetter(r) { - if unicode.IsDigit(r) { - // Based on analysis of the license corpus, the characters - // that are significant are numbers, periods, and dashes. Anything - // else can be safely discarded, and helps avoid matching failures - // due to inconsistent whitespacing and formatting. - for _, c := range in { - if unicode.IsDigit(c) || c == '.' || c == '-' { - out.WriteRune(c) - } +func header(in string) bool { + if len(in) == 0 { + return false + } + p, e := in[:len(in)-1], in[len(in)-1] + switch e { + case '.', ':', ')': + if listMarker[p] { + if e != ')' { + return true } - - // Numbers should not end in a . since that doesn't indicate a version - // number, but usually an end of a line. - res := out.String() - for strings.HasSuffix(res, ".") { - res = res[0 : len(res)-1] + } + // Check for patterns like 1.2.3 + for _, r := range p { + if unicode.IsDigit(r) || r == '.' { + continue } - return res + return false } + return true } + return false +} - // Remove internal hyphenization or URL constructs to better normalize - // strings for matching. - for _, c := range in { - if unicode.IsLetter(c) { - out.WriteRune(c) - } +var listMarker = func() map[string]bool { + const allListMarkers = "a b c d e f g h i j k l m n o p q r ii iii iv v vi vii viii ix xi xii xiii xiv xv" + l := map[string]bool{} + for _, marker := range strings.Split(allListMarkers, " ") { + l[marker] = true } - return out.String() + return l +}() + +// ignorableTexts is a list of lines at the start of the string we can remove +// to get a cleaner match. +var ignorableTexts = []*regexp.Regexp{ + regexp.MustCompile(`(?i)^(.{1,5})?copyright (\(c\) )?(\[yyyy\]|\d{4})[,.]?.*$`), + regexp.MustCompile(`(?i)^(.{1,5})?copyright \(c\) \[dates of first publication\].*$`), + regexp.MustCompile(`(?i)^\d{4}-(\d{2}|[a-z]{3})-\d{2}$`), } -func normalizeDoc(in []byte, normWords bool) (string, Matches) { - // Apply the global transforms described in SPDX +func tokenize(in []byte, dict *dictionary, updateDict bool) *indexedDocument { + // Since bytes.NewReader().Read() will never return an error, tokenizeStream + // will never return an error so it's okay to ignore the return value in this + // case. + id, _ := tokenizeStream(bytes.NewReader(in), true, dict, updateDict) + return id +} - norm := string(in) - norm = html.UnescapeString(norm) - norm = normalizePunctuation(norm) - norm, matches := removeIgnorableTexts(norm) +// tokenizeStream reads bytes from src and produces an indexedDocument of its +// cotent. tokenizeStream will never return an error of its own, it can only +// return an error from the provided Reader. If the provided Reader never +// returns an error, it is safe to assume that tokenizeStream will not return an +// error. +func tokenizeStream(src io.Reader, normalize bool, dict *dictionary, updateDict bool) (*indexedDocument, error) { + const bufSize = 1024 + // The longest UTF-8 encoded rune is 4 bytes, so we keep enough leftover bytes + // in the buffer to ensure we never run out of bytes trying to finish + // constructing a rune. These leftover 4 bytes will be copied to the start of + // the buffer before additional bytes are read. + tgt := bufSize - 4 - if normWords { - norm = normalizeWords(norm) - } - return norm, matches -} + rbuf := make([]byte, bufSize) + obuf := make([]byte, 0) + linebuf := make([]tokenID, 0) + idx := 0 + line := 1 // 1s-based count + deferredEOL := false + deferredWord := false + // the tokenizer uses a local dictionary to conserve memory while + // analyzing the input doc to avoid polluting the global dictionary + ld := newDictionary() -func tokenize(in []byte) *document { - // tokenize produces a document from the input content. - text, matches := normalizeDoc(in, true) - return extractDoc(text, true, matches) -} + var doc indexedDocument -func extractDoc(text string, removeEol bool, matches Matches) *document { - var doc document - doc.Matches = matches - // Iterate on a line-by-line basis. - i := 0 - pos := 0 - for { - // Scan the text for the first likely textual content. The scan ignores punctuation - // artifacts that include visual boxes for layout as well as comment characters in - // source files. - firstInLine := true - var wid int - var r rune - - if pos == len(text) { - break - } + isEOF := func(in error) bool { + return in == io.EOF || in == io.ErrUnexpectedEOF + } - next := func() { - r, wid = utf8.DecodeRuneInString(text[pos:]) - pos += wid + // Read out the stream in chunks + for { + // Fill up the buffer with bytes to extract runes from + // idx is offset to hold any bytes left over from previous reads + n, err := io.ReadFull(src, rbuf[idx:]) + if isEOF(err) { + // There are no more bytes to read, so we must now consume all bytes in the + // buffer. + tgt = idx + n + } else if err != nil { + return nil, err } - for pos < len(text) { - start := pos - next() + for idx = 0; idx < tgt; { + r, n := utf8.DecodeRune(rbuf[idx:]) + idx += n if r == '\n' { - doc.Tokens = append(doc.Tokens, &token{ - Text: eol, - Line: i + 1}) - i++ - } + // Deal with carriage return - if !isSignificant(r) { + // If we are in a word (len(obuf) > 0)and the last rune is a - + // strike that rune and keep accumulating. + // Otherwise we treat it like a space and + // flush the word + + if len(obuf) > 0 { + if obuf[len(obuf)-1] == '-' { + obuf = obuf[0 : len(obuf)-1] + deferredEOL = true + continue + } + + // Append the word fragment to the line buffer + linebuf = append(linebuf, flushBuf(len(linebuf), obuf, normalize, ld)) + } + + // If there is something in the line to process, do so now + if len(linebuf) > 0 { + appendToDoc(&doc, dict, line, linebuf, ld, normalize, updateDict, linebuf) + linebuf = nil + obuf = nil + } + if !normalize { + tokID := dict.getIndex(eol) + if tokID == unknownIndex { + tokID = dict.add(eol) + } + doc.Tokens = append(doc.Tokens, indexedToken{ + ID: tokID, + Line: line}) + } + line++ continue } - // We're at a word/number character. - for pos < len(text) { - next() - if unicode.IsSpace(r) { - pos -= wid // Will skip this in outer loop - break + if len(obuf) == 0 { + if unicode.IsLetter(r) || unicode.IsDigit(r) || r == '&' || r == '(' { + // Number or word character starts an interesting word + // Now we slurp up all non-space runes and aggregate it as + // a single word + + // Buffer the initial token, normalizing to lower case if needed + if normalize { + r = unicode.ToLower(r) + } + obuf = utf8.AppendRune(obuf, r) } + continue } - if pos > start { - if start >= 2 && text[start-2] == '.' && text[start-1] == ' ' { - // Insert a "soft EOL" that helps detect header-looking entries that - // follow this text. This resolves problems with licenses that are a - // very long line of text, motivated by - // https://github.com/microsoft/TypeScript/commit/6e6e570d57b6785335668e30b63712e41f89bf74#diff-e60c8cd1bc09b7c4e1bf79c769c9c120L109 - // - // Don't do this if the previous token was already an EOL - if len(doc.Tokens) > 0 && doc.Tokens[len(doc.Tokens)-1].Text != eol { - doc.Tokens = append(doc.Tokens, &token{ - Text: eol, - Line: i + 1}) - } + // At this point, len(obuf) > 0 and we are accumulating more runes + // to complete a word. + if unicode.IsSpace(r) { + // If we have a deferred EOL, we need to pick up a non-space character + // to resume the hyphenated word, so we just consume spaces until that + // happens + if deferredEOL { + continue } - tok := token{ - Text: text[start:pos], - Line: i + 1, + // This is a space between word characters, so we assemble the word as a + // token and flush it out. + idx -= n + + linebuf = append(linebuf, flushBuf(len(linebuf), obuf, normalize, ld)) + if deferredWord { + appendToDoc(&doc, dict, line, linebuf, ld, normalize, updateDict, linebuf) + linebuf = nil + deferredWord = false + // Increment the line count now so the remainder token is credited + // to the previous line number. + line++ } - if firstInLine { - // Store the prefix material, it is useful to discern some corner cases - tok.Previous = text[0:start] + obuf = make([]byte, 0) + continue + } + + if deferredEOL { + deferredEOL = false + deferredWord = true + } + // perform token mappings for punctuation to emulate + // normalizePunctuation. this returns a string and each rune needs to be + // injected. + if rep, found := punctuationMappings[r]; found { + for _, t := range rep { + obuf = utf8.AppendRune(obuf, unicode.ToLower(t)) } - doc.Tokens = append(doc.Tokens, &tok) - firstInLine = false + continue } + + // if it's not punctuation, lowercase and buffer the token + obuf = utf8.AppendRune(obuf, unicode.ToLower(r)) + } + + // Break out if we have consumed all read bytes + if isEOF(err) { + break } + + // Copy the unconsumed bytes at the end of the buffer to the start + // of the buffer so the next read appends after them. + n = copy(rbuf, rbuf[idx:]) + idx = n + } + + // Process the remaining bytes in the buffer + if len(obuf) > 0 { + linebuf = append(linebuf, flushBuf(len(linebuf), obuf, normalize, ld)) + } + if len(linebuf) > 0 { + appendToDoc(&doc, dict, line, linebuf, ld, normalize, updateDict, linebuf) } - doc.Tokens = cleanupTokens(doc.Tokens, removeEol) - return &doc + doc.dict = dict + doc.generateFrequencies() + doc.runes = diffWordsToRunes(&doc, 0, doc.size()) + doc.Norm = doc.normalized() + return &doc, nil } -func cleanupTokens(in []*token, removeEol bool) []*token { - // This routine performs sanitization of tokens. If it is a header-looking - // token (but not a version number) starting a line, it is removed. - // Hyphenated words are reassembled. - partialWord := "" - var out []*token - tokIdx := 0 - firstInLine := true - for i, tok := range in { - if firstInLine && header(tok) { +func appendToDoc(doc *indexedDocument, dict *dictionary, line int, in []tokenID, ld *dictionary, normalize bool, updateDict bool, linebuf []tokenID) { + tokens, m := stringifyLineBuf(dict, line, linebuf, ld, normalize, updateDict) + if tokens != nil { + doc.Tokens = append(doc.Tokens, tokens...) + } else if m != nil { + doc.Matches = append(doc.Matches, m) + } +} + +func stringifyLineBuf(dict *dictionary, line int, in []tokenID, ld *dictionary, normalize bool, updateDict bool) ([]indexedToken, *Match) { + if len(in) == 0 { + return nil, nil + } + var sb strings.Builder + for i, r := range in { + out := ld.getWord(r) + if out == "" { continue } - if tok.Text == eol { - firstInLine = true - if removeEol { - continue - } - // If we are reconstructing a hyphenated word, don't append the EOL - // now, do it when the word is reconstructed. - if partialWord == "" { - out = append(out, &token{Text: eol, Line: tok.Line}) - tokIdx++ - } - continue + sb.WriteString(out) + if i < len(in)-1 { + sb.WriteByte(' ') } - firstInLine = false - t := cleanupToken(tok.Text) - // If this is the last token in a line, and it looks like a hyphenated - // word, store it for reassembly. - if strings.HasSuffix(tok.Text, "-") && i+1 < len(in) && in[i+1].Text == eol { - partialWord = t - } else if partialWord != "" { - // Repair hyphenated words - tp := in[i-1] - tp.Text = partialWord + t - tp.Previous = "" - out = append(out, tp) - tokIdx++ - if !removeEol { - // Append the EOL now that the whole word is recovered - out = append(out, &token{Text: eol, Line: tp.Line}) - tokIdx++ - } + } - partialWord = "" - } else { - tok.Text = t - tok.Previous = "" - out = append(out, tok) - tokIdx++ + out := sb.String() + + for _, re := range ignorableTexts { + if re.MatchString(out) { + return nil, &Match{Name: "Copyright", MatchType: "Copyright", Confidence: 1.0, StartLine: line, EndLine: line} } } - return out -} -// interchangeablePunctutation is punctuation that can be normalized. -var interchangeablePunctuation = []struct { - interchangeable string - substitute string -}{ - // Hyphen, Dash, En Dash, and Em Dash. - {`-‒–—‐`, "-"}, - // Single, Double, Curly Single, and Curly Double. - {"'\"`‘’“”", "'"}, - // Copyright. - {"©", "(c)"}, - // Currency and Section. (Different copies of the CDDL use each marker.) - {"§¤", "(s)"}, - // Middle Dot - {"·", " "}, - {"*", " "}, -} - -// normalizePunctuation takes all hyphens and quotes and normalizes them. -func normalizePunctuation(s string) string { - for _, iw := range interchangeablePunctuation { - for _, in := range strings.Split(iw.interchangeable, "") { - s = strings.ReplaceAll(s, in, iw.substitute) + var tokens []indexedToken + for i, r := range in { + txt := cleanupToken(i, ld.getWord(r), normalize) + if txt != "" { + var tokID tokenID + if updateDict { + tokID = dict.add(txt) + } else { + tokID = dict.getIndex(txt) + } + tokens = append(tokens, indexedToken{ + Line: line, + ID: tokID, + }) } } - return s + + return tokens, nil } -// interchangeableWords are words we can substitute for a normalized form -// without changing the meaning of the license. See -// https://spdx.org/spdx-license-list/matching-guidelines for the list. -var interchangeableWords = []struct { - interchangeable *regexp.Regexp - substitute string -}{ - {regexp.MustCompile("acknowledgement"), "acknowledgment"}, - {regexp.MustCompile("analogue"), "analog"}, - {regexp.MustCompile("analyse"), "analyze"}, - {regexp.MustCompile("artefact"), "artifact"}, - {regexp.MustCompile("authorisation"), "authorization"}, - {regexp.MustCompile("authorised"), "authorized"}, - {regexp.MustCompile("calibre"), "caliber"}, - {regexp.MustCompile("cancelled"), "canceled"}, - {regexp.MustCompile("capitalisations"), "capitalizations"}, - {regexp.MustCompile("catalogue"), "catalog"}, - {regexp.MustCompile("categorise"), "categorize"}, - {regexp.MustCompile("centre"), "center"}, - {regexp.MustCompile("emphasised"), "emphasized"}, - {regexp.MustCompile("favour"), "favor"}, - {regexp.MustCompile("favourite"), "favorite"}, - {regexp.MustCompile("fulfil\\b"), "fulfill"}, - {regexp.MustCompile("fulfilment"), "fulfillment"}, - {regexp.MustCompile("https"), "http"}, - {regexp.MustCompile("initialise"), "initialize"}, - {regexp.MustCompile("judgment"), "judgement"}, - {regexp.MustCompile("labelling"), "labeling"}, - {regexp.MustCompile("labour"), "labor"}, - {regexp.MustCompile("licence"), "license"}, - {regexp.MustCompile("maximise"), "maximize"}, - {regexp.MustCompile("modelled"), "modeled"}, - {regexp.MustCompile("modelling"), "modeling"}, - {regexp.MustCompile("offence"), "offense"}, - {regexp.MustCompile("optimise"), "optimize"}, - {regexp.MustCompile("organisation"), "organization"}, - {regexp.MustCompile("organise"), "organize"}, - {regexp.MustCompile("practise"), "practice"}, - {regexp.MustCompile("programme"), "program"}, - {regexp.MustCompile("realise"), "realize"}, - {regexp.MustCompile("recognise"), "recognize"}, - {regexp.MustCompile("signalling"), "signaling"}, - {regexp.MustCompile("sub[ -]license"), "sublicense"}, - {regexp.MustCompile("utilisation"), "utilization"}, - {regexp.MustCompile("whilst"), "while"}, - {regexp.MustCompile("wilful"), "wilfull"}, - {regexp.MustCompile("non[ -]commercial"), "noncommercial"}, - {regexp.MustCompile("per cent"), "percent"}, +func normalizeToken(in string) string { + // This performs some preprocessing on the token. + // This is different than cleanupToken in that fixups here + // are not exact match on the token. + // Normalizing URLs from https to http is an example of a fix applied + // here. + return strings.ReplaceAll(in, "https", "http") } -// normalizeWords remaps equivalent words that are interchangeable and lowercases -// the word to allow for exact matching. -func normalizeWords(s string) string { - s = strings.ToLower(s) - for _, iw := range interchangeableWords { - s = iw.interchangeable.ReplaceAllString(s, iw.substitute) - } - return s +func flushBuf(pos int, obuf []byte, normalizeWord bool, ld *dictionary) tokenID { + // clean up the contents of the rune buffer + token := string(obuf) + // escape sequences can occur anywhere in the string, not just the beginning + // so always attempt to unescape the word's content. + token = html.UnescapeString(token) + + clean := normalizeToken(token) + + return ld.add(clean) } -func header(tok *token) bool { - in := tok.Text - p, e := in[:len(in)-1], in[len(in)-1] - switch e { - case '.', ':', ')': - if listMarker[p] { - if e != ')' { - return true +func cleanupToken(pos int, in string, normalizeWord bool) string { + r, _ := utf8.DecodeRuneInString(in) + var out strings.Builder + if pos == 0 && header(in) { + return "" + } + + if !unicode.IsLetter(r) { + if unicode.IsDigit(r) { + // Based on analysis of the license corpus, the characters that are + // significant are numbers, periods, and dashes. Anything else can be + // safely discarded, and helps avoid matching failures due to inconsistent + // whitespacing and formatting. + for _, c := range in { + if unicode.IsDigit(c) || c == '.' || c == '-' { + out.WriteRune(c) + } } - // Sometimes an internal reference like "(ii)" from NPL-1.02.txt - // endds up at the beginning of a line. In that case, it's - // not actually a header. - if e == ')' && !strings.HasSuffix(tok.Previous, "(") { - return true + + // Numbers should not end in a . since that doesn't indicate a version + // number, but usually an end of a line. + res := out.String() + for strings.HasSuffix(res, ".") { + res = res[0 : len(res)-1] } + return res } - // Check for patterns like 1.2.3 - for _, r := range p { - if unicode.IsDigit(r) || r == '.' { - continue - } - return false + } + + // Remove internal hyphenization or URL constructs to better normalize strings + // for matching. + + for _, c := range in { + if unicode.IsLetter(c) { + out.WriteRune(c) } - return true } - return false -} -var listMarker = func() map[string]bool { - const allListMarkers = "a b c d e f g h i j k l m n o p q r ii iii iv v vi vii viii ix xi xii xiii xiv xv" - l := map[string]bool{} - for _, marker := range strings.Split(allListMarkers, " ") { - l[marker] = true + tok := out.String() + if !normalizeWord { + return tok } - return l -}() -// ignorableTexts is a list of lines at the start of the string we can remove -// to get a cleaner match. -var ignorableTexts = []*regexp.Regexp{ - regexp.MustCompile(`(?i)^(.{1,5})?copyright (\(c\) )?(\[yyyy\]|\d{4})[,.]?.*$`), - regexp.MustCompile(`(?i)^(.{1,5})?copyright \(c\) \[dates of first publication\].*$`), - regexp.MustCompile(`(?i)^\d{4}-(\d{2}|[a-z]{3})-\d{2}$`), + if iw, ok := interchangeableWords[tok]; ok && normalizeWord { + return iw + } + return tok } -// removeIgnorableTexts removes common text, which is not important for -// classification -func removeIgnorableTexts(s string) (string, Matches) { - var out []string - var matches Matches - lines := strings.Split(s, "\n") - for i, l := range lines { - line := strings.TrimSpace(l) - var match bool - for _, re := range ignorableTexts { - if re.MatchString(line) { - match = true - } - } - if !match { - out = append(out, l) - } else { - // We want to preserve line presence for the positional information - out = append(out, "") - matches = append(matches, &Match{Name: "Copyright", MatchType: "Copyright", Confidence: 1.0, StartLine: i + 1, EndLine: i + 1}) - } - } - return strings.Join(out, "\n"), matches +var interchangeableWords = map[string]string{ + "analyse": "analyze", + "artefact": "artifact", + "authorisation": "authorization", + "authorised": "authorized", + "calibre": "caliber", + "cancelled": "canceled", + "capitalisations": "capitalizations", + "catalogue": "catalog", + "categorise": "categorize", + "centre": "center", + "emphasised": "emphasized", + "favour": "favor", + "favourite": "favorite", + "fulfil": "fulfill", + "fulfilment": "fulfillment", + "https": "http", + "initialise": "initialize", + "judgment": "judgement", + "labelling": "labeling", + "labour": "labor", + "licence": "license", + "maximise": "maximize", + "modelled": "modeled", + "modelling": "modeling", + "offence": "offense", + "optimise": "optimize", + "organisation": "organization", + "organise": "organize", + "practise": "practice", + "programme": "program", + "realise": "realize", + "recognise": "recognize", + "signalling": "signaling", + "utilisation": "utilization", + "whilst": "while", + "wilful": "wilfull", + // TODO: These three need tokenizer magic + "non commercial": "noncommercial", + "per cent": "percent", + "sub license": "sublicense", +} + +var punctuationMappings = map[rune]string{ + '-': "-", + '‒': "-", + '–': "-", + '—': "-", + '‐': "-", + '©': "(c)", + '§': "(s)", + '¤': "(s)", + '·': " ", + '*': " ", } diff --git a/v2/tokenizer_test.go b/v2/tokenizer_test.go index 662685c..6ddab4c 100644 --- a/v2/tokenizer_test.go +++ b/v2/tokenizer_test.go @@ -15,6 +15,7 @@ package classifier import ( + "io" "strings" "testing" @@ -56,7 +57,7 @@ func TestCleanupToken(t *testing.T) { }, } for _, test := range tests { - if got := cleanupToken(test.input); got != test.output { + if got := cleanupToken(0, test.input, true); got != test.output { t.Errorf("%q: got %q want %q", test.input, got, test.output) } } @@ -66,8 +67,21 @@ func TestTokenize(t *testing.T) { tests := []struct { name string input string - output *document + output *indexedDocument }{ + {name: "hyphenization recovery", + input: `basket- +ball`, + output: &indexedDocument{ + Tokens: []indexedToken{ + { + ID: 1, + Line: 1, + }, + }, + Norm: "basketball", + }, + }, { name: "basic scenario", input: `The AWESOME Project LICENSE @@ -80,63 +94,112 @@ Copyright 1996-2002, 2006 by A. Developer Introduction The AWESOME Project`, - output: &document{ - Tokens: []*token{ + output: &indexedDocument{ + Tokens: []indexedToken{ { - Text: "the", + ID: 1, Line: 1, }, { - Text: "awesome", + ID: 2, Line: 1, }, { - Text: "project", + ID: 3, Line: 1, }, { - Text: "license", + ID: 4, Line: 1, }, { - Text: "modifications", + ID: 5, Line: 3, }, { - Text: "prohibited", + ID: 6, Line: 4, }, { - Text: "introduction", + ID: 7, Line: 8, }, { - Text: "the", + ID: 1, Line: 10, }, { - Text: "awesome", + ID: 2, Line: 10, }, { - Text: "project", + ID: 3, Line: 10, }, }, Matches: Matches{&Match{Name: "Copyright", Confidence: 1.0, MatchType: "Copyright", StartLine: 6, EndLine: 6}}, + Norm: "the awesome project license modifications prohibited introduction the awesome project", }, }, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { - d := tokenize([]byte(test.input)) - if !cmp.Equal(d, test.output, cmpopts.IgnoreUnexported(document{})) { - t.Errorf("%s failed: %s", test.name, cmp.Diff(d, test.output)) + d := tokenize([]byte(test.input), newDictionary(), true) + if diff := cmp.Diff(d, test.output, cmpopts.IgnoreUnexported(indexedDocument{})); diff != "" { + t.Errorf("%s failed:\nDiff(+got,-want): %s", test.name, diff) } }) } } +type mockReader struct { + t *testing.T + schedule []int + cur int +} + +func (m *mockReader) Read(buf []byte) (int, error) { + if m.cur > len(m.schedule) { + m.t.Fatal("Unexpected read on mock") + } + + if m.cur == len(m.schedule) { + return 0, io.EOF + } + + if len(buf) != m.schedule[m.cur] { + m.t.Fatalf("step %d: got %d, want %d", m.cur, len(buf), m.schedule[m.cur]) + } + m.cur++ + + for i := range buf { + buf[i] = 'a' + } + + return len(buf), nil +} + +func TestTokenizerBuffering(t *testing.T) { + dict := newDictionary() + mr := mockReader{ + t: t, + schedule: []int{1024, 1020, 1020}, + } + d, err := tokenizeStream(&mr, true, dict, true) + if err != nil { + t.Errorf("Read returned unexpected error: %v", err) + } + + // Do a basic test to make sure the data returned is sound + if len(d.Tokens) != 1 { + t.Errorf("Got %d tokens, expected 1", len(d.Tokens)) + } + + if len(d.Norm) != 3064 { + t.Errorf("Got %d bytes, expected 3064", len(d.Norm)) + } +} + func TestTokenizer(t *testing.T) { // This test focuses primarily on the textual content extracted and does not look // at the other parts of the document. @@ -229,10 +292,11 @@ The FreeType Project`, for _, test := range tests { t.Run(test.name, func(t *testing.T) { - d := tokenize([]byte(test.input)) + dict := newDictionary() + d := tokenize([]byte(test.input), dict, true) var b strings.Builder for _, tok := range d.Tokens { - b.WriteString(tok.Text) + b.WriteString(dict.getWord(tok.ID)) b.WriteString(" ") } actual := strings.TrimSpace(b.String()) |