aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--v2/classifier.go37
-rw-r--r--v2/classifier_test.go7
-rw-r--r--v2/diff_test.go2
-rw-r--r--v2/document.go38
-rw-r--r--v2/frequencies_test.go8
-rw-r--r--v2/licenses/LGPL-2.0.txt6
-rw-r--r--v2/licenses/SGI-B-1.1.txt28
-rw-r--r--v2/scenarios/16815864726
-rw-r--r--v2/scoring.go64
-rw-r--r--v2/scoring_test.go29
-rw-r--r--v2/searchset.go61
-rw-r--r--v2/searchset_test.go56
-rw-r--r--v2/tokenizer.go9
-rw-r--r--v2/tokenizer_test.go2
-rw-r--r--v2/trace.go111
-rw-r--r--v2/trace_test.go117
16 files changed, 409 insertions, 192 deletions
diff --git a/v2/classifier.go b/v2/classifier.go
index e9521f2..dbfed57 100644
--- a/v2/classifier.go
+++ b/v2/classifier.go
@@ -55,7 +55,7 @@ func (d Matches) Less(i, j int) bool {
}
// Match reports instances of the supplied content in the corpus.
-func (c *Corpus) Match(in string) Matches {
+func (c *Classifier) match(in string) Matches {
id := c.createTargetIndexedDocument(in)
firstPass := make(map[string]*indexedDocument)
@@ -75,11 +75,11 @@ func (c *Corpus) Match(in string) Matches {
var candidates Matches
for l, d := range firstPass {
- matches := findPotentialMatches(d.s, id.s, c.threshold)
+ matches := c.findPotentialMatches(d.s, id.s, c.threshold)
for _, m := range matches {
startIndex := m.TargetStart
endIndex := m.TargetEnd
- conf, startOffset, endOffset := score(l, id, d, startIndex, endIndex)
+ conf, startOffset, endOffset := c.score(l, id, d, startIndex, endIndex)
if conf >= c.threshold && (endIndex-startIndex-startOffset-endOffset) > 0 {
candidates = append(candidates, &Match{
Name: LicenseName(l),
@@ -161,11 +161,28 @@ func (c *Corpus) Match(in string) Matches {
// Classifier provides methods for identifying open source licenses in text
// content.
type Classifier struct {
- Corpus *Corpus
+ tc *TraceConfiguration
+ dict *dictionary
+ docs map[string]*indexedDocument
+ threshold float64
+ q int // The value of q for q-grams in this corpus
}
-// LoadLicenses adds the contents of the supplied directory to the corpus.
-func (c *Corpus) LoadLicenses(dir string) error {
+// NewClassifier creates a classifier with an empty corpus.
+func NewClassifier(threshold float64) *Classifier {
+ classifier := &Classifier{
+ tc: new(TraceConfiguration),
+ dict: newDictionary(),
+ docs: make(map[string]*indexedDocument),
+ threshold: threshold,
+ q: computeQ(threshold),
+ }
+ return classifier
+}
+
+// LoadLicenses adds the contents of the supplied directory to the corpus of the
+// classifier.
+func (c *Classifier) LoadLicenses(dir string) error {
var files []string
err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {
if err != nil {
@@ -195,9 +212,15 @@ func (c *Corpus) LoadLicenses(dir string) error {
return nil
}
+// SetTraceConfiguration installs a tracing configuration for the classifier.
+func (c *Classifier) SetTraceConfiguration(in *TraceConfiguration) {
+ c.tc = in
+ c.tc.init()
+}
+
// Match finds matches within an unknown text.
func (c *Classifier) Match(in string) Matches {
- return c.Corpus.Match(in)
+ return c.match(in)
}
func detectionType(in string) string {
diff --git a/v2/classifier_test.go b/v2/classifier_test.go
index b4907b1..e980c8f 100644
--- a/v2/classifier_test.go
+++ b/v2/classifier_test.go
@@ -35,11 +35,8 @@ var defaultThreshold = .8
var baseLicenses = "./licenses"
func classifier() (*Classifier, error) {
- c := &Classifier{
- Corpus: NewCorpus(defaultThreshold),
- }
-
- return c, c.Corpus.LoadLicenses(baseLicenses)
+ c := NewClassifier(defaultThreshold)
+ return c, c.LoadLicenses(baseLicenses)
}
func TestScenarios(t *testing.T) {
diff --git a/v2/diff_test.go b/v2/diff_test.go
index 6ceceef..bd1d44d 100644
--- a/v2/diff_test.go
+++ b/v2/diff_test.go
@@ -269,7 +269,7 @@ func TestDiffing(t *testing.T) {
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
- c := NewCorpus(.8)
+ c := NewClassifier(.8)
c.AddContent("known", test.known)
kd := c.docs["known"]
ud := c.createTargetIndexedDocument(test.unknown)
diff --git a/v2/document.go b/v2/document.go
index eb366eb..1a3350d 100644
--- a/v2/document.go
+++ b/v2/document.go
@@ -71,28 +71,6 @@ func (d *indexedDocument) normalized() string {
return w.String()
}
-// Corpus is a collection of documents with a shared dictionary. Matching occurs
-// within all documents in the corpus.
-// TODO: This type may not be public in the long-term. I need to write the A/B classifier
-// facade and see how it best works out.
-type Corpus struct {
- dict *dictionary
- docs map[string]*indexedDocument
- threshold float64
- q int // The value of q for q-grams in this corpus
-}
-
-// NewCorpus creates an empty corpus.
-func NewCorpus(threshold float64) *Corpus {
- corpus := &Corpus{
- dict: newDictionary(),
- docs: make(map[string]*indexedDocument),
- threshold: threshold,
- q: computeQ(threshold),
- }
- return corpus
-}
-
func computeQ(threshold float64) int {
// q is the lower bound for token runs (q-grams) that must exist
// in content that can be recognized at the specified threshold.
@@ -117,14 +95,14 @@ func max(a, b int) int {
return b
}
-// AddContent incorporates the provided textual content into the corpus for matching.
-func (c *Corpus) AddContent(name, content string) {
+// AddContent incorporates the provided textual content into the classifier for matching.
+func (c *Classifier) AddContent(name, content string) {
doc := tokenize(content)
c.addDocument(name, doc)
}
-// addDocument takes a textual document and incorporates it into the corpus for matching.
-func (c *Corpus) addDocument(name string, doc *document) {
+// addDocument takes a textual document and incorporates it into the classifier for matching.
+func (c *Classifier) addDocument(name string, doc *document) {
// For documents that are part of the corpus, we add them to the dictionary and
// compute their associated search data eagerly so they are ready for matching against
// candidates.
@@ -136,8 +114,8 @@ func (c *Corpus) addDocument(name string, doc *document) {
}
// generateIndexedDocument creates an indexedDocument from the supplied document. if addWords
-// is true, the corpus dictionary is updated with new tokens encountered in the document.
-func (c *Corpus) generateIndexedDocument(d *document, addWords bool) *indexedDocument {
+// is true, the classifier dictionary is updated with new tokens encountered in the document.
+func (c *Classifier) generateIndexedDocument(d *document, addWords bool) *indexedDocument {
id := &indexedDocument{
Tokens: make([]indexedToken, 0, len(d.Tokens)),
dict: c.dict,
@@ -165,9 +143,9 @@ func (c *Corpus) generateIndexedDocument(d *document, addWords bool) *indexedDoc
}
// createTargetIndexedDocument creates an indexed document without adding the
-// words to the corpus dictionary. This should be used for matching targets, not
+// words to the classifier dictionary. This should be used for matching targets, not
// populating the corpus.
-func (c *Corpus) createTargetIndexedDocument(in string) *indexedDocument {
+func (c *Classifier) createTargetIndexedDocument(in string) *indexedDocument {
doc := tokenize(in)
return c.generateIndexedDocument(doc, false)
}
diff --git a/v2/frequencies_test.go b/v2/frequencies_test.go
index 01f9d2d..37092a1 100644
--- a/v2/frequencies_test.go
+++ b/v2/frequencies_test.go
@@ -44,10 +44,10 @@ func TestTokenSimilarity(t *testing.T) {
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
- corpus := NewCorpus(.8) // This value doesn't affect the test.
- corpus.AddContent("b", test.b)
- a := corpus.createTargetIndexedDocument(test.a)
- if actual := a.tokenSimilarity(corpus.docs["b"]); actual != test.sim {
+ c := NewClassifier(.8) // This value doesn't affect the test.
+ c.AddContent("b", test.b)
+ a := c.createTargetIndexedDocument(test.a)
+ if actual := a.tokenSimilarity(c.docs["b"]); actual != test.sim {
t.Errorf("got %v want %v", actual, test.sim)
}
})
diff --git a/v2/licenses/LGPL-2.0.txt b/v2/licenses/LGPL-2.0.txt
index cda4be3..6a58dcf 100644
--- a/v2/licenses/LGPL-2.0.txt
+++ b/v2/licenses/LGPL-2.0.txt
@@ -1,9 +1,3 @@
-GNU LIBRARY GENERAL PUBLIC LICENSE
-
-Version 2, June 1991 Copyright (C) 1991 Free Software Foundation, Inc.
-
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
-
Everyone is permitted to copy and distribute verbatim copies of this license
document, but changing it is not allowed.
diff --git a/v2/licenses/SGI-B-1.1.txt b/v2/licenses/SGI-B-1.1.txt
index f2b6e15..68903c5 100644
--- a/v2/licenses/SGI-B-1.1.txt
+++ b/v2/licenses/SGI-B-1.1.txt
@@ -220,31 +220,3 @@ fees and expenses. The application of the United Nations Convention on Contracts
for the International Sale of Goods is expressly excluded. Any law or regulation
that provides that the language of a contract shall be construed against the
drafter shall not apply to this License.
-
-Exhibit A
-
-License Applicability. Except to the extent portions of this file are made
-subject to an alternative license as permitted in the SGI Free Software License
-B, Version 1.1 (the "License"), the contents of this file are subject only to
-the provisions of the License. You may not use this file except in compliance
-with the License. You may obtain a copy of the License at Silicon Graphics,
-Inc., attn: Legal Services, 1600 Amphitheatre Parkway, Mountain View, CA
-94043-1351, or at:
-
-http://oss.sgi.com/projects/FreeB
-
-Note that, as provided in the License, the Software is distributed on an "AS IS"
-basis, with ALL EXPRESS AND IMPLIED WARRANTIES AND CONDITIONS DISCLAIMED,
-INCLUDING, WITHOUT LIMITATION, ANY IMPLIED WARRANTIES AND CONDITIONS OF
-MERCHANTABILITY, SATISFACTORY QUALITY, FITNESS FOR A PARTICULAR PURPOSE, AND
-NON-INFRINGEMENT.
-
-Original Code. The Original Code is: [name of software, version number, and
-release date], developed by Silicon Graphics, Inc. The Original Code is
-Copyright (c) [dates of first publication, as appearing in the Notice in the
-Original Code] Silicon Graphics, Inc. Copyright in any portions created by third
-parties is as indicated elsewhere herein. All Rights Reserved.
-
-Additional Notice Provisions: [such additional provisions, if any, as appear in
-the Notice in the Original Code under the heading "Additional Notice
-Provisions"]
diff --git a/v2/scenarios/168158647 b/v2/scenarios/168158647
new file mode 100644
index 0000000..eca4cb1
--- /dev/null
+++ b/v2/scenarios/168158647
@@ -0,0 +1,26 @@
+GPL versioning construct is different than existing templates.
+EXPECTED:GPL-2.0
+File src/zone.c
+ Copyright © 2011 Mathijs Mohlmann
+ License: GNU General Public License
+
+ This package is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation.
+
+ This package is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this package; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301,
+ USA.
+
+ Some files are licensed under version 2 only, while any others allow one to
+ use version 2 or (at your option) any later version.
+
+On Debian systems, the complete text of the GNU General Public License,
+version 2, can be found in `/usr/share/common-licenses/GPL-2'. The complete
+text of the latest version can be found in `/usr/share/common-licenses/GPL'.
diff --git a/v2/scoring.go b/v2/scoring.go
index 3d06255..3dae721 100644
--- a/v2/scoring.go
+++ b/v2/scoring.go
@@ -32,20 +32,20 @@ const (
// score computes a metric of similarity between the known and unknown
// document, including the offsets into the unknown that yield the content
// generating the computed similarity.
-func score(id string, unknown, known *indexedDocument, unknownStart, unknownEnd int) (float64, int, int) {
- if traceScoring(known.s.origin) {
- Trace("Scoring %s: [%d-%d]\n", known.s.origin, unknownStart, unknownEnd)
+func (c *Classifier) score(id string, unknown, known *indexedDocument, unknownStart, unknownEnd int) (float64, int, int) {
+ if c.tc.traceScoring(known.s.origin) {
+ c.tc.trace("Scoring %s: [%d-%d]", known.s.origin, unknownStart, unknownEnd)
}
knownLength := known.size()
diffs := docDiff(id, unknown, unknownStart, unknownEnd, known, 0, knownLength)
start, end := diffRange(known.norm, diffs)
- distance := scoreDiffs(diffs[start:end])
+ distance := scoreDiffs(id, diffs[start:end])
if distance < 0 {
// If the distance is negative, this indicates an unacceptable diff so we return a zero-confidence match.
- if traceScoring(known.s.origin) {
- Trace("Distance result %v, rejected match", distance)
+ if c.tc.traceScoring(known.s.origin) {
+ c.tc.trace("Distance result %v, rejected match", distance)
}
return 0.0, 0, 0
}
@@ -64,8 +64,8 @@ func score(id string, unknown, known *indexedDocument, unknownStart, unknownEnd
// target.
conf, so, eo := confidencePercentage(knownLength, distance), textLength(diffs[:start]), textLength(diffs[end:])
- if traceScoring(known.s.origin) {
- Trace("Score result: %v [%d-%d]\n", conf, so, eo)
+ if c.tc.traceScoring(known.s.origin) {
+ c.tc.trace("Score result: %v [%d-%d]", conf, so, eo)
}
return conf, so, eo
}
@@ -110,7 +110,7 @@ func diffLevenshteinWord(diffs []diffmatchpatch.Diff) int {
// negative value means that the changes represented by the diff are not an
// acceptable transformation since it would change the underlying license. A
// positive value indicates the Levenshtein word distance.
-func scoreDiffs(diffs []diffmatchpatch.Diff) int {
+func scoreDiffs(id string, diffs []diffmatchpatch.Diff) int {
// We make a pass looking for unacceptable substitutions
// Delete diffs are always ordered before insert diffs. This is leveraged to
// analyze a change by checking an insert against the delete text that was
@@ -135,25 +135,33 @@ func scoreDiffs(diffs []diffmatchpatch.Diff) int {
// these are words or phrases that appear in a single/small number of
// licenses. Can we leverage frequency analysis to identify these
// interesting words/phrases and auto-extract them?
- for _, p := range []string{
- "autoconf exception",
- "class path exception",
- "gcc linking exception",
- "bison exception",
- "font exception",
- "imagemagick",
- "x consortium",
- "apache",
- "bsd",
- "affero",
- "sun standards",
- "silicon graphics",
- "php",
- "acknowledgment",
- "atmel",
- } {
- if strings.Index(text, p) != -1 {
- return introducedPhraseChange
+
+ inducedPhrases := map[string][]string{
+ "AGPL": {"affero"},
+ "Atmel": {"atmel"},
+ "Apache": {"apache"},
+ "BSD": {"bsd"},
+ "BSD-3-Clause-Attribution": {"acknowledgment"},
+ "GPL-2.0-with-GCC-exception": {"gcc linking exception"},
+ "GPL-2.0-with-autoconf-exception": {"autoconf exception"},
+ "GPL-2.0-with-bison-exception": {"bison exception"},
+ "GPL-2.0-with-classpath-exception": {"class path exception"},
+ "GPL-2.0-with-font-exception": {"font exception"},
+ "LGPL-2.0": {"library"},
+ "ImageMagick": {"imagemagick"},
+ "PHP": {"php"},
+ "SISSL": {"sun standards"},
+ "SGI-B": {"silicon graphics"},
+ "X11": {"x consortium"},
+ }
+
+ for k, ps := range inducedPhrases {
+ if strings.HasPrefix(id, k) {
+ for _, p := range ps {
+ if strings.Index(text, p) != -1 {
+ return introducedPhraseChange
+ }
+ }
}
}
diff --git a/v2/scoring_test.go b/v2/scoring_test.go
index 17f7ce5..d5d4592 100644
--- a/v2/scoring_test.go
+++ b/v2/scoring_test.go
@@ -15,6 +15,8 @@
package classifier
import (
+ "fmt"
+ "strings"
"testing"
"github.com/sergi/go-diff/diffmatchpatch"
@@ -93,6 +95,7 @@ func TestLevenshteinDiff(t *testing.T) {
func TestScoreDiffs(t *testing.T) {
tests := []struct {
name string
+ license string
diffs []diffmatchpatch.Diff
expected int
}{
@@ -162,7 +165,8 @@ func TestScoreDiffs(t *testing.T) {
expected: lesserGPLChange,
},
{
- name: "license name change by name insertion",
+ name: "license name change by name insertion",
+ license: "ImageMagick",
diffs: []diffmatchpatch.Diff{
{
Type: diffmatchpatch.DiffEqual,
@@ -179,7 +183,7 @@ func TestScoreDiffs(t *testing.T) {
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
- if got := scoreDiffs(test.diffs); got != test.expected {
+ if got := scoreDiffs(test.license, test.diffs); got != test.expected {
t.Errorf("got %d, want %d", got, test.expected)
}
})
@@ -258,19 +262,36 @@ func TestScore(t *testing.T) {
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
- c := NewCorpus(.8)
+ var trace strings.Builder
+ c := NewClassifier(.8)
+ c.SetTraceConfiguration(&TraceConfiguration{
+ TraceLicenses: "*",
+ TracePhases: "*",
+ Tracer: func(f string, args ...interface{}) {
+ trace.WriteString(fmt.Sprintf(f, args...))
+ },
+ })
c.AddContent("known", test.known)
kd := c.docs["known"]
ud := c.createTargetIndexedDocument(test.unknown)
- conf, so, eo := score(test.name, ud, kd, 0, ud.size())
+ conf, so, eo := c.score(test.name, ud, kd, 0, ud.size())
+
+ success := true
if conf != test.expectedConf {
t.Errorf("conf: got %v want %v", conf, test.expectedConf)
+ success = false
}
if so != test.expectedStart {
t.Errorf("start offset: got %v want %v", so, test.expectedStart)
+ success = false
}
if eo != test.expectedEnd {
t.Errorf("end offset: got %v want %v", so, test.expectedEnd)
+ success = false
+ }
+
+ if !success {
+ t.Errorf("Trace:\n%s", trace.String())
}
})
}
diff --git a/v2/searchset.go b/v2/searchset.go
index 094c9fe..f43c4f2 100644
--- a/v2/searchset.go
+++ b/v2/searchset.go
@@ -169,10 +169,10 @@ func (m matchRanges) Less(i, j int) bool {
// findPotentialMatches returns the ranges in the target (unknown) text that
// are best potential matches to the source (known) text.
-func findPotentialMatches(src, target *searchSet, confidence float64) matchRanges {
- matchedRanges := getMatchedRanges(src, target, confidence, src.q)
- if traceSearchset(src.origin) {
- Trace("matchedRanges = %s", spew.Sdump(matchedRanges))
+func (c *Classifier) findPotentialMatches(src, target *searchSet, confidence float64) matchRanges {
+ matchedRanges := c.getMatchedRanges(src, target, confidence, src.q)
+ if c.tc.traceSearchset(src.origin) {
+ c.tc.trace("matchedRanges = %s", spew.Sdump(matchedRanges))
}
if len(matchedRanges) == 0 {
return nil
@@ -199,7 +199,7 @@ func findPotentialMatches(src, target *searchSet, confidence float64) matchRange
// target document. This routine intentionally does not accurately track error
// contributions from merging runs, trading false positives (but not false
// negatives), for faster performance.
-func fuseRanges(origin string, matched matchRanges, confidence float64, size int, runs []matchRange, targetSize int) matchRanges {
+func (c *Classifier) fuseRanges(origin string, matched matchRanges, confidence float64, size int, runs []matchRange, targetSize int) matchRanges {
var claimed matchRanges
errorMargin := int(math.Round(float64(size) * (1.0 - confidence)))
@@ -297,15 +297,15 @@ func fuseRanges(origin string, matched matchRanges, confidence float64, size int
if unclaimed && m.TokensClaimed*10 > matched[0].TokensClaimed {
claimed = append(claimed, m)
}
- if traceSearchset(origin) {
- Trace("after %d ranges, claimed is %s\n", i, spew.Sdump(claimed))
+ if c.tc.traceSearchset(origin) {
+ c.tc.trace("after %d ranges, claimed is %s", i, spew.Sdump(claimed))
}
}
sort.Sort(claimed)
- if traceSearchset(origin) {
- Trace("filterPasses = %+v\n", filterPasses)
- Trace("filterDrops = %+v\n", filterDrops)
- Trace("claimed = %s", spew.Sdump(claimed))
+ if c.tc.traceSearchset(origin) {
+ c.tc.trace("filterPasses = %+v", filterPasses)
+ c.tc.trace("filterDrops = %+v", filterDrops)
+ c.tc.trace("claimed = %s", spew.Sdump(claimed))
}
return claimed
}
@@ -313,15 +313,17 @@ func fuseRanges(origin string, matched matchRanges, confidence float64, size int
// getMatchedRanges finds the ranges in the target text that match the source
// text. The ranges returned are ordered from the entries with the most matched
// tokens to the least.
-func getMatchedRanges(src, target *searchSet, confidence float64, q int) matchRanges {
- if traceSearchset(src.origin) {
- Trace("src.origin = %+v\n", src.origin)
+func (c *Classifier) getMatchedRanges(src, target *searchSet, confidence float64, q int) matchRanges {
+ shouldTrace := c.tc.traceSearchset(src.origin)
+
+ if shouldTrace {
+ c.tc.trace("src.origin = %+v", src.origin)
}
// Assemble a list of all the matched q-grams without any consideration to
// error tolerances.
matched := targetMatchedRanges(src, target)
- if traceSearchset(src.origin) {
- Trace("matched = %s", spew.Sdump(matched))
+ if shouldTrace {
+ c.tc.trace("matched = %s", spew.Sdump(matched))
}
if len(matched) == 0 {
return nil
@@ -340,10 +342,10 @@ func getMatchedRanges(src, target *searchSet, confidence float64, q int) matchRa
// significantly since processing token matches is an N^2 (or worse)
// operation, so reducing N is a big win.
- runs := detectRuns(src.origin, matched, len(target.Tokens), len(src.Tokens), confidence, q)
+ runs := c.detectRuns(src.origin, matched, len(target.Tokens), len(src.Tokens), confidence, q)
- if traceSearchset(src.origin) {
- Trace("runs = %d: %s\n", len(runs), spew.Sdump(runs))
+ if shouldTrace {
+ c.tc.trace("runs = %d: %s", len(runs), spew.Sdump(runs))
}
// If there are no target runs of source tokens, we're done.
@@ -355,14 +357,15 @@ func getMatchedRanges(src, target *searchSet, confidence float64, q int) matchRa
// match ranges into larger matches (with possible errors) to see if we can
// produce large enough runs that pass the confidence threshold.
- fr := fuseRanges(src.origin, matched, confidence, len(src.Tokens), runs, len(target.Tokens))
- if traceSearchset(src.origin) {
- Trace("fr = %s", spew.Sdump(fr))
+ fr := c.fuseRanges(src.origin, matched, confidence, len(src.Tokens), runs, len(target.Tokens))
+ if shouldTrace {
+ c.tc.trace("fr = %s", spew.Sdump(fr))
}
return fr
}
-func detectRuns(origin string, matched matchRanges, targetLength, subsetLength int, threshold float64, q int) []matchRange {
+func (c *Classifier) detectRuns(origin string, matched matchRanges, targetLength, subsetLength int, threshold float64, q int) []matchRange {
+ shouldTrace := c.tc.traceSearchset(origin)
hits := make([]bool, targetLength)
for _, m := range matched {
for idx := m.TargetStart; idx < m.TargetEnd; idx++ {
@@ -377,17 +380,17 @@ func detectRuns(origin string, matched matchRanges, targetLength, subsetLength i
total := 0
target := int(float64(subsetLength) * threshold)
- if traceSearchset(origin) {
- Trace("target = %+v\n", target)
- Trace("targetLength = %+v\n", targetLength)
- Trace("subsetLength = %+v\n", subsetLength)
+ if shouldTrace {
+ c.tc.trace("target = %+v", target)
+ c.tc.trace("targetLength = %+v", targetLength)
+ c.tc.trace("subsetLength = %+v", subsetLength)
}
// If we don't have at least 1 subset (i.e. the target is shorter than the
// source) just analyze what we have.
if len(hits) < subsetLength {
- if traceSearchset(origin) {
- Trace("trimmed search length from %d to %d\n", subsetLength, len(hits))
+ if shouldTrace {
+ c.tc.trace("trimmed search length from %d to %d", subsetLength, len(hits))
}
subsetLength = len(hits)
}
diff --git a/v2/searchset_test.go b/v2/searchset_test.go
index 19ea413..a231311 100644
--- a/v2/searchset_test.go
+++ b/v2/searchset_test.go
@@ -15,7 +15,9 @@
package classifier
import (
+ "fmt"
"reflect"
+ "strings"
"testing"
"github.com/davecgh/go-spew/spew"
@@ -86,10 +88,19 @@ func TestSearchSet_New(t *testing.T) {
}
for _, tt := range tests {
- corpus := NewCorpus(.8) // This value doesn't affect the test.
- corpus.AddContent("text", tt.text)
- if got := newSearchSet(corpus.docs["text"], tt.q); !reflect.DeepEqual(got, tt.want) {
+ var trace strings.Builder
+ c := NewClassifier(.8) // This value doesn't affect the test.
+ c.SetTraceConfiguration(&TraceConfiguration{
+ TraceLicenses: "*",
+ TracePhases: "*",
+ Tracer: func(f string, args ...interface{}) {
+ trace.WriteString(fmt.Sprintf(f, args...))
+ },
+ })
+ c.AddContent("text", tt.text)
+ if got := newSearchSet(c.docs["text"], tt.q); !reflect.DeepEqual(got, tt.want) {
t.Errorf("New(%q) = %+v, want %+v", tt.description, spew.Sdump(got), spew.Sdump(tt.want))
+ t.Errorf("Trace:\n%s", trace.String())
}
}
}
@@ -133,14 +144,23 @@ func TestFindPotentialMatches(t *testing.T) {
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
- c := NewCorpus(test.confidence)
+ var trace strings.Builder
+ c := NewClassifier(test.confidence)
+ c.SetTraceConfiguration(&TraceConfiguration{
+ TraceLicenses: "*",
+ TracePhases: "*",
+ Tracer: func(f string, args ...interface{}) {
+ trace.WriteString(fmt.Sprintf(f, args...))
+ },
+ })
c.AddContent("source", test.src)
doc := c.createTargetIndexedDocument(test.target)
doc.generateSearchSet(c.q)
- hits := findPotentialMatches(c.docs["source"].s, doc.s, test.confidence)
+ hits := c.findPotentialMatches(c.docs["source"].s, doc.s, test.confidence)
if actual := len(hits); actual != test.expectedHits {
t.Errorf("got %d hits, wanted %d", actual, test.expectedHits)
+ t.Errorf("Trace:\n%s", trace.String())
}
})
}
@@ -249,10 +269,20 @@ func TestFuseRanges(t *testing.T) {
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
- runs := detectRuns(test.name, test.in, 100, 100, test.conf, 4)
- actual := fuseRanges(test.name, test.in, test.conf, test.size, runs, 100)
+ var trace strings.Builder
+ c := NewClassifier(.8)
+ c.SetTraceConfiguration(&TraceConfiguration{
+ TraceLicenses: "*",
+ TracePhases: "*",
+ Tracer: func(f string, args ...interface{}) {
+ trace.WriteString(fmt.Sprintf(f, args...))
+ },
+ })
+ runs := c.detectRuns(test.name, test.in, 100, 100, test.conf, 4)
+ actual := c.fuseRanges(test.name, test.in, test.conf, test.size, runs, 100)
if !cmp.Equal(actual, test.out) {
t.Errorf("%v: %v", test.name, cmp.Diff(actual, test.out))
+ t.Errorf("Trace:\n%s", trace.String())
}
})
}
@@ -338,8 +368,18 @@ func TestDetectRuns(t *testing.T) {
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
- if got := detectRuns(test.name, test.matched, test.targetLength, test.subsetLength, test.threshold, test.q); !cmp.Equal(got, test.expected) {
+ var trace strings.Builder
+ c := NewClassifier(.8)
+ c.SetTraceConfiguration(&TraceConfiguration{
+ TraceLicenses: "*",
+ TracePhases: "*",
+ Tracer: func(f string, args ...interface{}) {
+ trace.WriteString(fmt.Sprintf(f, args...))
+ },
+ })
+ if got := c.detectRuns(test.name, test.matched, test.targetLength, test.subsetLength, test.threshold, test.q); !cmp.Equal(got, test.expected) {
t.Errorf(cmp.Diff(got, test.expected))
+ t.Errorf("Trace:\n%s", trace.String())
}
})
diff --git a/v2/tokenizer.go b/v2/tokenizer.go
index 6138112..beb7961 100644
--- a/v2/tokenizer.go
+++ b/v2/tokenizer.go
@@ -45,7 +45,14 @@ func cleanupToken(in string) string {
out.WriteRune(c)
}
}
- return out.String()
+
+ // Numbers should not end in a . since that doesn't indicate a version
+ // number, but usually an end of a line.
+ res := out.String()
+ for strings.HasSuffix(res, ".") {
+ res = res[0 : len(res)-1]
+ }
+ return res
}
}
diff --git a/v2/tokenizer_test.go b/v2/tokenizer_test.go
index 4a4639b..7ea66cc 100644
--- a/v2/tokenizer_test.go
+++ b/v2/tokenizer_test.go
@@ -202,7 +202,7 @@ func TestTokenizer(t *testing.T) {
{
name: "preserve version number (not a header, but header-looking) not at beginning of sentence",
input: "This is version 1.1.",
- output: "this is version 1.1.",
+ output: "this is version 1.1",
},
{
name: "copyright inside a comment",
diff --git a/v2/trace.go b/v2/trace.go
index 9108d3b..cdb03a8 100644
--- a/v2/trace.go
+++ b/v2/trace.go
@@ -15,33 +15,45 @@
package classifier
import (
- "flag"
"fmt"
"strings"
)
// This file contains routines for a simple trace execution mechanism.
-//
-// The constant map lookups do incur some overhead and could be optimized. One possible approach
-// would be to sample the values at the time Match() is called and then store the results in a cached
-// format. This would have to be done in a threadsafe manner.
-var traceLicensesFlag = flag.String("trace_licenses", "", "comma-separated list of licenses for tracing")
-var tracePhasesFlag = flag.String("trace_phases", "", "comma-separated list of licenses for tracing")
-
-func initTrace() {
- // Sample the command line flags and set the tracing variables
- traceLicenses = make(map[string]bool)
- tracePhases = make(map[string]bool)
-
- if len(*traceLicensesFlag) > 0 {
- for _, lic := range strings.Split(*traceLicensesFlag, ",") {
- traceLicenses[lic] = true
+
+// TraceConfiguration specifies the configuration for tracing execution of the
+// license classifier.
+type TraceConfiguration struct {
+ // Comma-separated list of phases to be traced. Can use * for all phases.
+ TracePhases string
+ // Comma-separated list of licenses to be traced. Can use * as a suffix to
+ // match prefixes, or by itself to match all licenses.
+ TraceLicenses string
+
+ // Tracer specifies a TraceFunc used to capture tracing information.
+ // If not supplied, emits using fmt.Printf
+ Tracer TraceFunc
+ tracePhases map[string]bool
+ traceLicenses map[string]bool
+}
+
+func (t *TraceConfiguration) init() {
+ if t == nil {
+ return
+ }
+ // Sample the config values to create the lookup maps
+ t.traceLicenses = make(map[string]bool)
+ t.tracePhases = make(map[string]bool)
+
+ if len(t.TraceLicenses) > 0 {
+ for _, lic := range strings.Split(t.TraceLicenses, ",") {
+ t.traceLicenses[lic] = true
}
}
- if len(*tracePhasesFlag) > 0 {
- for _, phase := range strings.Split(*tracePhasesFlag, ",") {
- tracePhases[phase] = true
+ if len(t.TracePhases) > 0 {
+ for _, phase := range strings.Split(t.TracePhases, ",") {
+ t.tracePhases[phase] = true
}
}
}
@@ -49,32 +61,61 @@ func initTrace() {
var traceLicenses map[string]bool
var tracePhases map[string]bool
-func shouldTrace(phase string) bool {
- return tracePhases[phase]
+func (t *TraceConfiguration) shouldTrace(phase string) bool {
+ if t == nil {
+ return false
+ }
+ if t.tracePhases["*"] {
+ return true
+ }
+ return t.tracePhases[phase]
}
-func isTraceLicense(lic string) bool {
- return traceLicenses[lic]
+func (t *TraceConfiguration) isTraceLicense(lic string) bool {
+ if t == nil {
+ return false
+ }
+ if t.traceLicenses[lic] {
+ return true
+ }
+
+ for e := range t.traceLicenses {
+ if idx := strings.Index(e, "*"); idx != -1 {
+ if strings.HasPrefix(lic, e[0:idx]) {
+ return true
+ }
+ }
+ }
+
+ return false
}
-func traceSearchset(lic string) bool {
- return traceLicenses[lic] && shouldTrace("searchset")
+func (t *TraceConfiguration) trace(f string, args ...interface{}) {
+ if t == nil || t.Tracer == nil {
+ fmt.Printf(f, args...)
+ fmt.Println()
+ return
+ }
+
+ t.Tracer(f, args...)
}
-func traceTokenize(lic string) bool {
- return traceLicenses[lic] && shouldTrace("tokenize")
+func (t *TraceConfiguration) traceSearchset(lic string) bool {
+ return t.isTraceLicense(lic) && t.shouldTrace("searchset")
}
-func traceScoring(lic string) bool {
- return traceLicenses[lic] && shouldTrace("score")
+func (t *TraceConfiguration) traceTokenize(lic string) bool {
+ return t.isTraceLicense(lic) && t.shouldTrace("tokenize")
}
-func traceFrequency(lic string) bool {
- return traceLicenses[lic] && shouldTrace("frequency")
+func (t *TraceConfiguration) traceScoring(lic string) bool {
+ return t.isTraceLicense(lic) && t.shouldTrace("score")
}
-type traceFunc func(string, ...interface{}) (int, error)
+func (t *TraceConfiguration) traceFrequency(lic string) bool {
+ return t.isTraceLicense(lic) && t.shouldTrace("frequency")
+}
-// Trace holds the function that should be called to emit data. This can be overridden as desired,
-// defaulting to output on stdout.
-var Trace traceFunc = fmt.Printf
+// TraceFunc works like fmt.Printf to emit tracing data for the
+// classifier.
+type TraceFunc func(string, ...interface{})
diff --git a/v2/trace_test.go b/v2/trace_test.go
index 6a95a72..ec30002 100644
--- a/v2/trace_test.go
+++ b/v2/trace_test.go
@@ -51,15 +51,122 @@ func TestInitTrace(t *testing.T) {
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
- *traceLicensesFlag = test.licFlag
- *tracePhasesFlag = test.phaseFlag
- initTrace()
- if !cmp.Equal(traceLicenses, test.expectedLics) {
+ tc := &TraceConfiguration{
+ TraceLicenses: test.licFlag,
+ TracePhases: test.phaseFlag,
+ }
+ tc.init()
+ if !cmp.Equal(tc.traceLicenses, test.expectedLics) {
t.Errorf("got %v want %v", traceLicenses, test.expectedLics)
}
- if !cmp.Equal(tracePhases, test.expectedPhases) {
+ if !cmp.Equal(tc.tracePhases, test.expectedPhases) {
t.Errorf("got %v want %v", traceLicenses, test.expectedPhases)
}
})
}
}
+
+func TestPhaseWildcardMatching(t *testing.T) {
+ tests := []struct {
+ name string
+ phases string
+ hits []string
+ misses []string
+ }{
+ {
+ name: "exact match",
+ phases: "scoring",
+ hits: []string{"scoring"},
+ misses: []string{"tokenize"},
+ },
+ {
+ name: "all match",
+ phases: "*",
+ hits: []string{"scoring", "tokenize"},
+ misses: nil,
+ },
+ }
+ for _, test := range tests {
+ t.Run(test.name, func(t *testing.T) {
+ tc := &TraceConfiguration{
+ TracePhases: test.phases,
+ }
+ tc.init()
+ for _, h := range test.hits {
+ if !tc.shouldTrace(h) {
+ t.Errorf("unexpected miss on phase %s", h)
+ }
+ }
+
+ for _, m := range test.misses {
+ if tc.shouldTrace(m) {
+ t.Errorf("unexpected hit on phase %s", m)
+ }
+ }
+ })
+ }
+}
+
+func TestLicenseWildcardMatching(t *testing.T) {
+ tests := []struct {
+ name string
+ licenses string
+ hits []string
+ misses []string
+ }{
+ {
+ name: "exact match",
+ hits: []string{"GPL-2.0"},
+ misses: []string{"Apache-2.0", "GPL-3.0"},
+ licenses: "GPL-2.0",
+ },
+ {
+ name: "prefix match",
+ hits: []string{"GPL-2.0", "GPL-3.0"},
+ misses: []string{"Apache-2.0"},
+ licenses: "GPL-*",
+ },
+ {
+ name: "all match",
+ hits: []string{"GPL-2.0", "GPL-3.0", "Apache-2.0"},
+ misses: nil,
+ licenses: "*",
+ },
+ }
+
+ for _, test := range tests {
+ t.Run(test.name, func(t *testing.T) {
+ tc := &TraceConfiguration{
+ TraceLicenses: test.licenses,
+ }
+ tc.init()
+ for _, h := range test.hits {
+ if !tc.isTraceLicense(h) {
+ t.Errorf("unexpected miss on license %s", h)
+ }
+ }
+
+ for _, m := range test.misses {
+ if tc.isTraceLicense(m) {
+ t.Errorf("unexpected hit on license %s", m)
+ }
+ }
+ })
+ }
+}
+
+// The TraceConfiguration is only explicitly initialized and propagated to a
+// variety of helper structs. For convenience, we just make it work safely in
+// the case the pointer is nil. This test ensures that behavior so users of the
+// TraceConfiguration don't need to explicitly initialize it.
+func TestNilSafety(t *testing.T) {
+ var tc *TraceConfiguration
+ tc.init()
+ if tc.isTraceLicense("GPL-2.0") {
+ t.Errorf("unexpected hit on license")
+ }
+
+ if tc.shouldTrace("scoring") {
+ t.Errorf("unexpected hit on phase")
+ }
+}