diff options
-rw-r--r-- | classifier.go | 46 | ||||
-rw-r--r-- | classifier_test.go | 38 | ||||
-rw-r--r-- | commentparser/comment_parser_test.go | 2 | ||||
-rw-r--r-- | go.mod | 9 | ||||
-rw-r--r-- | go.sum | 11 | ||||
-rw-r--r-- | license_type.go | 4 | ||||
-rw-r--r-- | serializer/serializer_test.go | 2 | ||||
-rw-r--r-- | tools/identify_license/bench_test.go | 96 |
8 files changed, 94 insertions, 114 deletions
diff --git a/classifier.go b/classifier.go index 8d39caf..0af9c1c 100644 --- a/classifier.go +++ b/classifier.go @@ -75,31 +75,49 @@ type License struct { // Threshold is the lowest confidence percentage acceptable for the // classifier. Threshold float64 + + // archive is the path to the license archive + archive string +} + +// OptionFunc set options on a License struct. +type OptionFunc func(l *License) error + +// Archive is an OptionFunc to specify the location of the license archive file. +func Archive(f string) OptionFunc { + return func(l *License) error { + l.archive = f + return nil + } } // New creates a license classifier and pre-loads it with known open source licenses. -func New(threshold float64) (*License, error) { +func New(threshold float64, options ...OptionFunc) (*License, error) { classifier := &License{ c: stringclassifier.New(threshold, Normalizers...), Threshold: threshold, + archive: LicenseArchive, } - if err := classifier.registerLicenses(LicenseArchive); err != nil { - return nil, fmt.Errorf("cannot register licenses: %v", err) + + for _, o := range options { + err := o(classifier) + if err != nil { + return nil, fmt.Errorf("error setting option %v: %v", o, err) + } + } + + if err := classifier.registerLicenses(); err != nil { + return nil, fmt.Errorf("cannot register licenses from %q: %v", classifier.archive, err) } return classifier, nil } // NewWithForbiddenLicenses creates a license classifier and pre-loads it with // known open source licenses which are forbidden. -func NewWithForbiddenLicenses(threshold float64) (*License, error) { - classifier := &License{ - c: stringclassifier.New(threshold, Normalizers...), - Threshold: threshold, - } - if err := classifier.registerLicenses(ForbiddenLicenseArchive); err != nil { - return nil, fmt.Errorf("cannot register licenses: %v", err) - } - return classifier, nil +func NewWithForbiddenLicenses(threshold float64, options ...OptionFunc) (*License, error) { + opts := []OptionFunc{Archive(ForbiddenLicenseArchive)} + opts = append(opts, options...) + return New(threshold, opts...) } // WithinConfidenceThreshold returns true if the confidence value is above or @@ -178,8 +196,8 @@ type archivedValue struct { // registerLicenses loads all known licenses and adds them to c as known values // for comparison. The allocated space after ingesting the 'licenses.db' // archive is ~167M. -func (c *License) registerLicenses(archive string) error { - contents, err := ReadLicenseFile(archive) +func (c *License) registerLicenses() error { + contents, err := ReadLicenseFile(c.archive) if err != nil { return err } diff --git a/classifier_test.go b/classifier_test.go index 7ba32e6..b997a35 100644 --- a/classifier_test.go +++ b/classifier_test.go @@ -797,3 +797,41 @@ func BenchmarkClassifier(b *testing.B) { classifier.NearestMatch(contents) } } + +func TestNew(t *testing.T) { + tests := []struct { + desc string + options []OptionFunc + wantArchive string + wantErr bool + }{ + { + desc: "no options, use default", + options: []OptionFunc{}, + wantArchive: LicenseArchive, + }, + { + desc: "specify ForbiddenLicenseArchive", + options: []OptionFunc{Archive(ForbiddenLicenseArchive)}, + wantArchive: ForbiddenLicenseArchive, + }, + { + desc: "file doesn't exist results in error", + options: []OptionFunc{Archive("doesnotexist")}, + wantArchive: "doesnotexist", + wantErr: true, + }, + } + for _, tt := range tests { + t.Run(tt.desc, func(t *testing.T) { + c, err := New(0.5, tt.options...) + if tt.wantErr != (err != nil) { + t.Fatalf("unexpected error: %v", err) + } + if err == nil && c.archive != tt.wantArchive { + t.Errorf("got archive %v, want %v", c.archive, tt.wantArchive) + } + }) + } + +} diff --git a/commentparser/comment_parser_test.go b/commentparser/comment_parser_test.go index d1e0d1a..6b5429a 100644 --- a/commentparser/comment_parser_test.go +++ b/commentparser/comment_parser_test.go @@ -18,7 +18,7 @@ import ( "reflect" "testing" - "github.com/google/go-cmp" + "github.com/google/go-cmp/cmp" "github.com/google/licenseclassifier/commentparser/language" ) @@ -0,0 +1,9 @@ +module github.com/google/licenseclassifier + +go 1.11 + +require ( + github.com/google/go-cmp v0.2.0 + github.com/sergi/go-diff v1.0.0 + github.com/stretchr/testify v1.3.0 // indirect +) @@ -0,0 +1,11 @@ +github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/google/go-cmp v0.2.0 h1:+dTQ8DZQJz0Mb/HjFlkptS1FeQ4cWSnN941F8aEG4SQ= +github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/sergi/go-diff v1.0.0 h1:Kpca3qRNrduNnOQeazBd0ysaKrUJiIuISHxogkT9RPQ= +github.com/sergi/go-diff v1.0.0/go.mod h1:0CfEIISq7TuYL3j771MWULgwwjU+GofnZX9QAmXWZgo= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= diff --git a/license_type.go b/license_type.go index 43923be..25cb369 100644 --- a/license_type.go +++ b/license_type.go @@ -22,9 +22,9 @@ package licenseclassifier import "github.com/google/licenseclassifier/internal/sets" // Canonical names of the licenses. +// The names come from the https://spdx.org/licenses website, and are +// also the filenames of the licenses in licenseclassifier/licenses. const ( - // The names come from the https://spdx.org/licenses website, and are - // also the filenames of the licenses in licenseclassifier/licenses. AFL11 = "AFL-1.1" AFL12 = "AFL-1.2" AFL20 = "AFL-2.0" diff --git a/serializer/serializer_test.go b/serializer/serializer_test.go index 1bca727..755c601 100644 --- a/serializer/serializer_test.go +++ b/serializer/serializer_test.go @@ -213,7 +213,7 @@ func compareSearchSets(x, y *searchset.SearchSet) error { return fmt.Errorf("Hash keys differ = %d vs %d", xKeys[i], yKeys[i]) } if !reflect.DeepEqual(x.Hashes[xKeys[i]], y.Hashes[yKeys[i]]) { - return fmt.Errorf("Hash values differ = %d vs %d", x.Hashes[xKeys[i]], y.Hashes[yKeys[i]]) + return fmt.Errorf("Hash values differ = %v vs %v", x.Hashes[xKeys[i]], y.Hashes[yKeys[i]]) } } diff --git a/tools/identify_license/bench_test.go b/tools/identify_license/bench_test.go deleted file mode 100644 index 1320a87..0000000 --- a/tools/identify_license/bench_test.go +++ /dev/null @@ -1,96 +0,0 @@ -// Copyright 2017 Google Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// The identify_license program tries to identify the license type of an -// unknown license. The file containing the license text is specified on the -// command line. Multiple license files can be analyzed with a single command. -// The type of the license is returned along with the confidence level of the -// match. The confidence level is between 0.0 and 1.0, with 1.0 indicating an -// exact match and 0.0 indicating a complete mismatch. The results are sorted -// by confidence level. -// -// $ identifylicense LICENSE1 LICENSE2 -// LICENSE2: MIT (confidence: 0.987) -// LICENSE1: BSD-2-Clause (confidence: 0.833) - -package bench_test - -import ( - "context" - "fmt" - "log" - "os" - "path/filepath" - "sort" - "testing" - "time" - - "github.com/google/licenseclassifier" - "github.com/google/licenseclassifier/tools/identify_license/backend" - "google3/base/go/flag" -) - -var ( - headers = flag.Bool("headers", false, "match license headers") - forbiddenOnly = flag.Bool("forbidden", false, "identify using forbidden licenses archive") - threshold = flag.Float64("threshold", licenseclassifier.DefaultConfidenceThreshold, "confidence threshold") - timeout = flag.Duration("timeout", 24*time.Hour, "timeout before giving up on classifying a file.") -) - -func init() { - flag.Usage = func() { - fmt.Fprintf(os.Stderr, `Usage: %s <licensefile> ... - -Identify an unknown license. - -Options: -`, filepath.Base(os.Args[0])) - flag.PrintDefaults() - } -} - -func BenchmarkIdentifyLicense(b *testing.B) { - be, err := backend.New(*threshold, *forbiddenOnly) - if err != nil { - be.Close() - log.Fatalf("cannot create license classifier: %v", err) - } - - ctx, cancel := context.WithTimeout(context.Background(), *timeout) - defer func() { - b.StopTimer() - cancel() - }() - b.StartTimer() - if errs := be.ClassifyLicensesWithContext(ctx, flag.Args(), *headers); errs != nil { - be.Close() - for _, err := range errs { - log.Printf("classify license failed: %v", err) - } - log.Fatal("cannot classify licenses") - } - - results := be.GetResults() - if len(results) == 0 { - be.Close() - log.Fatal("Couldn't classify license(s)") - } - - sort.Sort(results) - for _, r := range results { - fmt.Printf("%s: %s (confidence: %v, offset: %v, extent: %v)\n", - r.Filename, r.Name, r.Confidence, r.Offset, r.Extent) - } - be.Close() -} |