diff options
-rw-r--r-- | classifier.go | 46 | ||||
-rw-r--r-- | classifier_test.go | 38 | ||||
-rw-r--r-- | commentparser/comment_parser_test.go | 2 | ||||
-rw-r--r-- | go.mod | 9 | ||||
-rw-r--r-- | go.sum | 11 | ||||
-rw-r--r-- | license_type.go | 4 | ||||
-rw-r--r-- | serializer/serializer_test.go | 2 | ||||
-rw-r--r-- | tools/identify_license/bench_test.go | 96 |
8 files changed, 114 insertions, 94 deletions
diff --git a/classifier.go b/classifier.go index 0af9c1c..8d39caf 100644 --- a/classifier.go +++ b/classifier.go @@ -75,49 +75,31 @@ type License struct { // Threshold is the lowest confidence percentage acceptable for the // classifier. Threshold float64 - - // archive is the path to the license archive - archive string -} - -// OptionFunc set options on a License struct. -type OptionFunc func(l *License) error - -// Archive is an OptionFunc to specify the location of the license archive file. -func Archive(f string) OptionFunc { - return func(l *License) error { - l.archive = f - return nil - } } // New creates a license classifier and pre-loads it with known open source licenses. -func New(threshold float64, options ...OptionFunc) (*License, error) { +func New(threshold float64) (*License, error) { classifier := &License{ c: stringclassifier.New(threshold, Normalizers...), Threshold: threshold, - archive: LicenseArchive, } - - for _, o := range options { - err := o(classifier) - if err != nil { - return nil, fmt.Errorf("error setting option %v: %v", o, err) - } - } - - if err := classifier.registerLicenses(); err != nil { - return nil, fmt.Errorf("cannot register licenses from %q: %v", classifier.archive, err) + if err := classifier.registerLicenses(LicenseArchive); err != nil { + return nil, fmt.Errorf("cannot register licenses: %v", err) } return classifier, nil } // NewWithForbiddenLicenses creates a license classifier and pre-loads it with // known open source licenses which are forbidden. -func NewWithForbiddenLicenses(threshold float64, options ...OptionFunc) (*License, error) { - opts := []OptionFunc{Archive(ForbiddenLicenseArchive)} - opts = append(opts, options...) - return New(threshold, opts...) +func NewWithForbiddenLicenses(threshold float64) (*License, error) { + classifier := &License{ + c: stringclassifier.New(threshold, Normalizers...), + Threshold: threshold, + } + if err := classifier.registerLicenses(ForbiddenLicenseArchive); err != nil { + return nil, fmt.Errorf("cannot register licenses: %v", err) + } + return classifier, nil } // WithinConfidenceThreshold returns true if the confidence value is above or @@ -196,8 +178,8 @@ type archivedValue struct { // registerLicenses loads all known licenses and adds them to c as known values // for comparison. The allocated space after ingesting the 'licenses.db' // archive is ~167M. -func (c *License) registerLicenses() error { - contents, err := ReadLicenseFile(c.archive) +func (c *License) registerLicenses(archive string) error { + contents, err := ReadLicenseFile(archive) if err != nil { return err } diff --git a/classifier_test.go b/classifier_test.go index b997a35..7ba32e6 100644 --- a/classifier_test.go +++ b/classifier_test.go @@ -797,41 +797,3 @@ func BenchmarkClassifier(b *testing.B) { classifier.NearestMatch(contents) } } - -func TestNew(t *testing.T) { - tests := []struct { - desc string - options []OptionFunc - wantArchive string - wantErr bool - }{ - { - desc: "no options, use default", - options: []OptionFunc{}, - wantArchive: LicenseArchive, - }, - { - desc: "specify ForbiddenLicenseArchive", - options: []OptionFunc{Archive(ForbiddenLicenseArchive)}, - wantArchive: ForbiddenLicenseArchive, - }, - { - desc: "file doesn't exist results in error", - options: []OptionFunc{Archive("doesnotexist")}, - wantArchive: "doesnotexist", - wantErr: true, - }, - } - for _, tt := range tests { - t.Run(tt.desc, func(t *testing.T) { - c, err := New(0.5, tt.options...) - if tt.wantErr != (err != nil) { - t.Fatalf("unexpected error: %v", err) - } - if err == nil && c.archive != tt.wantArchive { - t.Errorf("got archive %v, want %v", c.archive, tt.wantArchive) - } - }) - } - -} diff --git a/commentparser/comment_parser_test.go b/commentparser/comment_parser_test.go index 6b5429a..d1e0d1a 100644 --- a/commentparser/comment_parser_test.go +++ b/commentparser/comment_parser_test.go @@ -18,7 +18,7 @@ import ( "reflect" "testing" - "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp" "github.com/google/licenseclassifier/commentparser/language" ) @@ -1,9 +0,0 @@ -module github.com/google/licenseclassifier - -go 1.11 - -require ( - github.com/google/go-cmp v0.2.0 - github.com/sergi/go-diff v1.0.0 - github.com/stretchr/testify v1.3.0 // indirect -) @@ -1,11 +0,0 @@ -github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8= -github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/google/go-cmp v0.2.0 h1:+dTQ8DZQJz0Mb/HjFlkptS1FeQ4cWSnN941F8aEG4SQ= -github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= -github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= -github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/sergi/go-diff v1.0.0 h1:Kpca3qRNrduNnOQeazBd0ysaKrUJiIuISHxogkT9RPQ= -github.com/sergi/go-diff v1.0.0/go.mod h1:0CfEIISq7TuYL3j771MWULgwwjU+GofnZX9QAmXWZgo= -github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= -github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q= -github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= diff --git a/license_type.go b/license_type.go index 25cb369..43923be 100644 --- a/license_type.go +++ b/license_type.go @@ -22,9 +22,9 @@ package licenseclassifier import "github.com/google/licenseclassifier/internal/sets" // Canonical names of the licenses. -// The names come from the https://spdx.org/licenses website, and are -// also the filenames of the licenses in licenseclassifier/licenses. const ( + // The names come from the https://spdx.org/licenses website, and are + // also the filenames of the licenses in licenseclassifier/licenses. AFL11 = "AFL-1.1" AFL12 = "AFL-1.2" AFL20 = "AFL-2.0" diff --git a/serializer/serializer_test.go b/serializer/serializer_test.go index 755c601..1bca727 100644 --- a/serializer/serializer_test.go +++ b/serializer/serializer_test.go @@ -213,7 +213,7 @@ func compareSearchSets(x, y *searchset.SearchSet) error { return fmt.Errorf("Hash keys differ = %d vs %d", xKeys[i], yKeys[i]) } if !reflect.DeepEqual(x.Hashes[xKeys[i]], y.Hashes[yKeys[i]]) { - return fmt.Errorf("Hash values differ = %v vs %v", x.Hashes[xKeys[i]], y.Hashes[yKeys[i]]) + return fmt.Errorf("Hash values differ = %d vs %d", x.Hashes[xKeys[i]], y.Hashes[yKeys[i]]) } } diff --git a/tools/identify_license/bench_test.go b/tools/identify_license/bench_test.go new file mode 100644 index 0000000..1320a87 --- /dev/null +++ b/tools/identify_license/bench_test.go @@ -0,0 +1,96 @@ +// Copyright 2017 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// The identify_license program tries to identify the license type of an +// unknown license. The file containing the license text is specified on the +// command line. Multiple license files can be analyzed with a single command. +// The type of the license is returned along with the confidence level of the +// match. The confidence level is between 0.0 and 1.0, with 1.0 indicating an +// exact match and 0.0 indicating a complete mismatch. The results are sorted +// by confidence level. +// +// $ identifylicense LICENSE1 LICENSE2 +// LICENSE2: MIT (confidence: 0.987) +// LICENSE1: BSD-2-Clause (confidence: 0.833) + +package bench_test + +import ( + "context" + "fmt" + "log" + "os" + "path/filepath" + "sort" + "testing" + "time" + + "github.com/google/licenseclassifier" + "github.com/google/licenseclassifier/tools/identify_license/backend" + "google3/base/go/flag" +) + +var ( + headers = flag.Bool("headers", false, "match license headers") + forbiddenOnly = flag.Bool("forbidden", false, "identify using forbidden licenses archive") + threshold = flag.Float64("threshold", licenseclassifier.DefaultConfidenceThreshold, "confidence threshold") + timeout = flag.Duration("timeout", 24*time.Hour, "timeout before giving up on classifying a file.") +) + +func init() { + flag.Usage = func() { + fmt.Fprintf(os.Stderr, `Usage: %s <licensefile> ... + +Identify an unknown license. + +Options: +`, filepath.Base(os.Args[0])) + flag.PrintDefaults() + } +} + +func BenchmarkIdentifyLicense(b *testing.B) { + be, err := backend.New(*threshold, *forbiddenOnly) + if err != nil { + be.Close() + log.Fatalf("cannot create license classifier: %v", err) + } + + ctx, cancel := context.WithTimeout(context.Background(), *timeout) + defer func() { + b.StopTimer() + cancel() + }() + b.StartTimer() + if errs := be.ClassifyLicensesWithContext(ctx, flag.Args(), *headers); errs != nil { + be.Close() + for _, err := range errs { + log.Printf("classify license failed: %v", err) + } + log.Fatal("cannot classify licenses") + } + + results := be.GetResults() + if len(results) == 0 { + be.Close() + log.Fatal("Couldn't classify license(s)") + } + + sort.Sort(results) + for _, r := range results { + fmt.Printf("%s: %s (confidence: %v, offset: %v, extent: %v)\n", + r.Filename, r.Name, r.Confidence, r.Offset, r.Extent) + } + be.Close() +} |