diff options
-rw-r--r-- | classifier.go | 2 | ||||
-rw-r--r-- | classifier_test.go | 1 | ||||
-rw-r--r-- | forbidden.go | 1 | ||||
-rw-r--r-- | license_type.go | 1 | ||||
-rw-r--r-- | tools/identify_license/backend/backend.go | 143 | ||||
-rw-r--r-- | tools/identify_license/identify_license.go | 120 | ||||
-rw-r--r-- | tools/identify_license/results/results.go | 42 |
7 files changed, 206 insertions, 104 deletions
diff --git a/classifier.go b/classifier.go index e98f97c..d8a5480 100644 --- a/classifier.go +++ b/classifier.go @@ -35,7 +35,7 @@ import ( ) // DefaultConfidenceThreshold is the minimum confidence percentage we're willing to accept in order -// to say that a match is good. http://go/license-classifier-conf-threshold +// to say that a match is good. const DefaultConfidenceThreshold = 0.80 var ( diff --git a/classifier_test.go b/classifier_test.go index aac9c5f..5e5f635 100644 --- a/classifier_test.go +++ b/classifier_test.go @@ -11,6 +11,7 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. + package licenseclassifier import ( diff --git a/forbidden.go b/forbidden.go index 7719bae..b26e8b6 100644 --- a/forbidden.go +++ b/forbidden.go @@ -11,6 +11,7 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. + package licenseclassifier import "regexp" diff --git a/license_type.go b/license_type.go index a98e153..f0070c3 100644 --- a/license_type.go +++ b/license_type.go @@ -11,6 +11,7 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. + package licenseclassifier // *** NOTE: Update this file when adding a new license. You need to: diff --git a/tools/identify_license/backend/backend.go b/tools/identify_license/backend/backend.go new file mode 100644 index 0000000..587bc16 --- /dev/null +++ b/tools/identify_license/backend/backend.go @@ -0,0 +1,143 @@ +// Copyright 2017 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package backend contains the necessary functions to classify a license. +package backend + +import ( + "fmt" + "io/ioutil" + "log" + "sync" + "time" + + "github.com/google/licenseclassifier" + "github.com/google/licenseclassifier/internal/commentparser" + "github.com/google/licenseclassifier/internal/commentparser/language" + "github.com/google/licenseclassifier/tools/identify_license/results" +) + +// ClassifierInterface is the interface each backend must implement. +type ClassifierInterface interface { + Close() + ClassifyLicenses(filenames []string, headers bool) []error + GetResults() results.LicenseTypes +} + +// ClassifierBackend is an object that handles classifying a license. +type ClassifierBackend struct { + results results.LicenseTypes + mu sync.Mutex + classifier *licenseclassifier.License +} + +// New creates a new backend working on the local filesystem. +func New(threshold float64, forbiddenOnly bool) (*ClassifierBackend, error) { + var lc *licenseclassifier.License + var err error + if forbiddenOnly { + lc, err = licenseclassifier.NewWithForbiddenLicenses(threshold) + } else { + lc, err = licenseclassifier.New(threshold) + } + if err != nil { + return nil, err + } + return &ClassifierBackend{classifier: lc}, nil +} + +// Close does nothing here since there's nothing to close. +func (b *ClassifierBackend) Close() { +} + +// ClassifyLicenses runs the license classifier over the given file. +func (b *ClassifierBackend) ClassifyLicenses(filenames []string, headers bool) (errors []error) { + // Create a pool from which tasks can later be started. We use a pool because the OS limits + // the number of files that can be open at any one time. + const numTasks = 1000 + task := make(chan bool, numTasks) + for i := 0; i < numTasks; i++ { + task <- true + } + + errs := make(chan error, len(filenames)) + + var wg sync.WaitGroup + analyze := func(filename string) { + defer func() { + wg.Done() + task <- true + }() + if err := b.classifyLicense(filename, headers); err != nil { + errs <- err + } + } + + for _, filename := range filenames { + wg.Add(1) + <-task + go analyze(filename) + } + go func() { + wg.Wait() + close(task) + close(errs) + }() + + for err := range errs { + errors = append(errors, err) + } + return errors +} + +// classifyLicense is called by a Go-function to perform the actual +// classification of a license. +func (b *ClassifierBackend) classifyLicense(filename string, headers bool) error { + contents, err := ioutil.ReadFile(filename) + if err != nil { + return fmt.Errorf("unable to read %q: %v", filename, err) + } + + matchLoop := func(contents string) { + for _, m := range b.classifier.MultipleMatch(contents, headers) { + b.mu.Lock() + b.results = append(b.results, &results.LicenseType{ + Filename: filename, + Name: m.Name, + Confidence: m.Confidence, + Offset: m.Offset, + Extent: m.Extent, + }) + b.mu.Unlock() + } + } + + log.Printf("Classifying license(s): %s", filename) + start := time.Now() + if lang := language.ClassifyLanguage(filename); lang == language.Unknown { + matchLoop(string(contents)) + } else { + comments := commentparser.Parse(contents, lang) + for ch := range comments.ChunkIterator() { + matchLoop(ch.String()) + } + } + log.Printf("Finished Classifying License %q: %v", filename, time.Since(start)) + return nil +} + +// GetResults returns the results of the classifications. +func (b *ClassifierBackend) GetResults() results.LicenseTypes { + return b.results +} diff --git a/tools/identify_license/identify_license.go b/tools/identify_license/identify_license.go index c00587a..bf6c3e8 100644 --- a/tools/identify_license/identify_license.go +++ b/tools/identify_license/identify_license.go @@ -28,48 +28,21 @@ package main import ( "flag" "fmt" - "io/ioutil" "log" "os" "path/filepath" "sort" - "sync" - "time" "github.com/google/licenseclassifier" - "github.com/google/licenseclassifier/internal/commentparser" - "github.com/google/licenseclassifier/internal/commentparser/language" + "github.com/google/licenseclassifier/tools/identify_license/backend" ) var ( + headers = flag.Bool("headers", false, "match license headers") forbiddenOnly = flag.Bool("forbidden", false, "identify using forbidden licenses archive") threshold = flag.Float64("threshold", licenseclassifier.DefaultConfidenceThreshold, "confidence threshold") - headers = flag.Bool("headers", false, "match license headers") ) -// licenseType is the assumed type of the unknown license. -type licenseType struct { - filename string - name string - confidence float64 - offset int - extent int -} - -type licenseTypes []*licenseType - -func (lt licenseTypes) Len() int { return len(lt) } -func (lt licenseTypes) Swap(i, j int) { lt[i], lt[j] = lt[j], lt[i] } -func (lt licenseTypes) Less(i, j int) bool { - if lt[i].confidence > lt[j].confidence { - return true - } - if lt[i].confidence < lt[j].confidence { - return false - } - return lt[i].filename < lt[j].filename -} - func init() { flag.Usage = func() { fmt.Fprintf(os.Stderr, `Usage: %s <licensefile> ... @@ -85,89 +58,30 @@ Options: func main() { flag.Parse() - var lc *licenseclassifier.License - var err error - if *forbiddenOnly { - lc, err = licenseclassifier.NewWithForbiddenLicenses(*threshold) - } else { - lc, err = licenseclassifier.New(*threshold) - } + be, err := backend.New(*threshold, *forbiddenOnly) if err != nil { + be.Close() log.Fatalf("cannot create license classifier: %v", err) } - var mu sync.Mutex - var matches licenseTypes - - // Create a pool from which tasks can later be started. We use a pool because the OS limits - // the number of files that can be open at one time. - const numTasks = 1000 - task := make(chan bool, numTasks) - for i := 0; i < numTasks; i++ { - task <- true - } - - var wg sync.WaitGroup - classifyLicense := func(filename string) { - defer func() { - wg.Done() - task <- true - }() - - contents, err := ioutil.ReadFile(filename) - if err != nil { - log.Fatalf("cannot read %q: %v", filename, err) - return + if errs := be.ClassifyLicenses(flag.Args(), *headers); errs != nil { + be.Close() + for _, err := range errs { + log.Printf("classify license failed: %v", err) } - - start := time.Now() - if lang := language.ClassifyLanguage(filename); lang == language.Unknown { - log.Printf("Classifying license(s): %s", filename) - for _, m := range lc.MultipleMatch(string(contents), *headers) { - mu.Lock() - matches = append(matches, &licenseType{ - filename: filename, - name: m.Name, - confidence: m.Confidence, - offset: m.Offset, - extent: m.Extent, - }) - mu.Unlock() - } - } else { - comments := commentparser.Parse(contents, lang) - for ch := range comments.ChunkIterator() { - for _, m := range lc.MultipleMatch(ch.String(), *headers) { - mu.Lock() - matches = append(matches, &licenseType{ - filename: filename, - name: m.Name, - confidence: m.Confidence, - offset: m.Offset, - extent: m.Extent, - }) - mu.Unlock() - } - } - } - - log.Printf("Finished Classifying License %q: %v", filename, time.Since(start)) - } - - for _, unknown := range flag.Args() { - wg.Add(1) - <-task - go classifyLicense(unknown) + log.Fatal("cannot classify licenses") } - wg.Wait() - if len(matches) == 0 { - log.Fatalf("Couldn't classify license(s)") + results := be.GetResults() + if len(results) == 0 { + be.Close() + log.Fatal("Couldn't classify license(s)") } - sort.Sort(matches) - for _, r := range matches { + sort.Sort(results) + for _, r := range results { fmt.Printf("%s: %s (confidence: %v, offset: %v, extent: %v)\n", - r.filename, r.name, r.confidence, r.offset, r.extent) + r.Filename, r.Name, r.Confidence, r.Offset, r.Extent) } + be.Close() } diff --git a/tools/identify_license/results/results.go b/tools/identify_license/results/results.go new file mode 100644 index 0000000..7e258ea --- /dev/null +++ b/tools/identify_license/results/results.go @@ -0,0 +1,42 @@ +// Copyright 2017 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package results contains the result type returned by the classifier backend. +// Placing the type into a separate module allows us to swap out backends and +// still use the same datatype. +package results + +// LicenseType is the assumed type of the unknown license. +type LicenseType struct { + Filename string + Name string + Confidence float64 + Offset int + Extent int +} + +// LicenseTypes is a list of LicenseType objects. +type LicenseTypes []*LicenseType + +func (lt LicenseTypes) Len() int { return len(lt) } +func (lt LicenseTypes) Swap(i, j int) { lt[i], lt[j] = lt[j], lt[i] } +func (lt LicenseTypes) Less(i, j int) bool { + if lt[i].Confidence > lt[j].Confidence { + return true + } + if lt[i].Confidence < lt[j].Confidence { + return false + } + return lt[i].Filename < lt[j].Filename +} |