aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--classifier.go2
-rw-r--r--classifier_test.go1
-rw-r--r--forbidden.go1
-rw-r--r--license_type.go1
-rw-r--r--tools/identify_license/backend/backend.go143
-rw-r--r--tools/identify_license/identify_license.go120
-rw-r--r--tools/identify_license/results/results.go42
7 files changed, 206 insertions, 104 deletions
diff --git a/classifier.go b/classifier.go
index e98f97c..d8a5480 100644
--- a/classifier.go
+++ b/classifier.go
@@ -35,7 +35,7 @@ import (
)
// DefaultConfidenceThreshold is the minimum confidence percentage we're willing to accept in order
-// to say that a match is good. http://go/license-classifier-conf-threshold
+// to say that a match is good.
const DefaultConfidenceThreshold = 0.80
var (
diff --git a/classifier_test.go b/classifier_test.go
index aac9c5f..5e5f635 100644
--- a/classifier_test.go
+++ b/classifier_test.go
@@ -11,6 +11,7 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
+
package licenseclassifier
import (
diff --git a/forbidden.go b/forbidden.go
index 7719bae..b26e8b6 100644
--- a/forbidden.go
+++ b/forbidden.go
@@ -11,6 +11,7 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
+
package licenseclassifier
import "regexp"
diff --git a/license_type.go b/license_type.go
index a98e153..f0070c3 100644
--- a/license_type.go
+++ b/license_type.go
@@ -11,6 +11,7 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
+
package licenseclassifier
// *** NOTE: Update this file when adding a new license. You need to:
diff --git a/tools/identify_license/backend/backend.go b/tools/identify_license/backend/backend.go
new file mode 100644
index 0000000..587bc16
--- /dev/null
+++ b/tools/identify_license/backend/backend.go
@@ -0,0 +1,143 @@
+// Copyright 2017 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package backend contains the necessary functions to classify a license.
+package backend
+
+import (
+ "fmt"
+ "io/ioutil"
+ "log"
+ "sync"
+ "time"
+
+ "github.com/google/licenseclassifier"
+ "github.com/google/licenseclassifier/internal/commentparser"
+ "github.com/google/licenseclassifier/internal/commentparser/language"
+ "github.com/google/licenseclassifier/tools/identify_license/results"
+)
+
+// ClassifierInterface is the interface each backend must implement.
+type ClassifierInterface interface {
+ Close()
+ ClassifyLicenses(filenames []string, headers bool) []error
+ GetResults() results.LicenseTypes
+}
+
+// ClassifierBackend is an object that handles classifying a license.
+type ClassifierBackend struct {
+ results results.LicenseTypes
+ mu sync.Mutex
+ classifier *licenseclassifier.License
+}
+
+// New creates a new backend working on the local filesystem.
+func New(threshold float64, forbiddenOnly bool) (*ClassifierBackend, error) {
+ var lc *licenseclassifier.License
+ var err error
+ if forbiddenOnly {
+ lc, err = licenseclassifier.NewWithForbiddenLicenses(threshold)
+ } else {
+ lc, err = licenseclassifier.New(threshold)
+ }
+ if err != nil {
+ return nil, err
+ }
+ return &ClassifierBackend{classifier: lc}, nil
+}
+
+// Close does nothing here since there's nothing to close.
+func (b *ClassifierBackend) Close() {
+}
+
+// ClassifyLicenses runs the license classifier over the given file.
+func (b *ClassifierBackend) ClassifyLicenses(filenames []string, headers bool) (errors []error) {
+ // Create a pool from which tasks can later be started. We use a pool because the OS limits
+ // the number of files that can be open at any one time.
+ const numTasks = 1000
+ task := make(chan bool, numTasks)
+ for i := 0; i < numTasks; i++ {
+ task <- true
+ }
+
+ errs := make(chan error, len(filenames))
+
+ var wg sync.WaitGroup
+ analyze := func(filename string) {
+ defer func() {
+ wg.Done()
+ task <- true
+ }()
+ if err := b.classifyLicense(filename, headers); err != nil {
+ errs <- err
+ }
+ }
+
+ for _, filename := range filenames {
+ wg.Add(1)
+ <-task
+ go analyze(filename)
+ }
+ go func() {
+ wg.Wait()
+ close(task)
+ close(errs)
+ }()
+
+ for err := range errs {
+ errors = append(errors, err)
+ }
+ return errors
+}
+
+// classifyLicense is called by a Go-function to perform the actual
+// classification of a license.
+func (b *ClassifierBackend) classifyLicense(filename string, headers bool) error {
+ contents, err := ioutil.ReadFile(filename)
+ if err != nil {
+ return fmt.Errorf("unable to read %q: %v", filename, err)
+ }
+
+ matchLoop := func(contents string) {
+ for _, m := range b.classifier.MultipleMatch(contents, headers) {
+ b.mu.Lock()
+ b.results = append(b.results, &results.LicenseType{
+ Filename: filename,
+ Name: m.Name,
+ Confidence: m.Confidence,
+ Offset: m.Offset,
+ Extent: m.Extent,
+ })
+ b.mu.Unlock()
+ }
+ }
+
+ log.Printf("Classifying license(s): %s", filename)
+ start := time.Now()
+ if lang := language.ClassifyLanguage(filename); lang == language.Unknown {
+ matchLoop(string(contents))
+ } else {
+ comments := commentparser.Parse(contents, lang)
+ for ch := range comments.ChunkIterator() {
+ matchLoop(ch.String())
+ }
+ }
+ log.Printf("Finished Classifying License %q: %v", filename, time.Since(start))
+ return nil
+}
+
+// GetResults returns the results of the classifications.
+func (b *ClassifierBackend) GetResults() results.LicenseTypes {
+ return b.results
+}
diff --git a/tools/identify_license/identify_license.go b/tools/identify_license/identify_license.go
index c00587a..bf6c3e8 100644
--- a/tools/identify_license/identify_license.go
+++ b/tools/identify_license/identify_license.go
@@ -28,48 +28,21 @@ package main
import (
"flag"
"fmt"
- "io/ioutil"
"log"
"os"
"path/filepath"
"sort"
- "sync"
- "time"
"github.com/google/licenseclassifier"
- "github.com/google/licenseclassifier/internal/commentparser"
- "github.com/google/licenseclassifier/internal/commentparser/language"
+ "github.com/google/licenseclassifier/tools/identify_license/backend"
)
var (
+ headers = flag.Bool("headers", false, "match license headers")
forbiddenOnly = flag.Bool("forbidden", false, "identify using forbidden licenses archive")
threshold = flag.Float64("threshold", licenseclassifier.DefaultConfidenceThreshold, "confidence threshold")
- headers = flag.Bool("headers", false, "match license headers")
)
-// licenseType is the assumed type of the unknown license.
-type licenseType struct {
- filename string
- name string
- confidence float64
- offset int
- extent int
-}
-
-type licenseTypes []*licenseType
-
-func (lt licenseTypes) Len() int { return len(lt) }
-func (lt licenseTypes) Swap(i, j int) { lt[i], lt[j] = lt[j], lt[i] }
-func (lt licenseTypes) Less(i, j int) bool {
- if lt[i].confidence > lt[j].confidence {
- return true
- }
- if lt[i].confidence < lt[j].confidence {
- return false
- }
- return lt[i].filename < lt[j].filename
-}
-
func init() {
flag.Usage = func() {
fmt.Fprintf(os.Stderr, `Usage: %s <licensefile> ...
@@ -85,89 +58,30 @@ Options:
func main() {
flag.Parse()
- var lc *licenseclassifier.License
- var err error
- if *forbiddenOnly {
- lc, err = licenseclassifier.NewWithForbiddenLicenses(*threshold)
- } else {
- lc, err = licenseclassifier.New(*threshold)
- }
+ be, err := backend.New(*threshold, *forbiddenOnly)
if err != nil {
+ be.Close()
log.Fatalf("cannot create license classifier: %v", err)
}
- var mu sync.Mutex
- var matches licenseTypes
-
- // Create a pool from which tasks can later be started. We use a pool because the OS limits
- // the number of files that can be open at one time.
- const numTasks = 1000
- task := make(chan bool, numTasks)
- for i := 0; i < numTasks; i++ {
- task <- true
- }
-
- var wg sync.WaitGroup
- classifyLicense := func(filename string) {
- defer func() {
- wg.Done()
- task <- true
- }()
-
- contents, err := ioutil.ReadFile(filename)
- if err != nil {
- log.Fatalf("cannot read %q: %v", filename, err)
- return
+ if errs := be.ClassifyLicenses(flag.Args(), *headers); errs != nil {
+ be.Close()
+ for _, err := range errs {
+ log.Printf("classify license failed: %v", err)
}
-
- start := time.Now()
- if lang := language.ClassifyLanguage(filename); lang == language.Unknown {
- log.Printf("Classifying license(s): %s", filename)
- for _, m := range lc.MultipleMatch(string(contents), *headers) {
- mu.Lock()
- matches = append(matches, &licenseType{
- filename: filename,
- name: m.Name,
- confidence: m.Confidence,
- offset: m.Offset,
- extent: m.Extent,
- })
- mu.Unlock()
- }
- } else {
- comments := commentparser.Parse(contents, lang)
- for ch := range comments.ChunkIterator() {
- for _, m := range lc.MultipleMatch(ch.String(), *headers) {
- mu.Lock()
- matches = append(matches, &licenseType{
- filename: filename,
- name: m.Name,
- confidence: m.Confidence,
- offset: m.Offset,
- extent: m.Extent,
- })
- mu.Unlock()
- }
- }
- }
-
- log.Printf("Finished Classifying License %q: %v", filename, time.Since(start))
- }
-
- for _, unknown := range flag.Args() {
- wg.Add(1)
- <-task
- go classifyLicense(unknown)
+ log.Fatal("cannot classify licenses")
}
- wg.Wait()
- if len(matches) == 0 {
- log.Fatalf("Couldn't classify license(s)")
+ results := be.GetResults()
+ if len(results) == 0 {
+ be.Close()
+ log.Fatal("Couldn't classify license(s)")
}
- sort.Sort(matches)
- for _, r := range matches {
+ sort.Sort(results)
+ for _, r := range results {
fmt.Printf("%s: %s (confidence: %v, offset: %v, extent: %v)\n",
- r.filename, r.name, r.confidence, r.offset, r.extent)
+ r.Filename, r.Name, r.Confidence, r.Offset, r.Extent)
}
+ be.Close()
}
diff --git a/tools/identify_license/results/results.go b/tools/identify_license/results/results.go
new file mode 100644
index 0000000..7e258ea
--- /dev/null
+++ b/tools/identify_license/results/results.go
@@ -0,0 +1,42 @@
+// Copyright 2017 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package results contains the result type returned by the classifier backend.
+// Placing the type into a separate module allows us to swap out backends and
+// still use the same datatype.
+package results
+
+// LicenseType is the assumed type of the unknown license.
+type LicenseType struct {
+ Filename string
+ Name string
+ Confidence float64
+ Offset int
+ Extent int
+}
+
+// LicenseTypes is a list of LicenseType objects.
+type LicenseTypes []*LicenseType
+
+func (lt LicenseTypes) Len() int { return len(lt) }
+func (lt LicenseTypes) Swap(i, j int) { lt[i], lt[j] = lt[j], lt[i] }
+func (lt LicenseTypes) Less(i, j int) bool {
+ if lt[i].Confidence > lt[j].Confidence {
+ return true
+ }
+ if lt[i].Confidence < lt[j].Confidence {
+ return false
+ }
+ return lt[i].Filename < lt[j].Filename
+}