diff options
author | Bill Neubauer <wcn@google.com> | 2022-03-21 12:12:34 -0700 |
---|---|---|
committer | Bill Neubauer <wcn@google.com> | 2022-03-21 12:12:34 -0700 |
commit | a6f4b9f7a6aafe35aefa57e97567f4de44972f46 (patch) | |
tree | 70bd0fd1421edf8576993ab4f6bcae9ee5883a4f | |
parent | 0f3a421c4dc74c9c20185eb6063d7cfa59715a10 (diff) | |
download | licenseclassifier-a6f4b9f7a6aafe35aefa57e97567f4de44972f46.tar.gz |
Add v2 versions of identify_license packages
This uses Go 1.16 embed features, so it moves the overall support level
for the licenseclassifier up from 1.15
-rw-r--r-- | go.mod | 6 | ||||
-rw-r--r-- | go.sum | 16 | ||||
-rw-r--r-- | tools/identify_license/backend/v2/backend.go | 166 | ||||
-rw-r--r-- | tools/identify_license/backend/v2/go.mod | 10 | ||||
-rw-r--r-- | tools/identify_license/backend/v2/go.sum | 17 | ||||
-rw-r--r-- | tools/identify_license/results/v2/go.mod | 3 | ||||
-rw-r--r-- | tools/identify_license/results/v2/results.go | 142 | ||||
-rw-r--r-- | tools/identify_license/v2/go.mod | 11 | ||||
-rw-r--r-- | tools/identify_license/v2/go.sum | 17 | ||||
-rwxr-xr-x | tools/identify_license/v2/identify_license | bin | 0 -> 5310636 bytes | |||
-rw-r--r-- | tools/identify_license/v2/identify_license.go | 204 | ||||
-rw-r--r-- | v2/assets/embed.go | 54 | ||||
-rw-r--r-- | v2/go.mod | 2 |
13 files changed, 644 insertions, 4 deletions
@@ -3,7 +3,7 @@ module github.com/google/licenseclassifier go 1.16 require ( - github.com/google/go-cmp v0.2.0 - github.com/sergi/go-diff v1.0.0 - github.com/stretchr/testify v1.3.0 // indirect + github.com/google/go-cmp v0.5.2 + github.com/google/licenseclassifier/v2 v2.0.0-alpha.1 // indirect + github.com/sergi/go-diff v1.1.0 ) @@ -1,11 +1,27 @@ github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/google/go-cmp v0.2.0 h1:+dTQ8DZQJz0Mb/HjFlkptS1FeQ4cWSnN941F8aEG4SQ= github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= +github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/licenseclassifier/v2 v2.0.0-alpha.1 h1:E0HY5OuFS3CQoVFAr1dabMFm4PyjNMbIB1zYulfwnRI= +github.com/google/licenseclassifier/v2 v2.0.0-alpha.1/go.mod h1:YAgBGGTeNDMU+WfIgaFvjZe4rudym4f6nIn8ZH5X+VM= +github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= +github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/sergi/go-diff v1.0.0 h1:Kpca3qRNrduNnOQeazBd0ysaKrUJiIuISHxogkT9RPQ= github.com/sergi/go-diff v1.0.0/go.mod h1:0CfEIISq7TuYL3j771MWULgwwjU+GofnZX9QAmXWZgo= +github.com/sergi/go-diff v1.1.0 h1:we8PVUC3FE2uYfodKH/nBHMSetSfHDR6scGdBi+erh0= +github.com/sergi/go-diff v1.1.0/go.mod h1:STckp+ISIX8hZLjrqAeVduY0gWCT9IjLuqbuNXdaHfM= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.2.4/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= diff --git a/tools/identify_license/backend/v2/backend.go b/tools/identify_license/backend/v2/backend.go new file mode 100644 index 0000000..5e11d78 --- /dev/null +++ b/tools/identify_license/backend/v2/backend.go @@ -0,0 +1,166 @@ +// Copyright 2017 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package backend contains the necessary functions to classify a license. +package backend + +import ( + "context" + "fmt" + "io/ioutil" + "log" + "sync" + "time" + + //gc "google3/devtools/compliance/common/licenseclassifier/classifier" + + "github.com/google/licenseclassifier/tools/identify_license/results/v2" + classifier "github.com/google/licenseclassifier/v2" + "github.com/google/licenseclassifier/v2/assets" +) + +// ClassifierInterface is the interface each backend must implement. +type ClassifierInterface interface { + Close() + SetTraceConfiguration(tc *classifier.TraceConfiguration) + ClassifyLicenses(numTasks int, filenames []string, headers bool) []error + ClassifyLicensesWithContext(ctx context.Context, numTasks int, filenames []string, headers bool) []error + GetResults() results.LicenseTypes +} + +// ClassifierBackend is an object that handles classifying a license. +type ClassifierBackend struct { + results results.LicenseTypes + mu sync.Mutex + classifier *classifier.Classifier +} + +// New creates a new backend working on the local filesystem. +func New() (*ClassifierBackend, error) { + _, err := assets.ReadLicenseDir() + if err != nil { + return nil, err + } + lc, err := assets.DefaultClassifier() + if err != nil { + return nil, err + } + return &ClassifierBackend{classifier: lc}, nil +} + +// Close does nothing here since there's nothing to close. +func (b *ClassifierBackend) Close() { +} + +// SetTraceConfiguration injects the supplied trace configuration +func (b *ClassifierBackend) SetTraceConfiguration(tc *classifier.TraceConfiguration) { + //b.classifier.SetTraceConfiguration((*gc.TraceConfiguration)(tc)) +} + +// ClassifyLicenses runs the license classifier over the given file. +func (b *ClassifierBackend) ClassifyLicenses(numTasks int, filenames []string, headers bool) (errors []error) { + // Create a pool from which tasks can later be started. We use a pool because the OS limits + // the number of files that can be open at any one time. + task := make(chan bool, numTasks) + for i := 0; i < numTasks; i++ { + task <- true + } + + errs := make(chan error, len(filenames)) + + var wg sync.WaitGroup + analyze := func(filename string) { + defer func() { + wg.Done() + task <- true + }() + if err := b.classifyLicense(filename, headers); err != nil { + errs <- err + } + } + + for _, filename := range filenames { + wg.Add(1) + <-task + go analyze(filename) + } + go func() { + wg.Wait() + close(task) + close(errs) + }() + + for err := range errs { + errors = append(errors, err) + } + return errors +} + +// ClassifyLicensesWithContext runs the license classifier over the given file; ensure that it will respect the timeout in the provided context. +func (b *ClassifierBackend) ClassifyLicensesWithContext(ctx context.Context, numTasks int, filenames []string, headers bool) (errors []error) { + done := make(chan bool) + go func() { + errors = b.ClassifyLicenses(numTasks, filenames, headers) + done <- true + }() + select { + case <-ctx.Done(): + err := ctx.Err() + errors = append(errors, err) + return errors + case <-done: + return errors + } +} + +// classifyLicense is called by a Go-function to perform the actual +// classification of a license. +func (b *ClassifierBackend) classifyLicense(filename string, headers bool) error { + contents, err := ioutil.ReadFile(filename) + if err != nil { + return fmt.Errorf("unable to read %q: %v", filename, err) + } + + matchLoop := func(contents []byte) { + for _, m := range b.classifier.Match(contents).Matches { + // If not looking for headers, skip them + if !headers && m.MatchType == "Header" { + continue + } + + b.mu.Lock() + b.results = append(b.results, &results.LicenseType{ + Filename: filename, + MatchType: m.MatchType, + Name: m.Name, + Variant: m.Variant, + Confidence: m.Confidence, + StartLine: m.StartLine, + EndLine: m.EndLine, + }) + b.mu.Unlock() + } + } + + log.Printf("Classifying license(s): %s", filename) + start := time.Now() + matchLoop(contents) + log.Printf("Finished Classifying License %q: %v", filename, time.Since(start)) + return nil +} + +// GetResults returns the results of the classifications. +func (b *ClassifierBackend) GetResults() results.LicenseTypes { + return b.results +} diff --git a/tools/identify_license/backend/v2/go.mod b/tools/identify_license/backend/v2/go.mod new file mode 100644 index 0000000..9b73b94 --- /dev/null +++ b/tools/identify_license/backend/v2/go.mod @@ -0,0 +1,10 @@ +module github.com/google/licenseclassifier/tools/identify_license/backend/v2 + +go 1.18 + +require ( + github.com/davecgh/go-spew v1.1.1 // indirect + github.com/google/licenseclassifier/tools/identify_license/results/v2 v2.0.0 // indirect + github.com/google/licenseclassifier/v2 v2.0.0 // indirect + github.com/sergi/go-diff v1.1.0 // indirect +) diff --git a/tools/identify_license/backend/v2/go.sum b/tools/identify_license/backend/v2/go.sum new file mode 100644 index 0000000..72fe660 --- /dev/null +++ b/tools/identify_license/backend/v2/go.sum @@ -0,0 +1,17 @@ +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= +github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/sergi/go-diff v1.1.0 h1:we8PVUC3FE2uYfodKH/nBHMSetSfHDR6scGdBi+erh0= +github.com/sergi/go-diff v1.1.0/go.mod h1:STckp+ISIX8hZLjrqAeVduY0gWCT9IjLuqbuNXdaHfM= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.2.4/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= diff --git a/tools/identify_license/results/v2/go.mod b/tools/identify_license/results/v2/go.mod new file mode 100644 index 0000000..962b8e0 --- /dev/null +++ b/tools/identify_license/results/v2/go.mod @@ -0,0 +1,3 @@ +module github.com/google/licenseclassifier/tools/identify_license/results/v2 + +go 1.18 diff --git a/tools/identify_license/results/v2/results.go b/tools/identify_license/results/v2/results.go new file mode 100644 index 0000000..fd7c598 --- /dev/null +++ b/tools/identify_license/results/v2/results.go @@ -0,0 +1,142 @@ +// Copyright 2017 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package results contains the result type returned by the classifier backend. +// Placing the type into a separate module allows us to swap out backends and +// still use the same datatype. +package results + +import ( + "bufio" + "fmt" + "os" + "sort" +) + +// LicenseType is the assumed type of the unknown license. +type LicenseType struct { + Filename string + Name string + MatchType string + Variant string + Confidence float64 + StartLine int + EndLine int +} + +// LicenseTypes is a list of LicenseType objects. +type LicenseTypes []*LicenseType + +func (lt LicenseTypes) Len() int { return len(lt) } +func (lt LicenseTypes) Swap(i, j int) { lt[i], lt[j] = lt[j], lt[i] } +func (lt LicenseTypes) Less(i, j int) bool { + if lt[i].Confidence > lt[j].Confidence { + return true + } + if lt[i].Confidence < lt[j].Confidence { + return false + } + if lt[i].Filename < lt[j].Filename { + return true + } + if lt[i].Filename > lt[j].Filename { + return false + } + return lt[i].EndLine < lt[j].EndLine +} + +// Classification is the license classification for a segment of a file. +type Classification struct { + Name string + Confidence float64 + StartLine int + EndLine int + Text string `json:",omitempty"` +} + +// Classifications contains all license classifications for a file +type Classifications []*Classification + +// FileClassifications contains the license classifications for a particular file. +type FileClassifications struct { + Filepath string + Classifications Classifications +} + +//JSONResult is the format for the jr JSON file +type JSONResult []*FileClassifications + +func (jr JSONResult) Len() int { return len(jr) } +func (jr JSONResult) Swap(i, j int) { jr[i], jr[j] = jr[j], jr[i] } +func (jr JSONResult) Less(i, j int) bool { return jr[i].Filepath < jr[j].Filepath } + +// readFileLines will read a specified range of lines of a file +func readFileLines(filename string, startLine, endLine int) (string, error) { + f, err := os.Open(filename) + if err != nil { + return "", err + } + defer f.Close() + + scanner := bufio.NewScanner(f) + lines := "" + i := 0 + for scanner.Scan() { + i++ // lines are 1-indexed + if i < startLine { + continue + } else if i > endLine { + break + } + lines += scanner.Text() + "\n" + } + if i < endLine { + return "", fmt.Errorf( + "line %d was the last line read from file %s, but endLine was set to %d", i, filename, endLine) + } + return lines, nil +} + +// NewJSONResult creates a new JSONResult object from a LicenseTypes object. +func NewJSONResult(licenses LicenseTypes, includeText bool) (JSONResult, error) { + fMap := map[string]*FileClassifications{} + for _, l := range licenses { + currF, ok := fMap[l.Filename] + if !ok { + currF = &FileClassifications{Filepath: l.Filename} + fMap[l.Filename] = currF + } + c := &Classification{ + Name: l.Name, + Confidence: l.Confidence, + StartLine: l.StartLine, + EndLine: l.EndLine, + } + if includeText { + text, err := readFileLines(l.Filename, l.StartLine, l.EndLine) + if err != nil { + return nil, err + } + c.Text = text + } + currF.Classifications = append(currF.Classifications, c) + } + + jr := JSONResult{} + for _, fc := range fMap { + jr = append(jr, fc) + } + sort.Sort(jr) + return jr, nil +} diff --git a/tools/identify_license/v2/go.mod b/tools/identify_license/v2/go.mod new file mode 100644 index 0000000..a0b8e2e --- /dev/null +++ b/tools/identify_license/v2/go.mod @@ -0,0 +1,11 @@ +module github.com/google/licenseclassifier/tools/identify_license/v2 + +go 1.16 + +require ( + github.com/davecgh/go-spew v1.1.1 // indirect + github.com/google/licenseclassifier/tools/identify_license/backend/v2 v2.0.0 // indirect + github.com/google/licenseclassifier/tools/identify_license/results/v2 v2.0.0 // indirect + github.com/google/licenseclassifier/v2 v2.0.0 // indirect + github.com/sergi/go-diff v1.1.0 // indirect +) diff --git a/tools/identify_license/v2/go.sum b/tools/identify_license/v2/go.sum new file mode 100644 index 0000000..72fe660 --- /dev/null +++ b/tools/identify_license/v2/go.sum @@ -0,0 +1,17 @@ +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= +github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/sergi/go-diff v1.1.0 h1:we8PVUC3FE2uYfodKH/nBHMSetSfHDR6scGdBi+erh0= +github.com/sergi/go-diff v1.1.0/go.mod h1:STckp+ISIX8hZLjrqAeVduY0gWCT9IjLuqbuNXdaHfM= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.2.4/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= diff --git a/tools/identify_license/v2/identify_license b/tools/identify_license/v2/identify_license Binary files differnew file mode 100755 index 0000000..c7ad3f7 --- /dev/null +++ b/tools/identify_license/v2/identify_license diff --git a/tools/identify_license/v2/identify_license.go b/tools/identify_license/v2/identify_license.go new file mode 100644 index 0000000..42d9b94 --- /dev/null +++ b/tools/identify_license/v2/identify_license.go @@ -0,0 +1,204 @@ +// Copyright 2017 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// The identify_license program tries to identify the license type of an +// unknown license. The file containing the license text is specified on the +// command line. Multiple license files can be analyzed with a single command. +// The type of the license is returned along with the confidence level of the +// match. The confidence level is between 0.0 and 1.0, with 1.0 indicating an +// exact match and 0.0 indicating a complete mismatch. The results are sorted +// by confidence level. +// +// $ identifylicense <LICENSE_OR_DIRECTORY> <LICENSE_OR_DIRECTORY> ... +// LICENSE2: MIT (confidence: 0.987) +// LICENSE1: BSD-2-Clause (confidence: 0.833) +package main + +import ( + "context" + "encoding/json" + "flag" + "fmt" + "strings" + + //"google3/file/base/go/contrib/walk/walk" + //"google3/file/base/go/file" + "io/fs" + "io/ioutil" + "log" + "os" + "path/filepath" + "regexp" + "sort" + "time" + + "github.com/google/licenseclassifier/tools/identify_license/backend/v2" + "github.com/google/licenseclassifier/tools/identify_license/results/v2" + classifier "github.com/google/licenseclassifier/v2" +) + +var ( + headers = flag.Bool("headers", false, "match license headers") + jsonFname = flag.String("json", "", "filename to write JSON output to.") + includeText = flag.Bool("include_text", false, "include the license text in the JSON output") + numTasks = flag.Int("tasks", 1000, "the number of license scanning tasks running concurrently") + timeout = flag.Duration("timeout", 24*time.Hour, "timeout before giving up on classifying a file.") + tracePhases = flag.String("trace_phases", "", "comma-separated list of phases of the license classifier to trace") + traceLicenses = flag.String("trace_licenses", "", "comma-separated list of licenses for the license classifier to trace") + ignorePaths = flag.String("ignore_paths_re", "", "comma-separated list of regular expressions that match file paths to ignore") +) + +// expandFiles recursively returns a list of files stored in a list of +// directories. If an input is not a directory, it is added to the output list. +func expandFiles(ctx context.Context, paths []string) ([]string, error) { + var finalPaths []string + + ip, err := parseIgnorePaths() + if err != nil { + return nil, fmt.Errorf("could not parse ignore paths: %v", err) + } + + handleFile := func(path string) { + if shouldIgnore(ip, path) { + return + } + finalPaths = append(finalPaths, path) + } + + for _, p := range paths { + p, err := filepath.Abs(p) + if err != nil { + return nil, err + } + + err = filepath.Walk(p, func(path string, info os.FileInfo, err error) error { + if err != nil { + return err + } + if info.IsDir() { + if shouldIgnore(ip, info.Name()) { + return fs.SkipDir + } + return nil // walk the directory + } + handleFile(path) + return nil + }) + if err != nil { + return nil, err + } + } + return finalPaths, nil +} + +func shouldIgnore(ignorePaths []*regexp.Regexp, path string) bool { + for _, r := range ignorePaths { + if exactRegexMatch(r, path) { + return true + } + } + return false +} + +func exactRegexMatch(r *regexp.Regexp, s string) bool { + m := r.FindStringIndex(s) + if m == nil { + return false + } + return (m[0] == 0) && (m[1] == len(s)) +} + +func parseIgnorePaths() (out []*regexp.Regexp, err error) { + for _, p := range strings.Split(*ignorePaths, ",") { + r, err := regexp.Compile(p) + if err != nil { + return nil, err + } + out = append(out, r) + } + return out, nil +} + +// outputJSON writes the output formatted as JSON to a file. +func outputJSON(filename *string, res results.LicenseTypes, includeText bool) error { + d, err := results.NewJSONResult(res, includeText) + if err != nil { + return err + } + fc, err := json.MarshalIndent(d, "", " ") + if err != nil { + return err + } + return ioutil.WriteFile(*filename, fc, 0644) +} + +func init() { + flag.Usage = func() { + fmt.Fprintf(os.Stderr, `Usage: %s <licensefile> ... + +Identify an unknown license. + +Options: +`, filepath.Base(os.Args[0])) + flag.PrintDefaults() + } +} + +func main() { + flag.Parse() + + be, err := backend.New() + if err != nil { + log.Fatalf("cannot create license classifier: %v", err) + } + + paths, err := expandFiles(context.Background(), flag.Args()) + defer be.Close() + be.SetTraceConfiguration( + &classifier.TraceConfiguration{ + TracePhases: *tracePhases, + TraceLicenses: *traceLicenses, + }) + + ctx, cancel := context.WithTimeout(context.Background(), *timeout) + defer cancel() + if errs := be.ClassifyLicensesWithContext(ctx, *numTasks, paths, *headers); errs != nil { + be.Close() + for _, err := range errs { + log.Printf("classify license failed: %v", err) + } + log.Fatal("cannot classify licenses") + } + + results := be.GetResults() + if len(results) == 0 { + log.Fatal("Couldn't classify license(s)") + } + + sort.Sort(results) + for _, r := range results { + name := r.Name + if r.MatchType != "License" && r.MatchType != "Header" { + name = fmt.Sprintf("%s:%s", r.MatchType, r.Name) + } + fmt.Printf("%s %s (variant: %v, confidence: %v, start: %v, end: %v)\n", + r.Filename, name, r.Variant, r.Confidence, r.StartLine, r.EndLine) + } + if len(*jsonFname) > 0 { + err = outputJSON(jsonFname, results, *includeText) + if err != nil { + log.Fatalf("Couldn't write JSON output to file %s: %v", *jsonFname, err) + } + } +} diff --git a/v2/assets/embed.go b/v2/assets/embed.go new file mode 100644 index 0000000..7a93a39 --- /dev/null +++ b/v2/assets/embed.go @@ -0,0 +1,54 @@ +package assets + +import ( + "embed" + "fmt" + "io/fs" + "os" + "strings" + + classifier "github.com/google/licenseclassifier/v2" +) + +//go:embed */*/* +var licenseFS embed.FS + +// DefaultClassifier returns a classifier loaded with the contents of the +// assets directory. +func DefaultClassifier() (*classifier.Classifier, error) { + c := classifier.NewClassifier(.8) + + err := fs.WalkDir(licenseFS, ".", func(path string, d fs.DirEntry, err error) error { + if err != nil { + return err + } + if d.IsDir() { + return nil + } + + b, err := licenseFS.ReadFile(path) + if err != nil { + return err + } + splits := strings.Split(path, fmt.Sprintf("%c", os.PathSeparator)) + category, name, variant := splits[0], splits[1], splits[2] + c.AddContent(category, name, variant, b) + return nil + }) + + if err != nil { + return nil, err + } + return c, nil + +} + +// ReadLicenseFile locates and reads the license archive file. Absolute paths are used unmodified. Relative paths are expected to be in the licenses directory of the licenseclassifier package. +func ReadLicenseFile(filename string) ([]byte, error) { + return licenseFS.ReadFile(filename) +} + +// ReadLicenseDir reads directory containing the license files. +func ReadLicenseDir() ([]fs.DirEntry, error) { + return licenseFS.ReadDir(".") +} @@ -1,6 +1,6 @@ module github.com/google/licenseclassifier/v2 -go 1.15 +go 1.16 require ( github.com/davecgh/go-spew v1.1.1 |