aboutsummaryrefslogtreecommitdiff
path: root/tools/identify_license/backend/backend.go
blob: a9e46ba6d27b1f72727f5a0ef4ecaf33555a39e7 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
// Copyright 2017 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package backend contains the necessary functions to classify a license.
package backend

import (
	"context"
	"fmt"
	"io/ioutil"
	"log"
	"sync"
	"time"

	"github.com/google/licenseclassifier"
	"github.com/google/licenseclassifier/commentparser"
	"github.com/google/licenseclassifier/commentparser/language"
	"github.com/google/licenseclassifier/tools/identify_license/results"
)

// ClassifierInterface is the interface each backend must implement.
type ClassifierInterface interface {
	Close()
	ClassifyLicenses(filenames []string, headers bool) []error
	ClassifyLicensesWithContext(ctx context.Context, filenames []string, headers bool) []error
	GetResults() results.LicenseTypes
}

// ClassifierBackend is an object that handles classifying a license.
type ClassifierBackend struct {
	results    results.LicenseTypes
	mu         sync.Mutex
	classifier *licenseclassifier.License
}

// New creates a new backend working on the local filesystem.
func New(threshold float64, forbiddenOnly bool) (*ClassifierBackend, error) {
	var lc *licenseclassifier.License
	var err error
	if forbiddenOnly {
		lc, err = licenseclassifier.NewWithForbiddenLicenses(threshold)
	} else {
		lc, err = licenseclassifier.New(threshold)
	}
	if err != nil {
		return nil, err
	}
	return &ClassifierBackend{classifier: lc}, nil
}

// Close does nothing here since there's nothing to close.
func (b *ClassifierBackend) Close() {
}

// ClassifyLicenses runs the license classifier over the given file.
func (b *ClassifierBackend) ClassifyLicenses(filenames []string, headers bool) (errors []error) {
	return b.ClassifyLicensesWithContext(context.Background(), filenames, headers)
}

// ClassifyLicensesWithContext runs the license classifier over the given file;
// ensure that it will respect the timeout and cancelation in the provided context.
func (b *ClassifierBackend) ClassifyLicensesWithContext(ctx context.Context, filenames []string, headers bool) (errors []error) {

	files := make(chan string, len(filenames))
	for _, f := range filenames {
		files <- f
	}
	close(files)
	errs := make(chan error, len(filenames))

	var wg sync.WaitGroup

	// Create a pool from which tasks can later be started. We use a pool because the OS limits
	// the number of files that can be open at any one time.
	const numTasks = 1000
	wg.Add(numTasks)

	for i := 0; i < numTasks; i++ {
		go func() {
			// Ensure that however this function terminates, the wait group
			// is unblocked
			defer wg.Done()

			for {
				filename := <-files

				// no file? we're done
				if filename == "" {
					break
				}

				// If the context is done, record that the file was not
				// classified due to the context's termination.
				if err := ctx.Err(); err != nil {
					errs <- fmt.Errorf("file %s not classified due to context completion: %v", filename, err)
					continue
				}

				if err := b.classifyLicense(filename, headers); err != nil {
					errs <- err
				}
			}
		}()
	}

	wg.Wait()
	close(errs)

	for err := range errs {
		errors = append(errors, err)
	}
	return errors
}

// classifyLicense is called by a Go-function to perform the actual
// classification of a license.
func (b *ClassifierBackend) classifyLicense(filename string, headers bool) error {
	contents, err := ioutil.ReadFile(filename)
	if err != nil {
		return fmt.Errorf("unable to read %q: %v", filename, err)
	}

	matchLoop := func(contents string) {
		for _, m := range b.classifier.MultipleMatch(contents, headers) {
			b.mu.Lock()
			b.results = append(b.results, &results.LicenseType{
				Filename:   filename,
				Name:       m.Name,
				Confidence: m.Confidence,
				Offset:     m.Offset,
				Extent:     m.Extent,
			})
			b.mu.Unlock()
		}
	}

	log.Printf("Classifying license(s): %s", filename)
	start := time.Now()
	if lang := language.ClassifyLanguage(filename); lang == language.Unknown {
		matchLoop(string(contents))
	} else {
		log.Printf("detected language: %v", lang)
		comments := commentparser.Parse(contents, lang)
		for ch := range comments.ChunkIterator() {
			matchLoop(ch.String())
		}
	}
	log.Printf("Finished Classifying License %q: %v", filename, time.Since(start))
	return nil
}

// GetResults returns the results of the classifications.
func (b *ClassifierBackend) GetResults() results.LicenseTypes {
	return b.results
}