aboutsummaryrefslogtreecommitdiff
path: root/v2/scoring.go
diff options
context:
space:
mode:
Diffstat (limited to 'v2/scoring.go')
-rw-r--r--v2/scoring.go228
1 files changed, 228 insertions, 0 deletions
diff --git a/v2/scoring.go b/v2/scoring.go
new file mode 100644
index 0000000..616ea78
--- /dev/null
+++ b/v2/scoring.go
@@ -0,0 +1,228 @@
+// Copyright 2020 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package classifier
+
+import (
+ "strings"
+ "unicode"
+
+ "github.com/davecgh/go-spew/spew"
+ "github.com/sergi/go-diff/diffmatchpatch"
+)
+
+// return values for the distance function that explain why a diff
+// is not an acceptable match for the source document.
+const (
+ versionChange = -1
+ introducedPhraseChange = -2
+ lesserGPLChange = -3
+)
+
+// score computes a metric of similarity between the known and unknown
+// document, including the offsets into the unknown that yield the content
+// generating the computed similarity.
+func (c *Classifier) score(id string, unknown, known *indexedDocument, unknownStart, unknownEnd int) (float64, int, int) {
+ if c.tc.traceScoring(known.s.origin) {
+ c.tc.trace("Scoring %s: [%d-%d]", known.s.origin, unknownStart, unknownEnd)
+ }
+
+ knownLength := known.size()
+ diffs := docDiff(id, unknown, unknownStart, unknownEnd, known, 0, knownLength)
+
+ start, end := diffRange(known.Norm, diffs)
+ distance := scoreDiffs(id, diffs[start:end])
+
+ if c.tc.traceScoring(known.s.origin) {
+ c.tc.trace("Diffs against %s:\n%s", known.s.origin, spew.Sdump(diffs[start:end]))
+ }
+
+ if distance < 0 {
+ // If the distance is negative, this indicates an unacceptable diff so we return a zero-confidence match.
+ if c.tc.traceScoring(known.s.origin) {
+ c.tc.trace("Distance result %v, rejected match", distance)
+ }
+ return 0.0, 0, 0
+ }
+
+ // Applying the diffRange-generated offsets provides the run of text from the
+ // target most closely correlated to the source. This is the process for
+ // compensating for errors from the searchset. With the reduced text, we
+ // compute the final confidence score and exact text locations for the
+ // matched content.
+ // The diff slice consists of three regions: an optional deletion diff for
+ // target text before the source, n elements that represent the diff between
+ // the source and target, and an optional deletion diff for text after the
+ // target. The helper functions identify the portions of the slice
+ // corresponding to those regions. This results in a more accurate
+ // confidence score and better position detection of the source in the
+ // target.
+ conf, so, eo := confidencePercentage(knownLength, distance), textLength(diffs[:start]), textLength(diffs[end:])
+
+ if c.tc.traceScoring(known.s.origin) {
+ c.tc.trace("Score result: %v [%d-%d]", conf, so, eo)
+ }
+ return conf, so, eo
+}
+
+// confidencePercentage computes a confidence match score for the lengths,
+// handling the cases where source and target lengths differ.
+func confidencePercentage(klen, distance int) float64 {
+ // No text is matched at 100% confidence (avoid divide by zero).
+ if klen == 0 {
+ return 1.0
+ }
+
+ // Return a computed fractional match against the known text.
+ return 1.0 - float64(distance)/float64(klen)
+}
+
+// diffLevenshteinWord computes word-based Levenshtein count.
+func diffLevenshteinWord(diffs []diffmatchpatch.Diff) int {
+ levenshtein := 0
+ insertions := 0
+ deletions := 0
+
+ for _, aDiff := range diffs {
+ switch aDiff.Type {
+ case diffmatchpatch.DiffInsert:
+ insertions += wordLen(aDiff.Text)
+ case diffmatchpatch.DiffDelete:
+ deletions += wordLen(aDiff.Text)
+ case diffmatchpatch.DiffEqual:
+ // A deletion and an insertion is one substitution.
+ levenshtein += max(insertions, deletions)
+ insertions = 0
+ deletions = 0
+ }
+ }
+
+ levenshtein += max(insertions, deletions)
+ return levenshtein
+}
+
+func isVersionNumber(in string) bool {
+ for _, r := range in {
+ if !unicode.IsDigit(r) && r != '.' {
+ return false
+ }
+ }
+ return true
+}
+
+// scoreDiffs returns a score rating the acceptability of these diffs. A
+// negative value means that the changes represented by the diff are not an
+// acceptable transformation since it would change the underlying license. A
+// positive value indicates the Levenshtein word distance.
+func scoreDiffs(id string, diffs []diffmatchpatch.Diff) int {
+ // We make a pass looking for unacceptable substitutions
+ // Delete diffs are always ordered before insert diffs. This is leveraged to
+ // analyze a change by checking an insert against the delete text that was
+ // previously cached.
+ prevText := ""
+ prevDelete := ""
+ for i, diff := range diffs {
+ text := diff.Text
+ switch diff.Type {
+ case diffmatchpatch.DiffInsert:
+ num := text
+ if i := strings.Index(num, " "); i != -1 {
+ num = num[0:i]
+ }
+ if isVersionNumber(num) && strings.HasSuffix(prevText, "version") {
+ if !strings.HasSuffix(prevText, "the standard version") && !strings.HasSuffix(prevText, "the contributor version") {
+ return versionChange
+ }
+ }
+ // There are certain phrases that can't be introduced to make a license
+ // hit. TODO: would like to generate this programmatically. Most of
+ // these are words or phrases that appear in a single/small number of
+ // licenses. Can we leverage frequency analysis to identify these
+ // interesting words/phrases and auto-extract them?
+
+ inducedPhrases := map[string][]string{
+ "AGPL": {"affero"},
+ "Atmel": {"atmel"},
+ "Apache": {"apache"},
+ "BSD": {"bsd"},
+ "BSD-3-Clause-Attribution": {"acknowledgment"},
+ "bzip2": {"seward"},
+ "GPL-2.0-with-GCC-exception": {"gcc linking exception"},
+ "GPL-2.0-with-autoconf-exception": {"autoconf exception"},
+ "GPL-2.0-with-bison-exception": {"bison exception"},
+ "GPL-2.0-with-classpath-exception": {"class path exception"},
+ "GPL-2.0-with-font-exception": {"font exception"},
+ "LGPL-2.0": {"library"},
+ "ImageMagick": {"imagemagick"},
+ "PHP": {"php"},
+ "SISSL": {"sun standards"},
+ "SGI-B": {"silicon graphics"},
+ "SunPro": {"sunpro"},
+ "X11": {"x consortium"},
+ }
+
+ for k, ps := range inducedPhrases {
+ if strings.HasPrefix(LicenseName(id), k) {
+ for _, p := range ps {
+ if strings.Index(text, p) != -1 {
+ // Check to make sure there isn't a corresponding diff for this
+ // insert that also contains the text. This prevents against diff
+ // blocks that are too big and force a false hit on this check,
+ // which usually happens with URLs since they are stored in one
+ // token but can happen in other cases as well. We don't look just
+ // for delete diffs because the subsequent text may reference the
+ // content in case a URL was truncated.
+ if i+1 < len(diffs) && strings.Index(diffs[i+1].Text, p) != -1 {
+ continue
+ }
+ return introducedPhraseChange
+ }
+ }
+ }
+ }
+
+ // Ignore changes between "library" and "lesser" in a GNU context as they
+ // changed the terms, but look for introductions of Lesser that would
+ // otherwise disqualify a match.
+ if text == "lesser" && strings.HasSuffix(prevText, "gnu") && prevDelete != "library" {
+ // The LGPL 3.0 doesn't have a standard header, so people tend to craft
+ // their own. As a result, sometimes the warranty clause refers to the
+ // GPL instead of the LGPL. This is fine from a licensing perspective,
+ // but we need to tweak matching to ignore that particular case. In
+ // other circumstances, inserting or removing the word Lesser in the
+ // GPL context is not an acceptable change. There is also a reference to
+ // it when suggesting to use the LGPL.
+ if !strings.Contains(prevText, "warranty") && !strings.Contains(prevText, "is covered by the gnu") {
+ return lesserGPLChange
+ }
+ }
+ case diffmatchpatch.DiffEqual:
+ prevText = text
+ prevDelete = ""
+
+ case diffmatchpatch.DiffDelete:
+ // Avoid substitution in most cases. The two exceptions are with usage
+ // statements that are talking about *another* license, and don't affect
+ // the detection of the current license.
+ if (text == "lesser" || text == "library") && strings.HasSuffix(prevText, "gnu") {
+ // Same as above to avoid matching GPL instead of LGPL here.
+ if !strings.Contains(prevText, "warranty") && !strings.Contains(prevText, "is covered by the gnu") {
+ return lesserGPLChange
+ }
+ }
+ prevDelete = text
+ }
+ }
+ return diffLevenshteinWord(diffs)
+}