1 files changed, 228 insertions, 0 deletions
diff --git a/v2/scoring.go b/v2/scoring.go
new file mode 100644
index 0000000..616ea78
--- /dev/null
+++ b/v2/scoring.go
@@ -0,0 +1,228 @@
+// Copyright 2020 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package classifier
+
+import (
+	"strings"
+	"unicode"
+
+	"github.com/davecgh/go-spew/spew"
+	"github.com/sergi/go-diff/diffmatchpatch"
+)
+
+// return values for the distance function that explain why a diff
+// is not an acceptable match for the source document.
+const (
+	versionChange          = -1
+	introducedPhraseChange = -2
+	lesserGPLChange        = -3
+)
+
+// score computes a metric of similarity between the known and unknown
+// document, including the offsets into the unknown that yield the content
+// generating the computed similarity.
+func (c *Classifier) score(id string, unknown, known *indexedDocument, unknownStart, unknownEnd int) (float64, int, int) {
+	if c.tc.traceScoring(known.s.origin) {
+		c.tc.trace("Scoring %s: [%d-%d]", known.s.origin, unknownStart, unknownEnd)
+	}
+
+	knownLength := known.size()
+	diffs := docDiff(id, unknown, unknownStart, unknownEnd, known, 0, knownLength)
+
+	start, end := diffRange(known.Norm, diffs)
+	distance := scoreDiffs(id, diffs[start:end])
+
+	if c.tc.traceScoring(known.s.origin) {
+		c.tc.trace("Diffs against %s:\n%s", known.s.origin, spew.Sdump(diffs[start:end]))
+	}
+
+	if distance < 0 {
+		// If the distance is negative, this indicates an unacceptable diff so we return a zero-confidence match.
+		if c.tc.traceScoring(known.s.origin) {
+			c.tc.trace("Distance result %v, rejected match", distance)
+		}
+		return 0.0, 0, 0
+	}
+
+	// Applying the diffRange-generated offsets provides the run of text from the
+	// target most closely correlated to the source.  This is the process for
+	// compensating for errors from the searchset. With the reduced text, we
+	// compute the final confidence score and exact text locations for the
+	// matched content.
+	// The diff slice consists of three regions: an optional deletion diff for
+	// target text before the source, n elements that represent the diff between
+	// the source and target, and an optional deletion diff for text after the
+	// target.  The helper functions identify the portions of the slice
+	// corresponding to those regions.  This results in a more accurate
+	// confidence score and better position detection of the source in the
+	// target.
+	conf, so, eo := confidencePercentage(knownLength, distance), textLength(diffs[:start]), textLength(diffs[end:])
+
+	if c.tc.traceScoring(known.s.origin) {
+		c.tc.trace("Score result: %v [%d-%d]", conf, so, eo)
+	}
+	return conf, so, eo
+}
+
+// confidencePercentage computes a confidence match score for the lengths,
+// handling the cases where source and target lengths differ.
+func confidencePercentage(klen, distance int) float64 {
+	// No text is matched at 100% confidence (avoid divide by zero).
+	if klen == 0 {
+		return 1.0
+	}
+
+	// Return a computed fractional match against the known text.
+	return 1.0 - float64(distance)/float64(klen)
+}
+
+// diffLevenshteinWord computes word-based Levenshtein count.
+func diffLevenshteinWord(diffs []diffmatchpatch.Diff) int {
+	levenshtein := 0
+	insertions := 0
+	deletions := 0
+
+	for _, aDiff := range diffs {
+		switch aDiff.Type {
+		case diffmatchpatch.DiffInsert:
+			insertions += wordLen(aDiff.Text)
+		case diffmatchpatch.DiffDelete:
+			deletions += wordLen(aDiff.Text)
+		case diffmatchpatch.DiffEqual:
+			// A deletion and an insertion is one substitution.
+			levenshtein += max(insertions, deletions)
+			insertions = 0
+			deletions = 0
+		}
+	}
+
+	levenshtein += max(insertions, deletions)
+	return levenshtein
+}
+
+func isVersionNumber(in string) bool {
+	for _, r := range in {
+		if !unicode.IsDigit(r) && r != '.' {
+			return false
+		}
+	}
+	return true
+}
+
+// scoreDiffs returns a score rating the acceptability of these diffs.  A
+// negative value means that the changes represented by the diff are not an
+// acceptable transformation since it would change the underlying license.  A
+// positive value indicates the Levenshtein word distance.
+func scoreDiffs(id string, diffs []diffmatchpatch.Diff) int {
+	// We make a pass looking for unacceptable substitutions
+	// Delete diffs are always ordered before insert diffs. This is leveraged to
+	// analyze a change by checking an insert against the delete text that was
+	// previously cached.
+	prevText := ""
+	prevDelete := ""
+	for i, diff := range diffs {
+		text := diff.Text
+		switch diff.Type {
+		case diffmatchpatch.DiffInsert:
+			num := text
+			if i := strings.Index(num, " "); i != -1 {
+				num = num[0:i]
+			}
+			if isVersionNumber(num) && strings.HasSuffix(prevText, "version") {
+				if !strings.HasSuffix(prevText, "the standard version") && !strings.HasSuffix(prevText, "the contributor version") {
+					return versionChange
+				}
+			}
+			// There are certain phrases that can't be introduced to make a license
+			// hit.  TODO: would like to generate this programmatically. Most of
+			// these are words or phrases that appear in a single/small number of
+			// licenses. Can we leverage frequency analysis to identify these
+			// interesting words/phrases and auto-extract them?
+
+			inducedPhrases := map[string][]string{
+				"AGPL":                             {"affero"},
+				"Atmel":                            {"atmel"},
+				"Apache":                           {"apache"},
+				"BSD":                              {"bsd"},
+				"BSD-3-Clause-Attribution":         {"acknowledgment"},
+				"bzip2":                            {"seward"},
+				"GPL-2.0-with-GCC-exception":       {"gcc linking exception"},
+				"GPL-2.0-with-autoconf-exception":  {"autoconf exception"},
+				"GPL-2.0-with-bison-exception":     {"bison exception"},
+				"GPL-2.0-with-classpath-exception": {"class path exception"},
+				"GPL-2.0-with-font-exception":      {"font exception"},
+				"LGPL-2.0":                         {"library"},
+				"ImageMagick":                      {"imagemagick"},
+				"PHP":                              {"php"},
+				"SISSL":                            {"sun standards"},
+				"SGI-B":                            {"silicon graphics"},
+				"SunPro":                           {"sunpro"},
+				"X11":                              {"x consortium"},
+			}
+
+			for k, ps := range inducedPhrases {
+				if strings.HasPrefix(LicenseName(id), k) {
+					for _, p := range ps {
+						if strings.Index(text, p) != -1 {
+							// Check to make sure there isn't a corresponding diff for this
+							// insert that also contains the text. This prevents against diff
+							// blocks that are too big and force a false hit on this check,
+							// which usually happens with URLs since they are stored in one
+							// token but can happen in other cases as well. We don't look just
+							// for delete diffs because the subsequent text may reference the
+							// content in case a URL was truncated.
+							if i+1 < len(diffs) && strings.Index(diffs[i+1].Text, p) != -1 {
+								continue
+							}
+							return introducedPhraseChange
+						}
+					}
+				}
+			}
+
+			// Ignore changes between "library" and "lesser" in a GNU context as they
+			// changed the terms, but look for introductions of Lesser that would
+			// otherwise disqualify a match.
+			if text == "lesser" && strings.HasSuffix(prevText, "gnu") && prevDelete != "library" {
+				// The LGPL 3.0 doesn't have a standard header, so people tend to craft
+				// their own. As a result, sometimes the warranty clause refers to the
+				// GPL instead of the LGPL. This is fine from a licensing perspective,
+				// but we need to tweak matching to ignore that particular case. In
+				// other circumstances, inserting or removing the word Lesser in the
+				// GPL context is not an acceptable change. There is also a reference to
+				// it when suggesting to use the LGPL.
+				if !strings.Contains(prevText, "warranty") && !strings.Contains(prevText, "is covered by the gnu") {
+					return lesserGPLChange
+				}
+			}
+		case diffmatchpatch.DiffEqual:
+			prevText = text
+			prevDelete = ""
+
+		case diffmatchpatch.DiffDelete:
+			// Avoid substitution in most cases. The two exceptions are with usage
+			// statements that are talking about *another* license, and don't affect
+			// the detection of the current license.
+			if (text == "lesser" || text == "library") && strings.HasSuffix(prevText, "gnu") {
+				// Same as above to avoid matching GPL instead of LGPL here.
+				if !strings.Contains(prevText, "warranty") && !strings.Contains(prevText, "is covered by the gnu") {
+					return lesserGPLChange
+				}
+			}
+			prevDelete = text
+		}
+	}
+	return diffLevenshteinWord(diffs)
+}