diff options
Diffstat (limited to 'v2/scoring.go')
-rw-r--r-- | v2/scoring.go | 228 |
1 files changed, 228 insertions, 0 deletions
diff --git a/v2/scoring.go b/v2/scoring.go new file mode 100644 index 0000000..616ea78 --- /dev/null +++ b/v2/scoring.go @@ -0,0 +1,228 @@ +// Copyright 2020 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package classifier + +import ( + "strings" + "unicode" + + "github.com/davecgh/go-spew/spew" + "github.com/sergi/go-diff/diffmatchpatch" +) + +// return values for the distance function that explain why a diff +// is not an acceptable match for the source document. +const ( + versionChange = -1 + introducedPhraseChange = -2 + lesserGPLChange = -3 +) + +// score computes a metric of similarity between the known and unknown +// document, including the offsets into the unknown that yield the content +// generating the computed similarity. +func (c *Classifier) score(id string, unknown, known *indexedDocument, unknownStart, unknownEnd int) (float64, int, int) { + if c.tc.traceScoring(known.s.origin) { + c.tc.trace("Scoring %s: [%d-%d]", known.s.origin, unknownStart, unknownEnd) + } + + knownLength := known.size() + diffs := docDiff(id, unknown, unknownStart, unknownEnd, known, 0, knownLength) + + start, end := diffRange(known.Norm, diffs) + distance := scoreDiffs(id, diffs[start:end]) + + if c.tc.traceScoring(known.s.origin) { + c.tc.trace("Diffs against %s:\n%s", known.s.origin, spew.Sdump(diffs[start:end])) + } + + if distance < 0 { + // If the distance is negative, this indicates an unacceptable diff so we return a zero-confidence match. + if c.tc.traceScoring(known.s.origin) { + c.tc.trace("Distance result %v, rejected match", distance) + } + return 0.0, 0, 0 + } + + // Applying the diffRange-generated offsets provides the run of text from the + // target most closely correlated to the source. This is the process for + // compensating for errors from the searchset. With the reduced text, we + // compute the final confidence score and exact text locations for the + // matched content. + // The diff slice consists of three regions: an optional deletion diff for + // target text before the source, n elements that represent the diff between + // the source and target, and an optional deletion diff for text after the + // target. The helper functions identify the portions of the slice + // corresponding to those regions. This results in a more accurate + // confidence score and better position detection of the source in the + // target. + conf, so, eo := confidencePercentage(knownLength, distance), textLength(diffs[:start]), textLength(diffs[end:]) + + if c.tc.traceScoring(known.s.origin) { + c.tc.trace("Score result: %v [%d-%d]", conf, so, eo) + } + return conf, so, eo +} + +// confidencePercentage computes a confidence match score for the lengths, +// handling the cases where source and target lengths differ. +func confidencePercentage(klen, distance int) float64 { + // No text is matched at 100% confidence (avoid divide by zero). + if klen == 0 { + return 1.0 + } + + // Return a computed fractional match against the known text. + return 1.0 - float64(distance)/float64(klen) +} + +// diffLevenshteinWord computes word-based Levenshtein count. +func diffLevenshteinWord(diffs []diffmatchpatch.Diff) int { + levenshtein := 0 + insertions := 0 + deletions := 0 + + for _, aDiff := range diffs { + switch aDiff.Type { + case diffmatchpatch.DiffInsert: + insertions += wordLen(aDiff.Text) + case diffmatchpatch.DiffDelete: + deletions += wordLen(aDiff.Text) + case diffmatchpatch.DiffEqual: + // A deletion and an insertion is one substitution. + levenshtein += max(insertions, deletions) + insertions = 0 + deletions = 0 + } + } + + levenshtein += max(insertions, deletions) + return levenshtein +} + +func isVersionNumber(in string) bool { + for _, r := range in { + if !unicode.IsDigit(r) && r != '.' { + return false + } + } + return true +} + +// scoreDiffs returns a score rating the acceptability of these diffs. A +// negative value means that the changes represented by the diff are not an +// acceptable transformation since it would change the underlying license. A +// positive value indicates the Levenshtein word distance. +func scoreDiffs(id string, diffs []diffmatchpatch.Diff) int { + // We make a pass looking for unacceptable substitutions + // Delete diffs are always ordered before insert diffs. This is leveraged to + // analyze a change by checking an insert against the delete text that was + // previously cached. + prevText := "" + prevDelete := "" + for i, diff := range diffs { + text := diff.Text + switch diff.Type { + case diffmatchpatch.DiffInsert: + num := text + if i := strings.Index(num, " "); i != -1 { + num = num[0:i] + } + if isVersionNumber(num) && strings.HasSuffix(prevText, "version") { + if !strings.HasSuffix(prevText, "the standard version") && !strings.HasSuffix(prevText, "the contributor version") { + return versionChange + } + } + // There are certain phrases that can't be introduced to make a license + // hit. TODO: would like to generate this programmatically. Most of + // these are words or phrases that appear in a single/small number of + // licenses. Can we leverage frequency analysis to identify these + // interesting words/phrases and auto-extract them? + + inducedPhrases := map[string][]string{ + "AGPL": {"affero"}, + "Atmel": {"atmel"}, + "Apache": {"apache"}, + "BSD": {"bsd"}, + "BSD-3-Clause-Attribution": {"acknowledgment"}, + "bzip2": {"seward"}, + "GPL-2.0-with-GCC-exception": {"gcc linking exception"}, + "GPL-2.0-with-autoconf-exception": {"autoconf exception"}, + "GPL-2.0-with-bison-exception": {"bison exception"}, + "GPL-2.0-with-classpath-exception": {"class path exception"}, + "GPL-2.0-with-font-exception": {"font exception"}, + "LGPL-2.0": {"library"}, + "ImageMagick": {"imagemagick"}, + "PHP": {"php"}, + "SISSL": {"sun standards"}, + "SGI-B": {"silicon graphics"}, + "SunPro": {"sunpro"}, + "X11": {"x consortium"}, + } + + for k, ps := range inducedPhrases { + if strings.HasPrefix(LicenseName(id), k) { + for _, p := range ps { + if strings.Index(text, p) != -1 { + // Check to make sure there isn't a corresponding diff for this + // insert that also contains the text. This prevents against diff + // blocks that are too big and force a false hit on this check, + // which usually happens with URLs since they are stored in one + // token but can happen in other cases as well. We don't look just + // for delete diffs because the subsequent text may reference the + // content in case a URL was truncated. + if i+1 < len(diffs) && strings.Index(diffs[i+1].Text, p) != -1 { + continue + } + return introducedPhraseChange + } + } + } + } + + // Ignore changes between "library" and "lesser" in a GNU context as they + // changed the terms, but look for introductions of Lesser that would + // otherwise disqualify a match. + if text == "lesser" && strings.HasSuffix(prevText, "gnu") && prevDelete != "library" { + // The LGPL 3.0 doesn't have a standard header, so people tend to craft + // their own. As a result, sometimes the warranty clause refers to the + // GPL instead of the LGPL. This is fine from a licensing perspective, + // but we need to tweak matching to ignore that particular case. In + // other circumstances, inserting or removing the word Lesser in the + // GPL context is not an acceptable change. There is also a reference to + // it when suggesting to use the LGPL. + if !strings.Contains(prevText, "warranty") && !strings.Contains(prevText, "is covered by the gnu") { + return lesserGPLChange + } + } + case diffmatchpatch.DiffEqual: + prevText = text + prevDelete = "" + + case diffmatchpatch.DiffDelete: + // Avoid substitution in most cases. The two exceptions are with usage + // statements that are talking about *another* license, and don't affect + // the detection of the current license. + if (text == "lesser" || text == "library") && strings.HasSuffix(prevText, "gnu") { + // Same as above to avoid matching GPL instead of LGPL here. + if !strings.Contains(prevText, "warranty") && !strings.Contains(prevText, "is covered by the gnu") { + return lesserGPLChange + } + } + prevDelete = text + } + } + return diffLevenshteinWord(diffs) +} |