Fix the induced phrases search to not trigger on modified URLs.

URLs are stored as a single token by the classifier, meaning they can only introduce 1 error in exact matching, which is good for approximate matching, since a lengthy URL that is changed would otherwise introduce additional errors. However, this doesn't work well with the induced phrases check since a modified URL would get stored as a insert/delete pair that might contain the triggering word (e.g. "apache"). This meant that a license that had a different Apache URL from our pristine copies would get rejected because it "introduced" the word "apache" This fixes the logic to not trigger an induced phrase condition if the insert is paired with a delete that also contains the induced phrase, since this means it did exist in the document after all. Diagnostic logging for the diffing phase now includes output to help triage these conditions. This proved very useful in identifying older Apache licenses that were incorrectly rejected and sometimes barely matching non-applicable licenses. PiperOrigin-RevId: 407908906
author: Bill Neubauer <wcn@google.com> 2021-11-05 14:20:00 -0700
committer: Bill Neubauer <wcn@google.com> 2022-03-16 15:34:52 -0700
commit: c65735a948f193d619a05871e068a4d4c451b705 (patch)
tree: b87cbb1550a112353e59d493ef1c373c038a41db /v2
parent: 96b685b82f5466e36faa53de2f3cd35196376e60 (diff)
download: licenseclassifier-c65735a948f193d619a05871e068a4d4c451b705.tar.gz
1 files changed, 17 insertions, 1 deletions
diff --git a/v2/scoring.go b/v2/scoring.go
index 4ce671c..541fc7b 100644
--- a/v2/scoring.go
+++ b/v2/scoring.go
@@ -18,6 +18,7 @@ import (
 	"strings"
 	"unicode"
 
+	"github.com/davecgh/go-spew/spew"
 	"github.com/sergi/go-diff/diffmatchpatch"
 )
 
@@ -42,6 +43,11 @@ func (c *Classifier) score(id string, unknown, known *indexedDocument, unknownSt
 
 	start, end := diffRange(known.norm, diffs)
 	distance := scoreDiffs(id, diffs[start:end])
+
+	if c.tc.traceScoring(known.s.origin) {
+		c.tc.trace("Diffs against %s:\n%s", known.s.origin, spew.Sdump(diffs[start:end]))
+	}
+
 	if distance < 0 {
 		// If the distance is negative, this indicates an unacceptable diff so we return a zero-confidence match.
 		if c.tc.traceScoring(known.s.origin) {
@@ -126,7 +132,7 @@ func scoreDiffs(id string, diffs []diffmatchpatch.Diff) int {
 	// previously cached.
 	prevText := ""
 	prevDelete := ""
-	for _, diff := range diffs {
+	for i, diff := range diffs {
 		text := diff.Text
 		switch diff.Type {
 		case diffmatchpatch.DiffInsert:
@@ -169,6 +175,16 @@ func scoreDiffs(id string, diffs []diffmatchpatch.Diff) int {
 				if strings.HasPrefix(id, k) {
 					for _, p := range ps {
 						if strings.Index(text, p) != -1 {
+							// Check to make sure there isn't a corresponding diff for this
+							// insert that also contains the text. This prevents against diff
+							// blocks that are too big and force a false hit on this check,
+							// which usually happens with URLs since they are stored in one
+							// token but can happen in other cases as well. We don't look just
+							// for delete diffs because the subsequent text may reference the
+							// content in case a URL was truncated.
+							if i+1 < len(diffs) && strings.Index(diffs[i+1].Text, p) != -1 {
+								continue
+							}
 							return introducedPhraseChange
 						}
 					}
author	Bill Neubauer <wcn@google.com>	2021-11-05 14:20:00 -0700
committer	Bill Neubauer <wcn@google.com>	2022-03-16 15:34:52 -0700
commit	c65735a948f193d619a05871e068a4d4c451b705 (patch)
tree	b87cbb1550a112353e59d493ef1c373c038a41db /v2
parent	96b685b82f5466e36faa53de2f3cd35196376e60 (diff)
download	licenseclassifier-c65735a948f193d619a05871e068a4d4c451b705.tar.gz