aboutsummaryrefslogtreecommitdiff
path: root/v2
diff options
context:
space:
mode:
authorBill Neubauer <wcn@google.com>2021-11-05 14:20:00 -0700
committerBill Neubauer <wcn@google.com>2022-03-16 15:34:52 -0700
commitc65735a948f193d619a05871e068a4d4c451b705 (patch)
treeb87cbb1550a112353e59d493ef1c373c038a41db /v2
parent96b685b82f5466e36faa53de2f3cd35196376e60 (diff)
downloadlicenseclassifier-c65735a948f193d619a05871e068a4d4c451b705.tar.gz
Fix the induced phrases search to not trigger on modified URLs.
URLs are stored as a single token by the classifier, meaning they can only introduce 1 error in exact matching, which is good for approximate matching, since a lengthy URL that is changed would otherwise introduce additional errors. However, this doesn't work well with the induced phrases check since a modified URL would get stored as a insert/delete pair that might contain the triggering word (e.g. "apache"). This meant that a license that had a different Apache URL from our pristine copies would get rejected because it "introduced" the word "apache" This fixes the logic to not trigger an induced phrase condition if the insert is paired with a delete that also contains the induced phrase, since this means it did exist in the document after all. Diagnostic logging for the diffing phase now includes output to help triage these conditions. This proved very useful in identifying older Apache licenses that were incorrectly rejected and sometimes barely matching non-applicable licenses. PiperOrigin-RevId: 407908906
Diffstat (limited to 'v2')
-rw-r--r--v2/scoring.go18
1 files changed, 17 insertions, 1 deletions
diff --git a/v2/scoring.go b/v2/scoring.go
index 4ce671c..541fc7b 100644
--- a/v2/scoring.go
+++ b/v2/scoring.go
@@ -18,6 +18,7 @@ import (
"strings"
"unicode"
+ "github.com/davecgh/go-spew/spew"
"github.com/sergi/go-diff/diffmatchpatch"
)
@@ -42,6 +43,11 @@ func (c *Classifier) score(id string, unknown, known *indexedDocument, unknownSt
start, end := diffRange(known.norm, diffs)
distance := scoreDiffs(id, diffs[start:end])
+
+ if c.tc.traceScoring(known.s.origin) {
+ c.tc.trace("Diffs against %s:\n%s", known.s.origin, spew.Sdump(diffs[start:end]))
+ }
+
if distance < 0 {
// If the distance is negative, this indicates an unacceptable diff so we return a zero-confidence match.
if c.tc.traceScoring(known.s.origin) {
@@ -126,7 +132,7 @@ func scoreDiffs(id string, diffs []diffmatchpatch.Diff) int {
// previously cached.
prevText := ""
prevDelete := ""
- for _, diff := range diffs {
+ for i, diff := range diffs {
text := diff.Text
switch diff.Type {
case diffmatchpatch.DiffInsert:
@@ -169,6 +175,16 @@ func scoreDiffs(id string, diffs []diffmatchpatch.Diff) int {
if strings.HasPrefix(id, k) {
for _, p := range ps {
if strings.Index(text, p) != -1 {
+ // Check to make sure there isn't a corresponding diff for this
+ // insert that also contains the text. This prevents against diff
+ // blocks that are too big and force a false hit on this check,
+ // which usually happens with URLs since they are stored in one
+ // token but can happen in other cases as well. We don't look just
+ // for delete diffs because the subsequent text may reference the
+ // content in case a URL was truncated.
+ if i+1 < len(diffs) && strings.Index(diffs[i+1].Text, p) != -1 {
+ continue
+ }
return introducedPhraseChange
}
}