diff options
author | Bill Neubauer <wcn@google.com> | 2021-11-05 14:20:00 -0700 |
---|---|---|
committer | Bill Neubauer <wcn@google.com> | 2022-03-16 15:34:52 -0700 |
commit | c65735a948f193d619a05871e068a4d4c451b705 (patch) | |
tree | b87cbb1550a112353e59d493ef1c373c038a41db /v2 | |
parent | 96b685b82f5466e36faa53de2f3cd35196376e60 (diff) | |
download | licenseclassifier-c65735a948f193d619a05871e068a4d4c451b705.tar.gz |
Fix the induced phrases search to not trigger on modified URLs.
URLs are stored as a single token by the classifier, meaning they can only
introduce 1 error in exact matching, which is good for approximate matching,
since a lengthy URL that is changed would otherwise introduce additional
errors.
However, this doesn't work well with the induced phrases check since a modified
URL would get stored as a insert/delete pair that might contain the triggering
word (e.g. "apache"). This meant that a license that had a different Apache URL
from our pristine copies would get rejected because it "introduced" the word
"apache"
This fixes the logic to not trigger an induced phrase condition if the insert
is paired with a delete that also contains the induced phrase, since this means
it did exist in the document after all. Diagnostic logging for the diffing
phase now includes output to help triage these conditions.
This proved very useful in identifying older Apache licenses that were
incorrectly rejected and sometimes barely matching non-applicable licenses.
PiperOrigin-RevId: 407908906
Diffstat (limited to 'v2')
-rw-r--r-- | v2/scoring.go | 18 |
1 files changed, 17 insertions, 1 deletions
diff --git a/v2/scoring.go b/v2/scoring.go index 4ce671c..541fc7b 100644 --- a/v2/scoring.go +++ b/v2/scoring.go @@ -18,6 +18,7 @@ import ( "strings" "unicode" + "github.com/davecgh/go-spew/spew" "github.com/sergi/go-diff/diffmatchpatch" ) @@ -42,6 +43,11 @@ func (c *Classifier) score(id string, unknown, known *indexedDocument, unknownSt start, end := diffRange(known.norm, diffs) distance := scoreDiffs(id, diffs[start:end]) + + if c.tc.traceScoring(known.s.origin) { + c.tc.trace("Diffs against %s:\n%s", known.s.origin, spew.Sdump(diffs[start:end])) + } + if distance < 0 { // If the distance is negative, this indicates an unacceptable diff so we return a zero-confidence match. if c.tc.traceScoring(known.s.origin) { @@ -126,7 +132,7 @@ func scoreDiffs(id string, diffs []diffmatchpatch.Diff) int { // previously cached. prevText := "" prevDelete := "" - for _, diff := range diffs { + for i, diff := range diffs { text := diff.Text switch diff.Type { case diffmatchpatch.DiffInsert: @@ -169,6 +175,16 @@ func scoreDiffs(id string, diffs []diffmatchpatch.Diff) int { if strings.HasPrefix(id, k) { for _, p := range ps { if strings.Index(text, p) != -1 { + // Check to make sure there isn't a corresponding diff for this + // insert that also contains the text. This prevents against diff + // blocks that are too big and force a false hit on this check, + // which usually happens with URLs since they are stored in one + // token but can happen in other cases as well. We don't look just + // for delete diffs because the subsequent text may reference the + // content in case a URL was truncated. + if i+1 < len(diffs) && strings.Index(diffs[i+1].Text, p) != -1 { + continue + } return introducedPhraseChange } } |