aboutsummaryrefslogtreecommitdiff
path: root/stringclassifier
diff options
context:
space:
mode:
authorGoogle Open Source <noreply+opensource@google.com>2018-10-08 19:29:53 -0400
committerChristopher Schmidt <crschmidt@google.com>2018-10-10 14:56:59 -0400
commitc642cb3db2d241043a145c4c8ec498ab1422d892 (patch)
tree6026db41fe30c48620da25090f0ae1df4cf6fbd0 /stringclassifier
parent6bb0d9bc566e140485490f77ee95fe3c5f4d13ea (diff)
downloadlicenseclassifier-c642cb3db2d241043a145c4c8ec498ab1422d892.tar.gz
Change targetMatchedRanges to use algorithm not needing O(n^2) "in" predicate.
Tested: TAP: http://test/OCL:215850556:BASE:216265822:1539039513028:26bcce40 PiperOrigin-RevId: 216271196
Diffstat (limited to 'stringclassifier')
-rw-r--r--stringclassifier/searchset/searchset.go132
-rw-r--r--stringclassifier/searchset/tokenizer/tokenizer.go5
2 files changed, 106 insertions, 31 deletions
diff --git a/stringclassifier/searchset/searchset.go b/stringclassifier/searchset/searchset.go
index bdbca30..2f443e5 100644
--- a/stringclassifier/searchset/searchset.go
+++ b/stringclassifier/searchset/searchset.go
@@ -143,6 +143,10 @@ func (m *MatchRange) in(start, end int) bool {
return start >= m.TargetStart && end <= m.TargetEnd
}
+func (m *MatchRange) String() string {
+ return fmt.Sprintf("[%v, %v)->[%v, %v)", m.SrcStart, m.SrcEnd, m.TargetStart, m.TargetEnd)
+}
+
// MatchRanges is a list of "MatchRange"s. The ranges are monotonically
// increasing in value and indicate a single potential occurrence of the source
// text in the target text.
@@ -164,16 +168,6 @@ func (m MatchRanges) TargetRange(target *SearchSet) (start, end int) {
return start, end
}
-// in returns true if the start and end are enclosed in one of the match ranges.
-func (m MatchRanges) in(start, end int) bool {
- for _, val := range m {
- if val.in(start, end) {
- return true
- }
- }
- return false
-}
-
// Size is the number of source tokens that were matched.
func (m MatchRanges) Size() int {
sum := 0
@@ -212,40 +206,116 @@ func getMatchedRanges(src, target *SearchSet) []MatchRanges {
return mergeConsecutiveRanges(matchedRanges)
}
-// targetMatchedRanges goes through the source and finds all matches in the target.
+func extendsAny(tr tokenizer.TokenRanges, mr []MatchRanges) bool {
+ if len(mr) == 0 {
+ return false
+ }
+ for _, tv := range tr {
+ for _, mv := range mr {
+ if tv.Start >= mv[0].TargetStart && tv.Start <= mv[len(mv)-1].TargetEnd {
+ return true
+ }
+ }
+ }
+ return false
+}
+
+// targetMatchedRanges finds matching sequences in target and src ordered by target position
func targetMatchedRanges(src, target *SearchSet) MatchRanges {
if src.nodes == nil {
return nil
}
var matched MatchRanges
- for _, srcNode := range src.nodes {
- tr, ok := target.Hashes[srcNode.checksum]
+ var previous *node
+ var possible []MatchRanges
+ for _, tgtNode := range target.nodes {
+ sr, ok := src.Hashes[tgtNode.checksum]
+ if !ok || (previous != nil && tgtNode.tokens.Start > previous.tokens.End) || !extendsAny(sr, possible) {
+ for _, r := range possible {
+ matched = append(matched, r...)
+ }
+ possible = possible[:0]
+ previous = nil
+ }
if !ok {
- // There isn't a match in the target.
+ // There isn't a match in the source.
continue
}
- // Go over the set of target ranges that are potential matches.
- for _, tv := range tr {
- if matched.in(tv.Start, tv.End) {
- // Matched within a larger range. Ignore.
+ // Maps index within `possible` to the slice of ranges extended by a new range
+ extended := make(map[int]*MatchRanges)
+ // Go over the set of source ranges growing lists of `possible` match ranges.
+ tv := tgtNode.tokens
+ for _, sv := range sr {
+ r := &MatchRange{
+ SrcStart: sv.Start,
+ SrcEnd: sv.End,
+ TargetStart: tv.Start,
+ TargetEnd: tv.End,
+ }
+ found := false
+ // Grow or extend each abutting `possible` match range.
+ for i, p := range possible {
+ last := p[len(p)-1]
+ if sv.Start >= last.SrcStart && sv.Start <= last.SrcEnd && tv.Start >= last.TargetStart && tv.Start <= last.TargetEnd {
+ found = true
+ possible[i] = append(possible[i], r)
+ extended[i] = &possible[i]
+ }
+ }
+ if !found {
+ // Did not abut any existing ranges, start a new `possible` match range.
+ mrs := make(MatchRanges, 0, 2)
+ mrs = append(mrs, r)
+ possible = append(possible, mrs)
+ extended[len(possible)-1] = &possible[len(possible)-1]
+ }
+ }
+ if len(extended) < len(possible) {
+ // Ranges not extended--add to `matched` if not included in other range.
+ for i := 0; i < len(possible); {
+ _, updated := extended[i]
+ if updated {
+ i++ // Keep in `possible` and advance to next index.
+ continue
+ }
+ p1 := possible[i]
+ found := false // whether found as subrange of another `possible` match.
+ for _, p2 := range extended {
+ if p1[0].SrcStart >= (*p2)[0].SrcStart && p1[0].TargetStart >= (*p2)[0].TargetStart {
+ found = true
+ break
+ }
+ }
+ if !found {
+ matched = append(matched, p1...)
+ } // else included in other match.
+ // Finished -- delete from `possible` and continue from same index.
+ possible = append(possible[:i], possible[i+1:]...)
+ }
+ }
+ previous = tgtNode
+ }
+ // At end of file, terminate all `possible` match ranges.
+ for i := 0; i < len(possible); i++ {
+ p1 := possible[i]
+ found := false // whether found as subrange of another `possible` match.
+ for j := i + 1; j < len(possible); {
+ p2 := possible[j]
+ if p1[0].SrcStart <= p2[0].SrcStart && p1[0].TargetStart <= p2[0].TargetStart {
+ // Delete later sub-ranges included in this range.
+ possible = append(possible[:j], possible[j+1:]...)
continue
}
-
- // The matched sections from the source (S) should be
- // in the same order as in the target (T). Thus if
- // chunks A and C from S match in T at positions X and
- // Z, then chunk B, which is between A and C in S,
- // should be in a position between X and Z.
- for _, sv := range src.Hashes[srcNode.checksum] {
- matched = append(matched, &MatchRange{
- SrcStart: sv.Start,
- SrcEnd: sv.End,
- TargetStart: tv.Start,
- TargetEnd: tv.End,
- })
+ // Skip if subrange of a later range
+ if p1[0].SrcStart >= p2[0].SrcStart && p1[0].TargetStart >= p2[0].TargetStart {
+ found = true
}
+ j++
+ }
+ if !found {
+ matched = append(matched, p1...)
}
}
return matched
diff --git a/stringclassifier/searchset/tokenizer/tokenizer.go b/stringclassifier/searchset/tokenizer/tokenizer.go
index e207e14..0f842d8 100644
--- a/stringclassifier/searchset/tokenizer/tokenizer.go
+++ b/stringclassifier/searchset/tokenizer/tokenizer.go
@@ -17,6 +17,7 @@ package tokenizer
import (
"bytes"
+ "fmt"
"hash/crc32"
"sort"
"unicode"
@@ -115,6 +116,10 @@ type TokenRange struct {
End int
}
+func (t *TokenRange) String() string {
+ return fmt.Sprintf("[%v, %v)", t.Start, t.End)
+}
+
// TokenRanges is a list of TokenRange objects. The chance that two different
// strings map to the same checksum is very small, but unfortunately isn't
// zero, so we use this instead of making the assumption that they will all be