Change targetMatchedRanges to use algorithm not needing O(n^2) "in" predicate.

Tested: TAP: http://test/OCL:215850556:BASE:216265822:1539039513028:26bcce40 PiperOrigin-RevId: 216271196
author: Google Open Source <noreply+opensource@google.com> 2018-10-08 19:29:53 -0400
committer: Christopher Schmidt <crschmidt@google.com> 2018-10-10 14:56:59 -0400
commit: c642cb3db2d241043a145c4c8ec498ab1422d892 (patch)
tree: 6026db41fe30c48620da25090f0ae1df4cf6fbd0 /stringclassifier
parent: 6bb0d9bc566e140485490f77ee95fe3c5f4d13ea (diff)
download: licenseclassifier-c642cb3db2d241043a145c4c8ec498ab1422d892.tar.gz
2 files changed, 106 insertions, 31 deletions
diff --git a/stringclassifier/searchset/searchset.go b/stringclassifier/searchset/searchset.go
index bdbca30..2f443e5 100644
--- a/stringclassifier/searchset/searchset.go
+++ b/stringclassifier/searchset/searchset.go
@@ -143,6 +143,10 @@ func (m *MatchRange) in(start, end int) bool {
 	return start >= m.TargetStart && end <= m.TargetEnd
 }
 
+func (m *MatchRange) String() string {
+	return fmt.Sprintf("[%v, %v)->[%v, %v)", m.SrcStart, m.SrcEnd, m.TargetStart, m.TargetEnd)
+}
+
 // MatchRanges is a list of "MatchRange"s. The ranges are monotonically
 // increasing in value and indicate a single potential occurrence of the source
 // text in the target text.
@@ -164,16 +168,6 @@ func (m MatchRanges) TargetRange(target *SearchSet) (start, end int) {
 	return start, end
 }
 
-// in returns true if the start and end are enclosed in one of the match ranges.
-func (m MatchRanges) in(start, end int) bool {
-	for _, val := range m {
-		if val.in(start, end) {
-			return true
-		}
-	}
-	return false
-}
-
 // Size is the number of source tokens that were matched.
 func (m MatchRanges) Size() int {
 	sum := 0
@@ -212,40 +206,116 @@ func getMatchedRanges(src, target *SearchSet) []MatchRanges {
 	return mergeConsecutiveRanges(matchedRanges)
 }
 
-// targetMatchedRanges goes through the source and finds all matches in the target.
+func extendsAny(tr tokenizer.TokenRanges, mr []MatchRanges) bool {
+	if len(mr) == 0 {
+		return false
+	}
+	for _, tv := range tr {
+		for _, mv := range mr {
+			if tv.Start >= mv[0].TargetStart && tv.Start <= mv[len(mv)-1].TargetEnd {
+				return true
+			}
+		}
+	}
+	return false
+}
+
+// targetMatchedRanges finds matching sequences in target and src ordered by target position
 func targetMatchedRanges(src, target *SearchSet) MatchRanges {
 	if src.nodes == nil {
 		return nil
 	}
 
 	var matched MatchRanges
-	for _, srcNode := range src.nodes {
-		tr, ok := target.Hashes[srcNode.checksum]
+	var previous *node
+	var possible []MatchRanges
+	for _, tgtNode := range target.nodes {
+		sr, ok := src.Hashes[tgtNode.checksum]
+		if !ok || (previous != nil && tgtNode.tokens.Start > previous.tokens.End) || !extendsAny(sr, possible) {
+			for _, r := range possible {
+				matched = append(matched, r...)
+			}
+			possible = possible[:0]
+			previous = nil
+		}
 		if !ok {
-			// There isn't a match in the target.
+			// There isn't a match in the source.
 			continue
 		}
 
-		// Go over the set of target ranges that are potential matches.
-		for _, tv := range tr {
-			if matched.in(tv.Start, tv.End) {
-				// Matched within a larger range. Ignore.
+		// Maps index within `possible` to the slice of ranges extended by a new range
+		extended := make(map[int]*MatchRanges)
+		// Go over the set of source ranges growing lists of `possible` match ranges.
+		tv := tgtNode.tokens
+		for _, sv := range sr {
+			r := &MatchRange{
+				SrcStart:    sv.Start,
+				SrcEnd:      sv.End,
+				TargetStart: tv.Start,
+				TargetEnd:   tv.End,
+			}
+			found := false
+			// Grow or extend each abutting `possible` match range.
+			for i, p := range possible {
+				last := p[len(p)-1]
+				if sv.Start >= last.SrcStart && sv.Start <= last.SrcEnd && tv.Start >= last.TargetStart && tv.Start <= last.TargetEnd {
+					found = true
+					possible[i] = append(possible[i], r)
+					extended[i] = &possible[i]
+				}
+			}
+			if !found {
+				// Did not abut any existing ranges, start a new `possible` match range.
+				mrs := make(MatchRanges, 0, 2)
+				mrs = append(mrs, r)
+				possible = append(possible, mrs)
+				extended[len(possible)-1] = &possible[len(possible)-1]
+			}
+		}
+		if len(extended) < len(possible) {
+			// Ranges not extended--add to `matched` if not included in other range.
+			for i := 0; i < len(possible); {
+				_, updated := extended[i]
+				if updated {
+					i++ // Keep in `possible` and advance to next index.
+					continue
+				}
+				p1 := possible[i]
+				found := false // whether found as subrange of another `possible` match.
+				for _, p2 := range extended {
+					if p1[0].SrcStart >= (*p2)[0].SrcStart && p1[0].TargetStart >= (*p2)[0].TargetStart {
+						found = true
+						break
+					}
+				}
+				if !found {
+					matched = append(matched, p1...)
+				} // else included in other match.
+				// Finished -- delete from `possible` and continue from same index.
+				possible = append(possible[:i], possible[i+1:]...)
+			}
+		}
+		previous = tgtNode
+	}
+	// At end of file, terminate all `possible` match ranges.
+	for i := 0; i < len(possible); i++ {
+		p1 := possible[i]
+		found := false // whether found as subrange of another `possible` match.
+		for j := i + 1; j < len(possible); {
+			p2 := possible[j]
+			if p1[0].SrcStart <= p2[0].SrcStart && p1[0].TargetStart <= p2[0].TargetStart {
+				// Delete later sub-ranges included in this range.
+				possible = append(possible[:j], possible[j+1:]...)
 				continue
 			}
-
-			// The matched sections from the source (S) should be
-			// in the same order as in the target (T). Thus if
-			// chunks A and C from S match in T at positions X and
-			// Z, then chunk B, which is between A and C in S,
-			// should be in a position between X and Z.
-			for _, sv := range src.Hashes[srcNode.checksum] {
-				matched = append(matched, &MatchRange{
-					SrcStart:    sv.Start,
-					SrcEnd:      sv.End,
-					TargetStart: tv.Start,
-					TargetEnd:   tv.End,
-				})
+			// Skip if subrange of a later range
+			if p1[0].SrcStart >= p2[0].SrcStart && p1[0].TargetStart >= p2[0].TargetStart {
+				found = true
 			}
+			j++
+		}
+		if !found {
+			matched = append(matched, p1...)
 		}
 	}
 	return matched
diff --git a/stringclassifier/searchset/tokenizer/tokenizer.go b/stringclassifier/searchset/tokenizer/tokenizer.go
index e207e14..0f842d8 100644
--- a/stringclassifier/searchset/tokenizer/tokenizer.go
+++ b/stringclassifier/searchset/tokenizer/tokenizer.go
@@ -17,6 +17,7 @@ package tokenizer
 
 import (
 	"bytes"
+	"fmt"
 	"hash/crc32"
 	"sort"
 	"unicode"
@@ -115,6 +116,10 @@ type TokenRange struct {
 	End   int
 }
 
+func (t *TokenRange) String() string {
+	return fmt.Sprintf("[%v, %v)", t.Start, t.End)
+}
+
 // TokenRanges is a list of TokenRange objects. The chance that two different
 // strings map to the same checksum is very small, but unfortunately isn't
 // zero, so we use this instead of making the assumption that they will all be
author	Google Open Source <noreply+opensource@google.com>	2018-10-08 19:29:53 -0400
committer	Christopher Schmidt <crschmidt@google.com>	2018-10-10 14:56:59 -0400
commit	c642cb3db2d241043a145c4c8ec498ab1422d892 (patch)
tree	6026db41fe30c48620da25090f0ae1df4cf6fbd0 /stringclassifier
parent	6bb0d9bc566e140485490f77ee95fe3c5f4d13ea (diff)
download	licenseclassifier-c642cb3db2d241043a145c4c8ec498ab1422d892.tar.gz