diff options
author | Google Open Source <noreply+opensource@google.com> | 2018-10-08 19:29:53 -0400 |
---|---|---|
committer | Christopher Schmidt <crschmidt@google.com> | 2018-10-10 14:56:59 -0400 |
commit | c642cb3db2d241043a145c4c8ec498ab1422d892 (patch) | |
tree | 6026db41fe30c48620da25090f0ae1df4cf6fbd0 /stringclassifier | |
parent | 6bb0d9bc566e140485490f77ee95fe3c5f4d13ea (diff) | |
download | licenseclassifier-c642cb3db2d241043a145c4c8ec498ab1422d892.tar.gz |
Change targetMatchedRanges to use algorithm not needing O(n^2) "in" predicate.
Tested:
TAP: http://test/OCL:215850556:BASE:216265822:1539039513028:26bcce40
PiperOrigin-RevId: 216271196
Diffstat (limited to 'stringclassifier')
-rw-r--r-- | stringclassifier/searchset/searchset.go | 132 | ||||
-rw-r--r-- | stringclassifier/searchset/tokenizer/tokenizer.go | 5 |
2 files changed, 106 insertions, 31 deletions
diff --git a/stringclassifier/searchset/searchset.go b/stringclassifier/searchset/searchset.go index bdbca30..2f443e5 100644 --- a/stringclassifier/searchset/searchset.go +++ b/stringclassifier/searchset/searchset.go @@ -143,6 +143,10 @@ func (m *MatchRange) in(start, end int) bool { return start >= m.TargetStart && end <= m.TargetEnd } +func (m *MatchRange) String() string { + return fmt.Sprintf("[%v, %v)->[%v, %v)", m.SrcStart, m.SrcEnd, m.TargetStart, m.TargetEnd) +} + // MatchRanges is a list of "MatchRange"s. The ranges are monotonically // increasing in value and indicate a single potential occurrence of the source // text in the target text. @@ -164,16 +168,6 @@ func (m MatchRanges) TargetRange(target *SearchSet) (start, end int) { return start, end } -// in returns true if the start and end are enclosed in one of the match ranges. -func (m MatchRanges) in(start, end int) bool { - for _, val := range m { - if val.in(start, end) { - return true - } - } - return false -} - // Size is the number of source tokens that were matched. func (m MatchRanges) Size() int { sum := 0 @@ -212,40 +206,116 @@ func getMatchedRanges(src, target *SearchSet) []MatchRanges { return mergeConsecutiveRanges(matchedRanges) } -// targetMatchedRanges goes through the source and finds all matches in the target. +func extendsAny(tr tokenizer.TokenRanges, mr []MatchRanges) bool { + if len(mr) == 0 { + return false + } + for _, tv := range tr { + for _, mv := range mr { + if tv.Start >= mv[0].TargetStart && tv.Start <= mv[len(mv)-1].TargetEnd { + return true + } + } + } + return false +} + +// targetMatchedRanges finds matching sequences in target and src ordered by target position func targetMatchedRanges(src, target *SearchSet) MatchRanges { if src.nodes == nil { return nil } var matched MatchRanges - for _, srcNode := range src.nodes { - tr, ok := target.Hashes[srcNode.checksum] + var previous *node + var possible []MatchRanges + for _, tgtNode := range target.nodes { + sr, ok := src.Hashes[tgtNode.checksum] + if !ok || (previous != nil && tgtNode.tokens.Start > previous.tokens.End) || !extendsAny(sr, possible) { + for _, r := range possible { + matched = append(matched, r...) + } + possible = possible[:0] + previous = nil + } if !ok { - // There isn't a match in the target. + // There isn't a match in the source. continue } - // Go over the set of target ranges that are potential matches. - for _, tv := range tr { - if matched.in(tv.Start, tv.End) { - // Matched within a larger range. Ignore. + // Maps index within `possible` to the slice of ranges extended by a new range + extended := make(map[int]*MatchRanges) + // Go over the set of source ranges growing lists of `possible` match ranges. + tv := tgtNode.tokens + for _, sv := range sr { + r := &MatchRange{ + SrcStart: sv.Start, + SrcEnd: sv.End, + TargetStart: tv.Start, + TargetEnd: tv.End, + } + found := false + // Grow or extend each abutting `possible` match range. + for i, p := range possible { + last := p[len(p)-1] + if sv.Start >= last.SrcStart && sv.Start <= last.SrcEnd && tv.Start >= last.TargetStart && tv.Start <= last.TargetEnd { + found = true + possible[i] = append(possible[i], r) + extended[i] = &possible[i] + } + } + if !found { + // Did not abut any existing ranges, start a new `possible` match range. + mrs := make(MatchRanges, 0, 2) + mrs = append(mrs, r) + possible = append(possible, mrs) + extended[len(possible)-1] = &possible[len(possible)-1] + } + } + if len(extended) < len(possible) { + // Ranges not extended--add to `matched` if not included in other range. + for i := 0; i < len(possible); { + _, updated := extended[i] + if updated { + i++ // Keep in `possible` and advance to next index. + continue + } + p1 := possible[i] + found := false // whether found as subrange of another `possible` match. + for _, p2 := range extended { + if p1[0].SrcStart >= (*p2)[0].SrcStart && p1[0].TargetStart >= (*p2)[0].TargetStart { + found = true + break + } + } + if !found { + matched = append(matched, p1...) + } // else included in other match. + // Finished -- delete from `possible` and continue from same index. + possible = append(possible[:i], possible[i+1:]...) + } + } + previous = tgtNode + } + // At end of file, terminate all `possible` match ranges. + for i := 0; i < len(possible); i++ { + p1 := possible[i] + found := false // whether found as subrange of another `possible` match. + for j := i + 1; j < len(possible); { + p2 := possible[j] + if p1[0].SrcStart <= p2[0].SrcStart && p1[0].TargetStart <= p2[0].TargetStart { + // Delete later sub-ranges included in this range. + possible = append(possible[:j], possible[j+1:]...) continue } - - // The matched sections from the source (S) should be - // in the same order as in the target (T). Thus if - // chunks A and C from S match in T at positions X and - // Z, then chunk B, which is between A and C in S, - // should be in a position between X and Z. - for _, sv := range src.Hashes[srcNode.checksum] { - matched = append(matched, &MatchRange{ - SrcStart: sv.Start, - SrcEnd: sv.End, - TargetStart: tv.Start, - TargetEnd: tv.End, - }) + // Skip if subrange of a later range + if p1[0].SrcStart >= p2[0].SrcStart && p1[0].TargetStart >= p2[0].TargetStart { + found = true } + j++ + } + if !found { + matched = append(matched, p1...) } } return matched diff --git a/stringclassifier/searchset/tokenizer/tokenizer.go b/stringclassifier/searchset/tokenizer/tokenizer.go index e207e14..0f842d8 100644 --- a/stringclassifier/searchset/tokenizer/tokenizer.go +++ b/stringclassifier/searchset/tokenizer/tokenizer.go @@ -17,6 +17,7 @@ package tokenizer import ( "bytes" + "fmt" "hash/crc32" "sort" "unicode" @@ -115,6 +116,10 @@ type TokenRange struct { End int } +func (t *TokenRange) String() string { + return fmt.Sprintf("[%v, %v)", t.Start, t.End) +} + // TokenRanges is a list of TokenRange objects. The chance that two different // strings map to the same checksum is very small, but unfortunately isn't // zero, so we use this instead of making the assumption that they will all be |