v2/searchset.go


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497

// Copyright 2020 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package classifier

import (
	"fmt"
	"hash/crc32"
	"math"
	"sort"

	"github.com/davecgh/go-spew/spew"
)

// searchSet is a set of q-grams that have hashes associated with them,
// making it fast to search for potential matches.
type searchSet struct {
	// Tokens is a tokenized list of the original input string.
	Tokens []indexedToken
	// Hashes is a map of checksums to a range of tokens.
	Hashes hash
	// Checksums is a list of checksums ordered from longest range to
	// shortest.
	Checksums []uint32
	// ChecksumRanges are the token ranges for the above checksums.
	ChecksumRanges tokenRanges
	origin         string // A debugging identifier to label what this searchset is associated with

	nodes []*node
	q     int // The length of q-grams in this searchset.
}

// node consists of a range of tokens along with the checksum for those tokens.
type node struct {
	checksum uint32
	tokens   *tokenRange
}

func (n *node) String() string {
	return fmt.Sprintf("[%d:%d]", n.tokens.Start, n.tokens.End)
}

// newSearchSet creates a new searchSet object. A searchset generates all
// possible q-grams of tokens. These q-grams of tokens can be correlated to
// determine where a section of text from one source may appear in another
// source.
func newSearchSet(s *indexedDocument, q int) *searchSet {
	// Start generating hash values for all q-grams within the text.
	h := make(hash)
	if len(s.Tokens) < q {
		// We can't have a smaller q than the number of tokens.
		q = len(s.Tokens)
	}
	checksums, tokenRanges := generateHashes(h, q, s.Tokens, s.dict)
	sset := &searchSet{
		Tokens:         s.Tokens,
		Hashes:         h,
		Checksums:      checksums,
		ChecksumRanges: tokenRanges,
		q:              q,
	}
	sset.generateNodeList()
	return sset
}

// tokenRange indicates the range of tokens that map to a particular checksum.
type tokenRange struct {
	Start int
	End   int
}

func (t *tokenRange) String() string {
	return fmt.Sprintf("[%v, %v)", t.Start, t.End)
}

// tokenRanges is a sortable type of a slice of TokenRange.
type tokenRanges []*tokenRange

// generateHashes computes a hash using CRC-32 for each q-gram encountered in the provided tokens.
func generateHashes(h hash, q int, toks []indexedToken, dict *dictionary) ([]uint32, tokenRanges) {
	if q == 0 {
		return nil, nil
	}
	var css []uint32
	var tr tokenRanges
	crc := crc32.NewIEEE()
	for offset := 0; offset+q <= len(toks); offset++ {
		crc.Reset()
		for i := 0; i < q; i++ {
			crc.Write([]byte(dict.getWord(toks[offset+i].ID)))
			crc.Write([]byte{' '})
		}
		cs := crc.Sum32()
		css = append(css, cs)
		tr = append(tr, &tokenRange{offset, offset + q})
		h.add(cs, offset, offset+q)
	}

	return css, tr
}

// generateNodeList creates a node list out of the search set.
func (s *searchSet) generateNodeList() {
	if len(s.Tokens) == 0 {
		return
	}

	for i := 0; i < len(s.Checksums); i++ {
		s.nodes = append(s.nodes, &node{
			checksum: s.Checksums[i],
			tokens:   s.ChecksumRanges[i],
		})
	}
}

// matchRange is the range within the source text that is a match to the range
// in the target text.
type matchRange struct {
	// Offsets into the source tokens.
	SrcStart, SrcEnd int
	// Offsets into the target tokens.
	TargetStart, TargetEnd int
	// TokensClaimed tracks the number of positively matched tokens in this
	// range.  For initially created matchRanges, this is equal to the extent of
	// the range.  However, as matchRanges get merged together and error is
	// potentially introduced, this tracks the precise number of tokens that
	// exist in the range.
	TokensClaimed int
}

// in returns true if the start and end are enclosed in the match range.
func (m *matchRange) in(other *matchRange) bool {
	return m.TargetStart >= other.TargetStart && m.TargetEnd <= other.TargetEnd
}

func (m *matchRange) String() string {
	return fmt.Sprintf("S: [%v, %v)-> T: [%v, %v) => %v [%v]", m.SrcStart, m.SrcEnd, m.TargetStart, m.TargetEnd, m.TargetStart-m.SrcStart, m.TokensClaimed)
}

// matchRanges is a list of "matchRange"s. The ranges are monotonically
// increasing in value and indicate a single potential occurrence of the source
// text in the target text. They are sorted to support greedy matching with the
// longest runs of q-grams appearing first in the sort.
type matchRanges []*matchRange

func (m matchRanges) Len() int      { return len(m) }
func (m matchRanges) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
func (m matchRanges) Less(i, j int) bool {
	if m[i].TokensClaimed != m[j].TokensClaimed {
		return m[i].TokensClaimed > m[j].TokensClaimed
	}

	if m[i].TargetStart != m[j].TargetStart {
		return m[i].TargetStart < m[j].TargetStart
	}
	return m[i].SrcStart < m[j].SrcStart
}

// findPotentialMatches returns the ranges in the target (unknown) text that
// are best potential matches to the source (known) text.
func (c *Classifier) findPotentialMatches(src, target *searchSet, confidence float64) matchRanges {
	matchedRanges := c.getMatchedRanges(src, target, confidence, src.q)
	if c.tc.traceSearchset(src.origin) {
		c.tc.trace("matchedRanges = %s", spew.Sdump(matchedRanges))
	}
	if len(matchedRanges) == 0 {
		return nil
	}

	// After computing all potential matches, we only output ranges that contain
	// enough tokens to clear the confidence threshold. As noted, this number can
	// be too high, yielding false positives, but cannot yield false negatives.
	threshold := int(confidence * float64(len(src.Tokens)))

	for i, m := range matchedRanges {
		if m.TokensClaimed < threshold {
			matchedRanges = matchedRanges[:i]
			break
		}
	}

	if c.tc.traceSearchset(src.origin) {
		c.tc.trace("finalized matchedRanges for %s: %d = %s", src.origin, len(src.Tokens), spew.Sdump(matchedRanges))
	}
	return matchedRanges
}

// fuseRanges analyzes the source matches, attempting to combine hits without
// errors into larger hits with tolerable amounts of error to produce matches
// that contain enough tokens to be considered for exact matching against a a
// target document. This routine intentionally does not accurately track error
// contributions from merging runs, trading false positives (but not false
// negatives), for faster performance.
func (c *Classifier) fuseRanges(origin string, matched matchRanges, confidence float64, size int, runs []matchRange, targetSize int) matchRanges {
	var claimed matchRanges
	errorMargin := int(math.Round(float64(size) * (1.0 - confidence)))

	filter := make([]bool, targetSize)
	for _, m := range runs {
		for i := m.SrcStart; i < m.SrcEnd; i++ {
			// Only consider these offsets if they fit into the target document, since
			// the target may be smaller than the source being analyzed
			if i < targetSize {
				filter[i] = true
			}
		}
	}

	filterDrops := 0
	filterPasses := 0

	// For each hit detected, compare it against all other previous hits to see if it can be part of match
	// or represents a group that is eligible for matching and having other hits contribute to it.
	for _, m := range matched {
		off := m.TargetStart - m.SrcStart

		// If the offset is negative, but within error margins, we associate it
		// with the first index to see if it could contribute to a run. If the
		// absolute offset is larger than the error margin, it can't possibly
		// contribute and will be dropped. This puts more potential error into the zero
		// index, but that just slightly increases the rate of false positives. In
		// practice, this would only be an issue if there are major substrings of a
		// source in a target that aren't part of a real hit. We see many small
		// references (the name of a license) but not large chunks of the license.
		if off < 0 {
			if -off <= errorMargin {
				off = 0
			} else {
				continue
			}
		}

		// If the filter is false, there was not sufficient token density in that
		// part of the target document for a viable match, so this match is a
		// spurious hit and can be discarded.
		if !filter[off] {
			filterDrops++
			continue
		}

		filterPasses++
		unclaimed := true

		for _, c := range claimed {
			moff := m.TargetStart - m.SrcStart
			coff := c.TargetStart - c.SrcStart
			sampleError := int(math.Round(math.Abs(float64(moff - coff))))
			withinError := sampleError < errorMargin

			// The contribution needs to add more value than accumulated error. This prevents
			// against spurious matches of a reference to a license incorrectly overextending the
			// match range.
			if withinError && m.TokensClaimed > int(sampleError) {
				if m.in(c) {
					// This can cause too many tokens to be claimed, but that's fine since we want to avoid
					// undercounting and missing content.
					c.TokensClaimed += m.TokensClaimed
					unclaimed = false
				} else {
					// See if the claims can be merged. If the error tolerances allow for it,
					// we merge the claims and the ranges. This doesn't accumulate error
					// accurately so it's possible that repeated merges can introduce too
					// much error to actually make a match, but we won't get false
					// negatives from this approach.  The error tolerances allow for a
					// merge, but we only want to merge if it increases the range of
					// tokens being covered. If this match is already handled by an
					// existing (stronger by definition) claim, we don't merge this one,
					// but treat it as a new claim. This allows for the case where a
					// highly fragmented text will be matched by a long series of low
					// score matches.
					if m.TargetStart < c.TargetStart && m.SrcStart < c.SrcStart {
						c.TargetStart = m.TargetStart
						c.SrcStart = m.SrcStart
						c.TokensClaimed += m.TokensClaimed
						unclaimed = false
					} else if m.TargetEnd > c.TargetEnd && m.SrcEnd > c.SrcEnd {
						c.TargetEnd = m.TargetEnd
						c.SrcEnd = m.SrcEnd
						c.TokensClaimed += m.TokensClaimed
						unclaimed = false
					}
					// This claim does not extend any existing block, and it may be claimed in its own
					// right.
				}
			}
			if !unclaimed {
				break
			}
		}
		// Only create a claim if this claim is likely relevant. If we had some higher quality hits,
		// it's likely this is spurious noise. If we haven't had any significantly better hits, we'll keep
		// this around.
		if unclaimed && m.TokensClaimed*10 > matched[0].TokensClaimed {
			claimed = append(claimed, m)
		}
	}
	sort.Sort(claimed)
	if c.tc.traceSearchset(origin) {
		c.tc.trace("filterPasses = %+v", filterPasses)
		c.tc.trace("filterDrops = %+v", filterDrops)
		c.tc.trace("claimed = %s", spew.Sdump(claimed))
	}
	return claimed
}

// getMatchedRanges finds the ranges in the target text that match the source
// text. The ranges returned are ordered from the entries with the most matched
// tokens to the least.
func (c *Classifier) getMatchedRanges(src, target *searchSet, confidence float64, q int) matchRanges {
	shouldTrace := c.tc.traceSearchset(src.origin)

	if shouldTrace {
		c.tc.trace("src.origin = %+v", src.origin)
	}
	// Assemble a list of all the matched q-grams without any consideration to
	// error tolerances.
	matched := targetMatchedRanges(src, target)
	if shouldTrace {
		c.tc.trace("matched = %s", spew.Sdump(matched))
	}
	if len(matched) == 0 {
		return nil
	}

	// Perform neighborhood matching to figure out which clusters of q-grams have
	// sufficient likelihood to be a potential match to the source. For an error
	// confidence threshold of X, we require that a sequence of N target tokens
	// must contain N*X (X <=1.0) ordered source tokens in order to be a viable
	// match.
	//
	// It's much easier to compute potential ranges in the target if we disregard
	// proper ordering of source tokens initially, and see which source q-grams
	// appear in sufficient quantities to be a potential match. We can then
	// disregard any q-gram that falls outside of that range. This helps
	// significantly since processing token matches is an N^2 (or worse)
	// operation, so reducing N is a big win.

	runs := c.detectRuns(src.origin, matched, len(target.Tokens), len(src.Tokens), confidence, q)

	if shouldTrace {
		c.tc.trace("runs = %d: %s", len(runs), spew.Sdump(runs))
	}

	// If there are no target runs of source tokens, we're done.
	if len(runs) == 0 {
		return nil
	}

	// Using the runs as a filter to ignore irrelevant matches, fuse the source
	// match ranges into larger matches (with possible errors) to see if we can
	// produce large enough runs that pass the confidence threshold.

	fr := c.fuseRanges(src.origin, matched, confidence, len(src.Tokens), runs, len(target.Tokens))
	if shouldTrace {
		c.tc.trace("fr = %s", spew.Sdump(fr))
	}
	return fr
}

func (c *Classifier) detectRuns(origin string, matched matchRanges, targetLength, subsetLength int, threshold float64, q int) []matchRange {
	shouldTrace := c.tc.traceSearchset(origin)
	hits := make([]bool, targetLength)
	for _, m := range matched {
		for idx := m.TargetStart; idx < m.TargetEnd; idx++ {
			hits[idx] = true
		}
	}

	if len(hits) == 0 {
		return nil
	}
	var out []int

	total := 0
	target := int(float64(subsetLength) * threshold)
	if shouldTrace {
		c.tc.trace("target = %+v", target)
		c.tc.trace("targetLength = %+v", targetLength)
		c.tc.trace("subsetLength = %+v", subsetLength)
	}

	// If we don't have at least 1 subset (i.e. the target is shorter than the
	// source) just analyze what we have.
	if len(hits) < subsetLength {
		if shouldTrace {
			c.tc.trace("trimmed search length from %d to %d", subsetLength, len(hits))
		}
		subsetLength = len(hits)
	}
	// Initialize our sliding window value.
	for i := 0; i < subsetLength; i++ {
		if hits[i] {
			total++
		}
	}

	if total >= target {
		out = append(out, 0)
	}

	// Now move through the window adjusting the total by subtracting out the
	// last bit and adding in the new bit.
	for i := 1; i < len(hits); i++ {
		if hits[i-1] {
			total--
		}
		end := i + subsetLength - 1
		if end < len(hits) && hits[i+subsetLength-1] {
			total++
		}
		if total >= target {
			out = append(out, i)
		}
	}
	if len(out) == 0 {
		return nil
	}

	final := []matchRange{
		{
			SrcStart: out[0],
			SrcEnd:   out[0] + q,
		},
	}
	for i := 1; i < len(out); i++ {
		if out[i] != 1+out[i-1] {
			final = append(final, matchRange{
				SrcStart: out[i],
				SrcEnd:   out[i] + q,
			})
		} else {
			final[len(final)-1].SrcEnd = out[i] + q
		}
	}

	return final
}

func targetMatchedRanges(src, target *searchSet) matchRanges {
	offsetMappings := make(map[int][]*matchRange)

	var matched matchRanges
	for _, tgtNode := range target.nodes {
		sr, ok := src.Hashes[tgtNode.checksum]
		if !ok {
			continue
		}

		tv := tgtNode.tokens
		for _, sv := range sr {
			offset := tv.Start - sv.Start
			if om, ok := offsetMappings[offset]; ok {
				// See if this extends the most recent existing mapping
				lastIdx := len(om) - 1
				if om[lastIdx].TargetEnd == tv.End-1 {
					// This new value extends. Update the value in place
					om[lastIdx].SrcEnd = sv.End
					om[lastIdx].TargetEnd = tv.End
					continue
				}
			}
			offsetMappings[offset] = append(offsetMappings[offset], &matchRange{
				SrcStart:    sv.Start,
				SrcEnd:      sv.End,
				TargetStart: tv.Start,
				TargetEnd:   tv.End,
			})
		}
	}

	// Compute the number of tokens claimed in each run and flatten into a single slice.
	for _, mr := range offsetMappings {
		for _, m := range mr {
			m.TokensClaimed = m.TargetEnd - m.TargetStart
		}
		matched = append(matched, mr...)
	}
	sort.Sort(matched)
	return matched
}

type hash map[uint32]tokenRanges

func (h hash) add(checksum uint32, start, end int) {
	h[checksum] = append(h[checksum], &tokenRange{Start: start, End: end})
}