// Copyright 2017 Google Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package stringclassifier finds the nearest match between a string and a set of known values. It // uses the Levenshtein Distance (LD) algorithm to determine this. A match with a large LD is less // likely to be correct than one with a small LD. A confidence percentage is returned, which // indicates how confident the algorithm is that the match is correct. The higher the percentage, // the greater the confidence that the match is correct. // // Example Usage: // // type Text struct { // Name string // Text string // } // // func NewClassifier(knownTexts []Text) (*stringclassifier.Classifier, error) { // sc := stringclassifier.New(stringclassifier.FlattenWhitespace) // for _, known := range knownTexts { // if err := sc.AddValue(known.Name, known.Text); err != nil { // return nil, err // } // } // return sc, nil // } // // func IdentifyTexts(sc *stringclassifier.Classifier, unknownTexts []*Text) { // for _, unknown := range unknownTexts { // m := sc.NearestMatch(unknown.Text) // log.Printf("The nearest match to %q is %q (confidence: %v)", // unknown.Name, m.Name, m.Confidence) // } // } package stringclassifier import ( "fmt" "log" "math" "regexp" "sort" "sync" "github.com/google/licenseclassifier/stringclassifier/internal/pq" "github.com/google/licenseclassifier/stringclassifier/searchset" "github.com/sergi/go-diff/diffmatchpatch" ) // The diff/match/patch algorithm. var dmp = diffmatchpatch.New() const ( // DefaultConfidenceThreshold is the minimum ratio threshold between // the matching range and the full source range that we're willing to // accept in order to say that the matching range will produce a // sufficiently good edit distance. I.e., if the matching range is // below this threshold we won't run the Levenshtein Distance algorithm // on it. DefaultConfidenceThreshold float64 = 0.80 defaultMinDiffRatio float64 = 0.75 ) // A Classifier matches a string to a set of known values. type Classifier struct { muValues sync.RWMutex values map[string]*knownValue normalizers []NormalizeFunc threshold float64 // MinDiffRatio defines the minimum ratio of the length difference // allowed to consider a known value a possible match. This is used as // a performance optimization to eliminate values that are unlikely to // be a match. // // For example, a value of 0.75 means that the shorter string must be // at least 75% the length of the longer string to consider it a // possible match. // // Setting this to 1.0 will require that strings are identical length. // Setting this to 0 will consider all known values as possible // matches. MinDiffRatio float64 } // NormalizeFunc is a function that is used to normalize a string prior to comparison. type NormalizeFunc func(string) string // New creates a new Classifier with the provided NormalizeFuncs. Each // NormalizeFunc is applied in order to a string before comparison. func New(threshold float64, funcs ...NormalizeFunc) *Classifier { return &Classifier{ values: make(map[string]*knownValue), normalizers: append([]NormalizeFunc(nil), funcs...), threshold: threshold, MinDiffRatio: defaultMinDiffRatio, } } // knownValue identifies a value in the corpus to match against. type knownValue struct { key string normalizedValue string reValue *regexp.Regexp set *searchset.SearchSet } // AddValue adds a known value to be matched against. If a value already exists // for key, an error is returned. func (c *Classifier) AddValue(key, value string) error { c.muValues.Lock() defer c.muValues.Unlock() if _, ok := c.values[key]; ok { return fmt.Errorf("value already registered with key %q", key) } norm := c.normalize(value) c.values[key] = &knownValue{ key: key, normalizedValue: norm, reValue: regexp.MustCompile(norm), } return nil } // AddPrecomputedValue adds a known value to be matched against. The value has // already been normalized and the SearchSet object deserialized, so no // processing is necessary. func (c *Classifier) AddPrecomputedValue(key, value string, set *searchset.SearchSet) error { c.muValues.Lock() defer c.muValues.Unlock() if _, ok := c.values[key]; ok { return fmt.Errorf("value already registered with key %q", key) } set.GenerateNodeList() c.values[key] = &knownValue{ key: key, normalizedValue: value, reValue: regexp.MustCompile(value), set: set, } return nil } // normalize a string by applying each of the registered NormalizeFuncs. func (c *Classifier) normalize(s string) string { for _, fn := range c.normalizers { s = fn(s) } return s } // Match identifies the result of matching a string against a knownValue. type Match struct { Name string // Name of knownValue that was matched Confidence float64 // Confidence percentage Offset int // The offset into the unknown string the match was made Extent int // The length from the offset into the unknown string } // Matches is a list of Match-es. This is here mainly so that the list can be // sorted. type Matches []*Match func (m Matches) Len() int { return len(m) } func (m Matches) Swap(i, j int) { m[i], m[j] = m[j], m[i] } func (m Matches) Less(i, j int) bool { if math.Abs(m[j].Confidence-m[i].Confidence) < math.SmallestNonzeroFloat64 { if m[i].Name == m[j].Name { if m[i].Offset > m[j].Offset { return false } if m[i].Offset == m[j].Offset { return m[i].Extent > m[j].Extent } return true } return m[i].Name < m[j].Name } return m[i].Confidence > m[j].Confidence } // Names returns an unsorted slice of the names of the matched licenses. func (m Matches) Names() []string { var names []string for _, n := range m { names = append(names, n.Name) } return names } // uniquify goes through the matches and removes any that are contained within // one with a higher confidence. This assumes that Matches is sorted. func (m Matches) uniquify() Matches { type matchedRange struct { offset, extent int } var matched []matchedRange var matches Matches OUTER: for _, match := range m { for _, mr := range matched { if match.Offset >= mr.offset && match.Offset <= mr.offset+mr.extent { continue OUTER } } matched = append(matched, matchedRange{match.Offset, match.Extent}) matches = append(matches, match) } return matches } // NearestMatch returns the name of the known value that most closely matches // the unknown string and a confidence percentage is returned indicating how // confident the classifier is in the result. A percentage of "1.0" indicates // an exact match, while a percentage of "0.0" indicates a complete mismatch. // // If the string is equidistant from multiple known values, it is undefined // which will be returned. func (c *Classifier) NearestMatch(s string) *Match { pq := c.nearestMatch(s) if pq.Len() == 0 { return &Match{} } return pq.Pop().(*Match) } // MultipleMatch tries to determine which known strings are found within an // unknown string. This differs from "NearestMatch" in that it looks only at // those areas within the unknown string that are likely to match. A list of // potential matches are returned. It's up to the caller to determine which // ones are acceptable. func (c *Classifier) MultipleMatch(s string) (matches Matches) { pq := c.multipleMatch(s) if pq == nil { return matches } // A map to remove duplicate entries. m := make(map[Match]bool) for pq.Len() != 0 { v := pq.Pop().(*Match) if _, ok := m[*v]; !ok { m[*v] = true matches = append(matches, v) } } sort.Sort(matches) return matches.uniquify() } // possibleMatch identifies a known value and it's diffRatio to a given string. type possibleMatch struct { value *knownValue diffRatio float64 } // likelyMatches is a slice of possibleMatches that can be sorted by their // diffRatio to a given string, such that the most likely matches (based on // length) are at the beginning. type likelyMatches []possibleMatch func (m likelyMatches) Len() int { return len(m) } func (m likelyMatches) Less(i, j int) bool { return m[i].diffRatio > m[j].diffRatio } func (m likelyMatches) Swap(i, j int) { m[i], m[j] = m[j], m[i] } // nearestMatch returns a Queue of values that the unknown string may be. The // values are compared via their Levenshtein Distance and ranked with the // nearest match at the beginning. func (c *Classifier) nearestMatch(unknown string) *pq.Queue { var mu sync.Mutex // Protect the priority queue. pq := pq.NewQueue(func(x, y interface{}) bool { return x.(*Match).Confidence > y.(*Match).Confidence }, nil) unknown = c.normalize(unknown) if len(unknown) == 0 { return pq } c.muValues.RLock() var likely likelyMatches for _, v := range c.values { dr := diffRatio(unknown, v.normalizedValue) if dr < c.MinDiffRatio { continue } if unknown == v.normalizedValue { // We found an exact match. pq.Push(&Match{Name: v.key, Confidence: 1.0, Offset: 0, Extent: len(unknown)}) c.muValues.RUnlock() return pq } likely = append(likely, possibleMatch{value: v, diffRatio: dr}) } c.muValues.RUnlock() sort.Sort(likely) var wg sync.WaitGroup classifyString := func(name, unknown, known string) { defer wg.Done() diffs := dmp.DiffMain(unknown, known, true) distance := dmp.DiffLevenshtein(diffs) confidence := confidencePercentage(len(unknown), len(known), distance) if confidence > 0.0 { mu.Lock() pq.Push(&Match{Name: name, Confidence: confidence, Offset: 0, Extent: len(unknown)}) mu.Unlock() } } wg.Add(len(likely)) for _, known := range likely { go classifyString(known.value.key, unknown, known.value.normalizedValue) } wg.Wait() return pq } // matcher finds all potential matches of "known" in "unknown". The results are // placed in "queue". type matcher struct { unknown *searchset.SearchSet normUnknown string threshold float64 mu sync.Mutex queue *pq.Queue } // newMatcher creates a "matcher" object. func newMatcher(unknown string, threshold float64) *matcher { return &matcher{ unknown: searchset.New(unknown, searchset.DefaultGranularity), normUnknown: unknown, threshold: threshold, queue: pq.NewQueue(func(x, y interface{}) bool { return x.(*Match).Confidence > y.(*Match).Confidence }, nil), } } // findMatches takes a known text and finds all potential instances of it in // the unknown text. The resulting matches can then filtered to determine which // are the best matches. func (m *matcher) findMatches(known *knownValue) { var mrs []searchset.MatchRanges if all := known.reValue.FindAllStringIndex(m.normUnknown, -1); all != nil { // We found exact matches. Just use those! for _, a := range all { var start, end int for i, tok := range m.unknown.Tokens { if tok.Offset == a[0] { start = i } else if tok.Offset >= a[len(a)-1]-len(tok.Text) { end = i break } } mrs = append(mrs, searchset.MatchRanges{{ SrcStart: 0, SrcEnd: len(known.set.Tokens), TargetStart: start, TargetEnd: end + 1, }}) } } else { // No exact match. Perform a more thorough match. mrs = searchset.FindPotentialMatches(known.set, m.unknown) } var wg sync.WaitGroup for _, mr := range mrs { if !m.withinConfidenceThreshold(known.set, mr) { continue } wg.Add(1) go func(mr searchset.MatchRanges) { start, end := mr.TargetRange(m.unknown) conf := levDist(m.normUnknown[start:end], known.normalizedValue) if conf > 0.0 { m.mu.Lock() m.queue.Push(&Match{Name: known.key, Confidence: conf, Offset: start, Extent: end - start}) m.mu.Unlock() } wg.Done() }(mr) } wg.Wait() } // withinConfidenceThreshold returns the Confidence we have in the potential // match. It does this by calculating the ratio of what's matching to the // original known text. func (m *matcher) withinConfidenceThreshold(known *searchset.SearchSet, mr searchset.MatchRanges) bool { return float64(mr.Size())/float64(len(known.Tokens)) >= m.threshold } // multipleMatch returns a Queue of values that might be within the unknown // string. The values are compared via their Levenshtein Distance and ranked // with the nearest match at the beginning. func (c *Classifier) multipleMatch(unknown string) *pq.Queue { normUnknown := c.normalize(unknown) if normUnknown == "" { return nil } m := newMatcher(normUnknown, c.threshold) c.muValues.RLock() var kvals []*knownValue for _, known := range c.values { kvals = append(kvals, known) } c.muValues.RUnlock() var wg sync.WaitGroup wg.Add(len(kvals)) for _, known := range kvals { go func(known *knownValue) { if known.set == nil { k := searchset.New(known.normalizedValue, searchset.DefaultGranularity) c.muValues.Lock() c.values[known.key].set = k c.muValues.Unlock() } m.findMatches(known) wg.Done() }(known) } wg.Wait() return m.queue } // levDist runs the Levenshtein Distance algorithm on the known and unknown // texts to measure how well they match. func levDist(unknown, known string) float64 { if len(known) == 0 || len(unknown) == 0 { log.Printf("Zero-sized texts in Levenshtein Distance algorithm: known==%d, unknown==%d", len(known), len(unknown)) return 0.0 } // Calculate the differences between the potentially matching known // text and the unknown text. diffs := dmp.DiffMain(unknown, known, false) end := diffRangeEnd(known, diffs) // Now execute the Levenshtein Distance algorithm to see how much it // does match. distance := dmp.DiffLevenshtein(diffs[:end]) return confidencePercentage(unknownTextLength(unknown, diffs), len(known), distance) } // unknownTextLength returns the length of the unknown text based on the diff range. func unknownTextLength(unknown string, diffs []diffmatchpatch.Diff) int { last := len(diffs) - 1 for ; last >= 0; last-- { if diffs[last].Type == diffmatchpatch.DiffEqual { break } } ulen := 0 for i := 0; i < last+1; i++ { switch diffs[i].Type { case diffmatchpatch.DiffEqual, diffmatchpatch.DiffDelete: ulen += len(diffs[i].Text) } } return ulen } // diffRangeEnd returns the end index for the "Diff" objects that constructs // (or nearly constructs) the "known" value. func diffRangeEnd(known string, diffs []diffmatchpatch.Diff) (end int) { var seen string for end = 0; end < len(diffs); end++ { if seen == known { // Once we've constructed the "known" value, then we've // reached the point in the diff list where more // "Diff"s would just make the Levenshtein Distance // less valid. There shouldn't be further "DiffEqual" // nodes, because there's nothing further to match in // the "known" text. break } switch diffs[end].Type { case diffmatchpatch.DiffEqual, diffmatchpatch.DiffInsert: seen += diffs[end].Text } } return end } // confidencePercentage calculates how confident we are in the result of the // match. A percentage of "1.0" means an identical match. A confidence of "0.0" // means a complete mismatch. func confidencePercentage(ulen, klen, distance int) float64 { if ulen == 0 && klen == 0 { return 1.0 } if ulen == 0 || klen == 0 || (distance > ulen && distance > klen) { return 0.0 } return 1.0 - float64(distance)/float64(max(ulen, klen)) } // diffRatio calculates the ratio of the length of s1 and s2, returned as a // percentage of the length of the longer string. E.g., diffLength("abcd", "e") // would return 0.25 because "e" is 25% of the size of "abcd". Comparing // strings of equal length will return 1. func diffRatio(s1, s2 string) float64 { x, y := len(s1), len(s2) if x == 0 && y == 0 { // Both strings are zero length return 1.0 } if x < y { return float64(x) / float64(y) } return float64(y) / float64(x) } func max(a, b int) int { if a > b { return a } return b } func min(a, b int) int { if a < b { return a } return b } // wsRegexp is a regexp used to identify blocks of whitespace. var wsRegexp = regexp.MustCompile(`\s+`) // FlattenWhitespace will flatten contiguous blocks of whitespace down to a single space. var FlattenWhitespace NormalizeFunc = func(s string) string { return wsRegexp.ReplaceAllString(s, " ") }