diff options
author | Bill Neubauer <wcn@google.com> | 2020-04-29 09:45:59 -0700 |
---|---|---|
committer | Bill Neubauer <bill.neubauer@gmail.com> | 2020-11-13 09:54:34 -0800 |
commit | ccb1043e6dac21c65b5e9f5c692fa9c8484d5aa0 (patch) | |
tree | 3788b893debce46db40a5607f57d1a84baecb0c5 /v2/frequencies.go | |
parent | ccfb119676d741e33aa90616c8f1e236d8716e7a (diff) | |
download | licenseclassifier-ccb1043e6dac21c65b5e9f5c692fa9c8484d5aa0.tar.gz |
Add frequency table code to the v2 classifier.
Diffstat (limited to 'v2/frequencies.go')
-rw-r--r-- | v2/frequencies.go | 59 |
1 files changed, 59 insertions, 0 deletions
diff --git a/v2/frequencies.go b/v2/frequencies.go new file mode 100644 index 0000000..c29898e --- /dev/null +++ b/v2/frequencies.go @@ -0,0 +1,59 @@ +// Copyright 2020 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package classifier + +type frequencyTable struct { + counts map[tokenID]int // key: token ID, value: number of instances of that token +} + +func newFrequencyTable() *frequencyTable { + return &frequencyTable{ + counts: make(map[tokenID]int), + } +} + +func (f *frequencyTable) update(d *indexedDocument) { + for _, tok := range d.Tokens { + f.counts[tok.ID]++ + } +} + +func (d *indexedDocument) generateFrequencies() { + d.f = newFrequencyTable() + d.f.update(d) +} + +// TokenSimilarity returns a confidence score of how well d contains +// the tokens of o. This is used as a fast similarity metric to +// avoid running more expensive classifiers. +func (d *indexedDocument) tokenSimilarity(o *indexedDocument) float64 { + hits := 0 + // For each token in the source document, see if the target has "enough" instances + // of that token to possibly be a match to the target. + // We count up all the matches, and divide by the total number of unique source + // tokens to get a similarity metric. 1.0 means that all the tokens in the target + // are present in the source in appropriate quantities. If the value here is lower + // than the desired matching threshold, the target can't possibly match the source. + // Profiling indicates a significant amount of time is spent here. + // Avoiding checking (or storing) "uninteresting" tokens (common English words) + // could help. + for t, c := range o.f.counts { + if d.f.counts[t] >= c { + hits++ + } + } + + return float64(hits) / float64(len(o.f.counts)) +} |