// Copyright 2020 Google Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package classifier import ( "strings" "github.com/sergi/go-diff/diffmatchpatch" ) // This file contains word-diffing routines that build on top of the go-diff package. // The algorithm implemented here is from the suggested word diffing technique in // https://github.com/google/diff-match-patch/wiki/Line-or-Word-Diffs // diffRange returns the indices of the beginning and end locations of the diff // that reconstruct (as best possible) the source value. func diffRange(known string, diffs []diffmatchpatch.Diff) (start, end int) { var foundStart bool var seen string for end = 0; end < len(diffs); end++ { if len(seen) > 1 && seen[:len(seen)-1] == known { break } switch diffs[end].Type { case diffmatchpatch.DiffEqual, diffmatchpatch.DiffInsert: if !foundStart { start = end foundStart = true } seen += diffs[end].Text + " " } } return start, end } func docDiff(id string, doc1 *indexedDocument, doc1Start, doc1End int, doc2 *indexedDocument, doc2Start, doc2End int) []diffmatchpatch.Diff { chars1 := doc1.runes[doc1Start:doc1End] chars2 := doc2.runes[doc2Start:doc2End] dmp := diffmatchpatch.New() diffs := dmp.DiffMainRunes(chars1, chars2, false) // Recover the words from the previous rune encoding and return the textual diffs. diffs = diffRunesToWords(diffs, doc1.dict) return diffs } func diffWordsToRunes(doc *indexedDocument, start, end int) []rune { // Creates a slice of runes using the indexed values as a basis for runes. // The go-diff code basically does exactly this using ephemeral dictionaries // for each input string. We leverage the fact we have a persistent dictionary // to make this operation cheaper. // TODO: perhaps we should cache these in the corpus? runes := make([]rune, 0, end-start) for _, t := range doc.Tokens[start:end] { runes = append(runes, rune(t.ID)) } return runes } // diffRunesToWords rehydrates the text in a diff from a string of word hashes to real words of text. func diffRunesToWords(diffs []diffmatchpatch.Diff, dict *dictionary) []diffmatchpatch.Diff { hydrated := make([]diffmatchpatch.Diff, 0, len(diffs)) for _, aDiff := range diffs { chars := []rune(aDiff.Text) var sb strings.Builder for i, r := range chars { sb.WriteString(dict.getWord(tokenID(r))) if (i + 1) < len(chars) { sb.WriteByte(' ') } } aDiff.Text = sb.String() hydrated = append(hydrated, aDiff) } return hydrated } // Returns the number of words in the input string. Used by scoring and distance functions. // This function depends on the behavior of the tokenizer such that strings are separated // by exactly one space and don't start or end with whitespace. func wordLen(text string) int { if text == "" { return 0 } return strings.Count(text, " ") + 1 } // textLength returns the number of tokens in the diff. This value is used to // adjust the offset for detection, since this is the number of tokens // discarded while matching a diff. By virtue of how it's called, there won't // be "change" diffs (a paired insert/delete) so we can simplify the scan to // just count up everything. func textLength(diffs []diffmatchpatch.Diff) int { l := 0 for _, d := range diffs { l += wordLen(d.Text) } return l }