diff options
Diffstat (limited to 'internal/diff/lcs/old.go')
-rw-r--r-- | internal/diff/lcs/old.go | 480 |
1 files changed, 480 insertions, 0 deletions
diff --git a/internal/diff/lcs/old.go b/internal/diff/lcs/old.go new file mode 100644 index 000000000..7af11fc89 --- /dev/null +++ b/internal/diff/lcs/old.go @@ -0,0 +1,480 @@ +// Copyright 2022 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package lcs + +// TODO(adonovan): remove unclear references to "old" in this package. + +import ( + "fmt" +) + +// A Diff is a replacement of a portion of A by a portion of B. +type Diff struct { + Start, End int // offsets of portion to delete in A + ReplStart, ReplEnd int // offset of replacement text in B +} + +// DiffStrings returns the differences between two strings. +// It does not respect rune boundaries. +func DiffStrings(a, b string) []Diff { return diff(stringSeqs{a, b}) } + +// DiffBytes returns the differences between two byte sequences. +// It does not respect rune boundaries. +func DiffBytes(a, b []byte) []Diff { return diff(bytesSeqs{a, b}) } + +// DiffRunes returns the differences between two rune sequences. +func DiffRunes(a, b []rune) []Diff { return diff(runesSeqs{a, b}) } + +func diff(seqs sequences) []Diff { + // A limit on how deeply the LCS algorithm should search. The value is just a guess. + const maxDiffs = 30 + diff, _ := compute(seqs, twosided, maxDiffs/2) + return diff +} + +// compute computes the list of differences between two sequences, +// along with the LCS. It is exercised directly by tests. +// The algorithm is one of {forward, backward, twosided}. +func compute(seqs sequences, algo func(*editGraph) lcs, limit int) ([]Diff, lcs) { + if limit <= 0 { + limit = 1 << 25 // effectively infinity + } + alen, blen := seqs.lengths() + g := &editGraph{ + seqs: seqs, + vf: newtriang(limit), + vb: newtriang(limit), + limit: limit, + ux: alen, + uy: blen, + delta: alen - blen, + } + lcs := algo(g) + diffs := lcs.toDiffs(alen, blen) + return diffs, lcs +} + +// editGraph carries the information for computing the lcs of two sequences. +type editGraph struct { + seqs sequences + vf, vb label // forward and backward labels + + limit int // maximal value of D + // the bounding rectangle of the current edit graph + lx, ly, ux, uy int + delta int // common subexpression: (ux-lx)-(uy-ly) +} + +// toDiffs converts an LCS to a list of edits. +func (lcs lcs) toDiffs(alen, blen int) []Diff { + var diffs []Diff + var pa, pb int // offsets in a, b + for _, l := range lcs { + if pa < l.X || pb < l.Y { + diffs = append(diffs, Diff{pa, l.X, pb, l.Y}) + } + pa = l.X + l.Len + pb = l.Y + l.Len + } + if pa < alen || pb < blen { + diffs = append(diffs, Diff{pa, alen, pb, blen}) + } + return diffs +} + +// --- FORWARD --- + +// fdone decides if the forwward path has reached the upper right +// corner of the rectangle. If so, it also returns the computed lcs. +func (e *editGraph) fdone(D, k int) (bool, lcs) { + // x, y, k are relative to the rectangle + x := e.vf.get(D, k) + y := x - k + if x == e.ux && y == e.uy { + return true, e.forwardlcs(D, k) + } + return false, nil +} + +// run the forward algorithm, until success or up to the limit on D. +func forward(e *editGraph) lcs { + e.setForward(0, 0, e.lx) + if ok, ans := e.fdone(0, 0); ok { + return ans + } + // from D to D+1 + for D := 0; D < e.limit; D++ { + e.setForward(D+1, -(D + 1), e.getForward(D, -D)) + if ok, ans := e.fdone(D+1, -(D + 1)); ok { + return ans + } + e.setForward(D+1, D+1, e.getForward(D, D)+1) + if ok, ans := e.fdone(D+1, D+1); ok { + return ans + } + for k := -D + 1; k <= D-1; k += 2 { + // these are tricky and easy to get backwards + lookv := e.lookForward(k, e.getForward(D, k-1)+1) + lookh := e.lookForward(k, e.getForward(D, k+1)) + if lookv > lookh { + e.setForward(D+1, k, lookv) + } else { + e.setForward(D+1, k, lookh) + } + if ok, ans := e.fdone(D+1, k); ok { + return ans + } + } + } + // D is too large + // find the D path with maximal x+y inside the rectangle and + // use that to compute the found part of the lcs + kmax := -e.limit - 1 + diagmax := -1 + for k := -e.limit; k <= e.limit; k += 2 { + x := e.getForward(e.limit, k) + y := x - k + if x+y > diagmax && x <= e.ux && y <= e.uy { + diagmax, kmax = x+y, k + } + } + return e.forwardlcs(e.limit, kmax) +} + +// recover the lcs by backtracking from the farthest point reached +func (e *editGraph) forwardlcs(D, k int) lcs { + var ans lcs + for x := e.getForward(D, k); x != 0 || x-k != 0; { + if ok(D-1, k-1) && x-1 == e.getForward(D-1, k-1) { + // if (x-1,y) is labelled D-1, x--,D--,k--,continue + D, k, x = D-1, k-1, x-1 + continue + } else if ok(D-1, k+1) && x == e.getForward(D-1, k+1) { + // if (x,y-1) is labelled D-1, x, D--,k++, continue + D, k = D-1, k+1 + continue + } + // if (x-1,y-1)--(x,y) is a diagonal, prepend,x--,y--, continue + y := x - k + ans = ans.prepend(x+e.lx-1, y+e.ly-1) + x-- + } + return ans +} + +// start at (x,y), go up the diagonal as far as possible, +// and label the result with d +func (e *editGraph) lookForward(k, relx int) int { + rely := relx - k + x, y := relx+e.lx, rely+e.ly + if x < e.ux && y < e.uy { + x += e.seqs.commonPrefixLen(x, e.ux, y, e.uy) + } + return x +} + +func (e *editGraph) setForward(d, k, relx int) { + x := e.lookForward(k, relx) + e.vf.set(d, k, x-e.lx) +} + +func (e *editGraph) getForward(d, k int) int { + x := e.vf.get(d, k) + return x +} + +// --- BACKWARD --- + +// bdone decides if the backward path has reached the lower left corner +func (e *editGraph) bdone(D, k int) (bool, lcs) { + // x, y, k are relative to the rectangle + x := e.vb.get(D, k) + y := x - (k + e.delta) + if x == 0 && y == 0 { + return true, e.backwardlcs(D, k) + } + return false, nil +} + +// run the backward algorithm, until success or up to the limit on D. +func backward(e *editGraph) lcs { + e.setBackward(0, 0, e.ux) + if ok, ans := e.bdone(0, 0); ok { + return ans + } + // from D to D+1 + for D := 0; D < e.limit; D++ { + e.setBackward(D+1, -(D + 1), e.getBackward(D, -D)-1) + if ok, ans := e.bdone(D+1, -(D + 1)); ok { + return ans + } + e.setBackward(D+1, D+1, e.getBackward(D, D)) + if ok, ans := e.bdone(D+1, D+1); ok { + return ans + } + for k := -D + 1; k <= D-1; k += 2 { + // these are tricky and easy to get wrong + lookv := e.lookBackward(k, e.getBackward(D, k-1)) + lookh := e.lookBackward(k, e.getBackward(D, k+1)-1) + if lookv < lookh { + e.setBackward(D+1, k, lookv) + } else { + e.setBackward(D+1, k, lookh) + } + if ok, ans := e.bdone(D+1, k); ok { + return ans + } + } + } + + // D is too large + // find the D path with minimal x+y inside the rectangle and + // use that to compute the part of the lcs found + kmax := -e.limit - 1 + diagmin := 1 << 25 + for k := -e.limit; k <= e.limit; k += 2 { + x := e.getBackward(e.limit, k) + y := x - (k + e.delta) + if x+y < diagmin && x >= 0 && y >= 0 { + diagmin, kmax = x+y, k + } + } + if kmax < -e.limit { + panic(fmt.Sprintf("no paths when limit=%d?", e.limit)) + } + return e.backwardlcs(e.limit, kmax) +} + +// recover the lcs by backtracking +func (e *editGraph) backwardlcs(D, k int) lcs { + var ans lcs + for x := e.getBackward(D, k); x != e.ux || x-(k+e.delta) != e.uy; { + if ok(D-1, k-1) && x == e.getBackward(D-1, k-1) { + // D--, k--, x unchanged + D, k = D-1, k-1 + continue + } else if ok(D-1, k+1) && x+1 == e.getBackward(D-1, k+1) { + // D--, k++, x++ + D, k, x = D-1, k+1, x+1 + continue + } + y := x - (k + e.delta) + ans = ans.append(x+e.lx, y+e.ly) + x++ + } + return ans +} + +// start at (x,y), go down the diagonal as far as possible, +func (e *editGraph) lookBackward(k, relx int) int { + rely := relx - (k + e.delta) // forward k = k + e.delta + x, y := relx+e.lx, rely+e.ly + if x > 0 && y > 0 { + x -= e.seqs.commonSuffixLen(0, x, 0, y) + } + return x +} + +// convert to rectangle, and label the result with d +func (e *editGraph) setBackward(d, k, relx int) { + x := e.lookBackward(k, relx) + e.vb.set(d, k, x-e.lx) +} + +func (e *editGraph) getBackward(d, k int) int { + x := e.vb.get(d, k) + return x +} + +// -- TWOSIDED --- + +func twosided(e *editGraph) lcs { + // The termination condition could be improved, as either the forward + // or backward pass could succeed before Myers' Lemma applies. + // Aside from questions of efficiency (is the extra testing cost-effective) + // this is more likely to matter when e.limit is reached. + e.setForward(0, 0, e.lx) + e.setBackward(0, 0, e.ux) + + // from D to D+1 + for D := 0; D < e.limit; D++ { + // just finished a backwards pass, so check + if got, ok := e.twoDone(D, D); ok { + return e.twolcs(D, D, got) + } + // do a forwards pass (D to D+1) + e.setForward(D+1, -(D + 1), e.getForward(D, -D)) + e.setForward(D+1, D+1, e.getForward(D, D)+1) + for k := -D + 1; k <= D-1; k += 2 { + // these are tricky and easy to get backwards + lookv := e.lookForward(k, e.getForward(D, k-1)+1) + lookh := e.lookForward(k, e.getForward(D, k+1)) + if lookv > lookh { + e.setForward(D+1, k, lookv) + } else { + e.setForward(D+1, k, lookh) + } + } + // just did a forward pass, so check + if got, ok := e.twoDone(D+1, D); ok { + return e.twolcs(D+1, D, got) + } + // do a backward pass, D to D+1 + e.setBackward(D+1, -(D + 1), e.getBackward(D, -D)-1) + e.setBackward(D+1, D+1, e.getBackward(D, D)) + for k := -D + 1; k <= D-1; k += 2 { + // these are tricky and easy to get wrong + lookv := e.lookBackward(k, e.getBackward(D, k-1)) + lookh := e.lookBackward(k, e.getBackward(D, k+1)-1) + if lookv < lookh { + e.setBackward(D+1, k, lookv) + } else { + e.setBackward(D+1, k, lookh) + } + } + } + + // D too large. combine a forward and backward partial lcs + // first, a forward one + kmax := -e.limit - 1 + diagmax := -1 + for k := -e.limit; k <= e.limit; k += 2 { + x := e.getForward(e.limit, k) + y := x - k + if x+y > diagmax && x <= e.ux && y <= e.uy { + diagmax, kmax = x+y, k + } + } + if kmax < -e.limit { + panic(fmt.Sprintf("no forward paths when limit=%d?", e.limit)) + } + lcs := e.forwardlcs(e.limit, kmax) + // now a backward one + // find the D path with minimal x+y inside the rectangle and + // use that to compute the lcs + diagmin := 1 << 25 // infinity + for k := -e.limit; k <= e.limit; k += 2 { + x := e.getBackward(e.limit, k) + y := x - (k + e.delta) + if x+y < diagmin && x >= 0 && y >= 0 { + diagmin, kmax = x+y, k + } + } + if kmax < -e.limit { + panic(fmt.Sprintf("no backward paths when limit=%d?", e.limit)) + } + lcs = append(lcs, e.backwardlcs(e.limit, kmax)...) + // These may overlap (e.forwardlcs and e.backwardlcs return sorted lcs) + ans := lcs.fix() + return ans +} + +// Does Myers' Lemma apply? +func (e *editGraph) twoDone(df, db int) (int, bool) { + if (df+db+e.delta)%2 != 0 { + return 0, false // diagonals cannot overlap + } + kmin := -db + e.delta + if -df > kmin { + kmin = -df + } + kmax := db + e.delta + if df < kmax { + kmax = df + } + for k := kmin; k <= kmax; k += 2 { + x := e.vf.get(df, k) + u := e.vb.get(db, k-e.delta) + if u <= x { + // is it worth looking at all the other k? + for l := k; l <= kmax; l += 2 { + x := e.vf.get(df, l) + y := x - l + u := e.vb.get(db, l-e.delta) + v := u - l + if x == u || u == 0 || v == 0 || y == e.uy || x == e.ux { + return l, true + } + } + return k, true + } + } + return 0, false +} + +func (e *editGraph) twolcs(df, db, kf int) lcs { + // db==df || db+1==df + x := e.vf.get(df, kf) + y := x - kf + kb := kf - e.delta + u := e.vb.get(db, kb) + v := u - kf + + // Myers proved there is a df-path from (0,0) to (u,v) + // and a db-path from (x,y) to (N,M). + // In the first case the overall path is the forward path + // to (u,v) followed by the backward path to (N,M). + // In the second case the path is the backward path to (x,y) + // followed by the forward path to (x,y) from (0,0). + + // Look for some special cases to avoid computing either of these paths. + if x == u { + // "babaab" "cccaba" + // already patched together + lcs := e.forwardlcs(df, kf) + lcs = append(lcs, e.backwardlcs(db, kb)...) + return lcs.sort() + } + + // is (u-1,v) or (u,v-1) labelled df-1? + // if so, that forward df-1-path plus a horizontal or vertical edge + // is the df-path to (u,v), then plus the db-path to (N,M) + if u > 0 && ok(df-1, u-1-v) && e.vf.get(df-1, u-1-v) == u-1 { + // "aabbab" "cbcabc" + lcs := e.forwardlcs(df-1, u-1-v) + lcs = append(lcs, e.backwardlcs(db, kb)...) + return lcs.sort() + } + if v > 0 && ok(df-1, (u-(v-1))) && e.vf.get(df-1, u-(v-1)) == u { + // "abaabb" "bcacab" + lcs := e.forwardlcs(df-1, u-(v-1)) + lcs = append(lcs, e.backwardlcs(db, kb)...) + return lcs.sort() + } + + // The path can't possibly contribute to the lcs because it + // is all horizontal or vertical edges + if u == 0 || v == 0 || x == e.ux || y == e.uy { + // "abaabb" "abaaaa" + if u == 0 || v == 0 { + return e.backwardlcs(db, kb) + } + return e.forwardlcs(df, kf) + } + + // is (x+1,y) or (x,y+1) labelled db-1? + if x+1 <= e.ux && ok(db-1, x+1-y-e.delta) && e.vb.get(db-1, x+1-y-e.delta) == x+1 { + // "bababb" "baaabb" + lcs := e.backwardlcs(db-1, kb+1) + lcs = append(lcs, e.forwardlcs(df, kf)...) + return lcs.sort() + } + if y+1 <= e.uy && ok(db-1, x-(y+1)-e.delta) && e.vb.get(db-1, x-(y+1)-e.delta) == x { + // "abbbaa" "cabacc" + lcs := e.backwardlcs(db-1, kb-1) + lcs = append(lcs, e.forwardlcs(df, kf)...) + return lcs.sort() + } + + // need to compute another path + // "aabbaa" "aacaba" + lcs := e.backwardlcs(db, kb) + oldx, oldy := e.ux, e.uy + e.ux = u + e.uy = v + lcs = append(lcs, forward(e)...) + e.ux, e.uy = oldx, oldy + return lcs.sort() +} |