aboutsummaryrefslogtreecommitdiff
path: root/internal/diff/lcs/old.go
diff options
context:
space:
mode:
Diffstat (limited to 'internal/diff/lcs/old.go')
-rw-r--r--internal/diff/lcs/old.go480
1 files changed, 480 insertions, 0 deletions
diff --git a/internal/diff/lcs/old.go b/internal/diff/lcs/old.go
new file mode 100644
index 000000000..7af11fc89
--- /dev/null
+++ b/internal/diff/lcs/old.go
@@ -0,0 +1,480 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package lcs
+
+// TODO(adonovan): remove unclear references to "old" in this package.
+
+import (
+ "fmt"
+)
+
+// A Diff is a replacement of a portion of A by a portion of B.
+type Diff struct {
+ Start, End int // offsets of portion to delete in A
+ ReplStart, ReplEnd int // offset of replacement text in B
+}
+
+// DiffStrings returns the differences between two strings.
+// It does not respect rune boundaries.
+func DiffStrings(a, b string) []Diff { return diff(stringSeqs{a, b}) }
+
+// DiffBytes returns the differences between two byte sequences.
+// It does not respect rune boundaries.
+func DiffBytes(a, b []byte) []Diff { return diff(bytesSeqs{a, b}) }
+
+// DiffRunes returns the differences between two rune sequences.
+func DiffRunes(a, b []rune) []Diff { return diff(runesSeqs{a, b}) }
+
+func diff(seqs sequences) []Diff {
+ // A limit on how deeply the LCS algorithm should search. The value is just a guess.
+ const maxDiffs = 30
+ diff, _ := compute(seqs, twosided, maxDiffs/2)
+ return diff
+}
+
+// compute computes the list of differences between two sequences,
+// along with the LCS. It is exercised directly by tests.
+// The algorithm is one of {forward, backward, twosided}.
+func compute(seqs sequences, algo func(*editGraph) lcs, limit int) ([]Diff, lcs) {
+ if limit <= 0 {
+ limit = 1 << 25 // effectively infinity
+ }
+ alen, blen := seqs.lengths()
+ g := &editGraph{
+ seqs: seqs,
+ vf: newtriang(limit),
+ vb: newtriang(limit),
+ limit: limit,
+ ux: alen,
+ uy: blen,
+ delta: alen - blen,
+ }
+ lcs := algo(g)
+ diffs := lcs.toDiffs(alen, blen)
+ return diffs, lcs
+}
+
+// editGraph carries the information for computing the lcs of two sequences.
+type editGraph struct {
+ seqs sequences
+ vf, vb label // forward and backward labels
+
+ limit int // maximal value of D
+ // the bounding rectangle of the current edit graph
+ lx, ly, ux, uy int
+ delta int // common subexpression: (ux-lx)-(uy-ly)
+}
+
+// toDiffs converts an LCS to a list of edits.
+func (lcs lcs) toDiffs(alen, blen int) []Diff {
+ var diffs []Diff
+ var pa, pb int // offsets in a, b
+ for _, l := range lcs {
+ if pa < l.X || pb < l.Y {
+ diffs = append(diffs, Diff{pa, l.X, pb, l.Y})
+ }
+ pa = l.X + l.Len
+ pb = l.Y + l.Len
+ }
+ if pa < alen || pb < blen {
+ diffs = append(diffs, Diff{pa, alen, pb, blen})
+ }
+ return diffs
+}
+
+// --- FORWARD ---
+
+// fdone decides if the forwward path has reached the upper right
+// corner of the rectangle. If so, it also returns the computed lcs.
+func (e *editGraph) fdone(D, k int) (bool, lcs) {
+ // x, y, k are relative to the rectangle
+ x := e.vf.get(D, k)
+ y := x - k
+ if x == e.ux && y == e.uy {
+ return true, e.forwardlcs(D, k)
+ }
+ return false, nil
+}
+
+// run the forward algorithm, until success or up to the limit on D.
+func forward(e *editGraph) lcs {
+ e.setForward(0, 0, e.lx)
+ if ok, ans := e.fdone(0, 0); ok {
+ return ans
+ }
+ // from D to D+1
+ for D := 0; D < e.limit; D++ {
+ e.setForward(D+1, -(D + 1), e.getForward(D, -D))
+ if ok, ans := e.fdone(D+1, -(D + 1)); ok {
+ return ans
+ }
+ e.setForward(D+1, D+1, e.getForward(D, D)+1)
+ if ok, ans := e.fdone(D+1, D+1); ok {
+ return ans
+ }
+ for k := -D + 1; k <= D-1; k += 2 {
+ // these are tricky and easy to get backwards
+ lookv := e.lookForward(k, e.getForward(D, k-1)+1)
+ lookh := e.lookForward(k, e.getForward(D, k+1))
+ if lookv > lookh {
+ e.setForward(D+1, k, lookv)
+ } else {
+ e.setForward(D+1, k, lookh)
+ }
+ if ok, ans := e.fdone(D+1, k); ok {
+ return ans
+ }
+ }
+ }
+ // D is too large
+ // find the D path with maximal x+y inside the rectangle and
+ // use that to compute the found part of the lcs
+ kmax := -e.limit - 1
+ diagmax := -1
+ for k := -e.limit; k <= e.limit; k += 2 {
+ x := e.getForward(e.limit, k)
+ y := x - k
+ if x+y > diagmax && x <= e.ux && y <= e.uy {
+ diagmax, kmax = x+y, k
+ }
+ }
+ return e.forwardlcs(e.limit, kmax)
+}
+
+// recover the lcs by backtracking from the farthest point reached
+func (e *editGraph) forwardlcs(D, k int) lcs {
+ var ans lcs
+ for x := e.getForward(D, k); x != 0 || x-k != 0; {
+ if ok(D-1, k-1) && x-1 == e.getForward(D-1, k-1) {
+ // if (x-1,y) is labelled D-1, x--,D--,k--,continue
+ D, k, x = D-1, k-1, x-1
+ continue
+ } else if ok(D-1, k+1) && x == e.getForward(D-1, k+1) {
+ // if (x,y-1) is labelled D-1, x, D--,k++, continue
+ D, k = D-1, k+1
+ continue
+ }
+ // if (x-1,y-1)--(x,y) is a diagonal, prepend,x--,y--, continue
+ y := x - k
+ ans = ans.prepend(x+e.lx-1, y+e.ly-1)
+ x--
+ }
+ return ans
+}
+
+// start at (x,y), go up the diagonal as far as possible,
+// and label the result with d
+func (e *editGraph) lookForward(k, relx int) int {
+ rely := relx - k
+ x, y := relx+e.lx, rely+e.ly
+ if x < e.ux && y < e.uy {
+ x += e.seqs.commonPrefixLen(x, e.ux, y, e.uy)
+ }
+ return x
+}
+
+func (e *editGraph) setForward(d, k, relx int) {
+ x := e.lookForward(k, relx)
+ e.vf.set(d, k, x-e.lx)
+}
+
+func (e *editGraph) getForward(d, k int) int {
+ x := e.vf.get(d, k)
+ return x
+}
+
+// --- BACKWARD ---
+
+// bdone decides if the backward path has reached the lower left corner
+func (e *editGraph) bdone(D, k int) (bool, lcs) {
+ // x, y, k are relative to the rectangle
+ x := e.vb.get(D, k)
+ y := x - (k + e.delta)
+ if x == 0 && y == 0 {
+ return true, e.backwardlcs(D, k)
+ }
+ return false, nil
+}
+
+// run the backward algorithm, until success or up to the limit on D.
+func backward(e *editGraph) lcs {
+ e.setBackward(0, 0, e.ux)
+ if ok, ans := e.bdone(0, 0); ok {
+ return ans
+ }
+ // from D to D+1
+ for D := 0; D < e.limit; D++ {
+ e.setBackward(D+1, -(D + 1), e.getBackward(D, -D)-1)
+ if ok, ans := e.bdone(D+1, -(D + 1)); ok {
+ return ans
+ }
+ e.setBackward(D+1, D+1, e.getBackward(D, D))
+ if ok, ans := e.bdone(D+1, D+1); ok {
+ return ans
+ }
+ for k := -D + 1; k <= D-1; k += 2 {
+ // these are tricky and easy to get wrong
+ lookv := e.lookBackward(k, e.getBackward(D, k-1))
+ lookh := e.lookBackward(k, e.getBackward(D, k+1)-1)
+ if lookv < lookh {
+ e.setBackward(D+1, k, lookv)
+ } else {
+ e.setBackward(D+1, k, lookh)
+ }
+ if ok, ans := e.bdone(D+1, k); ok {
+ return ans
+ }
+ }
+ }
+
+ // D is too large
+ // find the D path with minimal x+y inside the rectangle and
+ // use that to compute the part of the lcs found
+ kmax := -e.limit - 1
+ diagmin := 1 << 25
+ for k := -e.limit; k <= e.limit; k += 2 {
+ x := e.getBackward(e.limit, k)
+ y := x - (k + e.delta)
+ if x+y < diagmin && x >= 0 && y >= 0 {
+ diagmin, kmax = x+y, k
+ }
+ }
+ if kmax < -e.limit {
+ panic(fmt.Sprintf("no paths when limit=%d?", e.limit))
+ }
+ return e.backwardlcs(e.limit, kmax)
+}
+
+// recover the lcs by backtracking
+func (e *editGraph) backwardlcs(D, k int) lcs {
+ var ans lcs
+ for x := e.getBackward(D, k); x != e.ux || x-(k+e.delta) != e.uy; {
+ if ok(D-1, k-1) && x == e.getBackward(D-1, k-1) {
+ // D--, k--, x unchanged
+ D, k = D-1, k-1
+ continue
+ } else if ok(D-1, k+1) && x+1 == e.getBackward(D-1, k+1) {
+ // D--, k++, x++
+ D, k, x = D-1, k+1, x+1
+ continue
+ }
+ y := x - (k + e.delta)
+ ans = ans.append(x+e.lx, y+e.ly)
+ x++
+ }
+ return ans
+}
+
+// start at (x,y), go down the diagonal as far as possible,
+func (e *editGraph) lookBackward(k, relx int) int {
+ rely := relx - (k + e.delta) // forward k = k + e.delta
+ x, y := relx+e.lx, rely+e.ly
+ if x > 0 && y > 0 {
+ x -= e.seqs.commonSuffixLen(0, x, 0, y)
+ }
+ return x
+}
+
+// convert to rectangle, and label the result with d
+func (e *editGraph) setBackward(d, k, relx int) {
+ x := e.lookBackward(k, relx)
+ e.vb.set(d, k, x-e.lx)
+}
+
+func (e *editGraph) getBackward(d, k int) int {
+ x := e.vb.get(d, k)
+ return x
+}
+
+// -- TWOSIDED ---
+
+func twosided(e *editGraph) lcs {
+ // The termination condition could be improved, as either the forward
+ // or backward pass could succeed before Myers' Lemma applies.
+ // Aside from questions of efficiency (is the extra testing cost-effective)
+ // this is more likely to matter when e.limit is reached.
+ e.setForward(0, 0, e.lx)
+ e.setBackward(0, 0, e.ux)
+
+ // from D to D+1
+ for D := 0; D < e.limit; D++ {
+ // just finished a backwards pass, so check
+ if got, ok := e.twoDone(D, D); ok {
+ return e.twolcs(D, D, got)
+ }
+ // do a forwards pass (D to D+1)
+ e.setForward(D+1, -(D + 1), e.getForward(D, -D))
+ e.setForward(D+1, D+1, e.getForward(D, D)+1)
+ for k := -D + 1; k <= D-1; k += 2 {
+ // these are tricky and easy to get backwards
+ lookv := e.lookForward(k, e.getForward(D, k-1)+1)
+ lookh := e.lookForward(k, e.getForward(D, k+1))
+ if lookv > lookh {
+ e.setForward(D+1, k, lookv)
+ } else {
+ e.setForward(D+1, k, lookh)
+ }
+ }
+ // just did a forward pass, so check
+ if got, ok := e.twoDone(D+1, D); ok {
+ return e.twolcs(D+1, D, got)
+ }
+ // do a backward pass, D to D+1
+ e.setBackward(D+1, -(D + 1), e.getBackward(D, -D)-1)
+ e.setBackward(D+1, D+1, e.getBackward(D, D))
+ for k := -D + 1; k <= D-1; k += 2 {
+ // these are tricky and easy to get wrong
+ lookv := e.lookBackward(k, e.getBackward(D, k-1))
+ lookh := e.lookBackward(k, e.getBackward(D, k+1)-1)
+ if lookv < lookh {
+ e.setBackward(D+1, k, lookv)
+ } else {
+ e.setBackward(D+1, k, lookh)
+ }
+ }
+ }
+
+ // D too large. combine a forward and backward partial lcs
+ // first, a forward one
+ kmax := -e.limit - 1
+ diagmax := -1
+ for k := -e.limit; k <= e.limit; k += 2 {
+ x := e.getForward(e.limit, k)
+ y := x - k
+ if x+y > diagmax && x <= e.ux && y <= e.uy {
+ diagmax, kmax = x+y, k
+ }
+ }
+ if kmax < -e.limit {
+ panic(fmt.Sprintf("no forward paths when limit=%d?", e.limit))
+ }
+ lcs := e.forwardlcs(e.limit, kmax)
+ // now a backward one
+ // find the D path with minimal x+y inside the rectangle and
+ // use that to compute the lcs
+ diagmin := 1 << 25 // infinity
+ for k := -e.limit; k <= e.limit; k += 2 {
+ x := e.getBackward(e.limit, k)
+ y := x - (k + e.delta)
+ if x+y < diagmin && x >= 0 && y >= 0 {
+ diagmin, kmax = x+y, k
+ }
+ }
+ if kmax < -e.limit {
+ panic(fmt.Sprintf("no backward paths when limit=%d?", e.limit))
+ }
+ lcs = append(lcs, e.backwardlcs(e.limit, kmax)...)
+ // These may overlap (e.forwardlcs and e.backwardlcs return sorted lcs)
+ ans := lcs.fix()
+ return ans
+}
+
+// Does Myers' Lemma apply?
+func (e *editGraph) twoDone(df, db int) (int, bool) {
+ if (df+db+e.delta)%2 != 0 {
+ return 0, false // diagonals cannot overlap
+ }
+ kmin := -db + e.delta
+ if -df > kmin {
+ kmin = -df
+ }
+ kmax := db + e.delta
+ if df < kmax {
+ kmax = df
+ }
+ for k := kmin; k <= kmax; k += 2 {
+ x := e.vf.get(df, k)
+ u := e.vb.get(db, k-e.delta)
+ if u <= x {
+ // is it worth looking at all the other k?
+ for l := k; l <= kmax; l += 2 {
+ x := e.vf.get(df, l)
+ y := x - l
+ u := e.vb.get(db, l-e.delta)
+ v := u - l
+ if x == u || u == 0 || v == 0 || y == e.uy || x == e.ux {
+ return l, true
+ }
+ }
+ return k, true
+ }
+ }
+ return 0, false
+}
+
+func (e *editGraph) twolcs(df, db, kf int) lcs {
+ // db==df || db+1==df
+ x := e.vf.get(df, kf)
+ y := x - kf
+ kb := kf - e.delta
+ u := e.vb.get(db, kb)
+ v := u - kf
+
+ // Myers proved there is a df-path from (0,0) to (u,v)
+ // and a db-path from (x,y) to (N,M).
+ // In the first case the overall path is the forward path
+ // to (u,v) followed by the backward path to (N,M).
+ // In the second case the path is the backward path to (x,y)
+ // followed by the forward path to (x,y) from (0,0).
+
+ // Look for some special cases to avoid computing either of these paths.
+ if x == u {
+ // "babaab" "cccaba"
+ // already patched together
+ lcs := e.forwardlcs(df, kf)
+ lcs = append(lcs, e.backwardlcs(db, kb)...)
+ return lcs.sort()
+ }
+
+ // is (u-1,v) or (u,v-1) labelled df-1?
+ // if so, that forward df-1-path plus a horizontal or vertical edge
+ // is the df-path to (u,v), then plus the db-path to (N,M)
+ if u > 0 && ok(df-1, u-1-v) && e.vf.get(df-1, u-1-v) == u-1 {
+ // "aabbab" "cbcabc"
+ lcs := e.forwardlcs(df-1, u-1-v)
+ lcs = append(lcs, e.backwardlcs(db, kb)...)
+ return lcs.sort()
+ }
+ if v > 0 && ok(df-1, (u-(v-1))) && e.vf.get(df-1, u-(v-1)) == u {
+ // "abaabb" "bcacab"
+ lcs := e.forwardlcs(df-1, u-(v-1))
+ lcs = append(lcs, e.backwardlcs(db, kb)...)
+ return lcs.sort()
+ }
+
+ // The path can't possibly contribute to the lcs because it
+ // is all horizontal or vertical edges
+ if u == 0 || v == 0 || x == e.ux || y == e.uy {
+ // "abaabb" "abaaaa"
+ if u == 0 || v == 0 {
+ return e.backwardlcs(db, kb)
+ }
+ return e.forwardlcs(df, kf)
+ }
+
+ // is (x+1,y) or (x,y+1) labelled db-1?
+ if x+1 <= e.ux && ok(db-1, x+1-y-e.delta) && e.vb.get(db-1, x+1-y-e.delta) == x+1 {
+ // "bababb" "baaabb"
+ lcs := e.backwardlcs(db-1, kb+1)
+ lcs = append(lcs, e.forwardlcs(df, kf)...)
+ return lcs.sort()
+ }
+ if y+1 <= e.uy && ok(db-1, x-(y+1)-e.delta) && e.vb.get(db-1, x-(y+1)-e.delta) == x {
+ // "abbbaa" "cabacc"
+ lcs := e.backwardlcs(db-1, kb-1)
+ lcs = append(lcs, e.forwardlcs(df, kf)...)
+ return lcs.sort()
+ }
+
+ // need to compute another path
+ // "aabbaa" "aacaba"
+ lcs := e.backwardlcs(db, kb)
+ oldx, oldy := e.ux, e.uy
+ e.ux = u
+ e.uy = v
+ lcs = append(lcs, forward(e)...)
+ e.ux, e.uy = oldx, oldy
+ return lcs.sort()
+}