aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJoe Tsai <joetsai@digital-static.net>2021-05-24 19:50:45 -0700
committerGitHub <noreply@github.com>2021-05-24 19:50:45 -0700
commitd103655696d8ae43c4125ee61454dbf03d8e8324 (patch)
tree098522dcb65980569606ba11b7c0b27bc982b660
parent9181d1e7c97613e12da2beec65ca4bd834a7a7da (diff)
downloadgo-cmp-d103655696d8ae43c4125ee61454dbf03d8e8324.tar.gz
Print as text if mostly text (#258)
The previous heuristic of treating strings as binary data if it contains any invalid UTF-8 was too strict. Loosen the heuristic to check if most of the characters are printable text. Fixes #257
-rw-r--r--cmp/compare_test.go5
-rw-r--r--cmp/report_slices.go35
-rw-r--r--cmp/testdata/diffs19
3 files changed, 42 insertions, 17 deletions
diff --git a/cmp/compare_test.go b/cmp/compare_test.go
index baae591..649c917 100644
--- a/cmp/compare_test.go
+++ b/cmp/compare_test.go
@@ -1308,6 +1308,11 @@ using the AllowUnexported option.`, "\n"),
y: "org-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=aa _value=2 11\torg-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=bb _value=2 21\torg-4747474747474747,bucket-4242424242424242:m,tag1=b,tag2=cc _value=1 21\torg-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=dd _value=3 31\torg-4747474747474747,bucket-4242424242424242:m,tag1=c _value=4 41\t",
reason: "leading/trailing equal spans should not appear in diff lines",
}, {
+ label: label + "/MostlyTextString",
+ x: "org-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=aa,\xff=_value _value=2 11\norg-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=bb,\xff=_value _value=2 21\norg-4747474747474747,bucket-4242424242424242:m,tag1=b,tag2=cc,\xff=_value _value=1 21\norg-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=dd,\xff=_value _value=3 31\norg-4747474747474747,bucket-4242424242424242:m,tag1=c,\xff=_value _value=4 41\n",
+ y: "org-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=aa _value=2 11\norg-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=bb _value=2 21\norg-4747474747474747,bucket-4242424242424242:m,tag1=b,tag2=cc _value=1 21\norg-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=dd _value=3 31\norg-4747474747474747,bucket-4242424242424242:m,tag1=c _value=4 41\n",
+ reason: "the presence of a few invalid UTF-8 characters should not prevent printing this as text",
+ }, {
label: label + "/AllLinesDiffer",
x: "d5c14bdf6bac81c27afc5429500ed750\n25483503b557c606dad4f144d27ae10b\n90bdbcdbb6ea7156068e3dcfb7459244\n978f480a6e3cced51e297fbff9a506b7\n",
y: "Xd5c14bdf6bac81c27afc5429500ed750\nX25483503b557c606dad4f144d27ae10b\nX90bdbcdbb6ea7156068e3dcfb7459244\nX978f480a6e3cced51e297fbff9a506b7\n",
diff --git a/cmp/report_slices.go b/cmp/report_slices.go
index bd8ca15..2ad3bc8 100644
--- a/cmp/report_slices.go
+++ b/cmp/report_slices.go
@@ -7,6 +7,7 @@ package cmp
import (
"bytes"
"fmt"
+ "math"
"reflect"
"strconv"
"strings"
@@ -96,16 +97,16 @@ func (opts formatOptions) FormatDiffSlice(v *valueNode) textNode {
}
// Auto-detect the type of the data.
- var isLinedText, isText, isBinary bool
var sx, sy string
var ssx, ssy []string
+ var isString, isMostlyText, isPureLinedText, isBinary bool
switch {
case t.Kind() == reflect.String:
sx, sy = vx.String(), vy.String()
- isText = true // Initial estimate, verify later
+ isString = true
case t.Kind() == reflect.Slice && t.Elem() == reflect.TypeOf(byte(0)):
sx, sy = string(vx.Bytes()), string(vy.Bytes())
- isBinary = true // Initial estimate, verify later
+ isString = true
case t.Kind() == reflect.Array:
// Arrays need to be addressable for slice operations to work.
vx2, vy2 := reflect.New(t).Elem(), reflect.New(t).Elem()
@@ -113,13 +114,12 @@ func (opts formatOptions) FormatDiffSlice(v *valueNode) textNode {
vy2.Set(vy)
vx, vy = vx2, vy2
}
- if isText || isBinary {
- var numLines, lastLineIdx, maxLineLen int
- isBinary = !utf8.ValidString(sx) || !utf8.ValidString(sy)
+ if isString {
+ var numTotalRunes, numValidRunes, numLines, lastLineIdx, maxLineLen int
for i, r := range sx + sy {
- if !(unicode.IsPrint(r) || unicode.IsSpace(r)) || r == utf8.RuneError {
- isBinary = true
- break
+ numTotalRunes++
+ if (unicode.IsPrint(r) || unicode.IsSpace(r)) && r != utf8.RuneError {
+ numValidRunes++
}
if r == '\n' {
if maxLineLen < i-lastLineIdx {
@@ -129,12 +129,14 @@ func (opts formatOptions) FormatDiffSlice(v *valueNode) textNode {
numLines++
}
}
- isText = !isBinary
- isLinedText = isText && numLines >= 4 && maxLineLen <= 1024
+ isPureText := numValidRunes == numTotalRunes
+ isMostlyText = float64(numValidRunes) > math.Floor(0.90*float64(numTotalRunes))
+ isPureLinedText = isPureText && numLines >= 4 && maxLineLen <= 1024
+ isBinary = !isMostlyText
// Avoid diffing by lines if it produces a significantly more complex
// edit script than diffing by bytes.
- if isLinedText {
+ if isPureLinedText {
ssx = strings.Split(sx, "\n")
ssy = strings.Split(sy, "\n")
esLines := diff.Difference(len(ssx), len(ssy), func(ix, iy int) diff.Result {
@@ -145,7 +147,7 @@ func (opts formatOptions) FormatDiffSlice(v *valueNode) textNode {
})
efficiencyLines := float64(esLines.Dist()) / float64(len(esLines))
efficiencyBytes := float64(esBytes.Dist()) / float64(len(esBytes))
- isLinedText = efficiencyLines < 4*efficiencyBytes
+ isPureLinedText = efficiencyLines < 4*efficiencyBytes
}
}
@@ -155,7 +157,7 @@ func (opts formatOptions) FormatDiffSlice(v *valueNode) textNode {
switch {
// If the text appears to be multi-lined text,
// then perform differencing across individual lines.
- case isLinedText:
+ case isPureLinedText:
list = opts.formatDiffSlice(
reflect.ValueOf(ssx), reflect.ValueOf(ssy), 1, "line",
func(v reflect.Value, d diffMode) textRecord {
@@ -244,7 +246,7 @@ func (opts formatOptions) FormatDiffSlice(v *valueNode) textNode {
// If the text appears to be single-lined text,
// then perform differencing in approximately fixed-sized chunks.
// The output is printed as quoted strings.
- case isText:
+ case isMostlyText:
list = opts.formatDiffSlice(
reflect.ValueOf(sx), reflect.ValueOf(sy), 64, "byte",
func(v reflect.Value, d diffMode) textRecord {
@@ -252,7 +254,6 @@ func (opts formatOptions) FormatDiffSlice(v *valueNode) textNode {
return textRecord{Diff: d, Value: textLine(s)}
},
)
- delim = ""
// If the text appears to be binary data,
// then perform differencing in approximately fixed-sized chunks.
@@ -314,7 +315,7 @@ func (opts formatOptions) FormatDiffSlice(v *valueNode) textNode {
// Wrap the output with appropriate type information.
var out textNode = &textWrap{Prefix: "{", Value: list, Suffix: "}"}
- if !isText {
+ if !isMostlyText {
// The "{...}" byte-sequence literal is not valid Go syntax for strings.
// Emit the type for extra clarity (e.g. "string{...}").
if t.Kind() == reflect.String {
diff --git a/cmp/testdata/diffs b/cmp/testdata/diffs
index c02df29..a3d5909 100644
--- a/cmp/testdata/diffs
+++ b/cmp/testdata/diffs
@@ -1065,6 +1065,25 @@
` _value=4 41 `,
}, "")
>>> TestDiff/Reporter/SurroundingEqualElements
+<<< TestDiff/Reporter/MostlyTextString
+ strings.Join({
+ "org-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=aa",
+- ",\xff=_value",
+ " _value=2 11\norg-4747474747474747,bucket-4242424242424242:m,tag1",
+ "=a,tag2=bb",
+- ",\xff=_value",
+ " _value=2 21\norg-4747474747474747,bucket-4242424242424242:m,tag1",
+ "=b,tag2=cc",
+- ",\xff=_value",
+ " _value=1 21\norg-4747474747474747,bucket-4242424242424242:m,tag1",
+ "=a,tag2=dd",
+- ",\xff=_value",
+ " _value=3 31\norg-4747474747474747,bucket-4242424242424242:m,tag1",
+ "=c",
+- ",\xff=_value",
+ " _value=4 41\n",
+ }, "")
+>>> TestDiff/Reporter/MostlyTextString
<<< TestDiff/Reporter/AllLinesDiffer
strings.Join({
+ "X",