diff options
author | Ben Gruver <bgruv@google.com> | 2014-02-01 14:43:54 -0800 |
---|---|---|
committer | Ben Gruver <bgruv@google.com> | 2014-02-01 14:43:54 -0800 |
commit | 3b5d84c3ae10f803e0281222e05eab31f8c9d2a6 (patch) | |
tree | 5ccc943108f77935a7b7166a1ad2a9abedf6ed2d /util | |
parent | dd2079cd53c94056436d8c7c26df801fb210df4d (diff) | |
download | smali-3b5d84c3ae10f803e0281222e05eab31f8c9d2a6.tar.gz |
Truncate filenames based on their utf-8 length
Diffstat (limited to 'util')
-rw-r--r-- | util/src/main/java/org/jf/util/ClassFileNameHandler.java | 95 | ||||
-rw-r--r-- | util/src/test/java/org/jf/util/ClassFileNameHandlerTest.java | 71 |
2 files changed, 154 insertions, 12 deletions
diff --git a/util/src/main/java/org/jf/util/ClassFileNameHandler.java b/util/src/main/java/org/jf/util/ClassFileNameHandler.java index aede10e8..7ac77352 100644 --- a/util/src/main/java/org/jf/util/ClassFileNameHandler.java +++ b/util/src/main/java/org/jf/util/ClassFileNameHandler.java @@ -33,7 +33,9 @@ import ds.tree.RadixTreeImpl; import javax.annotation.Nonnull; import java.io.*; +import java.nio.ByteBuffer; import java.nio.CharBuffer; +import java.nio.IntBuffer; import java.util.regex.Pattern; /** @@ -87,8 +89,9 @@ public class ClassFileNameHandler { packageElement += "#"; } - if (packageElement.length() > MAX_FILENAME_LENGTH) { - packageElement = shortenPathComponent(packageElement, MAX_FILENAME_LENGTH); + int utf8Length = utf8Length(packageElement); + if (utf8Length > MAX_FILENAME_LENGTH) { + packageElement = shortenPathComponent(packageElement, utf8Length - MAX_FILENAME_LENGTH); } packageElements[elementIndex++] = packageElement; @@ -109,8 +112,9 @@ public class ClassFileNameHandler { packageElement += "#"; } - if ((packageElement.length() + fileExtension.length()) > MAX_FILENAME_LENGTH) { - packageElement = shortenPathComponent(packageElement, MAX_FILENAME_LENGTH - fileExtension.length()); + int utf8Length = utf8Length(packageElement) + utf8Length(fileExtension); + if (utf8Length > MAX_FILENAME_LENGTH) { + packageElement = shortenPathComponent(packageElement, utf8Length - MAX_FILENAME_LENGTH); } packageElements[elementIndex] = packageElement; @@ -118,12 +122,87 @@ public class ClassFileNameHandler { return top.addUniqueChild(packageElements, 0); } + private static int utf8Length(String str) { + int utf8Length = 0; + int i=0; + while (i<str.length()) { + int c = str.codePointAt(i); + utf8Length += utf8Length(c); + i += Character.charCount(c); + } + return utf8Length; + } + + private static int utf8Length(int codePoint) { + if (codePoint < 0x80) { + return 1; + } else if (codePoint < 0x800) { + return 2; + } else if (codePoint < 0x10000) { + return 3; + } else { + return 4; + } + } + + /** + * Shortens an individual file/directory name, removing the necessary number of code points + * from the middle of the string such that the utf-8 encoding of the string is at least + * bytesToRemove bytes shorter than the original. + * + * The removed codePoints in the middle of the string will be replaced with a # character. + */ @Nonnull - static String shortenPathComponent(@Nonnull String pathComponent, int maxLength) { - int toRemove = pathComponent.length() - maxLength + 1; + static String shortenPathComponent(@Nonnull String pathComponent, int bytesToRemove) { + // We replace the removed part with a #, so we need to remove 1 extra char + bytesToRemove++; + + int[] codePoints; + try { + IntBuffer intBuffer = ByteBuffer.wrap(pathComponent.getBytes("UTF-32BE")).asIntBuffer(); + codePoints = new int[intBuffer.limit()]; + intBuffer.get(codePoints); + } catch (UnsupportedEncodingException ex) { + throw new RuntimeException(ex); + } + + int midPoint = codePoints.length/2; + int delta = 0; + + int firstEnd = midPoint; // exclusive + int secondStart = midPoint+1; // inclusive + int bytesRemoved = utf8Length(codePoints[midPoint]); + + // if we have an even number of codepoints, start by removing both middle characters, + // unless just removing the first already removes enough bytes + if (((codePoints.length % 2) == 0) && bytesRemoved < bytesToRemove) { + bytesRemoved += utf8Length(codePoints[secondStart]); + secondStart++; + } + + while ((bytesRemoved < bytesToRemove) && + (firstEnd > 0 || secondStart < codePoints.length)) { + if (firstEnd > 0) { + firstEnd--; + bytesRemoved += utf8Length(codePoints[firstEnd]); + } + + if (bytesRemoved < bytesToRemove && secondStart < codePoints.length) { + bytesRemoved += utf8Length(codePoints[secondStart]); + secondStart++; + } + } + + StringBuilder sb = new StringBuilder(); + for (int i=0; i<firstEnd; i++) { + sb.appendCodePoint(codePoints[i]); + } + sb.append('#'); + for (int i=secondStart; i<codePoints.length; i++) { + sb.appendCodePoint(codePoints[i]); + } - int firstIndex = (pathComponent.length()/2) - (toRemove/2); - return pathComponent.substring(0, firstIndex) + "#" + pathComponent.substring(firstIndex+toRemove); + return sb.toString(); } private static boolean testForWindowsReservedFileNames(File path) { diff --git a/util/src/test/java/org/jf/util/ClassFileNameHandlerTest.java b/util/src/test/java/org/jf/util/ClassFileNameHandlerTest.java index 7aa47adf..e3dfd154 100644 --- a/util/src/test/java/org/jf/util/ClassFileNameHandlerTest.java +++ b/util/src/test/java/org/jf/util/ClassFileNameHandlerTest.java @@ -34,16 +34,79 @@ package org.jf.util; import junit.framework.Assert; import org.junit.Test; +import java.nio.charset.Charset; + public class ClassFileNameHandlerTest { + private final Charset UTF8 = Charset.forName("UTF-8"); + + @Test + public void test1ByteEncodings() { + StringBuilder sb = new StringBuilder(); + for (int i=0; i<100; i++) { + sb.append((char)i); + } + + String result = ClassFileNameHandler.shortenPathComponent(sb.toString(), 5); + Assert.assertEquals(95, result.getBytes(UTF8).length); + Assert.assertEquals(95, result.length()); + } + @Test - public void testShortedPathComponent() { + public void test2ByteEncodings() { StringBuilder sb = new StringBuilder(); - for (int i=0; i<300; i++) { + for (int i=0x80; i<0x80+100; i++) { sb.append((char)i); } - String result = ClassFileNameHandler.shortenPathComponent(sb.toString(), 255); + // remove a total of 3 2-byte characters, and then add back in the 1-byte '#' + String result = ClassFileNameHandler.shortenPathComponent(sb.toString(), 4); + Assert.assertEquals(200, sb.toString().getBytes(UTF8).length); + Assert.assertEquals(195, result.getBytes(UTF8).length); + Assert.assertEquals(98, result.length()); + + // remove a total of 3 2-byte characters, and then add back in the 1-byte '#' + result = ClassFileNameHandler.shortenPathComponent(sb.toString(), 5); + Assert.assertEquals(200, sb.toString().getBytes(UTF8).length); + Assert.assertEquals(195, result.getBytes(UTF8).length); + Assert.assertEquals(98, result.length()); + } + + @Test + public void test3ByteEncodings() { + StringBuilder sb = new StringBuilder(); + for (int i=0x800; i<0x800+100; i++) { + sb.append((char)i); + } + + // remove a total of 3 3-byte characters, and then add back in the 1-byte '#' + String result = ClassFileNameHandler.shortenPathComponent(sb.toString(), 6); + Assert.assertEquals(300, sb.toString().getBytes(UTF8).length); + Assert.assertEquals(292, result.getBytes(UTF8).length); + Assert.assertEquals(98, result.length()); + + // remove a total of 3 3-byte characters, and then add back in the 1-byte '#' + result = ClassFileNameHandler.shortenPathComponent(sb.toString(), 7); + Assert.assertEquals(300, sb.toString().getBytes(UTF8).length); + Assert.assertEquals(292, result.getBytes(UTF8).length); + Assert.assertEquals(98, result.length()); + } + + public void test4ByteEncodings() { + StringBuilder sb = new StringBuilder(); + for (int i=0x10000; i<0x10000+100; i++) { + sb.appendCodePoint(i); + } + + // we remove 3 codepoints == 6 characters == 12 bytes, and then add back in the 1-byte '#' + String result = ClassFileNameHandler.shortenPathComponent(sb.toString(), 8); + Assert.assertEquals(400, sb.toString().getBytes(UTF8).length); + Assert.assertEquals(389, result.getBytes(UTF8).length); + Assert.assertEquals(98, result.length()); - Assert.assertEquals(255, result.length()); + // we remove 3 codepoints == 6 characters == 12 bytes, and then add back in the 1-byte '#' + result = ClassFileNameHandler.shortenPathComponent(sb.toString(), 7); + Assert.assertEquals(400, sb.toString().getBytes(UTF8).length); + Assert.assertEquals(3892, result.getBytes(UTF8).length); + Assert.assertEquals(98, result.length()); } } |