aboutsummaryrefslogtreecommitdiff
path: root/util
diff options
context:
space:
mode:
authorBen Gruver <bgruv@google.com>2014-02-01 14:43:54 -0800
committerBen Gruver <bgruv@google.com>2014-02-01 14:43:54 -0800
commit3b5d84c3ae10f803e0281222e05eab31f8c9d2a6 (patch)
tree5ccc943108f77935a7b7166a1ad2a9abedf6ed2d /util
parentdd2079cd53c94056436d8c7c26df801fb210df4d (diff)
downloadsmali-3b5d84c3ae10f803e0281222e05eab31f8c9d2a6.tar.gz
Truncate filenames based on their utf-8 length
Diffstat (limited to 'util')
-rw-r--r--util/src/main/java/org/jf/util/ClassFileNameHandler.java95
-rw-r--r--util/src/test/java/org/jf/util/ClassFileNameHandlerTest.java71
2 files changed, 154 insertions, 12 deletions
diff --git a/util/src/main/java/org/jf/util/ClassFileNameHandler.java b/util/src/main/java/org/jf/util/ClassFileNameHandler.java
index aede10e8..7ac77352 100644
--- a/util/src/main/java/org/jf/util/ClassFileNameHandler.java
+++ b/util/src/main/java/org/jf/util/ClassFileNameHandler.java
@@ -33,7 +33,9 @@ import ds.tree.RadixTreeImpl;
import javax.annotation.Nonnull;
import java.io.*;
+import java.nio.ByteBuffer;
import java.nio.CharBuffer;
+import java.nio.IntBuffer;
import java.util.regex.Pattern;
/**
@@ -87,8 +89,9 @@ public class ClassFileNameHandler {
packageElement += "#";
}
- if (packageElement.length() > MAX_FILENAME_LENGTH) {
- packageElement = shortenPathComponent(packageElement, MAX_FILENAME_LENGTH);
+ int utf8Length = utf8Length(packageElement);
+ if (utf8Length > MAX_FILENAME_LENGTH) {
+ packageElement = shortenPathComponent(packageElement, utf8Length - MAX_FILENAME_LENGTH);
}
packageElements[elementIndex++] = packageElement;
@@ -109,8 +112,9 @@ public class ClassFileNameHandler {
packageElement += "#";
}
- if ((packageElement.length() + fileExtension.length()) > MAX_FILENAME_LENGTH) {
- packageElement = shortenPathComponent(packageElement, MAX_FILENAME_LENGTH - fileExtension.length());
+ int utf8Length = utf8Length(packageElement) + utf8Length(fileExtension);
+ if (utf8Length > MAX_FILENAME_LENGTH) {
+ packageElement = shortenPathComponent(packageElement, utf8Length - MAX_FILENAME_LENGTH);
}
packageElements[elementIndex] = packageElement;
@@ -118,12 +122,87 @@ public class ClassFileNameHandler {
return top.addUniqueChild(packageElements, 0);
}
+ private static int utf8Length(String str) {
+ int utf8Length = 0;
+ int i=0;
+ while (i<str.length()) {
+ int c = str.codePointAt(i);
+ utf8Length += utf8Length(c);
+ i += Character.charCount(c);
+ }
+ return utf8Length;
+ }
+
+ private static int utf8Length(int codePoint) {
+ if (codePoint < 0x80) {
+ return 1;
+ } else if (codePoint < 0x800) {
+ return 2;
+ } else if (codePoint < 0x10000) {
+ return 3;
+ } else {
+ return 4;
+ }
+ }
+
+ /**
+ * Shortens an individual file/directory name, removing the necessary number of code points
+ * from the middle of the string such that the utf-8 encoding of the string is at least
+ * bytesToRemove bytes shorter than the original.
+ *
+ * The removed codePoints in the middle of the string will be replaced with a # character.
+ */
@Nonnull
- static String shortenPathComponent(@Nonnull String pathComponent, int maxLength) {
- int toRemove = pathComponent.length() - maxLength + 1;
+ static String shortenPathComponent(@Nonnull String pathComponent, int bytesToRemove) {
+ // We replace the removed part with a #, so we need to remove 1 extra char
+ bytesToRemove++;
+
+ int[] codePoints;
+ try {
+ IntBuffer intBuffer = ByteBuffer.wrap(pathComponent.getBytes("UTF-32BE")).asIntBuffer();
+ codePoints = new int[intBuffer.limit()];
+ intBuffer.get(codePoints);
+ } catch (UnsupportedEncodingException ex) {
+ throw new RuntimeException(ex);
+ }
+
+ int midPoint = codePoints.length/2;
+ int delta = 0;
+
+ int firstEnd = midPoint; // exclusive
+ int secondStart = midPoint+1; // inclusive
+ int bytesRemoved = utf8Length(codePoints[midPoint]);
+
+ // if we have an even number of codepoints, start by removing both middle characters,
+ // unless just removing the first already removes enough bytes
+ if (((codePoints.length % 2) == 0) && bytesRemoved < bytesToRemove) {
+ bytesRemoved += utf8Length(codePoints[secondStart]);
+ secondStart++;
+ }
+
+ while ((bytesRemoved < bytesToRemove) &&
+ (firstEnd > 0 || secondStart < codePoints.length)) {
+ if (firstEnd > 0) {
+ firstEnd--;
+ bytesRemoved += utf8Length(codePoints[firstEnd]);
+ }
+
+ if (bytesRemoved < bytesToRemove && secondStart < codePoints.length) {
+ bytesRemoved += utf8Length(codePoints[secondStart]);
+ secondStart++;
+ }
+ }
+
+ StringBuilder sb = new StringBuilder();
+ for (int i=0; i<firstEnd; i++) {
+ sb.appendCodePoint(codePoints[i]);
+ }
+ sb.append('#');
+ for (int i=secondStart; i<codePoints.length; i++) {
+ sb.appendCodePoint(codePoints[i]);
+ }
- int firstIndex = (pathComponent.length()/2) - (toRemove/2);
- return pathComponent.substring(0, firstIndex) + "#" + pathComponent.substring(firstIndex+toRemove);
+ return sb.toString();
}
private static boolean testForWindowsReservedFileNames(File path) {
diff --git a/util/src/test/java/org/jf/util/ClassFileNameHandlerTest.java b/util/src/test/java/org/jf/util/ClassFileNameHandlerTest.java
index 7aa47adf..e3dfd154 100644
--- a/util/src/test/java/org/jf/util/ClassFileNameHandlerTest.java
+++ b/util/src/test/java/org/jf/util/ClassFileNameHandlerTest.java
@@ -34,16 +34,79 @@ package org.jf.util;
import junit.framework.Assert;
import org.junit.Test;
+import java.nio.charset.Charset;
+
public class ClassFileNameHandlerTest {
+ private final Charset UTF8 = Charset.forName("UTF-8");
+
+ @Test
+ public void test1ByteEncodings() {
+ StringBuilder sb = new StringBuilder();
+ for (int i=0; i<100; i++) {
+ sb.append((char)i);
+ }
+
+ String result = ClassFileNameHandler.shortenPathComponent(sb.toString(), 5);
+ Assert.assertEquals(95, result.getBytes(UTF8).length);
+ Assert.assertEquals(95, result.length());
+ }
+
@Test
- public void testShortedPathComponent() {
+ public void test2ByteEncodings() {
StringBuilder sb = new StringBuilder();
- for (int i=0; i<300; i++) {
+ for (int i=0x80; i<0x80+100; i++) {
sb.append((char)i);
}
- String result = ClassFileNameHandler.shortenPathComponent(sb.toString(), 255);
+ // remove a total of 3 2-byte characters, and then add back in the 1-byte '#'
+ String result = ClassFileNameHandler.shortenPathComponent(sb.toString(), 4);
+ Assert.assertEquals(200, sb.toString().getBytes(UTF8).length);
+ Assert.assertEquals(195, result.getBytes(UTF8).length);
+ Assert.assertEquals(98, result.length());
+
+ // remove a total of 3 2-byte characters, and then add back in the 1-byte '#'
+ result = ClassFileNameHandler.shortenPathComponent(sb.toString(), 5);
+ Assert.assertEquals(200, sb.toString().getBytes(UTF8).length);
+ Assert.assertEquals(195, result.getBytes(UTF8).length);
+ Assert.assertEquals(98, result.length());
+ }
+
+ @Test
+ public void test3ByteEncodings() {
+ StringBuilder sb = new StringBuilder();
+ for (int i=0x800; i<0x800+100; i++) {
+ sb.append((char)i);
+ }
+
+ // remove a total of 3 3-byte characters, and then add back in the 1-byte '#'
+ String result = ClassFileNameHandler.shortenPathComponent(sb.toString(), 6);
+ Assert.assertEquals(300, sb.toString().getBytes(UTF8).length);
+ Assert.assertEquals(292, result.getBytes(UTF8).length);
+ Assert.assertEquals(98, result.length());
+
+ // remove a total of 3 3-byte characters, and then add back in the 1-byte '#'
+ result = ClassFileNameHandler.shortenPathComponent(sb.toString(), 7);
+ Assert.assertEquals(300, sb.toString().getBytes(UTF8).length);
+ Assert.assertEquals(292, result.getBytes(UTF8).length);
+ Assert.assertEquals(98, result.length());
+ }
+
+ public void test4ByteEncodings() {
+ StringBuilder sb = new StringBuilder();
+ for (int i=0x10000; i<0x10000+100; i++) {
+ sb.appendCodePoint(i);
+ }
+
+ // we remove 3 codepoints == 6 characters == 12 bytes, and then add back in the 1-byte '#'
+ String result = ClassFileNameHandler.shortenPathComponent(sb.toString(), 8);
+ Assert.assertEquals(400, sb.toString().getBytes(UTF8).length);
+ Assert.assertEquals(389, result.getBytes(UTF8).length);
+ Assert.assertEquals(98, result.length());
- Assert.assertEquals(255, result.length());
+ // we remove 3 codepoints == 6 characters == 12 bytes, and then add back in the 1-byte '#'
+ result = ClassFileNameHandler.shortenPathComponent(sb.toString(), 7);
+ Assert.assertEquals(400, sb.toString().getBytes(UTF8).length);
+ Assert.assertEquals(3892, result.getBytes(UTF8).length);
+ Assert.assertEquals(98, result.length());
}
}