summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarkus Scherer <markus.icu@gmail.com>2017-02-17 21:03:35 +0000
committerJoachim Sauer <jsauer@google.com>2017-04-04 23:32:27 +0100
commit4dc01285b26769ed1abfcea2bff97fd9e99ccc61 (patch)
tree8e9b8ac1d6aee56613b178b98353fa5a42c70f37
parent2a96d553055c7ac571d0db719bbe5c81ab03abaa (diff)
downloadicu-4dc01285b26769ed1abfcea2bff97fd9e99ccc61.tar.gz
Cherry-pick: ticket:12410: class Edits, class CaseMap with new low-level functions that work with Edits, simpler case properties code, some cleanup
This is the part of ICU changeset 39684 that affects the icu4j/ subdirectory, leaving the icu4c/ subdirectory unchanged: http://bugs.icu-project.org/trac/changeset/39684 Bug: 19047649 Test: mmma libcore external/icu Test: ant check Test: CtsIcuTestCases Test: CtsLibcoreOjTestCases Test: CtsLibcoreTestCases (cherry picked from commit 63cafec8b8cb135e7c06ef6b9fc8c128ed55b140) Change-Id: I2280e0376253abe1af6671a02c9b1d056c099949
-rw-r--r--icu4j/main/classes/core/src/com/ibm/icu/impl/CaseMapImpl.java (renamed from icu4j/main/classes/core/src/com/ibm/icu/impl/CaseMap.java)340
-rw-r--r--icu4j/main/classes/core/src/com/ibm/icu/impl/UCaseProps.java263
-rw-r--r--icu4j/main/classes/core/src/com/ibm/icu/lang/UCharacter.java288
-rw-r--r--icu4j/main/classes/core/src/com/ibm/icu/text/CaseMap.java339
-rw-r--r--icu4j/main/classes/core/src/com/ibm/icu/text/Edits.java494
-rw-r--r--icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java8
-rw-r--r--icu4j/main/classes/translit/src/com/ibm/icu/text/LowercaseTransliterator.java7
-rw-r--r--icu4j/main/classes/translit/src/com/ibm/icu/text/TitlecaseTransliterator.java9
-rw-r--r--icu4j/main/classes/translit/src/com/ibm/icu/text/UppercaseTransliterator.java7
-rw-r--r--icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UCharacterCaseTest.java189
10 files changed, 1589 insertions, 355 deletions
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/CaseMap.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/CaseMapImpl.java
index 0d1c259b9..f28e60ed5 100644
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/CaseMap.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/CaseMapImpl.java
@@ -2,9 +2,14 @@
// License & terms of use: http://www.unicode.org/copyright.html#License
package com.ibm.icu.impl;
-import com.ibm.icu.util.ULocale;
+import java.io.IOException;
-public final class CaseMap {
+import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.text.BreakIterator;
+import com.ibm.icu.text.Edits;
+import com.ibm.icu.util.ICUUncheckedIOException;
+
+public final class CaseMapImpl {
/**
* Implementation of UCaseProps.ContextIterator, iterates over a String.
* See ustrcase.c/utf16_caseContextIterator().
@@ -12,11 +17,11 @@ public final class CaseMap {
public static final class StringContextIterator implements UCaseProps.ContextIterator {
/**
* Constructor.
- * @param s String to iterate over.
+ * @param src String to iterate over.
*/
- public StringContextIterator(String s) {
- this.s=s;
- limit=s.length();
+ public StringContextIterator(CharSequence src) {
+ this.s=src;
+ limit=src.length();
cpStart=cpLimit=index=0;
dir=0;
}
@@ -60,7 +65,7 @@ public final class CaseMap {
public int nextCaseMapCP() {
cpStart=cpLimit;
if(cpLimit<limit) {
- int c=s.codePointAt(cpLimit);
+ int c=Character.codePointAt(s, cpLimit);
cpLimit+=Character.charCount(c);
return c;
} else {
@@ -84,6 +89,10 @@ public final class CaseMap {
return cpLimit;
}
+ public int getCPLength() {
+ return cpLimit-cpStart;
+ }
+
// implement UCaseProps.ContextIterator
// The following code is not used anywhere in this private class
@Override
@@ -108,11 +117,11 @@ public final class CaseMap {
int c;
if(dir>0 && index<s.length()) {
- c=s.codePointAt(index);
+ c=Character.codePointAt(s, index);
index+=Character.charCount(c);
return c;
} else if(dir<0 && index>0) {
- c=s.codePointBefore(index);
+ c=Character.codePointBefore(s, index);
index-=Character.charCount(c);
return c;
}
@@ -120,44 +129,242 @@ public final class CaseMap {
}
// variables
- protected String s;
+ protected CharSequence s;
protected int index, limit, cpStart, cpLimit;
protected int dir; // 0=initial state >0=forward <0=backward
}
- /** Appends a full case mapping result, see {@link UCaseProps#MAX_STRING_LENGTH}. */
- private static final void appendResult(int c, StringBuilder result) {
+ /**
+ * Omit unchanged text when case-mapping with Edits.
+ */
+ public static final int OMIT_UNCHANGED_TEXT = 0x4000;
+
+ private static int appendCodePoint(Appendable a, int c) throws IOException {
+ if (c <= Character.MAX_VALUE) {
+ a.append((char)c);
+ return 1;
+ } else {
+ a.append((char)(0xd7c0 + (c >> 10)));
+ a.append((char)(Character.MIN_LOW_SURROGATE + (c & 0x3ff)));
+ return 2;
+ }
+ }
+
+ /**
+ * Appends a full case mapping result, see {@link UCaseProps#MAX_STRING_LENGTH}.
+ * @throws IOException
+ */
+ private static void appendResult(int result, Appendable dest,
+ int cpLength, int options, Edits edits) throws IOException {
// Decode the result.
- if (c < 0) {
+ if (result < 0) {
// (not) original code point
- result.appendCodePoint(~c);
- } else if (c <= UCaseProps.MAX_STRING_LENGTH) {
+ if (edits != null) {
+ edits.addUnchanged(cpLength);
+ if ((options & OMIT_UNCHANGED_TEXT) != 0) {
+ return;
+ }
+ }
+ appendCodePoint(dest, ~result);
+ } else if (result <= UCaseProps.MAX_STRING_LENGTH) {
// The mapping has already been appended to result.
+ if (edits != null) {
+ edits.addReplace(cpLength, result);
+ }
} else {
// Append the single-code point mapping.
- result.appendCodePoint(c);
+ int length = appendCodePoint(dest, result);
+ if (edits != null) {
+ edits.addReplace(cpLength, length);
+ }
}
}
- // TODO: Move the other string case mapping functions from UCharacter to here, too.
+ private static final void appendUnchanged(CharSequence src, int start, int length,
+ Appendable dest, int options, Edits edits) throws IOException {
+ if (length > 0) {
+ if (edits != null) {
+ edits.addUnchanged(length);
+ if ((options & OMIT_UNCHANGED_TEXT) != 0) {
+ return;
+ }
+ }
+ dest.append(src, start, start + length);
+ }
+ }
- public static String toUpper(ULocale locale, String str) {
- if (locale == null) {
- locale = ULocale.getDefault();
+ private static void internalToLower(int caseLocale, int options, StringContextIterator iter,
+ Appendable dest, Edits edits) throws IOException {
+ int c;
+ while ((c = iter.nextCaseMapCP()) >= 0) {
+ c = UCaseProps.INSTANCE.toFullLower(c, iter, dest, caseLocale);
+ appendResult(c, dest, iter.getCPLength(), options, edits);
}
- int[] locCache = new int[] { UCaseProps.getCaseLocale(locale, null) };
- if (locCache[0] == UCaseProps.LOC_GREEK) {
- return GreekUpper.toUpper(str, locCache);
+ }
+
+ public static <A extends Appendable> A toLower(int caseLocale, int options,
+ CharSequence src, A dest, Edits edits) {
+ try {
+ if (edits != null) {
+ edits.reset();
+ }
+ StringContextIterator iter = new StringContextIterator(src);
+ internalToLower(caseLocale, options, iter, dest, edits);
+ return dest;
+ } catch (IOException e) {
+ throw new ICUUncheckedIOException(e);
}
+ }
- StringContextIterator iter = new StringContextIterator(str);
- StringBuilder result = new StringBuilder(str.length());
- int c;
- while((c=iter.nextCaseMapCP())>=0) {
- c = UCaseProps.INSTANCE.toFullUpper(c, iter, result, locale, locCache);
- appendResult(c, result);
+ public static <A extends Appendable> A toUpper(int caseLocale, int options,
+ CharSequence src, A dest, Edits edits) {
+ try {
+ if (edits != null) {
+ edits.reset();
+ }
+ if (caseLocale == UCaseProps.LOC_GREEK) {
+ return GreekUpper.toUpper(options, src, dest, edits);
+ }
+ StringContextIterator iter = new StringContextIterator(src);
+ int c;
+ while ((c = iter.nextCaseMapCP()) >= 0) {
+ c = UCaseProps.INSTANCE.toFullUpper(c, iter, dest, caseLocale);
+ appendResult(c, dest, iter.getCPLength(), options, edits);
+ }
+ return dest;
+ } catch (IOException e) {
+ throw new ICUUncheckedIOException(e);
+ }
+ }
+
+ public static <A extends Appendable> A toTitle(
+ int caseLocale, int options, BreakIterator titleIter,
+ CharSequence src, A dest, Edits edits) {
+ try {
+ if (edits != null) {
+ edits.reset();
+ }
+
+ /* set up local variables */
+ StringContextIterator iter = new StringContextIterator(src);
+ int srcLength = src.length();
+ int prev=0;
+ boolean isFirstIndex=true;
+
+ /* titlecasing loop */
+ while(prev<srcLength) {
+ /* find next index where to titlecase */
+ int index;
+ if(isFirstIndex) {
+ isFirstIndex=false;
+ index=titleIter.first();
+ } else {
+ index=titleIter.next();
+ }
+ if(index==BreakIterator.DONE || index>srcLength) {
+ index=srcLength;
+ }
+
+ /*
+ * Unicode 4 & 5 section 3.13 Default Case Operations:
+ *
+ * R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
+ * #29, "Text Boundaries." Between each pair of word boundaries, find the first
+ * cased character F. If F exists, map F to default_title(F); then map each
+ * subsequent character C to default_lower(C).
+ *
+ * In this implementation, segment [prev..index[ into 3 parts:
+ * a) uncased characters (copy as-is) [prev..titleStart[
+ * b) first case letter (titlecase) [titleStart..titleLimit[
+ * c) subsequent characters (lowercase) [titleLimit..index[
+ */
+ if(prev<index) {
+ // find and copy uncased characters [prev..titleStart[
+ int titleStart=prev;
+ iter.setLimit(index);
+ int c=iter.nextCaseMapCP();
+ if((options&UCharacter.TITLECASE_NO_BREAK_ADJUSTMENT)==0
+ && UCaseProps.NONE==UCaseProps.INSTANCE.getType(c)) {
+ // Adjust the titlecasing index (titleStart) to the next cased character.
+ while((c=iter.nextCaseMapCP())>=0
+ && UCaseProps.NONE==UCaseProps.INSTANCE.getType(c)) {}
+ // If c<0 then we have only uncased characters in [prev..index[
+ // and stopped with titleStart==titleLimit==index.
+ titleStart=iter.getCPStart();
+ appendUnchanged(src, prev, titleStart-prev, dest, options, edits);
+ }
+
+ if(titleStart<index) {
+ int titleLimit=iter.getCPLimit();
+ // titlecase c which is from [titleStart..titleLimit[
+ c = UCaseProps.INSTANCE.toFullTitle(c, iter, dest, caseLocale);
+ appendResult(c, dest, iter.getCPLength(), options, edits);
+
+ // Special case Dutch IJ titlecasing
+ if (titleStart+1 < index && caseLocale == UCaseProps.LOC_DUTCH) {
+ char c1 = src.charAt(titleStart);
+ if ((c1 == 'i' || c1 == 'I')) {
+ char c2 = src.charAt(titleStart+1);
+ if (c2 == 'j') {
+ dest.append('J');
+ if (edits != null) {
+ edits.addReplace(1, 1);
+ }
+ c = iter.nextCaseMapCP();
+ titleLimit++;
+ assert c == c2;
+ assert titleLimit == iter.getCPLimit();
+ } else if (c2 == 'J') {
+ // Keep the capital J from getting lowercased.
+ appendUnchanged(src, titleStart + 1, 1, dest, options, edits);
+ c = iter.nextCaseMapCP();
+ titleLimit++;
+ assert c == c2;
+ assert titleLimit == iter.getCPLimit();
+ }
+ }
+ }
+
+ // lowercase [titleLimit..index[
+ if(titleLimit<index) {
+ if((options&UCharacter.TITLECASE_NO_LOWERCASE)==0) {
+ // Normal operation: Lowercase the rest of the word.
+ internalToLower(caseLocale, options, iter, dest, edits);
+ } else {
+ // Optionally just copy the rest of the word unchanged.
+ appendUnchanged(src, titleLimit, index-titleLimit, dest, options, edits);
+ iter.moveToLimit();
+ }
+ }
+ }
+ }
+
+ prev=index;
+ }
+ return dest;
+ } catch (IOException e) {
+ throw new ICUUncheckedIOException(e);
+ }
+ }
+
+ public static <A extends Appendable> A fold(int options,
+ CharSequence src, A dest, Edits edits) {
+ try {
+ if (edits != null) {
+ edits.reset();
+ }
+ int length = src.length();
+ for (int i = 0; i < length;) {
+ int c = Character.codePointAt(src, i);
+ int cpLength = Character.charCount(c);
+ i += cpLength;
+ c = UCaseProps.INSTANCE.toFullFolding(c, dest, options);
+ appendResult(c, dest, cpLength, options, edits);
+ }
+ return dest;
+ } catch (IOException e) {
+ throw new ICUUncheckedIOException(e);
}
- return result.toString();
}
private static final class GreekUpper {
@@ -661,12 +868,13 @@ public final class CaseMap {
* TODO: Try to re-consolidate one way or another with the non-Greek function.
*
* <p>Keep this consistent with the C++ versions in ustrcase.cpp (UTF-16) and ucasemap.cpp (UTF-8).
+ * @throws IOException
*/
- private static String toUpper(CharSequence s, int[] locCache) {
- StringBuilder result = new StringBuilder(s.length());
+ private static <A extends Appendable> A toUpper(int options,
+ CharSequence src, A dest, Edits edits) throws IOException {
int state = 0;
- for (int i = 0; i < s.length();) {
- int c = Character.codePointAt(s, i);
+ for (int i = 0; i < src.length();) {
+ int c = Character.codePointAt(src, i);
int nextIndex = i + Character.charCount(c);
int nextState = 0;
int type = UCaseProps.INSTANCE.getTypeOrIgnorable(c);
@@ -695,8 +903,8 @@ public final class CaseMap {
numYpogegrammeni = 1;
}
// Skip combining diacritics after this Greek letter.
- while (nextIndex < s.length()) {
- int diacriticData = getDiacriticData(s.charAt(nextIndex));
+ while (nextIndex < src.length()) {
+ int diacriticData = getDiacriticData(src.charAt(nextIndex));
if (diacriticData != 0) {
data |= diacriticData;
if ((diacriticData & HAS_YPOGEGRAMMENI) != 0) {
@@ -716,7 +924,7 @@ public final class CaseMap {
(data & HAS_ACCENT) != 0 &&
numYpogegrammeni == 0 &&
(state & AFTER_CASED) == 0 &&
- !isFollowedByCasedLetter(s, nextIndex)) {
+ !isFollowedByCasedLetter(src, nextIndex)) {
// Keep disjunctive "or" with (only) a tonos.
// We use the same "word boundary" conditions as for the Final_Sigma test.
if (i == nextIndex) {
@@ -734,25 +942,59 @@ public final class CaseMap {
data &= ~HAS_EITHER_DIALYTIKA;
}
}
- result.appendCodePoint(upper);
- if ((data & HAS_EITHER_DIALYTIKA) != 0) {
- result.append('\u0308'); // restore or add a dialytika
- }
- if (addTonos) {
- result.append('\u0301');
+
+ boolean change;
+ if (edits == null) {
+ change = true; // common, simple usage
+ } else {
+ // Find out first whether we are changing the text.
+ change = src.charAt(i) != upper || numYpogegrammeni > 0;
+ int i2 = i + 1;
+ if ((data & HAS_EITHER_DIALYTIKA) != 0) {
+ change |= i2 >= nextIndex || src.charAt(i2) != 0x308;
+ ++i2;
+ }
+ if (addTonos) {
+ change |= i2 >= nextIndex || src.charAt(i2) != 0x301;
+ ++i2;
+ }
+ int oldLength = nextIndex - i;
+ int newLength = (i2 - i) + numYpogegrammeni;
+ change |= oldLength != newLength;
+ if (change) {
+ if (edits != null) {
+ edits.addReplace(oldLength, newLength);
+ }
+ } else {
+ if (edits != null) {
+ edits.addUnchanged(oldLength);
+ }
+ // Write unchanged text?
+ change = (options & OMIT_UNCHANGED_TEXT) == 0;
+ }
}
- while (numYpogegrammeni > 0) {
- result.append('Ι');
- --numYpogegrammeni;
+
+ if (change) {
+ dest.append((char)upper);
+ if ((data & HAS_EITHER_DIALYTIKA) != 0) {
+ dest.append('\u0308'); // restore or add a dialytika
+ }
+ if (addTonos) {
+ dest.append('\u0301');
+ }
+ while (numYpogegrammeni > 0) {
+ dest.append('Ι');
+ --numYpogegrammeni;
+ }
}
} else {
- c = UCaseProps.INSTANCE.toFullUpper(c, null, result, null, locCache);
- appendResult(c, result);
+ c = UCaseProps.INSTANCE.toFullUpper(c, null, dest, UCaseProps.LOC_GREEK);
+ appendResult(c, dest, nextIndex - i, options, edits);
}
i = nextIndex;
state = nextState;
}
- return result.toString();
+ return dest;
}
}
}
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/UCaseProps.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/UCaseProps.java
index 927cdc03c..6b5619d23 100644
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/UCaseProps.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/UCaseProps.java
@@ -24,6 +24,7 @@ package com.ibm.icu.impl;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.Iterator;
+import java.util.Locale;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UProperty;
@@ -71,7 +72,7 @@ public final class UCaseProps {
// read exceptions[]
count=indexes[IX_EXC_LENGTH];
if(count>0) {
- exceptions=ICUBinary.getChars(bytes, count, 0);
+ exceptions=ICUBinary.getString(bytes, count, 0);
}
// read unfold[]
@@ -150,7 +151,7 @@ public final class UCaseProps {
*
* @param excWord (in) initial exceptions word
* @param index (in) desired slot index
- * @param excOffset (in) offset into exceptions[] after excWord=exceptions[excOffset++];
+ * @param excOffset (in) offset into exceptions[] after excWord=exceptions.charAt(excOffset++);
* @return bits 31..0: slot value
* 63..32: modified excOffset, moved to the last char of the value, use +1 for beginning of next slot
*/
@@ -158,11 +159,11 @@ public final class UCaseProps {
long value;
if((excWord&EXC_DOUBLE_SLOTS)==0) {
excOffset+=slotOffset(excWord, index);
- value=exceptions[excOffset];
+ value=exceptions.charAt(excOffset);
} else {
excOffset+=2*slotOffset(excWord, index);
- value=exceptions[excOffset++];
- value=(value<<16)|exceptions[excOffset];
+ value=exceptions.charAt(excOffset++);
+ value=(value<<16)|exceptions.charAt(excOffset);
}
return value |((long)excOffset<<32);
}
@@ -172,11 +173,11 @@ public final class UCaseProps {
int value;
if((excWord&EXC_DOUBLE_SLOTS)==0) {
excOffset+=slotOffset(excWord, index);
- value=exceptions[excOffset];
+ value=exceptions.charAt(excOffset);
} else {
excOffset+=2*slotOffset(excWord, index);
- value=exceptions[excOffset++];
- value=(value<<16)|exceptions[excOffset];
+ value=exceptions.charAt(excOffset++);
+ value=(value<<16)|exceptions.charAt(excOffset);
}
return value;
}
@@ -191,7 +192,7 @@ public final class UCaseProps {
}
} else {
int excOffset=getExceptionsOffset(props);
- int excWord=exceptions[excOffset++];
+ int excWord=exceptions.charAt(excOffset++);
if(hasSlot(excWord, EXC_LOWER)) {
c=getSlotValue(excWord, EXC_LOWER, excOffset);
}
@@ -207,7 +208,7 @@ public final class UCaseProps {
}
} else {
int excOffset=getExceptionsOffset(props);
- int excWord=exceptions[excOffset++];
+ int excWord=exceptions.charAt(excOffset++);
if(hasSlot(excWord, EXC_UPPER)) {
c=getSlotValue(excWord, EXC_UPPER, excOffset);
}
@@ -223,7 +224,7 @@ public final class UCaseProps {
}
} else {
int excOffset=getExceptionsOffset(props);
- int excWord=exceptions[excOffset++];
+ int excWord=exceptions.charAt(excOffset++);
int index;
if(hasSlot(excWord, EXC_TITLE)) {
index=EXC_TITLE;
@@ -291,7 +292,7 @@ public final class UCaseProps {
*/
int excOffset0, excOffset=getExceptionsOffset(props);
int closureOffset;
- int excWord=exceptions[excOffset++];
+ int excWord=exceptions.charAt(excOffset++);
int index, closureLength, fullLength, length;
excOffset0=excOffset;
@@ -334,7 +335,7 @@ public final class UCaseProps {
/* add the full case folding string */
length=fullLength&0xf;
if(length!=0) {
- set.add(new String(exceptions, excOffset, length));
+ set.add(exceptions.substring(excOffset, excOffset+length));
excOffset+=length;
}
@@ -348,8 +349,9 @@ public final class UCaseProps {
}
/* add each code point in the closure string */
- for(index=0; index<closureLength; index+=UTF16.getCharCount(c)) {
- c=UTF16.charAt(exceptions, closureOffset, exceptions.length, index);
+ int limit=closureOffset+closureLength;
+ for(index=closureOffset; index<limit; index+=UTF16.getCharCount(c)) {
+ c=exceptions.codePointAt(index);
set.add(c);
}
}
@@ -468,7 +470,7 @@ public final class UCaseProps {
if(!propsHasException(props)) {
return props&DOT_MASK;
} else {
- return (exceptions[getExceptionsOffset(props)]>>EXC_DOT_SHIFT)&DOT_MASK;
+ return (exceptions.charAt(getExceptionsOffset(props))>>EXC_DOT_SHIFT)&DOT_MASK;
}
}
@@ -605,38 +607,49 @@ public final class UCaseProps {
*/
public static final int MAX_STRING_LENGTH=0x1f;
- private static final int LOC_UNKNOWN=0;
- private static final int LOC_ROOT=1;
+ //ivate static final int LOC_UNKNOWN=0;
+ public static final int LOC_ROOT=1;
private static final int LOC_TURKISH=2;
private static final int LOC_LITHUANIAN=3;
static final int LOC_GREEK=4;
+ public static final int LOC_DUTCH=5;
- /*
- * Checks and caches the type of locale ID as it is relevant for case mapping.
- * If the locCache is not null, then it must be initialized with locCache[0]=0 .
- */
- static final int getCaseLocale(ULocale locale, int[] locCache) {
- int result;
-
- if(locCache!=null && (result=locCache[0])!=LOC_UNKNOWN) {
- return result;
- }
-
- result=LOC_ROOT;
-
- String language=locale.getLanguage();
- if(language.equals("tr") || language.equals("tur") || language.equals("az") || language.equals("aze")) {
- result=LOC_TURKISH;
- } else if(language.equals("el") || language.equals("ell")) {
- result=LOC_GREEK;
- } else if(language.equals("lt") || language.equals("lit")) {
- result=LOC_LITHUANIAN;
- }
-
- if(locCache!=null) {
- locCache[0]=result;
+ public static final int getCaseLocale(Locale locale) {
+ return getCaseLocale(locale.getLanguage());
+ }
+ public static final int getCaseLocale(ULocale locale) {
+ return getCaseLocale(locale.getLanguage());
+ }
+ /** Accepts both 2- and 3-letter language subtags. */
+ private static final int getCaseLocale(String language) {
+ // Check the subtag length to reduce the number of comparisons
+ // for locales without special behavior.
+ // Fastpath for English "en" which is often used for default (=root locale) case mappings,
+ // and for Chinese "zh": Very common but no special case mapping behavior.
+ if(language.length()==2) {
+ if(language.equals("en") || language.charAt(0)>'t') {
+ return LOC_ROOT;
+ } else if(language.equals("tr") || language.equals("az")) {
+ return LOC_TURKISH;
+ } else if(language.equals("el")) {
+ return LOC_GREEK;
+ } else if(language.equals("lt")) {
+ return LOC_LITHUANIAN;
+ } else if(language.equals("nl")) {
+ return LOC_DUTCH;
+ }
+ } else if(language.length()==3) {
+ if(language.equals("tur") || language.equals("aze")) {
+ return LOC_TURKISH;
+ } else if(language.equals("ell")) {
+ return LOC_GREEK;
+ } else if(language.equals("lit")) {
+ return LOC_LITHUANIAN;
+ } else if(language.equals("nld")) {
+ return LOC_DUTCH;
+ }
}
- return result;
+ return LOC_ROOT;
}
/* Is followed by {case-ignorable}* cased ? (dir determines looking forward/backward) */
@@ -797,19 +810,14 @@ public final class UCaseProps {
* See ContextIterator for details.
* If iter==null then a context-independent result is returned.
* @param out If the mapping result is a string, then it is appended to out.
- * @param locale Locale ID for locale-dependent mappings.
- * @param locCache Initialize locCache[0] to 0; may be used to cache the result of parsing
- * the locale ID for subsequent calls.
- * Can be null.
+ * @param caseLocale Case locale value from ucase_getCaseLocale().
* @return Output code point or string length, see MAX_STRING_LENGTH.
*
* @see ContextIterator
* @see #MAX_STRING_LENGTH
* @internal
*/
- public final int toFullLower(int c, ContextIterator iter,
- StringBuilder out,
- ULocale locale, int[] locCache) {
+ public final int toFullLower(int c, ContextIterator iter, Appendable out, int caseLocale) {
int result, props;
result=c;
@@ -820,22 +828,20 @@ public final class UCaseProps {
}
} else {
int excOffset=getExceptionsOffset(props), excOffset2;
- int excWord=exceptions[excOffset++];
+ int excWord=exceptions.charAt(excOffset++);
int full;
excOffset2=excOffset;
if((excWord&EXC_CONDITIONAL_SPECIAL)!=0) {
/* use hardcoded conditions and mappings */
- int loc=getCaseLocale(locale, locCache);
-
/*
* Test for conditional mappings first
* (otherwise the unconditional default mappings are always taken),
* then test for characters that have unconditional mappings in SpecialCasing.txt,
* then get the UnicodeData.txt mappings.
*/
- if( loc==LOC_LITHUANIAN &&
+ if( caseLocale==LOC_LITHUANIAN &&
/* base characters, find accents above */
(((c==0x49 || c==0x4a || c==0x12e) &&
isFollowedByMoreAbove(iter)) ||
@@ -858,30 +864,34 @@ public final class UCaseProps {
00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
*/
- switch(c) {
- case 0x49: /* LATIN CAPITAL LETTER I */
- out.append(iDot);
- return 2;
- case 0x4a: /* LATIN CAPITAL LETTER J */
- out.append(jDot);
- return 2;
- case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */
- out.append(iOgonekDot);
- return 2;
- case 0xcc: /* LATIN CAPITAL LETTER I WITH GRAVE */
- out.append(iDotGrave);
- return 3;
- case 0xcd: /* LATIN CAPITAL LETTER I WITH ACUTE */
- out.append(iDotAcute);
- return 3;
- case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */
- out.append(iDotTilde);
- return 3;
- default:
- return 0; /* will not occur */
+ try {
+ switch(c) {
+ case 0x49: /* LATIN CAPITAL LETTER I */
+ out.append(iDot);
+ return 2;
+ case 0x4a: /* LATIN CAPITAL LETTER J */
+ out.append(jDot);
+ return 2;
+ case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */
+ out.append(iOgonekDot);
+ return 2;
+ case 0xcc: /* LATIN CAPITAL LETTER I WITH GRAVE */
+ out.append(iDotGrave);
+ return 3;
+ case 0xcd: /* LATIN CAPITAL LETTER I WITH ACUTE */
+ out.append(iDotAcute);
+ return 3;
+ case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */
+ out.append(iDotTilde);
+ return 3;
+ default:
+ return 0; /* will not occur */
+ }
+ } catch (IOException e) {
+ throw new ICUUncheckedIOException(e);
}
/* # Turkish and Azeri */
- } else if(loc==LOC_TURKISH && c==0x130) {
+ } else if(caseLocale==LOC_TURKISH && c==0x130) {
/*
# I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
# The following rules handle those cases.
@@ -890,7 +900,7 @@ public final class UCaseProps {
0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
*/
return 0x69;
- } else if(loc==LOC_TURKISH && c==0x307 && isPrecededBy_I(iter)) {
+ } else if(caseLocale==LOC_TURKISH && c==0x307 && isPrecededBy_I(iter)) {
/*
# When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
# This matches the behavior of the canonically equivalent I-dot_above
@@ -899,7 +909,7 @@ public final class UCaseProps {
0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
*/
return 0; /* remove the dot (continue without output) */
- } else if(loc==LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(iter)) {
+ } else if(caseLocale==LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(iter)) {
/*
# When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
@@ -913,8 +923,12 @@ public final class UCaseProps {
0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
*/
- out.append(iDot);
- return 2;
+ try {
+ out.append(iDot);
+ return 2;
+ } catch (IOException e) {
+ throw new ICUUncheckedIOException(e);
+ }
} else if( c==0x3a3 &&
!isFollowedByCasedLetter(iter, 1) &&
isFollowedByCasedLetter(iter, -1) /* -1=preceded */
@@ -936,11 +950,15 @@ public final class UCaseProps {
/* start of full case mapping strings */
excOffset=(int)(value>>32)+1;
- /* set the output pointer to the lowercase mapping */
- out.append(exceptions, excOffset, full);
+ try {
+ // append the lowercase mapping
+ out.append(exceptions, excOffset, excOffset+full);
- /* return the string length */
- return full;
+ /* return the string length */
+ return full;
+ } catch (IOException e) {
+ throw new ICUUncheckedIOException(e);
+ }
}
}
@@ -954,8 +972,8 @@ public final class UCaseProps {
/* internal */
private final int toUpperOrTitle(int c, ContextIterator iter,
- StringBuilder out,
- ULocale locale, int[] locCache,
+ Appendable out,
+ int loc,
boolean upperNotTitle) {
int result;
int props;
@@ -968,15 +986,13 @@ public final class UCaseProps {
}
} else {
int excOffset=getExceptionsOffset(props), excOffset2;
- int excWord=exceptions[excOffset++];
+ int excWord=exceptions.charAt(excOffset++);
int full, index;
excOffset2=excOffset;
if((excWord&EXC_CONDITIONAL_SPECIAL)!=0) {
/* use hardcoded conditions and mappings */
- int loc=getCaseLocale(locale, locCache);
-
if(loc==LOC_TURKISH && c==0x69) {
/*
# Turkish and Azeri
@@ -1026,11 +1042,15 @@ public final class UCaseProps {
}
if(full!=0) {
- /* set the output pointer to the result string */
- out.append(exceptions, excOffset, full);
-
- /* return the string length */
- return full;
+ try {
+ // append the result string
+ out.append(exceptions, excOffset, excOffset+full);
+
+ /* return the string length */
+ return full;
+ } catch (IOException e) {
+ throw new ICUUncheckedIOException(e);
+ }
}
}
@@ -1049,15 +1069,15 @@ public final class UCaseProps {
}
public final int toFullUpper(int c, ContextIterator iter,
- StringBuilder out,
- ULocale locale, int[] locCache) {
- return toUpperOrTitle(c, iter, out, locale, locCache, true);
+ Appendable out,
+ int caseLocale) {
+ return toUpperOrTitle(c, iter, out, caseLocale, true);
}
public final int toFullTitle(int c, ContextIterator iter,
- StringBuilder out,
- ULocale locale, int[] locCache) {
- return toUpperOrTitle(c, iter, out, locale, locCache, false);
+ Appendable out,
+ int caseLocale) {
+ return toUpperOrTitle(c, iter, out, caseLocale, false);
}
/* case folding ------------------------------------------------------------- */
@@ -1117,7 +1137,7 @@ public final class UCaseProps {
}
} else {
int excOffset=getExceptionsOffset(props);
- int excWord=exceptions[excOffset++];
+ int excWord=exceptions.charAt(excOffset++);
int index;
if((excWord&EXC_CONDITIONAL_FOLD)!=0) {
/* special case folding mappings, hardcoded */
@@ -1168,7 +1188,7 @@ public final class UCaseProps {
* together in a way that they still fold to common result strings.
*/
- public final int toFullFolding(int c, StringBuilder out, int options) {
+ public final int toFullFolding(int c, Appendable out, int options) {
int result;
int props;
@@ -1180,7 +1200,7 @@ public final class UCaseProps {
}
} else {
int excOffset=getExceptionsOffset(props), excOffset2;
- int excWord=exceptions[excOffset++];
+ int excWord=exceptions.charAt(excOffset++);
int full, index;
excOffset2=excOffset;
@@ -1194,8 +1214,12 @@ public final class UCaseProps {
return 0x69;
} else if(c==0x130) {
/* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
- out.append(iDot);
- return 2;
+ try {
+ out.append(iDot);
+ return 2;
+ } catch (IOException e) {
+ throw new ICUUncheckedIOException(e);
+ }
}
} else {
/* Turkic mappings */
@@ -1219,11 +1243,15 @@ public final class UCaseProps {
full=(full>>4)&0xf;
if(full!=0) {
- /* set the output pointer to the result string */
- out.append(exceptions, excOffset, full);
-
- /* return the string length */
- return full;
+ try {
+ // append the result string
+ out.append(exceptions, excOffset, excOffset+full);
+
+ /* return the string length */
+ return full;
+ } catch (IOException e) {
+ throw new ICUUncheckedIOException(e);
+ }
}
}
@@ -1242,7 +1270,6 @@ public final class UCaseProps {
/* case mapping properties API ---------------------------------------------- */
- private static final int[] rootLocCache = { LOC_ROOT };
/*
* We need a StringBuilder for multi-code point output from the
* full case mapping functions. However, we do not actually use that output,
@@ -1282,20 +1309,20 @@ public final class UCaseProps {
*/
case UProperty.CHANGES_WHEN_LOWERCASED:
dummyStringBuilder.setLength(0);
- return toFullLower(c, null, dummyStringBuilder, ULocale.ROOT, rootLocCache)>=0;
+ return toFullLower(c, null, dummyStringBuilder, LOC_ROOT)>=0;
case UProperty.CHANGES_WHEN_UPPERCASED:
dummyStringBuilder.setLength(0);
- return toFullUpper(c, null, dummyStringBuilder, ULocale.ROOT, rootLocCache)>=0;
+ return toFullUpper(c, null, dummyStringBuilder, LOC_ROOT)>=0;
case UProperty.CHANGES_WHEN_TITLECASED:
dummyStringBuilder.setLength(0);
- return toFullTitle(c, null, dummyStringBuilder, ULocale.ROOT, rootLocCache)>=0;
+ return toFullTitle(c, null, dummyStringBuilder, LOC_ROOT)>=0;
/* case UProperty.CHANGES_WHEN_CASEFOLDED: -- in UCharacterProperty.java */
case UProperty.CHANGES_WHEN_CASEMAPPED:
dummyStringBuilder.setLength(0);
return
- toFullLower(c, null, dummyStringBuilder, ULocale.ROOT, rootLocCache)>=0 ||
- toFullUpper(c, null, dummyStringBuilder, ULocale.ROOT, rootLocCache)>=0 ||
- toFullTitle(c, null, dummyStringBuilder, ULocale.ROOT, rootLocCache)>=0;
+ toFullLower(c, null, dummyStringBuilder, LOC_ROOT)>=0 ||
+ toFullUpper(c, null, dummyStringBuilder, LOC_ROOT)>=0 ||
+ toFullTitle(c, null, dummyStringBuilder, LOC_ROOT)>=0;
default:
return false;
}
@@ -1303,7 +1330,7 @@ public final class UCaseProps {
// data members -------------------------------------------------------- ***
private int indexes[];
- private char exceptions[];
+ private String exceptions;
private char unfold[];
private Trie2_16 trie;
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/lang/UCharacter.java b/icu4j/main/classes/core/src/com/ibm/icu/lang/UCharacter.java
index 40fecc7b1..65cebb36a 100644
--- a/icu4j/main/classes/core/src/com/ibm/icu/lang/UCharacter.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/lang/UCharacter.java
@@ -15,8 +15,7 @@ import java.util.Iterator;
import java.util.Locale;
import java.util.Map;
-import com.ibm.icu.impl.CaseMap;
-import com.ibm.icu.impl.CaseMap.StringContextIterator;
+import com.ibm.icu.impl.CaseMapImpl;
import com.ibm.icu.impl.IllegalIcuArgumentException;
import com.ibm.icu.impl.Trie2;
import com.ibm.icu.impl.UBiDiProps;
@@ -29,6 +28,7 @@ import com.ibm.icu.impl.UPropertyAliases;
import com.ibm.icu.lang.UCharacterEnums.ECharacterCategory;
import com.ibm.icu.lang.UCharacterEnums.ECharacterDirection;
import com.ibm.icu.text.BreakIterator;
+import com.ibm.icu.text.Edits;
import com.ibm.icu.text.Normalizer2;
import com.ibm.icu.util.RangeValueIterator;
import com.ibm.icu.util.ULocale;
@@ -4875,7 +4875,7 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
*/
public static String toUpperCase(String str)
{
- return toUpperCase(ULocale.getDefault(), str);
+ return toUpperCase(getDefaultCaseLocale(), str);
}
/**
@@ -4887,7 +4887,7 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
*/
public static String toLowerCase(String str)
{
- return toLowerCase(ULocale.getDefault(), str);
+ return toLowerCase(getDefaultCaseLocale(), str);
}
/**
@@ -4910,7 +4910,94 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
*/
public static String toTitleCase(String str, BreakIterator breakiter)
{
- return toTitleCase(ULocale.getDefault(), str, breakiter);
+ return toTitleCase(Locale.getDefault(), str, breakiter, 0);
+ }
+
+ private static int getDefaultCaseLocale() {
+ return UCaseProps.getCaseLocale(Locale.getDefault());
+ }
+
+ private static int getCaseLocale(Locale locale) {
+ if (locale == null) {
+ locale = Locale.getDefault();
+ }
+ return UCaseProps.getCaseLocale(locale);
+ }
+
+ private static int getCaseLocale(ULocale locale) {
+ if (locale == null) {
+ locale = ULocale.getDefault();
+ }
+ return UCaseProps.getCaseLocale(locale);
+ }
+
+ private static String toLowerCase(int caseLocale, String str) {
+ if (str.length() <= 100) {
+ if (str.isEmpty()) {
+ return str;
+ }
+ // Collect and apply only changes.
+ // Good if no or few changes. Bad (slow) if many changes.
+ Edits edits = new Edits();
+ StringBuilder replacementChars = CaseMapImpl.toLower(
+ caseLocale, CaseMapImpl.OMIT_UNCHANGED_TEXT, str, new StringBuilder(), edits);
+ return applyEdits(str, replacementChars, edits);
+ } else {
+ return CaseMapImpl.toLower(caseLocale, 0, str,
+ new StringBuilder(str.length()), null).toString();
+ }
+ }
+
+ private static String toUpperCase(int caseLocale, String str) {
+ if (str.length() <= 100) {
+ if (str.isEmpty()) {
+ return str;
+ }
+ // Collect and apply only changes.
+ // Good if no or few changes. Bad (slow) if many changes.
+ Edits edits = new Edits();
+ StringBuilder replacementChars = CaseMapImpl.toUpper(
+ caseLocale, CaseMapImpl.OMIT_UNCHANGED_TEXT, str, new StringBuilder(), edits);
+ return applyEdits(str, replacementChars, edits);
+ } else {
+ return CaseMapImpl.toUpper(caseLocale, 0, str,
+ new StringBuilder(str.length()), null).toString();
+ }
+ }
+
+ private static String toTitleCase(int caseLocale, int options, BreakIterator titleIter, String str) {
+ if (str.length() <= 100) {
+ if (str.isEmpty()) {
+ return str;
+ }
+ // Collect and apply only changes.
+ // Good if no or few changes. Bad (slow) if many changes.
+ Edits edits = new Edits();
+ StringBuilder replacementChars = CaseMapImpl.toTitle(
+ caseLocale, options | CaseMapImpl.OMIT_UNCHANGED_TEXT, titleIter, str,
+ new StringBuilder(), edits);
+ return applyEdits(str, replacementChars, edits);
+ } else {
+ return CaseMapImpl.toTitle(caseLocale, options, titleIter, str,
+ new StringBuilder(str.length()), null).toString();
+ }
+ }
+
+ private static String applyEdits(String str, StringBuilder replacementChars, Edits edits) {
+ if (!edits.hasChanges()) {
+ return str;
+ }
+ StringBuilder result = new StringBuilder(str.length() + edits.lengthDelta());
+ for (Edits.Iterator ei = edits.getCoarseIterator(); ei.next();) {
+ if (ei.hasChange()) {
+ int i = ei.replacementIndex();
+ result.append(replacementChars, i, i + ei.newLength());
+ } else {
+ int i = ei.sourceIndex();
+ result.append(str, i, i + ei.oldLength());
+ }
+ }
+ return result.toString();
}
/**
@@ -4923,7 +5010,7 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
*/
public static String toUpperCase(Locale locale, String str)
{
- return toUpperCase(ULocale.forLocale(locale), str);
+ return toUpperCase(getCaseLocale(locale), str);
}
/**
@@ -4935,7 +5022,7 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
* @stable ICU 3.2
*/
public static String toUpperCase(ULocale locale, String str) {
- return CaseMap.toUpper(locale, str);
+ return toUpperCase(getCaseLocale(locale), str);
}
/**
@@ -4948,7 +5035,7 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
*/
public static String toLowerCase(Locale locale, String str)
{
- return toLowerCase(ULocale.forLocale(locale), str);
+ return toLowerCase(getCaseLocale(locale), str);
}
/**
@@ -4960,31 +5047,7 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
* @stable ICU 3.2
*/
public static String toLowerCase(ULocale locale, String str) {
- StringContextIterator iter = new StringContextIterator(str);
- StringBuilder result = new StringBuilder(str.length());
- int[] locCache = new int[1];
- int c;
-
- if (locale == null) {
- locale = ULocale.getDefault();
- }
- locCache[0]=0;
-
- while((c=iter.nextCaseMapCP())>=0) {
- c = UCaseProps.INSTANCE.toFullLower(c, iter, result, locale, locCache);
-
- /* decode the result */
- if(c<0) {
- /* (not) original code point */
- c=~c;
- } else if(c<=UCaseProps.MAX_STRING_LENGTH) {
- /* mapping already appended to result */
- continue;
- /* } else { append single-code point mapping */
- }
- result.appendCodePoint(c);
- }
- return result.toString();
+ return toLowerCase(getCaseLocale(locale), str);
}
/**
@@ -5009,7 +5072,7 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
public static String toTitleCase(Locale locale, String str,
BreakIterator breakiter)
{
- return toTitleCase(ULocale.forLocale(locale), str, breakiter);
+ return toTitleCase(locale, str, breakiter, 0);
}
/**
@@ -5059,126 +5122,15 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
* @see #TITLECASE_NO_BREAK_ADJUSTMENT
*/
public static String toTitleCase(ULocale locale, String str,
- BreakIterator titleIter,
- int options) {
- StringContextIterator iter = new StringContextIterator(str);
- StringBuilder result = new StringBuilder(str.length());
- int[] locCache = new int[1];
- int c, nc, srcLength = str.length();
-
- if (locale == null) {
- locale = ULocale.getDefault();
- }
- locCache[0]=0;
-
+ BreakIterator titleIter, int options) {
if(titleIter == null) {
+ if (locale == null) {
+ locale = ULocale.getDefault();
+ }
titleIter = BreakIterator.getWordInstance(locale);
}
titleIter.setText(str);
-
- int prev, titleStart, index;
- boolean isFirstIndex;
- boolean isDutch = locale.getLanguage().equals("nl");
- boolean FirstIJ = true;
-
- /* set up local variables */
- prev=0;
- isFirstIndex=true;
-
- /* titlecasing loop */
- while(prev<srcLength) {
- /* find next index where to titlecase */
- if(isFirstIndex) {
- isFirstIndex=false;
- index=titleIter.first();
- } else {
- index=titleIter.next();
- }
- if(index==BreakIterator.DONE || index>srcLength) {
- index=srcLength;
- }
-
- /*
- * Unicode 4 & 5 section 3.13 Default Case Operations:
- *
- * R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
- * #29, "Text Boundaries." Between each pair of word boundaries, find the first
- * cased character F. If F exists, map F to default_title(F); then map each
- * subsequent character C to default_lower(C).
- *
- * In this implementation, segment [prev..index[ into 3 parts:
- * a) uncased characters (copy as-is) [prev..titleStart[
- * b) first case letter (titlecase) [titleStart..titleLimit[
- * c) subsequent characters (lowercase) [titleLimit..index[
- */
- if(prev<index) {
- /* find and copy uncased characters [prev..titleStart[ */
- iter.setLimit(index);
- c=iter.nextCaseMapCP();
- if((options&TITLECASE_NO_BREAK_ADJUSTMENT)==0
- && UCaseProps.NONE==UCaseProps.INSTANCE.getType(c)) {
- while((c=iter.nextCaseMapCP())>=0
- && UCaseProps.NONE==UCaseProps.INSTANCE.getType(c)) {}
- titleStart=iter.getCPStart();
- if(prev<titleStart) {
- result.append(str, prev, titleStart);
- }
- } else {
- titleStart=prev;
- }
-
- if(titleStart<index) {
- FirstIJ = true;
- /* titlecase c which is from titleStart */
- c = UCaseProps.INSTANCE.toFullTitle(c, iter, result, locale, locCache);
-
- /* decode the result and lowercase up to index */
- for(;;) {
- if(c<0) {
- /* (not) original code point */
- c=~c;
- result.appendCodePoint(c);
- } else if(c<=UCaseProps.MAX_STRING_LENGTH) {
- /* mapping already appended to result */
- } else {
- /* append single-code point mapping */
- result.appendCodePoint(c);
- }
-
- if((options&TITLECASE_NO_LOWERCASE)!=0) {
- /* Optionally just copy the rest of the word unchanged. */
-
- int titleLimit=iter.getCPLimit();
- if(titleLimit<index) {
- /* Special Case - Dutch IJ Titlecasing */
- if (isDutch && c == 0x0049 && str.charAt(titleLimit) == 'j') {
- result.append('J').append(str, titleLimit + 1, index);
- } else {
- result.append(str, titleLimit, index);
- }
- }
- iter.moveToLimit();
- break;
- } else if((nc=iter.nextCaseMapCP())>=0) {
- if (isDutch && (nc == 0x004A || nc == 0x006A)
- && (c == 0x0049) && (FirstIJ == true)) {
- c = 0x004A; /* J */
- FirstIJ = false;
- } else {
- /* Normal operation: Lowercase the rest of the word. */
- c = UCaseProps.INSTANCE.toFullLower(nc, iter, result, locale,
- locCache);
- }
- } else {
- break;
- }
- }
- }
- }
-
- prev=index;
- }
- return result.toString();
+ return toTitleCase(getCaseLocale(locale), options, titleIter, str);
}
@@ -5281,7 +5233,11 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
public static String toTitleCase(Locale locale, String str,
BreakIterator titleIter,
int options) {
- return toTitleCase(ULocale.forLocale(locale), str, titleIter, options);
+ if(titleIter == null) {
+ titleIter = BreakIterator.getWordInstance(locale);
+ }
+ titleIter.setText(str);
+ return toTitleCase(getCaseLocale(locale), options, titleIter, str);
}
/**
@@ -5398,27 +5354,19 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
* @stable ICU 2.6
*/
public static final String foldCase(String str, int options) {
- StringBuilder result = new StringBuilder(str.length());
- int c, i, length;
-
- length = str.length();
- for(i=0; i<length;) {
- c=str.codePointAt(i);
- i+=Character.charCount(c);
- c = UCaseProps.INSTANCE.toFullFolding(c, result, options);
-
- /* decode the result */
- if(c<0) {
- /* (not) original code point */
- c=~c;
- } else if(c<=UCaseProps.MAX_STRING_LENGTH) {
- /* mapping already appended to result */
- continue;
- /* } else { append single-code point mapping */
+ if (str.length() <= 100) {
+ if (str.isEmpty()) {
+ return str;
}
- result.appendCodePoint(c);
+ // Collect and apply only changes.
+ // Good if no or few changes. Bad (slow) if many changes.
+ Edits edits = new Edits();
+ StringBuilder replacementChars = CaseMapImpl.fold(
+ options | CaseMapImpl.OMIT_UNCHANGED_TEXT, str, new StringBuilder(), edits);
+ return applyEdits(str, replacementChars, edits);
+ } else {
+ return CaseMapImpl.fold(options, str, new StringBuilder(str.length()), null).toString();
}
- return result.toString();
}
/**
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/CaseMap.java b/icu4j/main/classes/core/src/com/ibm/icu/text/CaseMap.java
new file mode 100644
index 000000000..e998c6624
--- /dev/null
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/CaseMap.java
@@ -0,0 +1,339 @@
+// © 2017 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
+package com.ibm.icu.text;
+
+import java.util.Locale;
+
+import com.ibm.icu.impl.CaseMapImpl;
+import com.ibm.icu.impl.UCaseProps;
+import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.util.ULocale;
+
+/**
+ * Low-level case mapping options and methods. Immutable.
+ * "Setters" return instances with the union of the current and new options set.
+ *
+ * This class is not intended for public subclassing.
+ *
+ * @draft ICU 59
+ * @provisional This API might change or be removed in a future release.
+ */
+public abstract class CaseMap {
+ /**
+ * @internal
+ * @deprecated This API is ICU internal only.
+ */
+ @Deprecated
+ protected int internalOptions;
+
+ private CaseMap(int opt) { internalOptions = opt; }
+
+ private static int getCaseLocale(Locale locale) {
+ if (locale == null) {
+ locale = Locale.getDefault();
+ }
+ return UCaseProps.getCaseLocale(locale);
+ }
+
+ /**
+ * @return Lowercasing object with default options.
+ * @draft ICU 59
+ * @provisional This API might change or be removed in a future release.
+ */
+ public static Lower toLower() { return Lower.DEFAULT; }
+ /**
+ * @return Uppercasing object with default options.
+ * @draft ICU 59
+ * @provisional This API might change or be removed in a future release.
+ */
+ public static Upper toUpper() { return Upper.DEFAULT; }
+ /**
+ * @return Titlecasing object with default options.
+ * @draft ICU 59
+ * @provisional This API might change or be removed in a future release.
+ */
+ public static Title toTitle() { return Title.DEFAULT; }
+ /**
+ * @return Case folding object with default options.
+ * @draft ICU 59
+ * @provisional This API might change or be removed in a future release.
+ */
+ public static Fold fold() { return Fold.DEFAULT; }
+
+ /**
+ * Returns an instance that behaves like this one but
+ * omits unchanged text when case-mapping with {@link Edits}.
+ *
+ * @return an options object with this option.
+ * @draft ICU 59
+ * @provisional This API might change or be removed in a future release.
+ */
+ public abstract CaseMap omitUnchangedText();
+
+ /**
+ * Lowercasing options and methods. Immutable.
+ *
+ * @see #toLower()
+ * @draft ICU 59
+ * @provisional This API might change or be removed in a future release.
+ */
+ public static final class Lower extends CaseMap {
+ private static final Lower DEFAULT = new Lower(0);
+ private static final Lower OMIT_UNCHANGED = new Lower(CaseMapImpl.OMIT_UNCHANGED_TEXT);
+ private Lower(int opt) { super(opt); }
+
+ /**
+ * {@inheritDoc}
+ * @draft ICU 59
+ * @provisional This API might change or be removed in a future release.
+ */
+ @Override
+ public Lower omitUnchangedText() {
+ return OMIT_UNCHANGED;
+ }
+
+ /**
+ * Lowercases a string and optionally records edits (see {@link #omitUnchangedText}).
+ * Casing is locale-dependent and context-sensitive.
+ * The result may be longer or shorter than the original.
+ *
+ * @param locale The locale ID. Can be null for {@link Locale#getDefault}.
+ * (See {@link ULocale#toLocale}.)
+ * @param src The original string.
+ * @param dest A buffer for the result string. Must not be null.
+ * @param edits Records edits for index mapping, working with styled text,
+ * and getting only changes (if any).
+ * This function calls edits.reset() first. edits can be null.
+ * @return dest with the result string (or only changes) appended.
+ *
+ * @see UCharacter#toLowerCase(Locale, String)
+ * @draft ICU 59
+ * @provisional This API might change or be removed in a future release.
+ */
+ public <A extends Appendable> A apply(
+ Locale locale, CharSequence src, A dest, Edits edits) {
+ return CaseMapImpl.toLower(getCaseLocale(locale), internalOptions, src, dest, edits);
+ }
+ }
+
+ /**
+ * Uppercasing options and methods. Immutable.
+ *
+ * @see #toUpper()
+ * @draft ICU 59
+ * @provisional This API might change or be removed in a future release.
+ */
+ public static final class Upper extends CaseMap {
+ private static final Upper DEFAULT = new Upper(0);
+ private static final Upper OMIT_UNCHANGED = new Upper(CaseMapImpl.OMIT_UNCHANGED_TEXT);
+ private Upper(int opt) { super(opt); }
+
+ /**
+ * {@inheritDoc}
+ * @draft ICU 59
+ * @provisional This API might change or be removed in a future release.
+ */
+ @Override
+ public Upper omitUnchangedText() {
+ return OMIT_UNCHANGED;
+ }
+
+ /**
+ * Uppercases a string and optionally records edits (see {@link #omitUnchangedText}).
+ * Casing is locale-dependent and context-sensitive.
+ * The result may be longer or shorter than the original.
+ *
+ * @param locale The locale ID. Can be null for {@link Locale#getDefault}.
+ * (See {@link ULocale#toLocale}.)
+ * @param src The original string.
+ * @param dest A buffer for the result string. Must not be null.
+ * @param edits Records edits for index mapping, working with styled text,
+ * and getting only changes (if any).
+ * This function calls edits.reset() first. edits can be null.
+ * @return dest with the result string (or only changes) appended.
+ *
+ * @see UCharacter#toUpperCase(Locale, String)
+ * @draft ICU 59
+ * @provisional This API might change or be removed in a future release.
+ */
+ public <A extends Appendable> A apply(
+ Locale locale, CharSequence src, A dest, Edits edits) {
+ return CaseMapImpl.toUpper(getCaseLocale(locale), internalOptions, src, dest, edits);
+ }
+ }
+
+ /**
+ * Titlecasing options and methods. Immutable.
+ *
+ * @see #toTitle()
+ * @draft ICU 59
+ * @provisional This API might change or be removed in a future release.
+ */
+ public static final class Title extends CaseMap {
+ private static final Title DEFAULT = new Title(0);
+ private static final Title OMIT_UNCHANGED = new Title(CaseMapImpl.OMIT_UNCHANGED_TEXT);
+ private Title(int opt) { super(opt); }
+
+ /**
+ * {@inheritDoc}
+ * @draft ICU 59
+ * @provisional This API might change or be removed in a future release.
+ */
+ @Override
+ public Title omitUnchangedText() {
+ if (internalOptions == 0 || internalOptions == CaseMapImpl.OMIT_UNCHANGED_TEXT) {
+ return OMIT_UNCHANGED;
+ }
+ return new Title(internalOptions | CaseMapImpl.OMIT_UNCHANGED_TEXT);
+ }
+
+ /**
+ * Returns an instance that behaves like this one but
+ * does not lowercase non-initial parts of words when titlecasing.
+ *
+ * <p>By default, titlecasing will titlecase the first cased character
+ * of a word and lowercase all other characters.
+ * With this option, the other characters will not be modified.
+ *
+ * @return an options object with this option.
+ * @see UCharacter#TITLECASE_NO_LOWERCASE
+ * @draft ICU 59
+ * @provisional This API might change or be removed in a future release.
+ */
+ public Title noLowercase() {
+ return new Title(internalOptions | UCharacter.TITLECASE_NO_LOWERCASE);
+ }
+
+ // TODO: update references to the Unicode Standard for recent version
+ /**
+ * Returns an instance that behaves like this one but
+ * does not adjust the titlecasing indexes from BreakIterator::next() indexes;
+ * titlecases exactly the characters at breaks from the iterator.
+ *
+ * <p>By default, titlecasing will take each break iterator index,
+ * adjust it by looking for the next cased character, and titlecase that one.
+ * Other characters are lowercased.
+ *
+ * <p>This follows Unicode 4 &amp; 5 section 3.13 Default Case Operations:
+ *
+ * R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
+ * #29, "Text Boundaries." Between each pair of word boundaries, find the first
+ * cased character F. If F exists, map F to default_title(F); then map each
+ * subsequent character C to default_lower(C).
+ *
+ * @return an options object with this option.
+ * @see UCharacter#TITLECASE_NO_BREAK_ADJUSTMENT
+ * @draft ICU 59
+ * @provisional This API might change or be removed in a future release.
+ */
+ public Title noBreakAdjustment() {
+ return new Title(internalOptions | UCharacter.TITLECASE_NO_BREAK_ADJUSTMENT);
+ }
+
+ /**
+ * Titlecases a string and optionally records edits (see {@link #omitUnchangedText}).
+ * Casing is locale-dependent and context-sensitive.
+ * The result may be longer or shorter than the original.
+ *
+ * <p>Titlecasing uses a break iterator to find the first characters of words
+ * that are to be titlecased. It titlecases those characters and lowercases
+ * all others. (This can be modified with options bits.)
+ *
+ * @param locale The locale ID. Can be null for {@link Locale#getDefault}.
+ * (See {@link ULocale#toLocale}.)
+ * @param iter A break iterator to find the first characters of words that are to be titlecased.
+ * It is set to the source string (setText())
+ * and used one or more times for iteration (first() and next()).
+ * If null, then a word break iterator for the locale is used
+ * (or something equivalent).
+ * @param src The original string.
+ * @param dest A buffer for the result string. Must not be null.
+ * @param edits Records edits for index mapping, working with styled text,
+ * and getting only changes (if any).
+ * This function calls edits.reset() first. edits can be null.
+ * @return dest with the result string (or only changes) appended.
+ *
+ * @see UCharacter#toTitleCase(Locale, String, BreakIterator, int)
+ * @draft ICU 59
+ * @provisional This API might change or be removed in a future release.
+ */
+ public <A extends Appendable> A apply(
+ Locale locale, BreakIterator iter, CharSequence src, A dest, Edits edits) {
+ if (iter == null) {
+ iter = BreakIterator.getWordInstance(locale);
+ }
+ iter.setText(src.toString());
+ return CaseMapImpl.toTitle(
+ getCaseLocale(locale), internalOptions, iter, src, dest, edits);
+ }
+ }
+
+ /**
+ * Case folding options and methods. Immutable.
+ *
+ * @see #fold()
+ * @draft ICU 59
+ * @provisional This API might change or be removed in a future release.
+ */
+ public static final class Fold extends CaseMap {
+ private static final Fold DEFAULT = new Fold(0);
+ private static final Fold TURKIC = new Fold(UCharacter.FOLD_CASE_EXCLUDE_SPECIAL_I);
+ private static final Fold OMIT_UNCHANGED = new Fold(CaseMapImpl.OMIT_UNCHANGED_TEXT);
+ private static final Fold TURKIC_OMIT_UNCHANGED = new Fold(
+ UCharacter.FOLD_CASE_EXCLUDE_SPECIAL_I | CaseMapImpl.OMIT_UNCHANGED_TEXT);
+ private Fold(int opt) { super(opt); }
+
+ /**
+ * {@inheritDoc}
+ * @draft ICU 59
+ * @provisional This API might change or be removed in a future release.
+ */
+ @Override
+ public Fold omitUnchangedText() {
+ return (internalOptions & UCharacter.FOLD_CASE_EXCLUDE_SPECIAL_I) == 0 ?
+ OMIT_UNCHANGED : TURKIC_OMIT_UNCHANGED;
+ }
+
+ /**
+ * Returns an instance that behaves like this one but
+ * handles dotted I and dotless i appropriately for Turkic languages (tr, az).
+ *
+ * <p>Uses the Unicode CaseFolding.txt mappings marked with 'T' that
+ * are to be excluded for default mappings and
+ * included for the Turkic-specific mappings.
+ *
+ * @return an options object with this option.
+ * @see UCharacter#FOLD_CASE_EXCLUDE_SPECIAL_I
+ * @draft ICU 59
+ * @provisional This API might change or be removed in a future release.
+ */
+ public Fold turkic() {
+ return (internalOptions & CaseMapImpl.OMIT_UNCHANGED_TEXT) == 0 ?
+ TURKIC : TURKIC_OMIT_UNCHANGED;
+ }
+
+ /**
+ * Case-folds a string and optionally records edits (see {@link #omitUnchangedText}).
+ *
+ * <p>Case-folding is locale-independent and not context-sensitive,
+ * but there is an option for whether to include or exclude mappings for dotted I
+ * and dotless i that are marked with 'T' in CaseFolding.txt.
+ *
+ * <p>The result may be longer or shorter than the original.
+ *
+ * @param src The original string.
+ * @param dest A buffer for the result string. Must not be null.
+ * @param edits Records edits for index mapping, working with styled text,
+ * and getting only changes (if any).
+ * This function calls edits.reset() first. edits can be null.
+ * @return dest with the result string (or only changes) appended.
+ *
+ * @see UCharacter#foldCase(String, int)
+ * @draft ICU 59
+ * @provisional This API might change or be removed in a future release.
+ */
+ public <A extends Appendable> A apply(CharSequence src, A dest, Edits edits) {
+ return CaseMapImpl.fold(internalOptions, src, dest, edits);
+ }
+ }
+}
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/Edits.java b/icu4j/main/classes/core/src/com/ibm/icu/text/Edits.java
new file mode 100644
index 000000000..f9cbf9fb4
--- /dev/null
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/Edits.java
@@ -0,0 +1,494 @@
+// © 2017 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
+package com.ibm.icu.text;
+
+import java.nio.BufferOverflowException;
+import java.util.Arrays;
+
+/**
+ * Records lengths of string edits but not replacement text.
+ * Supports replacements, insertions, deletions in linear progression.
+ * Does not support moving/reordering of text.
+ *
+ * @draft ICU 59
+ * @provisional This API might change or be removed in a future release.
+ */
+public final class Edits {
+ // 0000uuuuuuuuuuuu records u+1 unchanged text units.
+ private static final int MAX_UNCHANGED_LENGTH = 0x1000;
+ private static final int MAX_UNCHANGED = MAX_UNCHANGED_LENGTH - 1;
+
+ // 0wwwcccccccccccc with w=1..6 records ccc+1 replacements of w:w text units.
+ // No length change.
+ private static final int MAX_SHORT_WIDTH = 6;
+ private static final int MAX_SHORT_CHANGE_LENGTH = 0xfff;
+ private static final int MAX_SHORT_CHANGE = 0x6fff;
+
+ // 0111mmmmmmnnnnnn records a replacement of m text units with n.
+ // m or n = 61: actual length follows in the next edits array unit.
+ // m or n = 62..63: actual length follows in the next two edits array units.
+ // Bit 30 of the actual length is in the head unit.
+ // Trailing units have bit 15 set.
+ private static final int LENGTH_IN_1TRAIL = 61;
+ private static final int LENGTH_IN_2TRAIL = 62;
+
+ private static final int STACK_CAPACITY = 100;
+ private char[] array;
+ private int length;
+ private int delta;
+
+ /**
+ * Constructs an empty object.
+ * @draft ICU 59
+ * @provisional This API might change or be removed in a future release.
+ */
+ public Edits() {
+ array = new char[STACK_CAPACITY];
+ }
+
+ /**
+ * Resets the data but may not release memory.
+ * @draft ICU 59
+ * @provisional This API might change or be removed in a future release.
+ */
+ public void reset() {
+ length = delta = 0;
+ }
+
+ private void setLastUnit(int last) {
+ array[length - 1] = (char)last;
+ }
+ private int lastUnit() {
+ return length > 0 ? array[length - 1] : 0xffff;
+ }
+
+ /**
+ * Adds a record for an unchanged segment of text.
+ * Normally called from inside ICU string transformation functions, not user code.
+ * @draft ICU 59
+ * @provisional This API might change or be removed in a future release.
+ */
+ public void addUnchanged(int unchangedLength) {
+ if(unchangedLength < 0) {
+ throw new IllegalArgumentException(
+ "addUnchanged(" + unchangedLength + "): length must not be negative");
+ }
+ // Merge into previous unchanged-text record, if any.
+ int last = lastUnit();
+ if(last < MAX_UNCHANGED) {
+ int remaining = MAX_UNCHANGED - last;
+ if (remaining >= unchangedLength) {
+ setLastUnit(last + unchangedLength);
+ return;
+ }
+ setLastUnit(MAX_UNCHANGED);
+ unchangedLength -= remaining;
+ }
+ // Split large lengths into multiple units.
+ while(unchangedLength >= MAX_UNCHANGED_LENGTH) {
+ append(MAX_UNCHANGED);
+ unchangedLength -= MAX_UNCHANGED_LENGTH;
+ }
+ // Write a small (remaining) length.
+ if(unchangedLength > 0) {
+ append(unchangedLength - 1);
+ }
+ }
+
+ /**
+ * Adds a record for a text replacement/insertion/deletion.
+ * Normally called from inside ICU string transformation functions, not user code.
+ * @draft ICU 59
+ * @provisional This API might change or be removed in a future release.
+ */
+ public void addReplace(int oldLength, int newLength) {
+ if(oldLength == newLength && 0 < oldLength && oldLength <= MAX_SHORT_WIDTH) {
+ // Replacement of short oldLength text units by same-length new text.
+ // Merge into previous short-replacement record, if any.
+ int last = lastUnit();
+ if(MAX_UNCHANGED < last && last < MAX_SHORT_CHANGE &&
+ (last >> 12) == oldLength && (last & 0xfff) < MAX_SHORT_CHANGE_LENGTH) {
+ setLastUnit(last + 1);
+ return;
+ }
+ append(oldLength << 12);
+ return;
+ }
+
+ if(oldLength < 0 || newLength < 0) {
+ throw new IllegalArgumentException(
+ "addReplace(" + oldLength + ", " + newLength +
+ "): both lengths must be non-negative");
+ }
+ if (oldLength == 0 && newLength == 0) {
+ return;
+ }
+ int newDelta = newLength - oldLength;
+ if (newDelta != 0) {
+ if ((newDelta > 0 && delta >= 0 && newDelta > (Integer.MAX_VALUE - delta)) ||
+ (newDelta < 0 && delta < 0 && newDelta < (Integer.MIN_VALUE - delta))) {
+ // Integer overflow or underflow.
+ throw new IndexOutOfBoundsException();
+ }
+ delta += newDelta;
+ }
+
+ int head = 0x7000;
+ if (oldLength < LENGTH_IN_1TRAIL && newLength < LENGTH_IN_1TRAIL) {
+ head |= oldLength << 6;
+ head |= newLength;
+ append(head);
+ } else if ((array.length - length) >= 5 || growArray()) {
+ int limit = length + 1;
+ if(oldLength < LENGTH_IN_1TRAIL) {
+ head |= oldLength << 6;
+ } else if(oldLength <= 0x7fff) {
+ head |= LENGTH_IN_1TRAIL << 6;
+ array[limit++] = (char)(0x8000 | oldLength);
+ } else {
+ head |= (LENGTH_IN_2TRAIL + (oldLength >> 30)) << 6;
+ array[limit++] = (char)(0x8000 | (oldLength >> 15));
+ array[limit++] = (char)(0x8000 | oldLength);
+ }
+ if(newLength < LENGTH_IN_1TRAIL) {
+ head |= newLength;
+ } else if(newLength <= 0x7fff) {
+ head |= LENGTH_IN_1TRAIL;
+ array[limit++] = (char)(0x8000 | newLength);
+ } else {
+ head |= LENGTH_IN_2TRAIL + (newLength >> 30);
+ array[limit++] = (char)(0x8000 | (newLength >> 15));
+ array[limit++] = (char)(0x8000 | newLength);
+ }
+ array[length] = (char)head;
+ length = limit;
+ }
+ }
+
+ private void append(int r) {
+ if(length < array.length || growArray()) {
+ array[length++] = (char)r;
+ }
+ }
+
+ private boolean growArray() {
+ int newCapacity;
+ if (array.length == STACK_CAPACITY) {
+ newCapacity = 2000;
+ } else if (array.length == Integer.MAX_VALUE) {
+ throw new BufferOverflowException();
+ } else if (array.length >= (Integer.MAX_VALUE / 2)) {
+ newCapacity = Integer.MAX_VALUE;
+ } else {
+ newCapacity = 2 * array.length;
+ }
+ // Grow by at least 5 units so that a maximal change record will fit.
+ if ((newCapacity - array.length) < 5) {
+ throw new BufferOverflowException();
+ }
+ array = Arrays.copyOf(array, newCapacity);
+ return true;
+ }
+
+ /**
+ * How much longer is the new text compared with the old text?
+ * @return new length minus old length
+ * @draft ICU 59
+ * @provisional This API might change or be removed in a future release.
+ */
+ public int lengthDelta() { return delta; }
+ /**
+ * @return true if there are any change edits
+ * @draft ICU 59
+ * @provisional This API might change or be removed in a future release.
+ */
+ public boolean hasChanges() {
+ if (delta != 0) {
+ return true;
+ }
+ for (int i = 0; i < length; ++i) {
+ if (array[i] > MAX_UNCHANGED) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /**
+ * Access to the list of edits.
+ * @see #getCoarseIterator
+ * @see #getFineIterator
+ * @draft ICU 59
+ * @provisional This API might change or be removed in a future release.
+ */
+ public static final class Iterator {
+ private final char[] array;
+ private int index;
+ private final int length;
+ private int remaining;
+ private final boolean onlyChanges_, coarse;
+
+ private boolean changed;
+ private int oldLength_, newLength_;
+ private int srcIndex, replIndex, destIndex;
+
+ private Iterator(char[] a, int len, boolean oc, boolean crs) {
+ array = a;
+ length = len;
+ onlyChanges_ = oc;
+ coarse = crs;
+ }
+
+ private int readLength(int head) {
+ if (head < LENGTH_IN_1TRAIL) {
+ return head;
+ } else if (head < LENGTH_IN_2TRAIL) {
+ assert(index < length);
+ assert(array[index] >= 0x8000);
+ return array[index++] & 0x7fff;
+ } else {
+ assert((index + 2) <= length);
+ assert(array[index] >= 0x8000);
+ assert(array[index + 1] >= 0x8000);
+ int len = ((head & 1) << 30) |
+ ((array[index] & 0x7fff) << 15) |
+ (array[index + 1] & 0x7fff);
+ index += 2;
+ return len;
+ }
+ }
+
+ private void updateIndexes() {
+ srcIndex += oldLength_;
+ if (changed) {
+ replIndex += newLength_;
+ }
+ destIndex += newLength_;
+ }
+
+ private boolean noNext() {
+ // No change beyond the string.
+ changed = false;
+ oldLength_ = newLength_ = 0;
+ return false;
+ }
+
+ /**
+ * Advances to the next edit.
+ * @return true if there is another edit
+ * @draft ICU 59
+ * @provisional This API might change or be removed in a future release.
+ */
+ public boolean next() {
+ return next(onlyChanges_);
+ }
+
+ private boolean next(boolean onlyChanges) {
+ // We have an errorCode in case we need to start guarding against integer overflows.
+ // It is also convenient for caller loops if we bail out when an error was set elsewhere.
+ updateIndexes();
+ if (remaining > 0) {
+ // Fine-grained iterator: Continue a sequence of equal-length changes.
+ --remaining;
+ return true;
+ }
+ if (index >= length) {
+ return noNext();
+ }
+ int u = array[index++];
+ if (u <= MAX_UNCHANGED) {
+ // Combine adjacent unchanged ranges.
+ changed = false;
+ oldLength_ = u + 1;
+ while (index < length && (u = array[index]) <= MAX_UNCHANGED) {
+ ++index;
+ oldLength_ += u + 1;
+ }
+ newLength_ = oldLength_;
+ if (onlyChanges) {
+ updateIndexes();
+ if (index >= length) {
+ return noNext();
+ }
+ // already fetched u > MAX_UNCHANGED at index
+ ++index;
+ } else {
+ return true;
+ }
+ }
+ changed = true;
+ if (u <= MAX_SHORT_CHANGE) {
+ if (coarse) {
+ int w = u >> 12;
+ int len = (u & 0xfff) + 1;
+ oldLength_ = newLength_ = len * w;
+ } else {
+ // Split a sequence of equal-length changes that was compressed into one unit.
+ oldLength_ = newLength_ = u >> 12;
+ remaining = u & 0xfff;
+ return true;
+ }
+ } else {
+ assert(u <= 0x7fff);
+ oldLength_ = readLength((u >> 6) & 0x3f);
+ newLength_ = readLength(u & 0x3f);
+ if (!coarse) {
+ return true;
+ }
+ }
+ // Combine adjacent changes.
+ while (index < length && (u = array[index]) > MAX_UNCHANGED) {
+ ++index;
+ if (u <= MAX_SHORT_CHANGE) {
+ int w = u >> 12;
+ int len = (u & 0xfff) + 1;
+ len = len * w;
+ oldLength_ += len;
+ newLength_ += len;
+ } else {
+ assert(u <= 0x7fff);
+ int oldLen = readLength((u >> 6) & 0x3f);
+ int newLen = readLength(u & 0x3f);
+ oldLength_ += oldLen;
+ newLength_ += newLen;
+ }
+ }
+ return true;
+ }
+
+ /**
+ * Finds the edit that contains the source index.
+ * The source index may be found in a non-change
+ * even if normal iteration would skip non-changes.
+ * Normal iteration can continue from a found edit.
+ *
+ * <p>The iterator state before this search logically does not matter.
+ * (It may affect the performance of the search.)
+ *
+ * <p>The iterator state after this search is undefined
+ * if the source index is out of bounds for the source string.
+ *
+ * @param i source index
+ * @return true if the edit for the source index was found
+ * @draft ICU 59
+ * @provisional This API might change or be removed in a future release.
+ */
+ public boolean findSourceIndex(int i) {
+ if (i < 0) { return false; }
+ if (i < srcIndex) {
+ // Reset the iterator to the start.
+ index = remaining = oldLength_ = newLength_ = srcIndex = replIndex = destIndex = 0;
+ } else if (i < (srcIndex + oldLength_)) {
+ // The index is in the current span.
+ return true;
+ }
+ while (next(false)) {
+ if (i < (srcIndex + oldLength_)) {
+ // The index is in the current span.
+ return true;
+ }
+ if (remaining > 0) {
+ // Is the index in one of the remaining compressed edits?
+ // srcIndex is the start of the current span, before the remaining ones.
+ int len = (remaining + 1) * oldLength_;
+ if (i < (srcIndex + len)) {
+ int n = (i - srcIndex) / oldLength_; // 1 <= n <= remaining
+ len = n * oldLength_;
+ srcIndex += len;
+ replIndex += len;
+ destIndex += len;
+ remaining -= n;
+ return true;
+ }
+ // Make next() skip all of these edits at once.
+ oldLength_ = newLength_ = len;
+ remaining = 0;
+ }
+ }
+ return false;
+ }
+
+ /**
+ * @return true if this edit replaces oldLength() units with newLength() different ones.
+ * false if oldLength units remain unchanged.
+ * @draft ICU 59
+ * @provisional This API might change or be removed in a future release.
+ */
+ public boolean hasChange() { return changed; }
+ /**
+ * @return the number of units in the original string which are replaced or remain unchanged.
+ * @draft ICU 59
+ * @provisional This API might change or be removed in a future release.
+ */
+ public int oldLength() { return oldLength_; }
+ /**
+ * @return the number of units in the modified string, if hasChange() is true.
+ * Same as oldLength if hasChange() is false.
+ * @draft ICU 59
+ * @provisional This API might change or be removed in a future release.
+ */
+ public int newLength() { return newLength_; }
+
+ /**
+ * @return the current index into the source string
+ * @draft ICU 59
+ * @provisional This API might change or be removed in a future release.
+ */
+ public int sourceIndex() { return srcIndex; }
+ /**
+ * @return the current index into the replacement-characters-only string,
+ * not counting unchanged spans
+ * @draft ICU 59
+ * @provisional This API might change or be removed in a future release.
+ */
+ public int replacementIndex() { return replIndex; }
+ /**
+ * @return the current index into the full destination string
+ * @draft ICU 59
+ * @provisional This API might change or be removed in a future release.
+ */
+ public int destinationIndex() { return destIndex; }
+ };
+
+ /**
+ * Returns an Iterator for coarse-grained changes for simple string updates.
+ * Skips non-changes.
+ * @return an Iterator that merges adjacent changes.
+ * @draft ICU 59
+ * @provisional This API might change or be removed in a future release.
+ */
+ public Iterator getCoarseChangesIterator() {
+ return new Iterator(array, length, true, true);
+ }
+
+ /**
+ * Returns an Iterator for coarse-grained changes and non-changes for simple string updates.
+ * @return an Iterator that merges adjacent changes.
+ * @draft ICU 59
+ * @provisional This API might change or be removed in a future release.
+ */
+ public Iterator getCoarseIterator() {
+ return new Iterator(array, length, false, true);
+ }
+
+ /**
+ * Returns an Iterator for fine-grained changes for modifying styled text.
+ * Skips non-changes.
+ * @return an Iterator that separates adjacent changes.
+ * @draft ICU 59
+ * @provisional This API might change or be removed in a future release.
+ */
+ public Iterator getFineChangesIterator() {
+ return new Iterator(array, length, true, false);
+ }
+
+ /**
+ * Returns an Iterator for fine-grained changes and non-changes for modifying styled text.
+ * @return an Iterator that separates adjacent changes.
+ * @draft ICU 59
+ * @provisional This API might change or be removed in a future release.
+ */
+ public Iterator getFineIterator() {
+ return new Iterator(array, length, false, false);
+ }
+}
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java b/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java
index 7d700d0fe..106259f41 100644
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java
@@ -3866,7 +3866,6 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
int n = getRangeCount();
int result;
StringBuilder full = new StringBuilder();
- int locCache[] = new int[1];
for (int i=0; i<n; ++i) {
int start = getRangeStart(i);
@@ -3881,13 +3880,13 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
// add case mappings
// (does not add long s for regular s, or Kelvin for k, for example)
for (int cp=start; cp<=end; ++cp) {
- result = csp.toFullLower(cp, null, full, root, locCache);
+ result = csp.toFullLower(cp, null, full, UCaseProps.LOC_ROOT);
addCaseMapping(foldSet, result, full);
- result = csp.toFullTitle(cp, null, full, root, locCache);
+ result = csp.toFullTitle(cp, null, full, UCaseProps.LOC_ROOT);
addCaseMapping(foldSet, result, full);
- result = csp.toFullUpper(cp, null, full, root, locCache);
+ result = csp.toFullUpper(cp, null, full, UCaseProps.LOC_ROOT);
addCaseMapping(foldSet, result, full);
result = csp.toFullFolding(cp, full, 0);
@@ -3906,6 +3905,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
} else {
BreakIterator bi = BreakIterator.getWordInstance(root);
for (String str : strings) {
+ // TODO: call lower-level functions
foldSet.add(UCharacter.toLowerCase(root, str));
foldSet.add(UCharacter.toTitleCase(root, str, bi));
foldSet.add(UCharacter.toUpperCase(root, str));
diff --git a/icu4j/main/classes/translit/src/com/ibm/icu/text/LowercaseTransliterator.java b/icu4j/main/classes/translit/src/com/ibm/icu/text/LowercaseTransliterator.java
index 95bb60b32..dfed35266 100644
--- a/icu4j/main/classes/translit/src/com/ibm/icu/text/LowercaseTransliterator.java
+++ b/icu4j/main/classes/translit/src/com/ibm/icu/text/LowercaseTransliterator.java
@@ -44,7 +44,7 @@ class LowercaseTransliterator extends Transliterator{
private final UCaseProps csp;
private ReplaceableContextIterator iter;
private StringBuilder result;
- private int[] locCache;
+ private int caseLocale;
/**
* Constructs a transliterator.
@@ -56,8 +56,7 @@ class LowercaseTransliterator extends Transliterator{
csp=UCaseProps.INSTANCE;
iter=new ReplaceableContextIterator();
result = new StringBuilder();
- locCache = new int[1];
- locCache[0]=0;
+ caseLocale = UCaseProps.getCaseLocale(locale);
}
/**
@@ -85,7 +84,7 @@ class LowercaseTransliterator extends Transliterator{
iter.setLimit(offsets.limit);
iter.setContextLimits(offsets.contextStart, offsets.contextLimit);
while((c=iter.nextCaseMapCP())>=0) {
- c=csp.toFullLower(c, iter, result, locale, locCache);
+ c=csp.toFullLower(c, iter, result, caseLocale);
if(iter.didReachLimit() && isIncremental) {
// the case mapping function tried to look beyond the context limit
diff --git a/icu4j/main/classes/translit/src/com/ibm/icu/text/TitlecaseTransliterator.java b/icu4j/main/classes/translit/src/com/ibm/icu/text/TitlecaseTransliterator.java
index d3dc29681..96f11c8e2 100644
--- a/icu4j/main/classes/translit/src/com/ibm/icu/text/TitlecaseTransliterator.java
+++ b/icu4j/main/classes/translit/src/com/ibm/icu/text/TitlecaseTransliterator.java
@@ -42,7 +42,7 @@ class TitlecaseTransliterator extends Transliterator {
private final UCaseProps csp;
private ReplaceableContextIterator iter;
private StringBuilder result;
- private int[] locCache;
+ private int caseLocale;
/**
* Constructs a transliterator.
@@ -55,8 +55,7 @@ class TitlecaseTransliterator extends Transliterator {
csp=UCaseProps.INSTANCE;
iter=new ReplaceableContextIterator();
result = new StringBuilder();
- locCache = new int[1];
- locCache[0]=0;
+ caseLocale = UCaseProps.getCaseLocale(locale);
}
/**
@@ -119,9 +118,9 @@ class TitlecaseTransliterator extends Transliterator {
type=csp.getTypeOrIgnorable(c);
if(type>=0) { // not case-ignorable
if(doTitle) {
- c=csp.toFullTitle(c, iter, result, locale, locCache);
+ c=csp.toFullTitle(c, iter, result, caseLocale);
} else {
- c=csp.toFullLower(c, iter, result, locale, locCache);
+ c=csp.toFullLower(c, iter, result, caseLocale);
}
doTitle = type==0; // doTitle=isUncased
diff --git a/icu4j/main/classes/translit/src/com/ibm/icu/text/UppercaseTransliterator.java b/icu4j/main/classes/translit/src/com/ibm/icu/text/UppercaseTransliterator.java
index 77e2dfd70..bd9e3fed3 100644
--- a/icu4j/main/classes/translit/src/com/ibm/icu/text/UppercaseTransliterator.java
+++ b/icu4j/main/classes/translit/src/com/ibm/icu/text/UppercaseTransliterator.java
@@ -41,7 +41,7 @@ class UppercaseTransliterator extends Transliterator {
private final UCaseProps csp;
private ReplaceableContextIterator iter;
private StringBuilder result;
- private int[] locCache;
+ private int caseLocale;
/**
* Constructs a transliterator.
@@ -52,8 +52,7 @@ class UppercaseTransliterator extends Transliterator {
csp=UCaseProps.INSTANCE;
iter=new ReplaceableContextIterator();
result = new StringBuilder();
- locCache = new int[1];
- locCache[0]=0;
+ caseLocale = UCaseProps.getCaseLocale(locale);
}
/**
@@ -81,7 +80,7 @@ class UppercaseTransliterator extends Transliterator {
iter.setLimit(offsets.limit);
iter.setContextLimits(offsets.contextStart, offsets.contextLimit);
while((c=iter.nextCaseMapCP())>=0) {
- c=csp.toFullUpper(c, iter, result, locale, locCache);
+ c=csp.toFullUpper(c, iter, result, caseLocale);
if(iter.didReachLimit() && isIncremental) {
// the case mapping function tried to look beyond the context limit
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UCharacterCaseTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UCharacterCaseTest.java
index 7ac358b51..6f8a67983 100644
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UCharacterCaseTest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UCharacterCaseTest.java
@@ -24,6 +24,8 @@ import com.ibm.icu.impl.Utility;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.text.BreakIterator;
+import com.ibm.icu.text.CaseMap;
+import com.ibm.icu.text.Edits;
import com.ibm.icu.text.RuleBasedBreakIterator;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.util.ULocale;
@@ -708,6 +710,191 @@ public final class UCharacterCaseTest extends TestFmwk
assertGreekUpper("ρωμέικα", "ΡΩΜΕΪΚΑ");
}
+ private static final class EditChange {
+ private boolean change;
+ private int oldLength, newLength;
+ EditChange(boolean change, int oldLength, int newLength) {
+ this.change = change;
+ this.oldLength = oldLength;
+ this.newLength = newLength;
+ }
+ }
+
+ private static void checkEditsIter(
+ String name, Edits.Iterator ei1, Edits.Iterator ei2, // two equal iterators
+ EditChange[] expected, boolean withUnchanged) {
+ assertFalse(name, ei2.findSourceIndex(-1));
+
+ int expSrcIndex = 0;
+ int expDestIndex = 0;
+ int expReplIndex = 0;
+ for (int expIndex = 0; expIndex < expected.length; ++expIndex) {
+ EditChange expect = expected[expIndex];
+ String msg = name + ' ' + expIndex;
+ if (withUnchanged || expect.change) {
+ assertTrue(msg, ei1.next());
+ assertEquals(msg, expect.change, ei1.hasChange());
+ assertEquals(msg, expect.oldLength, ei1.oldLength());
+ assertEquals(msg, expect.newLength, ei1.newLength());
+ assertEquals(msg, expSrcIndex, ei1.sourceIndex());
+ assertEquals(msg, expDestIndex, ei1.destinationIndex());
+ assertEquals(msg, expReplIndex, ei1.replacementIndex());
+ }
+
+ if (expect.oldLength > 0) {
+ assertTrue(msg, ei2.findSourceIndex(expSrcIndex));
+ assertEquals(msg, expect.change, ei2.hasChange());
+ assertEquals(msg, expect.oldLength, ei2.oldLength());
+ assertEquals(msg, expect.newLength, ei2.newLength());
+ assertEquals(msg, expSrcIndex, ei2.sourceIndex());
+ assertEquals(msg, expDestIndex, ei2.destinationIndex());
+ assertEquals(msg, expReplIndex, ei2.replacementIndex());
+ if (!withUnchanged) {
+ // For some iterators, move past the current range
+ // so that findSourceIndex() has to look before the current index.
+ ei2.next();
+ ei2.next();
+ }
+ }
+
+ expSrcIndex += expect.oldLength;
+ expDestIndex += expect.newLength;
+ if (expect.change) {
+ expReplIndex += expect.newLength;
+ }
+ }
+ String msg = name + " end";
+ assertFalse(msg, ei1.next());
+ assertFalse(msg, ei1.hasChange());
+ assertEquals(msg, 0, ei1.oldLength());
+ assertEquals(msg, 0, ei1.newLength());
+ assertEquals(msg, expSrcIndex, ei1.sourceIndex());
+ assertEquals(msg, expDestIndex, ei1.destinationIndex());
+ assertEquals(msg, expReplIndex, ei1.replacementIndex());
+
+ assertFalse(name, ei2.findSourceIndex(expSrcIndex));
+ }
+
+ @Test
+ public void TestEdits() {
+ Edits edits = new Edits();
+ assertFalse("new Edits", edits.hasChanges());
+ assertEquals("new Edits", 0, edits.lengthDelta());
+ edits.addUnchanged(1); // multiple unchanged ranges are combined
+ edits.addUnchanged(10000); // too long, and they are split
+ edits.addReplace(0, 0);
+ edits.addUnchanged(2);
+ assertFalse("unchanged 10003", edits.hasChanges());
+ assertEquals("unchanged 10003", 0, edits.lengthDelta());
+ edits.addReplace(1, 1); // multiple short equal-length edits are compressed
+ edits.addUnchanged(0);
+ edits.addReplace(1, 1);
+ edits.addReplace(1, 1);
+ edits.addReplace(0, 10);
+ edits.addReplace(100, 0);
+ edits.addReplace(3000, 4000); // variable-length encoding
+ edits.addReplace(100000, 100000);
+ assertTrue("some edits", edits.hasChanges());
+ assertEquals("some edits", 10 - 100 + 1000, edits.lengthDelta());
+
+ EditChange[] coarseExpectedChanges = new EditChange[] {
+ new EditChange(false, 10003, 10003),
+ new EditChange(true, 103103, 104013)
+ };
+ checkEditsIter("coarse",
+ edits.getCoarseIterator(), edits.getCoarseIterator(),
+ coarseExpectedChanges, true);
+ checkEditsIter("coarse changes",
+ edits.getCoarseChangesIterator(), edits.getCoarseChangesIterator(),
+ coarseExpectedChanges, false);
+
+ EditChange[] fineExpectedChanges = new EditChange[] {
+ new EditChange(false, 10003, 10003),
+ new EditChange(true, 1, 1),
+ new EditChange(true, 1, 1),
+ new EditChange(true, 1, 1),
+ new EditChange(true, 0, 10),
+ new EditChange(true, 100, 0),
+ new EditChange(true, 3000, 4000),
+ new EditChange(true, 100000, 100000)
+ };
+ checkEditsIter("fine",
+ edits.getFineIterator(), edits.getFineIterator(),
+ fineExpectedChanges, true);
+ checkEditsIter("fine changes",
+ edits.getFineChangesIterator(), edits.getFineChangesIterator(),
+ fineExpectedChanges, false);
+
+ edits.reset();
+ assertFalse("reset", edits.hasChanges());
+ assertEquals("reset", 0, edits.lengthDelta());
+ Edits.Iterator ei = edits.getCoarseChangesIterator();
+ assertFalse("reset then iterator", ei.next());
+ }
+
+ @Test
+ public void TestCaseMapWithEdits() {
+ StringBuilder sb = new StringBuilder();
+ Edits edits = new Edits();
+
+ sb = CaseMap.toLower().omitUnchangedText().apply(TURKISH_LOCALE_, "IstanBul", sb, edits);
+ assertEquals("toLower(Istanbul)", "ıb", sb.toString());
+ EditChange[] lowerExpectedChanges = new EditChange[] {
+ new EditChange(true, 1, 1),
+ new EditChange(false, 4, 4),
+ new EditChange(true, 1, 1),
+ new EditChange(false, 2, 2)
+ };
+ checkEditsIter("toLower(Istanbul)",
+ edits.getFineIterator(), edits.getFineIterator(),
+ lowerExpectedChanges, true);
+
+ sb.delete(0, sb.length());
+ edits.reset();
+ sb = CaseMap.toUpper().omitUnchangedText().apply(GREEK_LOCALE_, "Πατάτα", sb, edits);
+ assertEquals("toUpper(Πατάτα)", "ΑΤΑΤΑ", sb.toString());
+ EditChange[] upperExpectedChanges = new EditChange[] {
+ new EditChange(false, 1, 1),
+ new EditChange(true, 1, 1),
+ new EditChange(true, 1, 1),
+ new EditChange(true, 1, 1),
+ new EditChange(true, 1, 1),
+ new EditChange(true, 1, 1)
+ };
+ checkEditsIter("toUpper(Πατάτα)",
+ edits.getFineIterator(), edits.getFineIterator(),
+ upperExpectedChanges, true);
+
+ sb.delete(0, sb.length());
+ edits.reset();
+ sb = CaseMap.toTitle().omitUnchangedText().noBreakAdjustment().noLowercase().apply(
+ new Locale("nl"), null, "IjssEL IglOo", sb, edits);
+ assertEquals("toTitle(IjssEL IglOo)", "J", sb.toString());
+ EditChange[] titleExpectedChanges = new EditChange[] {
+ new EditChange(false, 1, 1),
+ new EditChange(true, 1, 1),
+ new EditChange(false, 10, 10)
+ };
+ checkEditsIter("toTitle(IjssEL IglOo)",
+ edits.getFineIterator(), edits.getFineIterator(),
+ titleExpectedChanges, true);
+
+ sb.delete(0, sb.length());
+ edits.reset();
+ sb = CaseMap.fold().omitUnchangedText().turkic().apply("IßtanBul", sb, edits);
+ assertEquals("fold(IßtanBul)", "ıssb", sb.toString());
+ EditChange[] foldExpectedChanges = new EditChange[] {
+ new EditChange(true, 1, 1),
+ new EditChange(true, 1, 2),
+ new EditChange(false, 3, 3),
+ new EditChange(true, 1, 1),
+ new EditChange(false, 2, 2)
+ };
+ checkEditsIter("fold(IßtanBul)",
+ edits.getFineIterator(), edits.getFineIterator(),
+ foldExpectedChanges, true);
+ }
+
// private data members - test data --------------------------------------
private static final Locale TURKISH_LOCALE_ = new Locale("tr", "TR");
@@ -945,7 +1132,7 @@ public final class UCharacterCaseTest extends TestFmwk
// private methods -------------------------------------------------------
/**
- * Converting the hex numbers represented betwee n ';' to Unicode strings
+ * Converting the hex numbers represented between ';' to Unicode strings
* @param str string to break up into Unicode strings
* @return array of Unicode strings ending with a null
*/