diff options
author | Markus Scherer <markus.icu@gmail.com> | 2017-02-17 21:03:35 +0000 |
---|---|---|
committer | Joachim Sauer <jsauer@google.com> | 2017-04-04 23:32:27 +0100 |
commit | 4dc01285b26769ed1abfcea2bff97fd9e99ccc61 (patch) | |
tree | 8e9b8ac1d6aee56613b178b98353fa5a42c70f37 | |
parent | 2a96d553055c7ac571d0db719bbe5c81ab03abaa (diff) | |
download | icu-4dc01285b26769ed1abfcea2bff97fd9e99ccc61.tar.gz |
Cherry-pick: ticket:12410: class Edits, class CaseMap with new low-level functions that work with Edits, simpler case properties code, some cleanup
This is the part of ICU changeset 39684 that affects the icu4j/
subdirectory, leaving the icu4c/ subdirectory unchanged:
http://bugs.icu-project.org/trac/changeset/39684
Bug: 19047649
Test: mmma libcore external/icu
Test: ant check
Test: CtsIcuTestCases
Test: CtsLibcoreOjTestCases
Test: CtsLibcoreTestCases
(cherry picked from commit 63cafec8b8cb135e7c06ef6b9fc8c128ed55b140)
Change-Id: I2280e0376253abe1af6671a02c9b1d056c099949
-rw-r--r-- | icu4j/main/classes/core/src/com/ibm/icu/impl/CaseMapImpl.java (renamed from icu4j/main/classes/core/src/com/ibm/icu/impl/CaseMap.java) | 340 | ||||
-rw-r--r-- | icu4j/main/classes/core/src/com/ibm/icu/impl/UCaseProps.java | 263 | ||||
-rw-r--r-- | icu4j/main/classes/core/src/com/ibm/icu/lang/UCharacter.java | 288 | ||||
-rw-r--r-- | icu4j/main/classes/core/src/com/ibm/icu/text/CaseMap.java | 339 | ||||
-rw-r--r-- | icu4j/main/classes/core/src/com/ibm/icu/text/Edits.java | 494 | ||||
-rw-r--r-- | icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java | 8 | ||||
-rw-r--r-- | icu4j/main/classes/translit/src/com/ibm/icu/text/LowercaseTransliterator.java | 7 | ||||
-rw-r--r-- | icu4j/main/classes/translit/src/com/ibm/icu/text/TitlecaseTransliterator.java | 9 | ||||
-rw-r--r-- | icu4j/main/classes/translit/src/com/ibm/icu/text/UppercaseTransliterator.java | 7 | ||||
-rw-r--r-- | icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UCharacterCaseTest.java | 189 |
10 files changed, 1589 insertions, 355 deletions
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/CaseMap.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/CaseMapImpl.java index 0d1c259b9..f28e60ed5 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/CaseMap.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/CaseMapImpl.java @@ -2,9 +2,14 @@ // License & terms of use: http://www.unicode.org/copyright.html#License package com.ibm.icu.impl; -import com.ibm.icu.util.ULocale; +import java.io.IOException; -public final class CaseMap { +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.text.BreakIterator; +import com.ibm.icu.text.Edits; +import com.ibm.icu.util.ICUUncheckedIOException; + +public final class CaseMapImpl { /** * Implementation of UCaseProps.ContextIterator, iterates over a String. * See ustrcase.c/utf16_caseContextIterator(). @@ -12,11 +17,11 @@ public final class CaseMap { public static final class StringContextIterator implements UCaseProps.ContextIterator { /** * Constructor. - * @param s String to iterate over. + * @param src String to iterate over. */ - public StringContextIterator(String s) { - this.s=s; - limit=s.length(); + public StringContextIterator(CharSequence src) { + this.s=src; + limit=src.length(); cpStart=cpLimit=index=0; dir=0; } @@ -60,7 +65,7 @@ public final class CaseMap { public int nextCaseMapCP() { cpStart=cpLimit; if(cpLimit<limit) { - int c=s.codePointAt(cpLimit); + int c=Character.codePointAt(s, cpLimit); cpLimit+=Character.charCount(c); return c; } else { @@ -84,6 +89,10 @@ public final class CaseMap { return cpLimit; } + public int getCPLength() { + return cpLimit-cpStart; + } + // implement UCaseProps.ContextIterator // The following code is not used anywhere in this private class @Override @@ -108,11 +117,11 @@ public final class CaseMap { int c; if(dir>0 && index<s.length()) { - c=s.codePointAt(index); + c=Character.codePointAt(s, index); index+=Character.charCount(c); return c; } else if(dir<0 && index>0) { - c=s.codePointBefore(index); + c=Character.codePointBefore(s, index); index-=Character.charCount(c); return c; } @@ -120,44 +129,242 @@ public final class CaseMap { } // variables - protected String s; + protected CharSequence s; protected int index, limit, cpStart, cpLimit; protected int dir; // 0=initial state >0=forward <0=backward } - /** Appends a full case mapping result, see {@link UCaseProps#MAX_STRING_LENGTH}. */ - private static final void appendResult(int c, StringBuilder result) { + /** + * Omit unchanged text when case-mapping with Edits. + */ + public static final int OMIT_UNCHANGED_TEXT = 0x4000; + + private static int appendCodePoint(Appendable a, int c) throws IOException { + if (c <= Character.MAX_VALUE) { + a.append((char)c); + return 1; + } else { + a.append((char)(0xd7c0 + (c >> 10))); + a.append((char)(Character.MIN_LOW_SURROGATE + (c & 0x3ff))); + return 2; + } + } + + /** + * Appends a full case mapping result, see {@link UCaseProps#MAX_STRING_LENGTH}. + * @throws IOException + */ + private static void appendResult(int result, Appendable dest, + int cpLength, int options, Edits edits) throws IOException { // Decode the result. - if (c < 0) { + if (result < 0) { // (not) original code point - result.appendCodePoint(~c); - } else if (c <= UCaseProps.MAX_STRING_LENGTH) { + if (edits != null) { + edits.addUnchanged(cpLength); + if ((options & OMIT_UNCHANGED_TEXT) != 0) { + return; + } + } + appendCodePoint(dest, ~result); + } else if (result <= UCaseProps.MAX_STRING_LENGTH) { // The mapping has already been appended to result. + if (edits != null) { + edits.addReplace(cpLength, result); + } } else { // Append the single-code point mapping. - result.appendCodePoint(c); + int length = appendCodePoint(dest, result); + if (edits != null) { + edits.addReplace(cpLength, length); + } } } - // TODO: Move the other string case mapping functions from UCharacter to here, too. + private static final void appendUnchanged(CharSequence src, int start, int length, + Appendable dest, int options, Edits edits) throws IOException { + if (length > 0) { + if (edits != null) { + edits.addUnchanged(length); + if ((options & OMIT_UNCHANGED_TEXT) != 0) { + return; + } + } + dest.append(src, start, start + length); + } + } - public static String toUpper(ULocale locale, String str) { - if (locale == null) { - locale = ULocale.getDefault(); + private static void internalToLower(int caseLocale, int options, StringContextIterator iter, + Appendable dest, Edits edits) throws IOException { + int c; + while ((c = iter.nextCaseMapCP()) >= 0) { + c = UCaseProps.INSTANCE.toFullLower(c, iter, dest, caseLocale); + appendResult(c, dest, iter.getCPLength(), options, edits); } - int[] locCache = new int[] { UCaseProps.getCaseLocale(locale, null) }; - if (locCache[0] == UCaseProps.LOC_GREEK) { - return GreekUpper.toUpper(str, locCache); + } + + public static <A extends Appendable> A toLower(int caseLocale, int options, + CharSequence src, A dest, Edits edits) { + try { + if (edits != null) { + edits.reset(); + } + StringContextIterator iter = new StringContextIterator(src); + internalToLower(caseLocale, options, iter, dest, edits); + return dest; + } catch (IOException e) { + throw new ICUUncheckedIOException(e); } + } - StringContextIterator iter = new StringContextIterator(str); - StringBuilder result = new StringBuilder(str.length()); - int c; - while((c=iter.nextCaseMapCP())>=0) { - c = UCaseProps.INSTANCE.toFullUpper(c, iter, result, locale, locCache); - appendResult(c, result); + public static <A extends Appendable> A toUpper(int caseLocale, int options, + CharSequence src, A dest, Edits edits) { + try { + if (edits != null) { + edits.reset(); + } + if (caseLocale == UCaseProps.LOC_GREEK) { + return GreekUpper.toUpper(options, src, dest, edits); + } + StringContextIterator iter = new StringContextIterator(src); + int c; + while ((c = iter.nextCaseMapCP()) >= 0) { + c = UCaseProps.INSTANCE.toFullUpper(c, iter, dest, caseLocale); + appendResult(c, dest, iter.getCPLength(), options, edits); + } + return dest; + } catch (IOException e) { + throw new ICUUncheckedIOException(e); + } + } + + public static <A extends Appendable> A toTitle( + int caseLocale, int options, BreakIterator titleIter, + CharSequence src, A dest, Edits edits) { + try { + if (edits != null) { + edits.reset(); + } + + /* set up local variables */ + StringContextIterator iter = new StringContextIterator(src); + int srcLength = src.length(); + int prev=0; + boolean isFirstIndex=true; + + /* titlecasing loop */ + while(prev<srcLength) { + /* find next index where to titlecase */ + int index; + if(isFirstIndex) { + isFirstIndex=false; + index=titleIter.first(); + } else { + index=titleIter.next(); + } + if(index==BreakIterator.DONE || index>srcLength) { + index=srcLength; + } + + /* + * Unicode 4 & 5 section 3.13 Default Case Operations: + * + * R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex + * #29, "Text Boundaries." Between each pair of word boundaries, find the first + * cased character F. If F exists, map F to default_title(F); then map each + * subsequent character C to default_lower(C). + * + * In this implementation, segment [prev..index[ into 3 parts: + * a) uncased characters (copy as-is) [prev..titleStart[ + * b) first case letter (titlecase) [titleStart..titleLimit[ + * c) subsequent characters (lowercase) [titleLimit..index[ + */ + if(prev<index) { + // find and copy uncased characters [prev..titleStart[ + int titleStart=prev; + iter.setLimit(index); + int c=iter.nextCaseMapCP(); + if((options&UCharacter.TITLECASE_NO_BREAK_ADJUSTMENT)==0 + && UCaseProps.NONE==UCaseProps.INSTANCE.getType(c)) { + // Adjust the titlecasing index (titleStart) to the next cased character. + while((c=iter.nextCaseMapCP())>=0 + && UCaseProps.NONE==UCaseProps.INSTANCE.getType(c)) {} + // If c<0 then we have only uncased characters in [prev..index[ + // and stopped with titleStart==titleLimit==index. + titleStart=iter.getCPStart(); + appendUnchanged(src, prev, titleStart-prev, dest, options, edits); + } + + if(titleStart<index) { + int titleLimit=iter.getCPLimit(); + // titlecase c which is from [titleStart..titleLimit[ + c = UCaseProps.INSTANCE.toFullTitle(c, iter, dest, caseLocale); + appendResult(c, dest, iter.getCPLength(), options, edits); + + // Special case Dutch IJ titlecasing + if (titleStart+1 < index && caseLocale == UCaseProps.LOC_DUTCH) { + char c1 = src.charAt(titleStart); + if ((c1 == 'i' || c1 == 'I')) { + char c2 = src.charAt(titleStart+1); + if (c2 == 'j') { + dest.append('J'); + if (edits != null) { + edits.addReplace(1, 1); + } + c = iter.nextCaseMapCP(); + titleLimit++; + assert c == c2; + assert titleLimit == iter.getCPLimit(); + } else if (c2 == 'J') { + // Keep the capital J from getting lowercased. + appendUnchanged(src, titleStart + 1, 1, dest, options, edits); + c = iter.nextCaseMapCP(); + titleLimit++; + assert c == c2; + assert titleLimit == iter.getCPLimit(); + } + } + } + + // lowercase [titleLimit..index[ + if(titleLimit<index) { + if((options&UCharacter.TITLECASE_NO_LOWERCASE)==0) { + // Normal operation: Lowercase the rest of the word. + internalToLower(caseLocale, options, iter, dest, edits); + } else { + // Optionally just copy the rest of the word unchanged. + appendUnchanged(src, titleLimit, index-titleLimit, dest, options, edits); + iter.moveToLimit(); + } + } + } + } + + prev=index; + } + return dest; + } catch (IOException e) { + throw new ICUUncheckedIOException(e); + } + } + + public static <A extends Appendable> A fold(int options, + CharSequence src, A dest, Edits edits) { + try { + if (edits != null) { + edits.reset(); + } + int length = src.length(); + for (int i = 0; i < length;) { + int c = Character.codePointAt(src, i); + int cpLength = Character.charCount(c); + i += cpLength; + c = UCaseProps.INSTANCE.toFullFolding(c, dest, options); + appendResult(c, dest, cpLength, options, edits); + } + return dest; + } catch (IOException e) { + throw new ICUUncheckedIOException(e); } - return result.toString(); } private static final class GreekUpper { @@ -661,12 +868,13 @@ public final class CaseMap { * TODO: Try to re-consolidate one way or another with the non-Greek function. * * <p>Keep this consistent with the C++ versions in ustrcase.cpp (UTF-16) and ucasemap.cpp (UTF-8). + * @throws IOException */ - private static String toUpper(CharSequence s, int[] locCache) { - StringBuilder result = new StringBuilder(s.length()); + private static <A extends Appendable> A toUpper(int options, + CharSequence src, A dest, Edits edits) throws IOException { int state = 0; - for (int i = 0; i < s.length();) { - int c = Character.codePointAt(s, i); + for (int i = 0; i < src.length();) { + int c = Character.codePointAt(src, i); int nextIndex = i + Character.charCount(c); int nextState = 0; int type = UCaseProps.INSTANCE.getTypeOrIgnorable(c); @@ -695,8 +903,8 @@ public final class CaseMap { numYpogegrammeni = 1; } // Skip combining diacritics after this Greek letter. - while (nextIndex < s.length()) { - int diacriticData = getDiacriticData(s.charAt(nextIndex)); + while (nextIndex < src.length()) { + int diacriticData = getDiacriticData(src.charAt(nextIndex)); if (diacriticData != 0) { data |= diacriticData; if ((diacriticData & HAS_YPOGEGRAMMENI) != 0) { @@ -716,7 +924,7 @@ public final class CaseMap { (data & HAS_ACCENT) != 0 && numYpogegrammeni == 0 && (state & AFTER_CASED) == 0 && - !isFollowedByCasedLetter(s, nextIndex)) { + !isFollowedByCasedLetter(src, nextIndex)) { // Keep disjunctive "or" with (only) a tonos. // We use the same "word boundary" conditions as for the Final_Sigma test. if (i == nextIndex) { @@ -734,25 +942,59 @@ public final class CaseMap { data &= ~HAS_EITHER_DIALYTIKA; } } - result.appendCodePoint(upper); - if ((data & HAS_EITHER_DIALYTIKA) != 0) { - result.append('\u0308'); // restore or add a dialytika - } - if (addTonos) { - result.append('\u0301'); + + boolean change; + if (edits == null) { + change = true; // common, simple usage + } else { + // Find out first whether we are changing the text. + change = src.charAt(i) != upper || numYpogegrammeni > 0; + int i2 = i + 1; + if ((data & HAS_EITHER_DIALYTIKA) != 0) { + change |= i2 >= nextIndex || src.charAt(i2) != 0x308; + ++i2; + } + if (addTonos) { + change |= i2 >= nextIndex || src.charAt(i2) != 0x301; + ++i2; + } + int oldLength = nextIndex - i; + int newLength = (i2 - i) + numYpogegrammeni; + change |= oldLength != newLength; + if (change) { + if (edits != null) { + edits.addReplace(oldLength, newLength); + } + } else { + if (edits != null) { + edits.addUnchanged(oldLength); + } + // Write unchanged text? + change = (options & OMIT_UNCHANGED_TEXT) == 0; + } } - while (numYpogegrammeni > 0) { - result.append('Ι'); - --numYpogegrammeni; + + if (change) { + dest.append((char)upper); + if ((data & HAS_EITHER_DIALYTIKA) != 0) { + dest.append('\u0308'); // restore or add a dialytika + } + if (addTonos) { + dest.append('\u0301'); + } + while (numYpogegrammeni > 0) { + dest.append('Ι'); + --numYpogegrammeni; + } } } else { - c = UCaseProps.INSTANCE.toFullUpper(c, null, result, null, locCache); - appendResult(c, result); + c = UCaseProps.INSTANCE.toFullUpper(c, null, dest, UCaseProps.LOC_GREEK); + appendResult(c, dest, nextIndex - i, options, edits); } i = nextIndex; state = nextState; } - return result.toString(); + return dest; } } } diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/UCaseProps.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/UCaseProps.java index 927cdc03c..6b5619d23 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/UCaseProps.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/UCaseProps.java @@ -24,6 +24,7 @@ package com.ibm.icu.impl; import java.io.IOException; import java.nio.ByteBuffer; import java.util.Iterator; +import java.util.Locale; import com.ibm.icu.lang.UCharacter; import com.ibm.icu.lang.UProperty; @@ -71,7 +72,7 @@ public final class UCaseProps { // read exceptions[] count=indexes[IX_EXC_LENGTH]; if(count>0) { - exceptions=ICUBinary.getChars(bytes, count, 0); + exceptions=ICUBinary.getString(bytes, count, 0); } // read unfold[] @@ -150,7 +151,7 @@ public final class UCaseProps { * * @param excWord (in) initial exceptions word * @param index (in) desired slot index - * @param excOffset (in) offset into exceptions[] after excWord=exceptions[excOffset++]; + * @param excOffset (in) offset into exceptions[] after excWord=exceptions.charAt(excOffset++); * @return bits 31..0: slot value * 63..32: modified excOffset, moved to the last char of the value, use +1 for beginning of next slot */ @@ -158,11 +159,11 @@ public final class UCaseProps { long value; if((excWord&EXC_DOUBLE_SLOTS)==0) { excOffset+=slotOffset(excWord, index); - value=exceptions[excOffset]; + value=exceptions.charAt(excOffset); } else { excOffset+=2*slotOffset(excWord, index); - value=exceptions[excOffset++]; - value=(value<<16)|exceptions[excOffset]; + value=exceptions.charAt(excOffset++); + value=(value<<16)|exceptions.charAt(excOffset); } return value |((long)excOffset<<32); } @@ -172,11 +173,11 @@ public final class UCaseProps { int value; if((excWord&EXC_DOUBLE_SLOTS)==0) { excOffset+=slotOffset(excWord, index); - value=exceptions[excOffset]; + value=exceptions.charAt(excOffset); } else { excOffset+=2*slotOffset(excWord, index); - value=exceptions[excOffset++]; - value=(value<<16)|exceptions[excOffset]; + value=exceptions.charAt(excOffset++); + value=(value<<16)|exceptions.charAt(excOffset); } return value; } @@ -191,7 +192,7 @@ public final class UCaseProps { } } else { int excOffset=getExceptionsOffset(props); - int excWord=exceptions[excOffset++]; + int excWord=exceptions.charAt(excOffset++); if(hasSlot(excWord, EXC_LOWER)) { c=getSlotValue(excWord, EXC_LOWER, excOffset); } @@ -207,7 +208,7 @@ public final class UCaseProps { } } else { int excOffset=getExceptionsOffset(props); - int excWord=exceptions[excOffset++]; + int excWord=exceptions.charAt(excOffset++); if(hasSlot(excWord, EXC_UPPER)) { c=getSlotValue(excWord, EXC_UPPER, excOffset); } @@ -223,7 +224,7 @@ public final class UCaseProps { } } else { int excOffset=getExceptionsOffset(props); - int excWord=exceptions[excOffset++]; + int excWord=exceptions.charAt(excOffset++); int index; if(hasSlot(excWord, EXC_TITLE)) { index=EXC_TITLE; @@ -291,7 +292,7 @@ public final class UCaseProps { */ int excOffset0, excOffset=getExceptionsOffset(props); int closureOffset; - int excWord=exceptions[excOffset++]; + int excWord=exceptions.charAt(excOffset++); int index, closureLength, fullLength, length; excOffset0=excOffset; @@ -334,7 +335,7 @@ public final class UCaseProps { /* add the full case folding string */ length=fullLength&0xf; if(length!=0) { - set.add(new String(exceptions, excOffset, length)); + set.add(exceptions.substring(excOffset, excOffset+length)); excOffset+=length; } @@ -348,8 +349,9 @@ public final class UCaseProps { } /* add each code point in the closure string */ - for(index=0; index<closureLength; index+=UTF16.getCharCount(c)) { - c=UTF16.charAt(exceptions, closureOffset, exceptions.length, index); + int limit=closureOffset+closureLength; + for(index=closureOffset; index<limit; index+=UTF16.getCharCount(c)) { + c=exceptions.codePointAt(index); set.add(c); } } @@ -468,7 +470,7 @@ public final class UCaseProps { if(!propsHasException(props)) { return props&DOT_MASK; } else { - return (exceptions[getExceptionsOffset(props)]>>EXC_DOT_SHIFT)&DOT_MASK; + return (exceptions.charAt(getExceptionsOffset(props))>>EXC_DOT_SHIFT)&DOT_MASK; } } @@ -605,38 +607,49 @@ public final class UCaseProps { */ public static final int MAX_STRING_LENGTH=0x1f; - private static final int LOC_UNKNOWN=0; - private static final int LOC_ROOT=1; + //ivate static final int LOC_UNKNOWN=0; + public static final int LOC_ROOT=1; private static final int LOC_TURKISH=2; private static final int LOC_LITHUANIAN=3; static final int LOC_GREEK=4; + public static final int LOC_DUTCH=5; - /* - * Checks and caches the type of locale ID as it is relevant for case mapping. - * If the locCache is not null, then it must be initialized with locCache[0]=0 . - */ - static final int getCaseLocale(ULocale locale, int[] locCache) { - int result; - - if(locCache!=null && (result=locCache[0])!=LOC_UNKNOWN) { - return result; - } - - result=LOC_ROOT; - - String language=locale.getLanguage(); - if(language.equals("tr") || language.equals("tur") || language.equals("az") || language.equals("aze")) { - result=LOC_TURKISH; - } else if(language.equals("el") || language.equals("ell")) { - result=LOC_GREEK; - } else if(language.equals("lt") || language.equals("lit")) { - result=LOC_LITHUANIAN; - } - - if(locCache!=null) { - locCache[0]=result; + public static final int getCaseLocale(Locale locale) { + return getCaseLocale(locale.getLanguage()); + } + public static final int getCaseLocale(ULocale locale) { + return getCaseLocale(locale.getLanguage()); + } + /** Accepts both 2- and 3-letter language subtags. */ + private static final int getCaseLocale(String language) { + // Check the subtag length to reduce the number of comparisons + // for locales without special behavior. + // Fastpath for English "en" which is often used for default (=root locale) case mappings, + // and for Chinese "zh": Very common but no special case mapping behavior. + if(language.length()==2) { + if(language.equals("en") || language.charAt(0)>'t') { + return LOC_ROOT; + } else if(language.equals("tr") || language.equals("az")) { + return LOC_TURKISH; + } else if(language.equals("el")) { + return LOC_GREEK; + } else if(language.equals("lt")) { + return LOC_LITHUANIAN; + } else if(language.equals("nl")) { + return LOC_DUTCH; + } + } else if(language.length()==3) { + if(language.equals("tur") || language.equals("aze")) { + return LOC_TURKISH; + } else if(language.equals("ell")) { + return LOC_GREEK; + } else if(language.equals("lit")) { + return LOC_LITHUANIAN; + } else if(language.equals("nld")) { + return LOC_DUTCH; + } } - return result; + return LOC_ROOT; } /* Is followed by {case-ignorable}* cased ? (dir determines looking forward/backward) */ @@ -797,19 +810,14 @@ public final class UCaseProps { * See ContextIterator for details. * If iter==null then a context-independent result is returned. * @param out If the mapping result is a string, then it is appended to out. - * @param locale Locale ID for locale-dependent mappings. - * @param locCache Initialize locCache[0] to 0; may be used to cache the result of parsing - * the locale ID for subsequent calls. - * Can be null. + * @param caseLocale Case locale value from ucase_getCaseLocale(). * @return Output code point or string length, see MAX_STRING_LENGTH. * * @see ContextIterator * @see #MAX_STRING_LENGTH * @internal */ - public final int toFullLower(int c, ContextIterator iter, - StringBuilder out, - ULocale locale, int[] locCache) { + public final int toFullLower(int c, ContextIterator iter, Appendable out, int caseLocale) { int result, props; result=c; @@ -820,22 +828,20 @@ public final class UCaseProps { } } else { int excOffset=getExceptionsOffset(props), excOffset2; - int excWord=exceptions[excOffset++]; + int excWord=exceptions.charAt(excOffset++); int full; excOffset2=excOffset; if((excWord&EXC_CONDITIONAL_SPECIAL)!=0) { /* use hardcoded conditions and mappings */ - int loc=getCaseLocale(locale, locCache); - /* * Test for conditional mappings first * (otherwise the unconditional default mappings are always taken), * then test for characters that have unconditional mappings in SpecialCasing.txt, * then get the UnicodeData.txt mappings. */ - if( loc==LOC_LITHUANIAN && + if( caseLocale==LOC_LITHUANIAN && /* base characters, find accents above */ (((c==0x49 || c==0x4a || c==0x12e) && isFollowedByMoreAbove(iter)) || @@ -858,30 +864,34 @@ public final class UCaseProps { 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE */ - switch(c) { - case 0x49: /* LATIN CAPITAL LETTER I */ - out.append(iDot); - return 2; - case 0x4a: /* LATIN CAPITAL LETTER J */ - out.append(jDot); - return 2; - case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */ - out.append(iOgonekDot); - return 2; - case 0xcc: /* LATIN CAPITAL LETTER I WITH GRAVE */ - out.append(iDotGrave); - return 3; - case 0xcd: /* LATIN CAPITAL LETTER I WITH ACUTE */ - out.append(iDotAcute); - return 3; - case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */ - out.append(iDotTilde); - return 3; - default: - return 0; /* will not occur */ + try { + switch(c) { + case 0x49: /* LATIN CAPITAL LETTER I */ + out.append(iDot); + return 2; + case 0x4a: /* LATIN CAPITAL LETTER J */ + out.append(jDot); + return 2; + case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */ + out.append(iOgonekDot); + return 2; + case 0xcc: /* LATIN CAPITAL LETTER I WITH GRAVE */ + out.append(iDotGrave); + return 3; + case 0xcd: /* LATIN CAPITAL LETTER I WITH ACUTE */ + out.append(iDotAcute); + return 3; + case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */ + out.append(iDotTilde); + return 3; + default: + return 0; /* will not occur */ + } + } catch (IOException e) { + throw new ICUUncheckedIOException(e); } /* # Turkish and Azeri */ - } else if(loc==LOC_TURKISH && c==0x130) { + } else if(caseLocale==LOC_TURKISH && c==0x130) { /* # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri # The following rules handle those cases. @@ -890,7 +900,7 @@ public final class UCaseProps { 0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE */ return 0x69; - } else if(loc==LOC_TURKISH && c==0x307 && isPrecededBy_I(iter)) { + } else if(caseLocale==LOC_TURKISH && c==0x307 && isPrecededBy_I(iter)) { /* # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i. # This matches the behavior of the canonically equivalent I-dot_above @@ -899,7 +909,7 @@ public final class UCaseProps { 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE */ return 0; /* remove the dot (continue without output) */ - } else if(loc==LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(iter)) { + } else if(caseLocale==LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(iter)) { /* # When lowercasing, unless an I is before a dot_above, it turns into a dotless i. @@ -913,8 +923,12 @@ public final class UCaseProps { 0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE */ - out.append(iDot); - return 2; + try { + out.append(iDot); + return 2; + } catch (IOException e) { + throw new ICUUncheckedIOException(e); + } } else if( c==0x3a3 && !isFollowedByCasedLetter(iter, 1) && isFollowedByCasedLetter(iter, -1) /* -1=preceded */ @@ -936,11 +950,15 @@ public final class UCaseProps { /* start of full case mapping strings */ excOffset=(int)(value>>32)+1; - /* set the output pointer to the lowercase mapping */ - out.append(exceptions, excOffset, full); + try { + // append the lowercase mapping + out.append(exceptions, excOffset, excOffset+full); - /* return the string length */ - return full; + /* return the string length */ + return full; + } catch (IOException e) { + throw new ICUUncheckedIOException(e); + } } } @@ -954,8 +972,8 @@ public final class UCaseProps { /* internal */ private final int toUpperOrTitle(int c, ContextIterator iter, - StringBuilder out, - ULocale locale, int[] locCache, + Appendable out, + int loc, boolean upperNotTitle) { int result; int props; @@ -968,15 +986,13 @@ public final class UCaseProps { } } else { int excOffset=getExceptionsOffset(props), excOffset2; - int excWord=exceptions[excOffset++]; + int excWord=exceptions.charAt(excOffset++); int full, index; excOffset2=excOffset; if((excWord&EXC_CONDITIONAL_SPECIAL)!=0) { /* use hardcoded conditions and mappings */ - int loc=getCaseLocale(locale, locCache); - if(loc==LOC_TURKISH && c==0x69) { /* # Turkish and Azeri @@ -1026,11 +1042,15 @@ public final class UCaseProps { } if(full!=0) { - /* set the output pointer to the result string */ - out.append(exceptions, excOffset, full); - - /* return the string length */ - return full; + try { + // append the result string + out.append(exceptions, excOffset, excOffset+full); + + /* return the string length */ + return full; + } catch (IOException e) { + throw new ICUUncheckedIOException(e); + } } } @@ -1049,15 +1069,15 @@ public final class UCaseProps { } public final int toFullUpper(int c, ContextIterator iter, - StringBuilder out, - ULocale locale, int[] locCache) { - return toUpperOrTitle(c, iter, out, locale, locCache, true); + Appendable out, + int caseLocale) { + return toUpperOrTitle(c, iter, out, caseLocale, true); } public final int toFullTitle(int c, ContextIterator iter, - StringBuilder out, - ULocale locale, int[] locCache) { - return toUpperOrTitle(c, iter, out, locale, locCache, false); + Appendable out, + int caseLocale) { + return toUpperOrTitle(c, iter, out, caseLocale, false); } /* case folding ------------------------------------------------------------- */ @@ -1117,7 +1137,7 @@ public final class UCaseProps { } } else { int excOffset=getExceptionsOffset(props); - int excWord=exceptions[excOffset++]; + int excWord=exceptions.charAt(excOffset++); int index; if((excWord&EXC_CONDITIONAL_FOLD)!=0) { /* special case folding mappings, hardcoded */ @@ -1168,7 +1188,7 @@ public final class UCaseProps { * together in a way that they still fold to common result strings. */ - public final int toFullFolding(int c, StringBuilder out, int options) { + public final int toFullFolding(int c, Appendable out, int options) { int result; int props; @@ -1180,7 +1200,7 @@ public final class UCaseProps { } } else { int excOffset=getExceptionsOffset(props), excOffset2; - int excWord=exceptions[excOffset++]; + int excWord=exceptions.charAt(excOffset++); int full, index; excOffset2=excOffset; @@ -1194,8 +1214,12 @@ public final class UCaseProps { return 0x69; } else if(c==0x130) { /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */ - out.append(iDot); - return 2; + try { + out.append(iDot); + return 2; + } catch (IOException e) { + throw new ICUUncheckedIOException(e); + } } } else { /* Turkic mappings */ @@ -1219,11 +1243,15 @@ public final class UCaseProps { full=(full>>4)&0xf; if(full!=0) { - /* set the output pointer to the result string */ - out.append(exceptions, excOffset, full); - - /* return the string length */ - return full; + try { + // append the result string + out.append(exceptions, excOffset, excOffset+full); + + /* return the string length */ + return full; + } catch (IOException e) { + throw new ICUUncheckedIOException(e); + } } } @@ -1242,7 +1270,6 @@ public final class UCaseProps { /* case mapping properties API ---------------------------------------------- */ - private static final int[] rootLocCache = { LOC_ROOT }; /* * We need a StringBuilder for multi-code point output from the * full case mapping functions. However, we do not actually use that output, @@ -1282,20 +1309,20 @@ public final class UCaseProps { */ case UProperty.CHANGES_WHEN_LOWERCASED: dummyStringBuilder.setLength(0); - return toFullLower(c, null, dummyStringBuilder, ULocale.ROOT, rootLocCache)>=0; + return toFullLower(c, null, dummyStringBuilder, LOC_ROOT)>=0; case UProperty.CHANGES_WHEN_UPPERCASED: dummyStringBuilder.setLength(0); - return toFullUpper(c, null, dummyStringBuilder, ULocale.ROOT, rootLocCache)>=0; + return toFullUpper(c, null, dummyStringBuilder, LOC_ROOT)>=0; case UProperty.CHANGES_WHEN_TITLECASED: dummyStringBuilder.setLength(0); - return toFullTitle(c, null, dummyStringBuilder, ULocale.ROOT, rootLocCache)>=0; + return toFullTitle(c, null, dummyStringBuilder, LOC_ROOT)>=0; /* case UProperty.CHANGES_WHEN_CASEFOLDED: -- in UCharacterProperty.java */ case UProperty.CHANGES_WHEN_CASEMAPPED: dummyStringBuilder.setLength(0); return - toFullLower(c, null, dummyStringBuilder, ULocale.ROOT, rootLocCache)>=0 || - toFullUpper(c, null, dummyStringBuilder, ULocale.ROOT, rootLocCache)>=0 || - toFullTitle(c, null, dummyStringBuilder, ULocale.ROOT, rootLocCache)>=0; + toFullLower(c, null, dummyStringBuilder, LOC_ROOT)>=0 || + toFullUpper(c, null, dummyStringBuilder, LOC_ROOT)>=0 || + toFullTitle(c, null, dummyStringBuilder, LOC_ROOT)>=0; default: return false; } @@ -1303,7 +1330,7 @@ public final class UCaseProps { // data members -------------------------------------------------------- *** private int indexes[]; - private char exceptions[]; + private String exceptions; private char unfold[]; private Trie2_16 trie; diff --git a/icu4j/main/classes/core/src/com/ibm/icu/lang/UCharacter.java b/icu4j/main/classes/core/src/com/ibm/icu/lang/UCharacter.java index 40fecc7b1..65cebb36a 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/lang/UCharacter.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/lang/UCharacter.java @@ -15,8 +15,7 @@ import java.util.Iterator; import java.util.Locale; import java.util.Map; -import com.ibm.icu.impl.CaseMap; -import com.ibm.icu.impl.CaseMap.StringContextIterator; +import com.ibm.icu.impl.CaseMapImpl; import com.ibm.icu.impl.IllegalIcuArgumentException; import com.ibm.icu.impl.Trie2; import com.ibm.icu.impl.UBiDiProps; @@ -29,6 +28,7 @@ import com.ibm.icu.impl.UPropertyAliases; import com.ibm.icu.lang.UCharacterEnums.ECharacterCategory; import com.ibm.icu.lang.UCharacterEnums.ECharacterDirection; import com.ibm.icu.text.BreakIterator; +import com.ibm.icu.text.Edits; import com.ibm.icu.text.Normalizer2; import com.ibm.icu.util.RangeValueIterator; import com.ibm.icu.util.ULocale; @@ -4875,7 +4875,7 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection */ public static String toUpperCase(String str) { - return toUpperCase(ULocale.getDefault(), str); + return toUpperCase(getDefaultCaseLocale(), str); } /** @@ -4887,7 +4887,7 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection */ public static String toLowerCase(String str) { - return toLowerCase(ULocale.getDefault(), str); + return toLowerCase(getDefaultCaseLocale(), str); } /** @@ -4910,7 +4910,94 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection */ public static String toTitleCase(String str, BreakIterator breakiter) { - return toTitleCase(ULocale.getDefault(), str, breakiter); + return toTitleCase(Locale.getDefault(), str, breakiter, 0); + } + + private static int getDefaultCaseLocale() { + return UCaseProps.getCaseLocale(Locale.getDefault()); + } + + private static int getCaseLocale(Locale locale) { + if (locale == null) { + locale = Locale.getDefault(); + } + return UCaseProps.getCaseLocale(locale); + } + + private static int getCaseLocale(ULocale locale) { + if (locale == null) { + locale = ULocale.getDefault(); + } + return UCaseProps.getCaseLocale(locale); + } + + private static String toLowerCase(int caseLocale, String str) { + if (str.length() <= 100) { + if (str.isEmpty()) { + return str; + } + // Collect and apply only changes. + // Good if no or few changes. Bad (slow) if many changes. + Edits edits = new Edits(); + StringBuilder replacementChars = CaseMapImpl.toLower( + caseLocale, CaseMapImpl.OMIT_UNCHANGED_TEXT, str, new StringBuilder(), edits); + return applyEdits(str, replacementChars, edits); + } else { + return CaseMapImpl.toLower(caseLocale, 0, str, + new StringBuilder(str.length()), null).toString(); + } + } + + private static String toUpperCase(int caseLocale, String str) { + if (str.length() <= 100) { + if (str.isEmpty()) { + return str; + } + // Collect and apply only changes. + // Good if no or few changes. Bad (slow) if many changes. + Edits edits = new Edits(); + StringBuilder replacementChars = CaseMapImpl.toUpper( + caseLocale, CaseMapImpl.OMIT_UNCHANGED_TEXT, str, new StringBuilder(), edits); + return applyEdits(str, replacementChars, edits); + } else { + return CaseMapImpl.toUpper(caseLocale, 0, str, + new StringBuilder(str.length()), null).toString(); + } + } + + private static String toTitleCase(int caseLocale, int options, BreakIterator titleIter, String str) { + if (str.length() <= 100) { + if (str.isEmpty()) { + return str; + } + // Collect and apply only changes. + // Good if no or few changes. Bad (slow) if many changes. + Edits edits = new Edits(); + StringBuilder replacementChars = CaseMapImpl.toTitle( + caseLocale, options | CaseMapImpl.OMIT_UNCHANGED_TEXT, titleIter, str, + new StringBuilder(), edits); + return applyEdits(str, replacementChars, edits); + } else { + return CaseMapImpl.toTitle(caseLocale, options, titleIter, str, + new StringBuilder(str.length()), null).toString(); + } + } + + private static String applyEdits(String str, StringBuilder replacementChars, Edits edits) { + if (!edits.hasChanges()) { + return str; + } + StringBuilder result = new StringBuilder(str.length() + edits.lengthDelta()); + for (Edits.Iterator ei = edits.getCoarseIterator(); ei.next();) { + if (ei.hasChange()) { + int i = ei.replacementIndex(); + result.append(replacementChars, i, i + ei.newLength()); + } else { + int i = ei.sourceIndex(); + result.append(str, i, i + ei.oldLength()); + } + } + return result.toString(); } /** @@ -4923,7 +5010,7 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection */ public static String toUpperCase(Locale locale, String str) { - return toUpperCase(ULocale.forLocale(locale), str); + return toUpperCase(getCaseLocale(locale), str); } /** @@ -4935,7 +5022,7 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection * @stable ICU 3.2 */ public static String toUpperCase(ULocale locale, String str) { - return CaseMap.toUpper(locale, str); + return toUpperCase(getCaseLocale(locale), str); } /** @@ -4948,7 +5035,7 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection */ public static String toLowerCase(Locale locale, String str) { - return toLowerCase(ULocale.forLocale(locale), str); + return toLowerCase(getCaseLocale(locale), str); } /** @@ -4960,31 +5047,7 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection * @stable ICU 3.2 */ public static String toLowerCase(ULocale locale, String str) { - StringContextIterator iter = new StringContextIterator(str); - StringBuilder result = new StringBuilder(str.length()); - int[] locCache = new int[1]; - int c; - - if (locale == null) { - locale = ULocale.getDefault(); - } - locCache[0]=0; - - while((c=iter.nextCaseMapCP())>=0) { - c = UCaseProps.INSTANCE.toFullLower(c, iter, result, locale, locCache); - - /* decode the result */ - if(c<0) { - /* (not) original code point */ - c=~c; - } else if(c<=UCaseProps.MAX_STRING_LENGTH) { - /* mapping already appended to result */ - continue; - /* } else { append single-code point mapping */ - } - result.appendCodePoint(c); - } - return result.toString(); + return toLowerCase(getCaseLocale(locale), str); } /** @@ -5009,7 +5072,7 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection public static String toTitleCase(Locale locale, String str, BreakIterator breakiter) { - return toTitleCase(ULocale.forLocale(locale), str, breakiter); + return toTitleCase(locale, str, breakiter, 0); } /** @@ -5059,126 +5122,15 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection * @see #TITLECASE_NO_BREAK_ADJUSTMENT */ public static String toTitleCase(ULocale locale, String str, - BreakIterator titleIter, - int options) { - StringContextIterator iter = new StringContextIterator(str); - StringBuilder result = new StringBuilder(str.length()); - int[] locCache = new int[1]; - int c, nc, srcLength = str.length(); - - if (locale == null) { - locale = ULocale.getDefault(); - } - locCache[0]=0; - + BreakIterator titleIter, int options) { if(titleIter == null) { + if (locale == null) { + locale = ULocale.getDefault(); + } titleIter = BreakIterator.getWordInstance(locale); } titleIter.setText(str); - - int prev, titleStart, index; - boolean isFirstIndex; - boolean isDutch = locale.getLanguage().equals("nl"); - boolean FirstIJ = true; - - /* set up local variables */ - prev=0; - isFirstIndex=true; - - /* titlecasing loop */ - while(prev<srcLength) { - /* find next index where to titlecase */ - if(isFirstIndex) { - isFirstIndex=false; - index=titleIter.first(); - } else { - index=titleIter.next(); - } - if(index==BreakIterator.DONE || index>srcLength) { - index=srcLength; - } - - /* - * Unicode 4 & 5 section 3.13 Default Case Operations: - * - * R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex - * #29, "Text Boundaries." Between each pair of word boundaries, find the first - * cased character F. If F exists, map F to default_title(F); then map each - * subsequent character C to default_lower(C). - * - * In this implementation, segment [prev..index[ into 3 parts: - * a) uncased characters (copy as-is) [prev..titleStart[ - * b) first case letter (titlecase) [titleStart..titleLimit[ - * c) subsequent characters (lowercase) [titleLimit..index[ - */ - if(prev<index) { - /* find and copy uncased characters [prev..titleStart[ */ - iter.setLimit(index); - c=iter.nextCaseMapCP(); - if((options&TITLECASE_NO_BREAK_ADJUSTMENT)==0 - && UCaseProps.NONE==UCaseProps.INSTANCE.getType(c)) { - while((c=iter.nextCaseMapCP())>=0 - && UCaseProps.NONE==UCaseProps.INSTANCE.getType(c)) {} - titleStart=iter.getCPStart(); - if(prev<titleStart) { - result.append(str, prev, titleStart); - } - } else { - titleStart=prev; - } - - if(titleStart<index) { - FirstIJ = true; - /* titlecase c which is from titleStart */ - c = UCaseProps.INSTANCE.toFullTitle(c, iter, result, locale, locCache); - - /* decode the result and lowercase up to index */ - for(;;) { - if(c<0) { - /* (not) original code point */ - c=~c; - result.appendCodePoint(c); - } else if(c<=UCaseProps.MAX_STRING_LENGTH) { - /* mapping already appended to result */ - } else { - /* append single-code point mapping */ - result.appendCodePoint(c); - } - - if((options&TITLECASE_NO_LOWERCASE)!=0) { - /* Optionally just copy the rest of the word unchanged. */ - - int titleLimit=iter.getCPLimit(); - if(titleLimit<index) { - /* Special Case - Dutch IJ Titlecasing */ - if (isDutch && c == 0x0049 && str.charAt(titleLimit) == 'j') { - result.append('J').append(str, titleLimit + 1, index); - } else { - result.append(str, titleLimit, index); - } - } - iter.moveToLimit(); - break; - } else if((nc=iter.nextCaseMapCP())>=0) { - if (isDutch && (nc == 0x004A || nc == 0x006A) - && (c == 0x0049) && (FirstIJ == true)) { - c = 0x004A; /* J */ - FirstIJ = false; - } else { - /* Normal operation: Lowercase the rest of the word. */ - c = UCaseProps.INSTANCE.toFullLower(nc, iter, result, locale, - locCache); - } - } else { - break; - } - } - } - } - - prev=index; - } - return result.toString(); + return toTitleCase(getCaseLocale(locale), options, titleIter, str); } @@ -5281,7 +5233,11 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection public static String toTitleCase(Locale locale, String str, BreakIterator titleIter, int options) { - return toTitleCase(ULocale.forLocale(locale), str, titleIter, options); + if(titleIter == null) { + titleIter = BreakIterator.getWordInstance(locale); + } + titleIter.setText(str); + return toTitleCase(getCaseLocale(locale), options, titleIter, str); } /** @@ -5398,27 +5354,19 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection * @stable ICU 2.6 */ public static final String foldCase(String str, int options) { - StringBuilder result = new StringBuilder(str.length()); - int c, i, length; - - length = str.length(); - for(i=0; i<length;) { - c=str.codePointAt(i); - i+=Character.charCount(c); - c = UCaseProps.INSTANCE.toFullFolding(c, result, options); - - /* decode the result */ - if(c<0) { - /* (not) original code point */ - c=~c; - } else if(c<=UCaseProps.MAX_STRING_LENGTH) { - /* mapping already appended to result */ - continue; - /* } else { append single-code point mapping */ + if (str.length() <= 100) { + if (str.isEmpty()) { + return str; } - result.appendCodePoint(c); + // Collect and apply only changes. + // Good if no or few changes. Bad (slow) if many changes. + Edits edits = new Edits(); + StringBuilder replacementChars = CaseMapImpl.fold( + options | CaseMapImpl.OMIT_UNCHANGED_TEXT, str, new StringBuilder(), edits); + return applyEdits(str, replacementChars, edits); + } else { + return CaseMapImpl.fold(options, str, new StringBuilder(str.length()), null).toString(); } - return result.toString(); } /** diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/CaseMap.java b/icu4j/main/classes/core/src/com/ibm/icu/text/CaseMap.java new file mode 100644 index 000000000..e998c6624 --- /dev/null +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/CaseMap.java @@ -0,0 +1,339 @@ +// © 2017 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html#License +package com.ibm.icu.text; + +import java.util.Locale; + +import com.ibm.icu.impl.CaseMapImpl; +import com.ibm.icu.impl.UCaseProps; +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.util.ULocale; + +/** + * Low-level case mapping options and methods. Immutable. + * "Setters" return instances with the union of the current and new options set. + * + * This class is not intended for public subclassing. + * + * @draft ICU 59 + * @provisional This API might change or be removed in a future release. + */ +public abstract class CaseMap { + /** + * @internal + * @deprecated This API is ICU internal only. + */ + @Deprecated + protected int internalOptions; + + private CaseMap(int opt) { internalOptions = opt; } + + private static int getCaseLocale(Locale locale) { + if (locale == null) { + locale = Locale.getDefault(); + } + return UCaseProps.getCaseLocale(locale); + } + + /** + * @return Lowercasing object with default options. + * @draft ICU 59 + * @provisional This API might change or be removed in a future release. + */ + public static Lower toLower() { return Lower.DEFAULT; } + /** + * @return Uppercasing object with default options. + * @draft ICU 59 + * @provisional This API might change or be removed in a future release. + */ + public static Upper toUpper() { return Upper.DEFAULT; } + /** + * @return Titlecasing object with default options. + * @draft ICU 59 + * @provisional This API might change or be removed in a future release. + */ + public static Title toTitle() { return Title.DEFAULT; } + /** + * @return Case folding object with default options. + * @draft ICU 59 + * @provisional This API might change or be removed in a future release. + */ + public static Fold fold() { return Fold.DEFAULT; } + + /** + * Returns an instance that behaves like this one but + * omits unchanged text when case-mapping with {@link Edits}. + * + * @return an options object with this option. + * @draft ICU 59 + * @provisional This API might change or be removed in a future release. + */ + public abstract CaseMap omitUnchangedText(); + + /** + * Lowercasing options and methods. Immutable. + * + * @see #toLower() + * @draft ICU 59 + * @provisional This API might change or be removed in a future release. + */ + public static final class Lower extends CaseMap { + private static final Lower DEFAULT = new Lower(0); + private static final Lower OMIT_UNCHANGED = new Lower(CaseMapImpl.OMIT_UNCHANGED_TEXT); + private Lower(int opt) { super(opt); } + + /** + * {@inheritDoc} + * @draft ICU 59 + * @provisional This API might change or be removed in a future release. + */ + @Override + public Lower omitUnchangedText() { + return OMIT_UNCHANGED; + } + + /** + * Lowercases a string and optionally records edits (see {@link #omitUnchangedText}). + * Casing is locale-dependent and context-sensitive. + * The result may be longer or shorter than the original. + * + * @param locale The locale ID. Can be null for {@link Locale#getDefault}. + * (See {@link ULocale#toLocale}.) + * @param src The original string. + * @param dest A buffer for the result string. Must not be null. + * @param edits Records edits for index mapping, working with styled text, + * and getting only changes (if any). + * This function calls edits.reset() first. edits can be null. + * @return dest with the result string (or only changes) appended. + * + * @see UCharacter#toLowerCase(Locale, String) + * @draft ICU 59 + * @provisional This API might change or be removed in a future release. + */ + public <A extends Appendable> A apply( + Locale locale, CharSequence src, A dest, Edits edits) { + return CaseMapImpl.toLower(getCaseLocale(locale), internalOptions, src, dest, edits); + } + } + + /** + * Uppercasing options and methods. Immutable. + * + * @see #toUpper() + * @draft ICU 59 + * @provisional This API might change or be removed in a future release. + */ + public static final class Upper extends CaseMap { + private static final Upper DEFAULT = new Upper(0); + private static final Upper OMIT_UNCHANGED = new Upper(CaseMapImpl.OMIT_UNCHANGED_TEXT); + private Upper(int opt) { super(opt); } + + /** + * {@inheritDoc} + * @draft ICU 59 + * @provisional This API might change or be removed in a future release. + */ + @Override + public Upper omitUnchangedText() { + return OMIT_UNCHANGED; + } + + /** + * Uppercases a string and optionally records edits (see {@link #omitUnchangedText}). + * Casing is locale-dependent and context-sensitive. + * The result may be longer or shorter than the original. + * + * @param locale The locale ID. Can be null for {@link Locale#getDefault}. + * (See {@link ULocale#toLocale}.) + * @param src The original string. + * @param dest A buffer for the result string. Must not be null. + * @param edits Records edits for index mapping, working with styled text, + * and getting only changes (if any). + * This function calls edits.reset() first. edits can be null. + * @return dest with the result string (or only changes) appended. + * + * @see UCharacter#toUpperCase(Locale, String) + * @draft ICU 59 + * @provisional This API might change or be removed in a future release. + */ + public <A extends Appendable> A apply( + Locale locale, CharSequence src, A dest, Edits edits) { + return CaseMapImpl.toUpper(getCaseLocale(locale), internalOptions, src, dest, edits); + } + } + + /** + * Titlecasing options and methods. Immutable. + * + * @see #toTitle() + * @draft ICU 59 + * @provisional This API might change or be removed in a future release. + */ + public static final class Title extends CaseMap { + private static final Title DEFAULT = new Title(0); + private static final Title OMIT_UNCHANGED = new Title(CaseMapImpl.OMIT_UNCHANGED_TEXT); + private Title(int opt) { super(opt); } + + /** + * {@inheritDoc} + * @draft ICU 59 + * @provisional This API might change or be removed in a future release. + */ + @Override + public Title omitUnchangedText() { + if (internalOptions == 0 || internalOptions == CaseMapImpl.OMIT_UNCHANGED_TEXT) { + return OMIT_UNCHANGED; + } + return new Title(internalOptions | CaseMapImpl.OMIT_UNCHANGED_TEXT); + } + + /** + * Returns an instance that behaves like this one but + * does not lowercase non-initial parts of words when titlecasing. + * + * <p>By default, titlecasing will titlecase the first cased character + * of a word and lowercase all other characters. + * With this option, the other characters will not be modified. + * + * @return an options object with this option. + * @see UCharacter#TITLECASE_NO_LOWERCASE + * @draft ICU 59 + * @provisional This API might change or be removed in a future release. + */ + public Title noLowercase() { + return new Title(internalOptions | UCharacter.TITLECASE_NO_LOWERCASE); + } + + // TODO: update references to the Unicode Standard for recent version + /** + * Returns an instance that behaves like this one but + * does not adjust the titlecasing indexes from BreakIterator::next() indexes; + * titlecases exactly the characters at breaks from the iterator. + * + * <p>By default, titlecasing will take each break iterator index, + * adjust it by looking for the next cased character, and titlecase that one. + * Other characters are lowercased. + * + * <p>This follows Unicode 4 & 5 section 3.13 Default Case Operations: + * + * R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex + * #29, "Text Boundaries." Between each pair of word boundaries, find the first + * cased character F. If F exists, map F to default_title(F); then map each + * subsequent character C to default_lower(C). + * + * @return an options object with this option. + * @see UCharacter#TITLECASE_NO_BREAK_ADJUSTMENT + * @draft ICU 59 + * @provisional This API might change or be removed in a future release. + */ + public Title noBreakAdjustment() { + return new Title(internalOptions | UCharacter.TITLECASE_NO_BREAK_ADJUSTMENT); + } + + /** + * Titlecases a string and optionally records edits (see {@link #omitUnchangedText}). + * Casing is locale-dependent and context-sensitive. + * The result may be longer or shorter than the original. + * + * <p>Titlecasing uses a break iterator to find the first characters of words + * that are to be titlecased. It titlecases those characters and lowercases + * all others. (This can be modified with options bits.) + * + * @param locale The locale ID. Can be null for {@link Locale#getDefault}. + * (See {@link ULocale#toLocale}.) + * @param iter A break iterator to find the first characters of words that are to be titlecased. + * It is set to the source string (setText()) + * and used one or more times for iteration (first() and next()). + * If null, then a word break iterator for the locale is used + * (or something equivalent). + * @param src The original string. + * @param dest A buffer for the result string. Must not be null. + * @param edits Records edits for index mapping, working with styled text, + * and getting only changes (if any). + * This function calls edits.reset() first. edits can be null. + * @return dest with the result string (or only changes) appended. + * + * @see UCharacter#toTitleCase(Locale, String, BreakIterator, int) + * @draft ICU 59 + * @provisional This API might change or be removed in a future release. + */ + public <A extends Appendable> A apply( + Locale locale, BreakIterator iter, CharSequence src, A dest, Edits edits) { + if (iter == null) { + iter = BreakIterator.getWordInstance(locale); + } + iter.setText(src.toString()); + return CaseMapImpl.toTitle( + getCaseLocale(locale), internalOptions, iter, src, dest, edits); + } + } + + /** + * Case folding options and methods. Immutable. + * + * @see #fold() + * @draft ICU 59 + * @provisional This API might change or be removed in a future release. + */ + public static final class Fold extends CaseMap { + private static final Fold DEFAULT = new Fold(0); + private static final Fold TURKIC = new Fold(UCharacter.FOLD_CASE_EXCLUDE_SPECIAL_I); + private static final Fold OMIT_UNCHANGED = new Fold(CaseMapImpl.OMIT_UNCHANGED_TEXT); + private static final Fold TURKIC_OMIT_UNCHANGED = new Fold( + UCharacter.FOLD_CASE_EXCLUDE_SPECIAL_I | CaseMapImpl.OMIT_UNCHANGED_TEXT); + private Fold(int opt) { super(opt); } + + /** + * {@inheritDoc} + * @draft ICU 59 + * @provisional This API might change or be removed in a future release. + */ + @Override + public Fold omitUnchangedText() { + return (internalOptions & UCharacter.FOLD_CASE_EXCLUDE_SPECIAL_I) == 0 ? + OMIT_UNCHANGED : TURKIC_OMIT_UNCHANGED; + } + + /** + * Returns an instance that behaves like this one but + * handles dotted I and dotless i appropriately for Turkic languages (tr, az). + * + * <p>Uses the Unicode CaseFolding.txt mappings marked with 'T' that + * are to be excluded for default mappings and + * included for the Turkic-specific mappings. + * + * @return an options object with this option. + * @see UCharacter#FOLD_CASE_EXCLUDE_SPECIAL_I + * @draft ICU 59 + * @provisional This API might change or be removed in a future release. + */ + public Fold turkic() { + return (internalOptions & CaseMapImpl.OMIT_UNCHANGED_TEXT) == 0 ? + TURKIC : TURKIC_OMIT_UNCHANGED; + } + + /** + * Case-folds a string and optionally records edits (see {@link #omitUnchangedText}). + * + * <p>Case-folding is locale-independent and not context-sensitive, + * but there is an option for whether to include or exclude mappings for dotted I + * and dotless i that are marked with 'T' in CaseFolding.txt. + * + * <p>The result may be longer or shorter than the original. + * + * @param src The original string. + * @param dest A buffer for the result string. Must not be null. + * @param edits Records edits for index mapping, working with styled text, + * and getting only changes (if any). + * This function calls edits.reset() first. edits can be null. + * @return dest with the result string (or only changes) appended. + * + * @see UCharacter#foldCase(String, int) + * @draft ICU 59 + * @provisional This API might change or be removed in a future release. + */ + public <A extends Appendable> A apply(CharSequence src, A dest, Edits edits) { + return CaseMapImpl.fold(internalOptions, src, dest, edits); + } + } +} diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/Edits.java b/icu4j/main/classes/core/src/com/ibm/icu/text/Edits.java new file mode 100644 index 000000000..f9cbf9fb4 --- /dev/null +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/Edits.java @@ -0,0 +1,494 @@ +// © 2017 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html#License +package com.ibm.icu.text; + +import java.nio.BufferOverflowException; +import java.util.Arrays; + +/** + * Records lengths of string edits but not replacement text. + * Supports replacements, insertions, deletions in linear progression. + * Does not support moving/reordering of text. + * + * @draft ICU 59 + * @provisional This API might change or be removed in a future release. + */ +public final class Edits { + // 0000uuuuuuuuuuuu records u+1 unchanged text units. + private static final int MAX_UNCHANGED_LENGTH = 0x1000; + private static final int MAX_UNCHANGED = MAX_UNCHANGED_LENGTH - 1; + + // 0wwwcccccccccccc with w=1..6 records ccc+1 replacements of w:w text units. + // No length change. + private static final int MAX_SHORT_WIDTH = 6; + private static final int MAX_SHORT_CHANGE_LENGTH = 0xfff; + private static final int MAX_SHORT_CHANGE = 0x6fff; + + // 0111mmmmmmnnnnnn records a replacement of m text units with n. + // m or n = 61: actual length follows in the next edits array unit. + // m or n = 62..63: actual length follows in the next two edits array units. + // Bit 30 of the actual length is in the head unit. + // Trailing units have bit 15 set. + private static final int LENGTH_IN_1TRAIL = 61; + private static final int LENGTH_IN_2TRAIL = 62; + + private static final int STACK_CAPACITY = 100; + private char[] array; + private int length; + private int delta; + + /** + * Constructs an empty object. + * @draft ICU 59 + * @provisional This API might change or be removed in a future release. + */ + public Edits() { + array = new char[STACK_CAPACITY]; + } + + /** + * Resets the data but may not release memory. + * @draft ICU 59 + * @provisional This API might change or be removed in a future release. + */ + public void reset() { + length = delta = 0; + } + + private void setLastUnit(int last) { + array[length - 1] = (char)last; + } + private int lastUnit() { + return length > 0 ? array[length - 1] : 0xffff; + } + + /** + * Adds a record for an unchanged segment of text. + * Normally called from inside ICU string transformation functions, not user code. + * @draft ICU 59 + * @provisional This API might change or be removed in a future release. + */ + public void addUnchanged(int unchangedLength) { + if(unchangedLength < 0) { + throw new IllegalArgumentException( + "addUnchanged(" + unchangedLength + "): length must not be negative"); + } + // Merge into previous unchanged-text record, if any. + int last = lastUnit(); + if(last < MAX_UNCHANGED) { + int remaining = MAX_UNCHANGED - last; + if (remaining >= unchangedLength) { + setLastUnit(last + unchangedLength); + return; + } + setLastUnit(MAX_UNCHANGED); + unchangedLength -= remaining; + } + // Split large lengths into multiple units. + while(unchangedLength >= MAX_UNCHANGED_LENGTH) { + append(MAX_UNCHANGED); + unchangedLength -= MAX_UNCHANGED_LENGTH; + } + // Write a small (remaining) length. + if(unchangedLength > 0) { + append(unchangedLength - 1); + } + } + + /** + * Adds a record for a text replacement/insertion/deletion. + * Normally called from inside ICU string transformation functions, not user code. + * @draft ICU 59 + * @provisional This API might change or be removed in a future release. + */ + public void addReplace(int oldLength, int newLength) { + if(oldLength == newLength && 0 < oldLength && oldLength <= MAX_SHORT_WIDTH) { + // Replacement of short oldLength text units by same-length new text. + // Merge into previous short-replacement record, if any. + int last = lastUnit(); + if(MAX_UNCHANGED < last && last < MAX_SHORT_CHANGE && + (last >> 12) == oldLength && (last & 0xfff) < MAX_SHORT_CHANGE_LENGTH) { + setLastUnit(last + 1); + return; + } + append(oldLength << 12); + return; + } + + if(oldLength < 0 || newLength < 0) { + throw new IllegalArgumentException( + "addReplace(" + oldLength + ", " + newLength + + "): both lengths must be non-negative"); + } + if (oldLength == 0 && newLength == 0) { + return; + } + int newDelta = newLength - oldLength; + if (newDelta != 0) { + if ((newDelta > 0 && delta >= 0 && newDelta > (Integer.MAX_VALUE - delta)) || + (newDelta < 0 && delta < 0 && newDelta < (Integer.MIN_VALUE - delta))) { + // Integer overflow or underflow. + throw new IndexOutOfBoundsException(); + } + delta += newDelta; + } + + int head = 0x7000; + if (oldLength < LENGTH_IN_1TRAIL && newLength < LENGTH_IN_1TRAIL) { + head |= oldLength << 6; + head |= newLength; + append(head); + } else if ((array.length - length) >= 5 || growArray()) { + int limit = length + 1; + if(oldLength < LENGTH_IN_1TRAIL) { + head |= oldLength << 6; + } else if(oldLength <= 0x7fff) { + head |= LENGTH_IN_1TRAIL << 6; + array[limit++] = (char)(0x8000 | oldLength); + } else { + head |= (LENGTH_IN_2TRAIL + (oldLength >> 30)) << 6; + array[limit++] = (char)(0x8000 | (oldLength >> 15)); + array[limit++] = (char)(0x8000 | oldLength); + } + if(newLength < LENGTH_IN_1TRAIL) { + head |= newLength; + } else if(newLength <= 0x7fff) { + head |= LENGTH_IN_1TRAIL; + array[limit++] = (char)(0x8000 | newLength); + } else { + head |= LENGTH_IN_2TRAIL + (newLength >> 30); + array[limit++] = (char)(0x8000 | (newLength >> 15)); + array[limit++] = (char)(0x8000 | newLength); + } + array[length] = (char)head; + length = limit; + } + } + + private void append(int r) { + if(length < array.length || growArray()) { + array[length++] = (char)r; + } + } + + private boolean growArray() { + int newCapacity; + if (array.length == STACK_CAPACITY) { + newCapacity = 2000; + } else if (array.length == Integer.MAX_VALUE) { + throw new BufferOverflowException(); + } else if (array.length >= (Integer.MAX_VALUE / 2)) { + newCapacity = Integer.MAX_VALUE; + } else { + newCapacity = 2 * array.length; + } + // Grow by at least 5 units so that a maximal change record will fit. + if ((newCapacity - array.length) < 5) { + throw new BufferOverflowException(); + } + array = Arrays.copyOf(array, newCapacity); + return true; + } + + /** + * How much longer is the new text compared with the old text? + * @return new length minus old length + * @draft ICU 59 + * @provisional This API might change or be removed in a future release. + */ + public int lengthDelta() { return delta; } + /** + * @return true if there are any change edits + * @draft ICU 59 + * @provisional This API might change or be removed in a future release. + */ + public boolean hasChanges() { + if (delta != 0) { + return true; + } + for (int i = 0; i < length; ++i) { + if (array[i] > MAX_UNCHANGED) { + return true; + } + } + return false; + } + + /** + * Access to the list of edits. + * @see #getCoarseIterator + * @see #getFineIterator + * @draft ICU 59 + * @provisional This API might change or be removed in a future release. + */ + public static final class Iterator { + private final char[] array; + private int index; + private final int length; + private int remaining; + private final boolean onlyChanges_, coarse; + + private boolean changed; + private int oldLength_, newLength_; + private int srcIndex, replIndex, destIndex; + + private Iterator(char[] a, int len, boolean oc, boolean crs) { + array = a; + length = len; + onlyChanges_ = oc; + coarse = crs; + } + + private int readLength(int head) { + if (head < LENGTH_IN_1TRAIL) { + return head; + } else if (head < LENGTH_IN_2TRAIL) { + assert(index < length); + assert(array[index] >= 0x8000); + return array[index++] & 0x7fff; + } else { + assert((index + 2) <= length); + assert(array[index] >= 0x8000); + assert(array[index + 1] >= 0x8000); + int len = ((head & 1) << 30) | + ((array[index] & 0x7fff) << 15) | + (array[index + 1] & 0x7fff); + index += 2; + return len; + } + } + + private void updateIndexes() { + srcIndex += oldLength_; + if (changed) { + replIndex += newLength_; + } + destIndex += newLength_; + } + + private boolean noNext() { + // No change beyond the string. + changed = false; + oldLength_ = newLength_ = 0; + return false; + } + + /** + * Advances to the next edit. + * @return true if there is another edit + * @draft ICU 59 + * @provisional This API might change or be removed in a future release. + */ + public boolean next() { + return next(onlyChanges_); + } + + private boolean next(boolean onlyChanges) { + // We have an errorCode in case we need to start guarding against integer overflows. + // It is also convenient for caller loops if we bail out when an error was set elsewhere. + updateIndexes(); + if (remaining > 0) { + // Fine-grained iterator: Continue a sequence of equal-length changes. + --remaining; + return true; + } + if (index >= length) { + return noNext(); + } + int u = array[index++]; + if (u <= MAX_UNCHANGED) { + // Combine adjacent unchanged ranges. + changed = false; + oldLength_ = u + 1; + while (index < length && (u = array[index]) <= MAX_UNCHANGED) { + ++index; + oldLength_ += u + 1; + } + newLength_ = oldLength_; + if (onlyChanges) { + updateIndexes(); + if (index >= length) { + return noNext(); + } + // already fetched u > MAX_UNCHANGED at index + ++index; + } else { + return true; + } + } + changed = true; + if (u <= MAX_SHORT_CHANGE) { + if (coarse) { + int w = u >> 12; + int len = (u & 0xfff) + 1; + oldLength_ = newLength_ = len * w; + } else { + // Split a sequence of equal-length changes that was compressed into one unit. + oldLength_ = newLength_ = u >> 12; + remaining = u & 0xfff; + return true; + } + } else { + assert(u <= 0x7fff); + oldLength_ = readLength((u >> 6) & 0x3f); + newLength_ = readLength(u & 0x3f); + if (!coarse) { + return true; + } + } + // Combine adjacent changes. + while (index < length && (u = array[index]) > MAX_UNCHANGED) { + ++index; + if (u <= MAX_SHORT_CHANGE) { + int w = u >> 12; + int len = (u & 0xfff) + 1; + len = len * w; + oldLength_ += len; + newLength_ += len; + } else { + assert(u <= 0x7fff); + int oldLen = readLength((u >> 6) & 0x3f); + int newLen = readLength(u & 0x3f); + oldLength_ += oldLen; + newLength_ += newLen; + } + } + return true; + } + + /** + * Finds the edit that contains the source index. + * The source index may be found in a non-change + * even if normal iteration would skip non-changes. + * Normal iteration can continue from a found edit. + * + * <p>The iterator state before this search logically does not matter. + * (It may affect the performance of the search.) + * + * <p>The iterator state after this search is undefined + * if the source index is out of bounds for the source string. + * + * @param i source index + * @return true if the edit for the source index was found + * @draft ICU 59 + * @provisional This API might change or be removed in a future release. + */ + public boolean findSourceIndex(int i) { + if (i < 0) { return false; } + if (i < srcIndex) { + // Reset the iterator to the start. + index = remaining = oldLength_ = newLength_ = srcIndex = replIndex = destIndex = 0; + } else if (i < (srcIndex + oldLength_)) { + // The index is in the current span. + return true; + } + while (next(false)) { + if (i < (srcIndex + oldLength_)) { + // The index is in the current span. + return true; + } + if (remaining > 0) { + // Is the index in one of the remaining compressed edits? + // srcIndex is the start of the current span, before the remaining ones. + int len = (remaining + 1) * oldLength_; + if (i < (srcIndex + len)) { + int n = (i - srcIndex) / oldLength_; // 1 <= n <= remaining + len = n * oldLength_; + srcIndex += len; + replIndex += len; + destIndex += len; + remaining -= n; + return true; + } + // Make next() skip all of these edits at once. + oldLength_ = newLength_ = len; + remaining = 0; + } + } + return false; + } + + /** + * @return true if this edit replaces oldLength() units with newLength() different ones. + * false if oldLength units remain unchanged. + * @draft ICU 59 + * @provisional This API might change or be removed in a future release. + */ + public boolean hasChange() { return changed; } + /** + * @return the number of units in the original string which are replaced or remain unchanged. + * @draft ICU 59 + * @provisional This API might change or be removed in a future release. + */ + public int oldLength() { return oldLength_; } + /** + * @return the number of units in the modified string, if hasChange() is true. + * Same as oldLength if hasChange() is false. + * @draft ICU 59 + * @provisional This API might change or be removed in a future release. + */ + public int newLength() { return newLength_; } + + /** + * @return the current index into the source string + * @draft ICU 59 + * @provisional This API might change or be removed in a future release. + */ + public int sourceIndex() { return srcIndex; } + /** + * @return the current index into the replacement-characters-only string, + * not counting unchanged spans + * @draft ICU 59 + * @provisional This API might change or be removed in a future release. + */ + public int replacementIndex() { return replIndex; } + /** + * @return the current index into the full destination string + * @draft ICU 59 + * @provisional This API might change or be removed in a future release. + */ + public int destinationIndex() { return destIndex; } + }; + + /** + * Returns an Iterator for coarse-grained changes for simple string updates. + * Skips non-changes. + * @return an Iterator that merges adjacent changes. + * @draft ICU 59 + * @provisional This API might change or be removed in a future release. + */ + public Iterator getCoarseChangesIterator() { + return new Iterator(array, length, true, true); + } + + /** + * Returns an Iterator for coarse-grained changes and non-changes for simple string updates. + * @return an Iterator that merges adjacent changes. + * @draft ICU 59 + * @provisional This API might change or be removed in a future release. + */ + public Iterator getCoarseIterator() { + return new Iterator(array, length, false, true); + } + + /** + * Returns an Iterator for fine-grained changes for modifying styled text. + * Skips non-changes. + * @return an Iterator that separates adjacent changes. + * @draft ICU 59 + * @provisional This API might change or be removed in a future release. + */ + public Iterator getFineChangesIterator() { + return new Iterator(array, length, true, false); + } + + /** + * Returns an Iterator for fine-grained changes and non-changes for modifying styled text. + * @return an Iterator that separates adjacent changes. + * @draft ICU 59 + * @provisional This API might change or be removed in a future release. + */ + public Iterator getFineIterator() { + return new Iterator(array, length, false, false); + } +} diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java b/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java index 7d700d0fe..106259f41 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java @@ -3866,7 +3866,6 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa int n = getRangeCount(); int result; StringBuilder full = new StringBuilder(); - int locCache[] = new int[1]; for (int i=0; i<n; ++i) { int start = getRangeStart(i); @@ -3881,13 +3880,13 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa // add case mappings // (does not add long s for regular s, or Kelvin for k, for example) for (int cp=start; cp<=end; ++cp) { - result = csp.toFullLower(cp, null, full, root, locCache); + result = csp.toFullLower(cp, null, full, UCaseProps.LOC_ROOT); addCaseMapping(foldSet, result, full); - result = csp.toFullTitle(cp, null, full, root, locCache); + result = csp.toFullTitle(cp, null, full, UCaseProps.LOC_ROOT); addCaseMapping(foldSet, result, full); - result = csp.toFullUpper(cp, null, full, root, locCache); + result = csp.toFullUpper(cp, null, full, UCaseProps.LOC_ROOT); addCaseMapping(foldSet, result, full); result = csp.toFullFolding(cp, full, 0); @@ -3906,6 +3905,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa } else { BreakIterator bi = BreakIterator.getWordInstance(root); for (String str : strings) { + // TODO: call lower-level functions foldSet.add(UCharacter.toLowerCase(root, str)); foldSet.add(UCharacter.toTitleCase(root, str, bi)); foldSet.add(UCharacter.toUpperCase(root, str)); diff --git a/icu4j/main/classes/translit/src/com/ibm/icu/text/LowercaseTransliterator.java b/icu4j/main/classes/translit/src/com/ibm/icu/text/LowercaseTransliterator.java index 95bb60b32..dfed35266 100644 --- a/icu4j/main/classes/translit/src/com/ibm/icu/text/LowercaseTransliterator.java +++ b/icu4j/main/classes/translit/src/com/ibm/icu/text/LowercaseTransliterator.java @@ -44,7 +44,7 @@ class LowercaseTransliterator extends Transliterator{ private final UCaseProps csp; private ReplaceableContextIterator iter; private StringBuilder result; - private int[] locCache; + private int caseLocale; /** * Constructs a transliterator. @@ -56,8 +56,7 @@ class LowercaseTransliterator extends Transliterator{ csp=UCaseProps.INSTANCE; iter=new ReplaceableContextIterator(); result = new StringBuilder(); - locCache = new int[1]; - locCache[0]=0; + caseLocale = UCaseProps.getCaseLocale(locale); } /** @@ -85,7 +84,7 @@ class LowercaseTransliterator extends Transliterator{ iter.setLimit(offsets.limit); iter.setContextLimits(offsets.contextStart, offsets.contextLimit); while((c=iter.nextCaseMapCP())>=0) { - c=csp.toFullLower(c, iter, result, locale, locCache); + c=csp.toFullLower(c, iter, result, caseLocale); if(iter.didReachLimit() && isIncremental) { // the case mapping function tried to look beyond the context limit diff --git a/icu4j/main/classes/translit/src/com/ibm/icu/text/TitlecaseTransliterator.java b/icu4j/main/classes/translit/src/com/ibm/icu/text/TitlecaseTransliterator.java index d3dc29681..96f11c8e2 100644 --- a/icu4j/main/classes/translit/src/com/ibm/icu/text/TitlecaseTransliterator.java +++ b/icu4j/main/classes/translit/src/com/ibm/icu/text/TitlecaseTransliterator.java @@ -42,7 +42,7 @@ class TitlecaseTransliterator extends Transliterator { private final UCaseProps csp; private ReplaceableContextIterator iter; private StringBuilder result; - private int[] locCache; + private int caseLocale; /** * Constructs a transliterator. @@ -55,8 +55,7 @@ class TitlecaseTransliterator extends Transliterator { csp=UCaseProps.INSTANCE; iter=new ReplaceableContextIterator(); result = new StringBuilder(); - locCache = new int[1]; - locCache[0]=0; + caseLocale = UCaseProps.getCaseLocale(locale); } /** @@ -119,9 +118,9 @@ class TitlecaseTransliterator extends Transliterator { type=csp.getTypeOrIgnorable(c); if(type>=0) { // not case-ignorable if(doTitle) { - c=csp.toFullTitle(c, iter, result, locale, locCache); + c=csp.toFullTitle(c, iter, result, caseLocale); } else { - c=csp.toFullLower(c, iter, result, locale, locCache); + c=csp.toFullLower(c, iter, result, caseLocale); } doTitle = type==0; // doTitle=isUncased diff --git a/icu4j/main/classes/translit/src/com/ibm/icu/text/UppercaseTransliterator.java b/icu4j/main/classes/translit/src/com/ibm/icu/text/UppercaseTransliterator.java index 77e2dfd70..bd9e3fed3 100644 --- a/icu4j/main/classes/translit/src/com/ibm/icu/text/UppercaseTransliterator.java +++ b/icu4j/main/classes/translit/src/com/ibm/icu/text/UppercaseTransliterator.java @@ -41,7 +41,7 @@ class UppercaseTransliterator extends Transliterator { private final UCaseProps csp; private ReplaceableContextIterator iter; private StringBuilder result; - private int[] locCache; + private int caseLocale; /** * Constructs a transliterator. @@ -52,8 +52,7 @@ class UppercaseTransliterator extends Transliterator { csp=UCaseProps.INSTANCE; iter=new ReplaceableContextIterator(); result = new StringBuilder(); - locCache = new int[1]; - locCache[0]=0; + caseLocale = UCaseProps.getCaseLocale(locale); } /** @@ -81,7 +80,7 @@ class UppercaseTransliterator extends Transliterator { iter.setLimit(offsets.limit); iter.setContextLimits(offsets.contextStart, offsets.contextLimit); while((c=iter.nextCaseMapCP())>=0) { - c=csp.toFullUpper(c, iter, result, locale, locCache); + c=csp.toFullUpper(c, iter, result, caseLocale); if(iter.didReachLimit() && isIncremental) { // the case mapping function tried to look beyond the context limit diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UCharacterCaseTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UCharacterCaseTest.java index 7ac358b51..6f8a67983 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UCharacterCaseTest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UCharacterCaseTest.java @@ -24,6 +24,8 @@ import com.ibm.icu.impl.Utility; import com.ibm.icu.lang.UCharacter; import com.ibm.icu.lang.UProperty; import com.ibm.icu.text.BreakIterator; +import com.ibm.icu.text.CaseMap; +import com.ibm.icu.text.Edits; import com.ibm.icu.text.RuleBasedBreakIterator; import com.ibm.icu.text.UTF16; import com.ibm.icu.util.ULocale; @@ -708,6 +710,191 @@ public final class UCharacterCaseTest extends TestFmwk assertGreekUpper("ρωμέικα", "ΡΩΜΕΪΚΑ"); } + private static final class EditChange { + private boolean change; + private int oldLength, newLength; + EditChange(boolean change, int oldLength, int newLength) { + this.change = change; + this.oldLength = oldLength; + this.newLength = newLength; + } + } + + private static void checkEditsIter( + String name, Edits.Iterator ei1, Edits.Iterator ei2, // two equal iterators + EditChange[] expected, boolean withUnchanged) { + assertFalse(name, ei2.findSourceIndex(-1)); + + int expSrcIndex = 0; + int expDestIndex = 0; + int expReplIndex = 0; + for (int expIndex = 0; expIndex < expected.length; ++expIndex) { + EditChange expect = expected[expIndex]; + String msg = name + ' ' + expIndex; + if (withUnchanged || expect.change) { + assertTrue(msg, ei1.next()); + assertEquals(msg, expect.change, ei1.hasChange()); + assertEquals(msg, expect.oldLength, ei1.oldLength()); + assertEquals(msg, expect.newLength, ei1.newLength()); + assertEquals(msg, expSrcIndex, ei1.sourceIndex()); + assertEquals(msg, expDestIndex, ei1.destinationIndex()); + assertEquals(msg, expReplIndex, ei1.replacementIndex()); + } + + if (expect.oldLength > 0) { + assertTrue(msg, ei2.findSourceIndex(expSrcIndex)); + assertEquals(msg, expect.change, ei2.hasChange()); + assertEquals(msg, expect.oldLength, ei2.oldLength()); + assertEquals(msg, expect.newLength, ei2.newLength()); + assertEquals(msg, expSrcIndex, ei2.sourceIndex()); + assertEquals(msg, expDestIndex, ei2.destinationIndex()); + assertEquals(msg, expReplIndex, ei2.replacementIndex()); + if (!withUnchanged) { + // For some iterators, move past the current range + // so that findSourceIndex() has to look before the current index. + ei2.next(); + ei2.next(); + } + } + + expSrcIndex += expect.oldLength; + expDestIndex += expect.newLength; + if (expect.change) { + expReplIndex += expect.newLength; + } + } + String msg = name + " end"; + assertFalse(msg, ei1.next()); + assertFalse(msg, ei1.hasChange()); + assertEquals(msg, 0, ei1.oldLength()); + assertEquals(msg, 0, ei1.newLength()); + assertEquals(msg, expSrcIndex, ei1.sourceIndex()); + assertEquals(msg, expDestIndex, ei1.destinationIndex()); + assertEquals(msg, expReplIndex, ei1.replacementIndex()); + + assertFalse(name, ei2.findSourceIndex(expSrcIndex)); + } + + @Test + public void TestEdits() { + Edits edits = new Edits(); + assertFalse("new Edits", edits.hasChanges()); + assertEquals("new Edits", 0, edits.lengthDelta()); + edits.addUnchanged(1); // multiple unchanged ranges are combined + edits.addUnchanged(10000); // too long, and they are split + edits.addReplace(0, 0); + edits.addUnchanged(2); + assertFalse("unchanged 10003", edits.hasChanges()); + assertEquals("unchanged 10003", 0, edits.lengthDelta()); + edits.addReplace(1, 1); // multiple short equal-length edits are compressed + edits.addUnchanged(0); + edits.addReplace(1, 1); + edits.addReplace(1, 1); + edits.addReplace(0, 10); + edits.addReplace(100, 0); + edits.addReplace(3000, 4000); // variable-length encoding + edits.addReplace(100000, 100000); + assertTrue("some edits", edits.hasChanges()); + assertEquals("some edits", 10 - 100 + 1000, edits.lengthDelta()); + + EditChange[] coarseExpectedChanges = new EditChange[] { + new EditChange(false, 10003, 10003), + new EditChange(true, 103103, 104013) + }; + checkEditsIter("coarse", + edits.getCoarseIterator(), edits.getCoarseIterator(), + coarseExpectedChanges, true); + checkEditsIter("coarse changes", + edits.getCoarseChangesIterator(), edits.getCoarseChangesIterator(), + coarseExpectedChanges, false); + + EditChange[] fineExpectedChanges = new EditChange[] { + new EditChange(false, 10003, 10003), + new EditChange(true, 1, 1), + new EditChange(true, 1, 1), + new EditChange(true, 1, 1), + new EditChange(true, 0, 10), + new EditChange(true, 100, 0), + new EditChange(true, 3000, 4000), + new EditChange(true, 100000, 100000) + }; + checkEditsIter("fine", + edits.getFineIterator(), edits.getFineIterator(), + fineExpectedChanges, true); + checkEditsIter("fine changes", + edits.getFineChangesIterator(), edits.getFineChangesIterator(), + fineExpectedChanges, false); + + edits.reset(); + assertFalse("reset", edits.hasChanges()); + assertEquals("reset", 0, edits.lengthDelta()); + Edits.Iterator ei = edits.getCoarseChangesIterator(); + assertFalse("reset then iterator", ei.next()); + } + + @Test + public void TestCaseMapWithEdits() { + StringBuilder sb = new StringBuilder(); + Edits edits = new Edits(); + + sb = CaseMap.toLower().omitUnchangedText().apply(TURKISH_LOCALE_, "IstanBul", sb, edits); + assertEquals("toLower(Istanbul)", "ıb", sb.toString()); + EditChange[] lowerExpectedChanges = new EditChange[] { + new EditChange(true, 1, 1), + new EditChange(false, 4, 4), + new EditChange(true, 1, 1), + new EditChange(false, 2, 2) + }; + checkEditsIter("toLower(Istanbul)", + edits.getFineIterator(), edits.getFineIterator(), + lowerExpectedChanges, true); + + sb.delete(0, sb.length()); + edits.reset(); + sb = CaseMap.toUpper().omitUnchangedText().apply(GREEK_LOCALE_, "Πατάτα", sb, edits); + assertEquals("toUpper(Πατάτα)", "ΑΤΑΤΑ", sb.toString()); + EditChange[] upperExpectedChanges = new EditChange[] { + new EditChange(false, 1, 1), + new EditChange(true, 1, 1), + new EditChange(true, 1, 1), + new EditChange(true, 1, 1), + new EditChange(true, 1, 1), + new EditChange(true, 1, 1) + }; + checkEditsIter("toUpper(Πατάτα)", + edits.getFineIterator(), edits.getFineIterator(), + upperExpectedChanges, true); + + sb.delete(0, sb.length()); + edits.reset(); + sb = CaseMap.toTitle().omitUnchangedText().noBreakAdjustment().noLowercase().apply( + new Locale("nl"), null, "IjssEL IglOo", sb, edits); + assertEquals("toTitle(IjssEL IglOo)", "J", sb.toString()); + EditChange[] titleExpectedChanges = new EditChange[] { + new EditChange(false, 1, 1), + new EditChange(true, 1, 1), + new EditChange(false, 10, 10) + }; + checkEditsIter("toTitle(IjssEL IglOo)", + edits.getFineIterator(), edits.getFineIterator(), + titleExpectedChanges, true); + + sb.delete(0, sb.length()); + edits.reset(); + sb = CaseMap.fold().omitUnchangedText().turkic().apply("IßtanBul", sb, edits); + assertEquals("fold(IßtanBul)", "ıssb", sb.toString()); + EditChange[] foldExpectedChanges = new EditChange[] { + new EditChange(true, 1, 1), + new EditChange(true, 1, 2), + new EditChange(false, 3, 3), + new EditChange(true, 1, 1), + new EditChange(false, 2, 2) + }; + checkEditsIter("fold(IßtanBul)", + edits.getFineIterator(), edits.getFineIterator(), + foldExpectedChanges, true); + } + // private data members - test data -------------------------------------- private static final Locale TURKISH_LOCALE_ = new Locale("tr", "TR"); @@ -945,7 +1132,7 @@ public final class UCharacterCaseTest extends TestFmwk // private methods ------------------------------------------------------- /** - * Converting the hex numbers represented betwee n ';' to Unicode strings + * Converting the hex numbers represented between ';' to Unicode strings * @param str string to break up into Unicode strings * @return array of Unicode strings ending with a null */ |