diff options
Diffstat (limited to 'android_icu4j/src/main/java/android/icu/text/UnicodeSet.java')
-rw-r--r-- | android_icu4j/src/main/java/android/icu/text/UnicodeSet.java | 306 |
1 files changed, 227 insertions, 79 deletions
diff --git a/android_icu4j/src/main/java/android/icu/text/UnicodeSet.java b/android_icu4j/src/main/java/android/icu/text/UnicodeSet.java index a85c57e16..297edbb0c 100644 --- a/android_icu4j/src/main/java/android/icu/text/UnicodeSet.java +++ b/android_icu4j/src/main/java/android/icu/text/UnicodeSet.java @@ -449,7 +449,9 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa * for the syntax of the pattern language. * @param pattern a string specifying what characters are in the set * @param options a bitmask indicating which options to apply. - * Valid options are IGNORE_SPACE and CASE. + * Valid options are {@link #IGNORE_SPACE} and + * at most one of {@link #CASE_INSENSITIVE}, {@link #ADD_CASE_MAPPINGS}, + * {@link #SIMPLE_CASE_INSENSITIVE}. These case options are mutually exclusive. * @exception java.lang.IllegalArgumentException if the pattern contains * a syntax error. */ @@ -483,7 +485,9 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa * @param symbols a symbol table mapping variables to char[] arrays * and chars to UnicodeSets * @param options a bitmask indicating which options to apply. - * Valid options are IGNORE_SPACE and CASE. + * Valid options are {@link #IGNORE_SPACE} and + * at most one of {@link #CASE_INSENSITIVE}, {@link #ADD_CASE_MAPPINGS}, + * {@link #SIMPLE_CASE_INSENSITIVE}. These case options are mutually exclusive. * @exception java.lang.IllegalArgumentException if the pattern * contains a syntax error. */ @@ -569,7 +573,9 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa * See the class description for the syntax of the pattern language. * @param pattern a string specifying what characters are in the set * @param options a bitmask indicating which options to apply. - * Valid options are IGNORE_SPACE and CASE. + * Valid options are {@link #IGNORE_SPACE} and + * at most one of {@link #CASE_INSENSITIVE}, {@link #ADD_CASE_MAPPINGS}, + * {@link #SIMPLE_CASE_INSENSITIVE}. These case options are mutually exclusive. * @exception java.lang.IllegalArgumentException if the pattern * contains a syntax error. */ @@ -2512,8 +2518,10 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa * variables, or null if none. * @param rebuiltPat the pattern that was parsed, rebuilt or * copied from the input pattern, as appropriate. - * @param options a bit mask of zero or more of the following: - * IGNORE_SPACE, CASE. + * @param options a bit mask. + * Valid options are {@link #IGNORE_SPACE} and + * at most one of {@link #CASE_INSENSITIVE}, {@link #ADD_CASE_MAPPINGS}, + * {@link #SIMPLE_CASE_INSENSITIVE}. These case options are mutually exclusive. */ private void applyPattern(RuleCharacterIterator chars, SymbolTable symbols, Appendable rebuiltPat, int options, int depth) { @@ -2893,8 +2901,8 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa * to close over case BEFORE COMPLEMENTING. This makes * patterns like /[^abc]/i work. */ - if ((options & CASE) != 0) { - closeOver(CASE); + if ((options & CASE_MASK) != 0) { + closeOver(options); } if (invert) { complement().removeAllStrings(); // code point complement @@ -3784,28 +3792,74 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa * * @deprecated ICU 73 Use {@link #CASE_INSENSITIVE} instead. */ + @Deprecated public static final int CASE = 2; /** - * Alias for UnicodeSet.CASE, for ease of porting from C++ where ICU4C - * also has both USET_CASE and USET_CASE_INSENSITIVE (see uset.h). - * @see #CASE + * Enable case insensitive matching. E.g., "[ab]" with this flag + * will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will + * match all except 'a', 'A', 'b', and 'B'. This performs a full + * closure over case mappings, e.g. 'ſ' (U+017F long s) for 's'. + * + * <p>This value is an options bit set value for some + * constructors, applyPattern(), and closeOver(). + * It can be ORed together with other, unrelated options. + * + * <p>The resulting set is a superset of the input for the code points but + * not for the strings. + * It performs a case mapping closure of the code points and adds + * full case folding strings for the code points, and reduces strings of + * the original set to their full case folding equivalents. + * + * <p>This is designed for case-insensitive matches, for example + * in regular expressions. The full code point case closure allows checking of + * an input character directly against the closure set. + * Strings are matched by comparing the case-folded form from the closure + * set with an incremental case folding of the string in question. + * + * <p>The closure set will also contain single code points if the original + * set contained case-equivalent strings (like U+00DF for "ss" or "Ss" etc.). + * This is not necessary (that is, redundant) for the above matching method + * but results in the same closure sets regardless of whether the original + * set contained the code point or a string. */ public static final int CASE_INSENSITIVE = 2; /** - * Bitmask for constructor, applyPattern(), and closeOver() - * indicating letter case. This may be ORed together with other - * selectors. - * - * Enable case insensitive matching. E.g., "[ab]" with this flag - * will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will - * match all except 'a', 'A', 'b', and 'B'. This adds the lower-, - * title-, and uppercase mappings as well as the case folding + * Adds all case mappings for each element in the set. + * This adds the full lower-, title-, and uppercase mappings as well as the full case folding * of each existing element in the set. + * + * <p>This value is an options bit set value for some + * constructors, applyPattern(), and closeOver(). + * It can be ORed together with other, unrelated options. + * + * <p>Unlike the “case insensitive” options, this does not perform a closure. + * For example, it does not add 'ſ' (U+017F long s) for 's', + * 'K' (U+212A Kelvin sign) for 'k', or replace set strings by their case-folded versions. */ public static final int ADD_CASE_MAPPINGS = 4; + /** + * Enable case insensitive matching. + * Same as {@link #CASE_INSENSITIVE} but using only Simple_Case_Folding (scf) mappings, + * which map each code point to one code point, + * not full Case_Folding (cf) mappings, which map some code points to multiple code points. + * + * <p>This is designed for case-insensitive matches, for example in certain + * regular expression implementations where only Simple_Case_Folding mappings are used, + * such as in ECMAScript (JavaScript) regular expressions. + * + * <p>This value is an options bit set value for some + * constructors, applyPattern(), and closeOver(). + * It can be ORed together with other, unrelated options. + * + * @hide draft / provisional / internal are hidden on Android + */ + public static final int SIMPLE_CASE_INSENSITIVE = 6; + + private static final int CASE_MASK = CASE_INSENSITIVE | ADD_CASE_MAPPINGS; + // add the result of a full case mapping to the set // use str as a temporary string to avoid constructing one private static final void addCaseMapping(UnicodeSet set, int result, StringBuilder full) { @@ -3823,96 +3877,190 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa // see UCaseProps } + /** For case closure on a large set, look only at code points with relevant properties. */ + UnicodeSet maybeOnlyCaseSensitive(UnicodeSet src) { + if (src.size() < 30) { + return src; + } + // Return the intersection of the src code points with Case_Sensitive ones. + UnicodeSet sensitive = CharacterProperties.getBinaryPropertySet(UProperty.CASE_SENSITIVE); + // Start by cloning the "smaller" set. Try not to copy the strings, if there are any in src. + if (src.hasStrings() || src.getRangeCount() > sensitive.getRangeCount()) { + return sensitive.cloneAsThawed().retainAll(src); + } else { + return ((UnicodeSet) src.clone()).retainAll(sensitive); + } + } + + // Per-character scf = Simple_Case_Folding of a string. + // (Normally when we case-fold a string we use full case foldings.) + private static final boolean scfString(CharSequence s, StringBuilder scf) { + int length = s.length(); + // Loop while not needing modification. + for (int i = 0; i < length;) { + int c = Character.codePointAt(s, i); + int scfChar = UCharacter.foldCase(c, UCharacter.FOLD_CASE_DEFAULT); + if (scfChar != c) { + // Copy the characters before c. + scf.setLength(0); + scf.append(s, 0, i); + // Loop over the rest of the string and keep case-folding. + for (;;) { + scf.appendCodePoint(scfChar); + i += Character.charCount(c); + if (i == length) { + return true; + } + c = Character.codePointAt(s, i); + scfChar = UCharacter.foldCase(c, UCharacter.FOLD_CASE_DEFAULT); + } + } + i += Character.charCount(c); + } + return false; + } + /** * Close this set over the given attribute. For the attribute - * CASE, the result is to modify this set so that: + * {@link #CASE_INSENSITIVE}, the result is to modify this set so that: * - * 1. For each character or string 'a' in this set, all strings + * <ol> + * <li>For each character or string 'a' in this set, all strings * 'b' such that foldCase(a) == foldCase(b) are added to this set. * (For most 'a' that are single characters, 'b' will have * b.length() == 1.) * - * 2. For each string 'e' in the resulting set, if e != + * <li>For each string 'e' in the resulting set, if e != * foldCase(e), 'e' will be removed. + * </ol> * - * Example: [aq\u00DF{Bc}{bC}{Fi}] => [aAqQ\u00DF\uFB01{ss}{bc}{fi}] + * <p>Example: [aq\u00DF{Bc}{bC}{Fi}] => [aAqQ\u00DF\uFB01{ss}{bc}{fi}] * - * (Here foldCase(x) refers to the operation + * <p>(Here foldCase(x) refers to the operation * UCharacter.foldCase(x, true), and a == b actually denotes * a.equals(b), not pointer comparison.) * * @param attribute bitmask for attributes to close over. - * Currently only the CASE bit is supported. Any undefined bits - * are ignored. + * Valid options: + * At most one of {@link #CASE_INSENSITIVE}, {@link #ADD_CASE_MAPPINGS}, + * {@link #SIMPLE_CASE_INSENSITIVE}. These case options are mutually exclusive. + * Unrelated options bits are ignored. * @return a reference to this set. */ public UnicodeSet closeOver(int attribute) { checkFrozen(); - if ((attribute & (CASE | ADD_CASE_MAPPINGS)) != 0) { - UCaseProps csp = UCaseProps.INSTANCE; - UnicodeSet foldSet = new UnicodeSet(this); - ULocale root = ULocale.ROOT; - - // start with input set to guarantee inclusion - // CASE: remove strings because the strings will actually be reduced (folded); - // therefore, start with no strings and add only those needed - if((attribute & CASE) != 0 && foldSet.hasStrings()) { - foldSet.strings.clear(); - } - - int n = getRangeCount(); - int result; - StringBuilder full = new StringBuilder(); + switch (attribute & CASE_MASK) { + case 0: + break; + case CASE_INSENSITIVE: + closeOverCaseInsensitive(/* simple= */ false); + break; + case ADD_CASE_MAPPINGS: + closeOverAddCaseMappings(); + break; + case SIMPLE_CASE_INSENSITIVE: + closeOverCaseInsensitive(/* simple= */ true); + break; + default: + // bad option (unreachable) + break; + } + return this; + } - for (int i=0; i<n; ++i) { - int start = getRangeStart(i); - int end = getRangeEnd(i); + private void closeOverCaseInsensitive(boolean simple) { + UCaseProps csp = UCaseProps.INSTANCE; + // Start with input set to guarantee inclusion. + UnicodeSet foldSet = new UnicodeSet(this); - if((attribute & CASE) != 0) { - // full case closure - for (int cp=start; cp<=end; ++cp) { - csp.addCaseClosure(cp, foldSet); - } - } else { - // add case mappings - // (does not add long s for regular s, or Kelvin for k, for example) - for (int cp=start; cp<=end; ++cp) { - result = csp.toFullLower(cp, null, full, UCaseProps.LOC_ROOT); - addCaseMapping(foldSet, result, full); + // Full case mappings closure: + // Remove strings because the strings will actually be reduced (folded); + // therefore, start with no strings and add only those needed. + // Do this before processing code points, because they may add strings. + if (!simple && foldSet.hasStrings()) { + foldSet.strings.clear(); + } - result = csp.toFullTitle(cp, null, full, UCaseProps.LOC_ROOT); - addCaseMapping(foldSet, result, full); + UnicodeSet codePoints = maybeOnlyCaseSensitive(this); - result = csp.toFullUpper(cp, null, full, UCaseProps.LOC_ROOT); - addCaseMapping(foldSet, result, full); + // Iterate over the ranges of single code points. Nested loop for each code point. + int n = codePoints.getRangeCount(); + for (int i=0; i<n; ++i) { + int start = codePoints.getRangeStart(i); + int end = codePoints.getRangeEnd(i); - result = csp.toFullFolding(cp, full, 0); - addCaseMapping(foldSet, result, full); - } + if (simple) { + for (int cp=start; cp<=end; ++cp) { + csp.addSimpleCaseClosure(cp, foldSet); + } + } else { + for (int cp=start; cp<=end; ++cp) { + csp.addCaseClosure(cp, foldSet); } } - if (hasStrings()) { - if ((attribute & CASE) != 0) { - for (String s : strings) { - String str = UCharacter.foldCase(s, 0); - if(!csp.addStringCaseClosure(str, foldSet)) { - foldSet.add(str); // does not map to code points: add the folded string itself - } + } + if (hasStrings()) { + StringBuilder sb = simple ? new StringBuilder() : null; + for (String s : strings) { + if (simple) { + if (scfString(s, sb)) { + foldSet.remove(s).add(sb); } } else { - BreakIterator bi = BreakIterator.getWordInstance(root); - for (String str : strings) { - // TODO: call lower-level functions - foldSet.add(UCharacter.toLowerCase(root, str)); - foldSet.add(UCharacter.toTitleCase(root, str, bi)); - foldSet.add(UCharacter.toUpperCase(root, str)); - foldSet.add(UCharacter.foldCase(str, 0)); + String str = UCharacter.foldCase(s, 0); + if(!csp.addStringCaseClosure(str, foldSet)) { + foldSet.add(str); // does not map to code points: add the folded string itself } } } - set(foldSet); } - return this; + set(foldSet); + } + + private void closeOverAddCaseMappings() { + UCaseProps csp = UCaseProps.INSTANCE; + // Start with input set to guarantee inclusion. + UnicodeSet foldSet = new UnicodeSet(this); + + UnicodeSet codePoints = maybeOnlyCaseSensitive(this); + + // Iterate over the ranges of single code points. Nested loop for each code point. + int n = codePoints.getRangeCount(); + int result; + StringBuilder full = new StringBuilder(); + + for (int i=0; i<n; ++i) { + int start = codePoints.getRangeStart(i); + int end = codePoints.getRangeEnd(i); + + // add case mappings + // (does not add long s for regular s, or Kelvin for k, for example) + for (int cp=start; cp<=end; ++cp) { + result = csp.toFullLower(cp, null, full, UCaseProps.LOC_ROOT); + addCaseMapping(foldSet, result, full); + + result = csp.toFullTitle(cp, null, full, UCaseProps.LOC_ROOT); + addCaseMapping(foldSet, result, full); + + result = csp.toFullUpper(cp, null, full, UCaseProps.LOC_ROOT); + addCaseMapping(foldSet, result, full); + + result = csp.toFullFolding(cp, full, 0); + addCaseMapping(foldSet, result, full); + } + } + if (hasStrings()) { + ULocale root = ULocale.ROOT; + BreakIterator bi = BreakIterator.getWordInstance(root); + for (String str : strings) { + // TODO: call lower-level functions + foldSet.add(UCharacter.toLowerCase(root, str)); + foldSet.add(UCharacter.toTitleCase(root, str, bi)); + foldSet.add(UCharacter.toUpperCase(root, str)); + foldSet.add(UCharacter.foldCase(str, 0)); + } + } + set(foldSet); } /** @@ -4513,7 +4661,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa * Same results as turning the code point into a string (with the [ugly] new StringBuilder().appendCodePoint(codepoint).toString()) * and comparing, but much faster (no object creation). * Actually, there is one difference; a null compares as less. - * Note that this (=String) order is UTF-16 order -- *not* code point order. + * Note that this (=String) order is UTF-16 order -- <i>not</i> code point order. * @hide unsupported on Android */ @@ -4525,7 +4673,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa * Utility to compare a string to a code point. * Same results as turning the code point into a string and comparing, but much faster (no object creation). * Actually, there is one difference; a null compares as less. - * Note that this (=String) order is UTF-16 order -- *not* code point order. + * Note that this (=String) order is UTF-16 order -- <i>not</i> code point order. * @hide unsupported on Android */ public static int compare(int codePoint, CharSequence string) { |