summaryrefslogtreecommitdiff
path: root/android_icu4j/src/main/java/android/icu/text/UnicodeSet.java
diff options
context:
space:
mode:
Diffstat (limited to 'android_icu4j/src/main/java/android/icu/text/UnicodeSet.java')
-rw-r--r--android_icu4j/src/main/java/android/icu/text/UnicodeSet.java306
1 files changed, 227 insertions, 79 deletions
diff --git a/android_icu4j/src/main/java/android/icu/text/UnicodeSet.java b/android_icu4j/src/main/java/android/icu/text/UnicodeSet.java
index a85c57e16..297edbb0c 100644
--- a/android_icu4j/src/main/java/android/icu/text/UnicodeSet.java
+++ b/android_icu4j/src/main/java/android/icu/text/UnicodeSet.java
@@ -449,7 +449,9 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
* for the syntax of the pattern language.
* @param pattern a string specifying what characters are in the set
* @param options a bitmask indicating which options to apply.
- * Valid options are IGNORE_SPACE and CASE.
+ * Valid options are {@link #IGNORE_SPACE} and
+ * at most one of {@link #CASE_INSENSITIVE}, {@link #ADD_CASE_MAPPINGS},
+ * {@link #SIMPLE_CASE_INSENSITIVE}. These case options are mutually exclusive.
* @exception java.lang.IllegalArgumentException if the pattern contains
* a syntax error.
*/
@@ -483,7 +485,9 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
* @param symbols a symbol table mapping variables to char[] arrays
* and chars to UnicodeSets
* @param options a bitmask indicating which options to apply.
- * Valid options are IGNORE_SPACE and CASE.
+ * Valid options are {@link #IGNORE_SPACE} and
+ * at most one of {@link #CASE_INSENSITIVE}, {@link #ADD_CASE_MAPPINGS},
+ * {@link #SIMPLE_CASE_INSENSITIVE}. These case options are mutually exclusive.
* @exception java.lang.IllegalArgumentException if the pattern
* contains a syntax error.
*/
@@ -569,7 +573,9 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
* See the class description for the syntax of the pattern language.
* @param pattern a string specifying what characters are in the set
* @param options a bitmask indicating which options to apply.
- * Valid options are IGNORE_SPACE and CASE.
+ * Valid options are {@link #IGNORE_SPACE} and
+ * at most one of {@link #CASE_INSENSITIVE}, {@link #ADD_CASE_MAPPINGS},
+ * {@link #SIMPLE_CASE_INSENSITIVE}. These case options are mutually exclusive.
* @exception java.lang.IllegalArgumentException if the pattern
* contains a syntax error.
*/
@@ -2512,8 +2518,10 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
* variables, or null if none.
* @param rebuiltPat the pattern that was parsed, rebuilt or
* copied from the input pattern, as appropriate.
- * @param options a bit mask of zero or more of the following:
- * IGNORE_SPACE, CASE.
+ * @param options a bit mask.
+ * Valid options are {@link #IGNORE_SPACE} and
+ * at most one of {@link #CASE_INSENSITIVE}, {@link #ADD_CASE_MAPPINGS},
+ * {@link #SIMPLE_CASE_INSENSITIVE}. These case options are mutually exclusive.
*/
private void applyPattern(RuleCharacterIterator chars, SymbolTable symbols,
Appendable rebuiltPat, int options, int depth) {
@@ -2893,8 +2901,8 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
* to close over case BEFORE COMPLEMENTING. This makes
* patterns like /[^abc]/i work.
*/
- if ((options & CASE) != 0) {
- closeOver(CASE);
+ if ((options & CASE_MASK) != 0) {
+ closeOver(options);
}
if (invert) {
complement().removeAllStrings(); // code point complement
@@ -3784,28 +3792,74 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
*
* @deprecated ICU 73 Use {@link #CASE_INSENSITIVE} instead.
*/
+ @Deprecated
public static final int CASE = 2;
/**
- * Alias for UnicodeSet.CASE, for ease of porting from C++ where ICU4C
- * also has both USET_CASE and USET_CASE_INSENSITIVE (see uset.h).
- * @see #CASE
+ * Enable case insensitive matching. E.g., "[ab]" with this flag
+ * will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will
+ * match all except 'a', 'A', 'b', and 'B'. This performs a full
+ * closure over case mappings, e.g. 'ſ' (U+017F long s) for 's'.
+ *
+ * <p>This value is an options bit set value for some
+ * constructors, applyPattern(), and closeOver().
+ * It can be ORed together with other, unrelated options.
+ *
+ * <p>The resulting set is a superset of the input for the code points but
+ * not for the strings.
+ * It performs a case mapping closure of the code points and adds
+ * full case folding strings for the code points, and reduces strings of
+ * the original set to their full case folding equivalents.
+ *
+ * <p>This is designed for case-insensitive matches, for example
+ * in regular expressions. The full code point case closure allows checking of
+ * an input character directly against the closure set.
+ * Strings are matched by comparing the case-folded form from the closure
+ * set with an incremental case folding of the string in question.
+ *
+ * <p>The closure set will also contain single code points if the original
+ * set contained case-equivalent strings (like U+00DF for "ss" or "Ss" etc.).
+ * This is not necessary (that is, redundant) for the above matching method
+ * but results in the same closure sets regardless of whether the original
+ * set contained the code point or a string.
*/
public static final int CASE_INSENSITIVE = 2;
/**
- * Bitmask for constructor, applyPattern(), and closeOver()
- * indicating letter case. This may be ORed together with other
- * selectors.
- *
- * Enable case insensitive matching. E.g., "[ab]" with this flag
- * will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will
- * match all except 'a', 'A', 'b', and 'B'. This adds the lower-,
- * title-, and uppercase mappings as well as the case folding
+ * Adds all case mappings for each element in the set.
+ * This adds the full lower-, title-, and uppercase mappings as well as the full case folding
* of each existing element in the set.
+ *
+ * <p>This value is an options bit set value for some
+ * constructors, applyPattern(), and closeOver().
+ * It can be ORed together with other, unrelated options.
+ *
+ * <p>Unlike the “case insensitive” options, this does not perform a closure.
+ * For example, it does not add 'ſ' (U+017F long s) for 's',
+ * 'K' (U+212A Kelvin sign) for 'k', or replace set strings by their case-folded versions.
*/
public static final int ADD_CASE_MAPPINGS = 4;
+ /**
+ * Enable case insensitive matching.
+ * Same as {@link #CASE_INSENSITIVE} but using only Simple_Case_Folding (scf) mappings,
+ * which map each code point to one code point,
+ * not full Case_Folding (cf) mappings, which map some code points to multiple code points.
+ *
+ * <p>This is designed for case-insensitive matches, for example in certain
+ * regular expression implementations where only Simple_Case_Folding mappings are used,
+ * such as in ECMAScript (JavaScript) regular expressions.
+ *
+ * <p>This value is an options bit set value for some
+ * constructors, applyPattern(), and closeOver().
+ * It can be ORed together with other, unrelated options.
+ *
+ * @hide draft / provisional / internal are hidden on Android
+ */
+ public static final int SIMPLE_CASE_INSENSITIVE = 6;
+
+ private static final int CASE_MASK = CASE_INSENSITIVE | ADD_CASE_MAPPINGS;
+
// add the result of a full case mapping to the set
// use str as a temporary string to avoid constructing one
private static final void addCaseMapping(UnicodeSet set, int result, StringBuilder full) {
@@ -3823,96 +3877,190 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
// see UCaseProps
}
+ /** For case closure on a large set, look only at code points with relevant properties. */
+ UnicodeSet maybeOnlyCaseSensitive(UnicodeSet src) {
+ if (src.size() < 30) {
+ return src;
+ }
+ // Return the intersection of the src code points with Case_Sensitive ones.
+ UnicodeSet sensitive = CharacterProperties.getBinaryPropertySet(UProperty.CASE_SENSITIVE);
+ // Start by cloning the "smaller" set. Try not to copy the strings, if there are any in src.
+ if (src.hasStrings() || src.getRangeCount() > sensitive.getRangeCount()) {
+ return sensitive.cloneAsThawed().retainAll(src);
+ } else {
+ return ((UnicodeSet) src.clone()).retainAll(sensitive);
+ }
+ }
+
+ // Per-character scf = Simple_Case_Folding of a string.
+ // (Normally when we case-fold a string we use full case foldings.)
+ private static final boolean scfString(CharSequence s, StringBuilder scf) {
+ int length = s.length();
+ // Loop while not needing modification.
+ for (int i = 0; i < length;) {
+ int c = Character.codePointAt(s, i);
+ int scfChar = UCharacter.foldCase(c, UCharacter.FOLD_CASE_DEFAULT);
+ if (scfChar != c) {
+ // Copy the characters before c.
+ scf.setLength(0);
+ scf.append(s, 0, i);
+ // Loop over the rest of the string and keep case-folding.
+ for (;;) {
+ scf.appendCodePoint(scfChar);
+ i += Character.charCount(c);
+ if (i == length) {
+ return true;
+ }
+ c = Character.codePointAt(s, i);
+ scfChar = UCharacter.foldCase(c, UCharacter.FOLD_CASE_DEFAULT);
+ }
+ }
+ i += Character.charCount(c);
+ }
+ return false;
+ }
+
/**
* Close this set over the given attribute. For the attribute
- * CASE, the result is to modify this set so that:
+ * {@link #CASE_INSENSITIVE}, the result is to modify this set so that:
*
- * 1. For each character or string 'a' in this set, all strings
+ * <ol>
+ * <li>For each character or string 'a' in this set, all strings
* 'b' such that foldCase(a) == foldCase(b) are added to this set.
* (For most 'a' that are single characters, 'b' will have
* b.length() == 1.)
*
- * 2. For each string 'e' in the resulting set, if e !=
+ * <li>For each string 'e' in the resulting set, if e !=
* foldCase(e), 'e' will be removed.
+ * </ol>
*
- * Example: [aq\u00DF{Bc}{bC}{Fi}] =&gt; [aAqQ\u00DF\uFB01{ss}{bc}{fi}]
+ * <p>Example: [aq\u00DF{Bc}{bC}{Fi}] =&gt; [aAqQ\u00DF\uFB01{ss}{bc}{fi}]
*
- * (Here foldCase(x) refers to the operation
+ * <p>(Here foldCase(x) refers to the operation
* UCharacter.foldCase(x, true), and a == b actually denotes
* a.equals(b), not pointer comparison.)
*
* @param attribute bitmask for attributes to close over.
- * Currently only the CASE bit is supported. Any undefined bits
- * are ignored.
+ * Valid options:
+ * At most one of {@link #CASE_INSENSITIVE}, {@link #ADD_CASE_MAPPINGS},
+ * {@link #SIMPLE_CASE_INSENSITIVE}. These case options are mutually exclusive.
+ * Unrelated options bits are ignored.
* @return a reference to this set.
*/
public UnicodeSet closeOver(int attribute) {
checkFrozen();
- if ((attribute & (CASE | ADD_CASE_MAPPINGS)) != 0) {
- UCaseProps csp = UCaseProps.INSTANCE;
- UnicodeSet foldSet = new UnicodeSet(this);
- ULocale root = ULocale.ROOT;
-
- // start with input set to guarantee inclusion
- // CASE: remove strings because the strings will actually be reduced (folded);
- // therefore, start with no strings and add only those needed
- if((attribute & CASE) != 0 && foldSet.hasStrings()) {
- foldSet.strings.clear();
- }
-
- int n = getRangeCount();
- int result;
- StringBuilder full = new StringBuilder();
+ switch (attribute & CASE_MASK) {
+ case 0:
+ break;
+ case CASE_INSENSITIVE:
+ closeOverCaseInsensitive(/* simple= */ false);
+ break;
+ case ADD_CASE_MAPPINGS:
+ closeOverAddCaseMappings();
+ break;
+ case SIMPLE_CASE_INSENSITIVE:
+ closeOverCaseInsensitive(/* simple= */ true);
+ break;
+ default:
+ // bad option (unreachable)
+ break;
+ }
+ return this;
+ }
- for (int i=0; i<n; ++i) {
- int start = getRangeStart(i);
- int end = getRangeEnd(i);
+ private void closeOverCaseInsensitive(boolean simple) {
+ UCaseProps csp = UCaseProps.INSTANCE;
+ // Start with input set to guarantee inclusion.
+ UnicodeSet foldSet = new UnicodeSet(this);
- if((attribute & CASE) != 0) {
- // full case closure
- for (int cp=start; cp<=end; ++cp) {
- csp.addCaseClosure(cp, foldSet);
- }
- } else {
- // add case mappings
- // (does not add long s for regular s, or Kelvin for k, for example)
- for (int cp=start; cp<=end; ++cp) {
- result = csp.toFullLower(cp, null, full, UCaseProps.LOC_ROOT);
- addCaseMapping(foldSet, result, full);
+ // Full case mappings closure:
+ // Remove strings because the strings will actually be reduced (folded);
+ // therefore, start with no strings and add only those needed.
+ // Do this before processing code points, because they may add strings.
+ if (!simple && foldSet.hasStrings()) {
+ foldSet.strings.clear();
+ }
- result = csp.toFullTitle(cp, null, full, UCaseProps.LOC_ROOT);
- addCaseMapping(foldSet, result, full);
+ UnicodeSet codePoints = maybeOnlyCaseSensitive(this);
- result = csp.toFullUpper(cp, null, full, UCaseProps.LOC_ROOT);
- addCaseMapping(foldSet, result, full);
+ // Iterate over the ranges of single code points. Nested loop for each code point.
+ int n = codePoints.getRangeCount();
+ for (int i=0; i<n; ++i) {
+ int start = codePoints.getRangeStart(i);
+ int end = codePoints.getRangeEnd(i);
- result = csp.toFullFolding(cp, full, 0);
- addCaseMapping(foldSet, result, full);
- }
+ if (simple) {
+ for (int cp=start; cp<=end; ++cp) {
+ csp.addSimpleCaseClosure(cp, foldSet);
+ }
+ } else {
+ for (int cp=start; cp<=end; ++cp) {
+ csp.addCaseClosure(cp, foldSet);
}
}
- if (hasStrings()) {
- if ((attribute & CASE) != 0) {
- for (String s : strings) {
- String str = UCharacter.foldCase(s, 0);
- if(!csp.addStringCaseClosure(str, foldSet)) {
- foldSet.add(str); // does not map to code points: add the folded string itself
- }
+ }
+ if (hasStrings()) {
+ StringBuilder sb = simple ? new StringBuilder() : null;
+ for (String s : strings) {
+ if (simple) {
+ if (scfString(s, sb)) {
+ foldSet.remove(s).add(sb);
}
} else {
- BreakIterator bi = BreakIterator.getWordInstance(root);
- for (String str : strings) {
- // TODO: call lower-level functions
- foldSet.add(UCharacter.toLowerCase(root, str));
- foldSet.add(UCharacter.toTitleCase(root, str, bi));
- foldSet.add(UCharacter.toUpperCase(root, str));
- foldSet.add(UCharacter.foldCase(str, 0));
+ String str = UCharacter.foldCase(s, 0);
+ if(!csp.addStringCaseClosure(str, foldSet)) {
+ foldSet.add(str); // does not map to code points: add the folded string itself
}
}
}
- set(foldSet);
}
- return this;
+ set(foldSet);
+ }
+
+ private void closeOverAddCaseMappings() {
+ UCaseProps csp = UCaseProps.INSTANCE;
+ // Start with input set to guarantee inclusion.
+ UnicodeSet foldSet = new UnicodeSet(this);
+
+ UnicodeSet codePoints = maybeOnlyCaseSensitive(this);
+
+ // Iterate over the ranges of single code points. Nested loop for each code point.
+ int n = codePoints.getRangeCount();
+ int result;
+ StringBuilder full = new StringBuilder();
+
+ for (int i=0; i<n; ++i) {
+ int start = codePoints.getRangeStart(i);
+ int end = codePoints.getRangeEnd(i);
+
+ // add case mappings
+ // (does not add long s for regular s, or Kelvin for k, for example)
+ for (int cp=start; cp<=end; ++cp) {
+ result = csp.toFullLower(cp, null, full, UCaseProps.LOC_ROOT);
+ addCaseMapping(foldSet, result, full);
+
+ result = csp.toFullTitle(cp, null, full, UCaseProps.LOC_ROOT);
+ addCaseMapping(foldSet, result, full);
+
+ result = csp.toFullUpper(cp, null, full, UCaseProps.LOC_ROOT);
+ addCaseMapping(foldSet, result, full);
+
+ result = csp.toFullFolding(cp, full, 0);
+ addCaseMapping(foldSet, result, full);
+ }
+ }
+ if (hasStrings()) {
+ ULocale root = ULocale.ROOT;
+ BreakIterator bi = BreakIterator.getWordInstance(root);
+ for (String str : strings) {
+ // TODO: call lower-level functions
+ foldSet.add(UCharacter.toLowerCase(root, str));
+ foldSet.add(UCharacter.toTitleCase(root, str, bi));
+ foldSet.add(UCharacter.toUpperCase(root, str));
+ foldSet.add(UCharacter.foldCase(str, 0));
+ }
+ }
+ set(foldSet);
}
/**
@@ -4513,7 +4661,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
* Same results as turning the code point into a string (with the [ugly] new StringBuilder().appendCodePoint(codepoint).toString())
* and comparing, but much faster (no object creation).
* Actually, there is one difference; a null compares as less.
- * Note that this (=String) order is UTF-16 order -- *not* code point order.
+ * Note that this (=String) order is UTF-16 order -- <i>not</i> code point order.
* @hide unsupported on Android
*/
@@ -4525,7 +4673,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
* Utility to compare a string to a code point.
* Same results as turning the code point into a string and comparing, but much faster (no object creation).
* Actually, there is one difference; a null compares as less.
- * Note that this (=String) order is UTF-16 order -- *not* code point order.
+ * Note that this (=String) order is UTF-16 order -- <i>not</i> code point order.
* @hide unsupported on Android
*/
public static int compare(int codePoint, CharSequence string) {