1 files changed, 227 insertions, 79 deletions
diff --git a/android_icu4j/src/main/java/android/icu/text/UnicodeSet.java b/android_icu4j/src/main/java/android/icu/text/UnicodeSet.java
index a85c57e16..297edbb0c 100644
--- a/android_icu4j/src/main/java/android/icu/text/UnicodeSet.java
+++ b/android_icu4j/src/main/java/android/icu/text/UnicodeSet.java
@@ -449,7 +449,9 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
      * for the syntax of the pattern language.
      * @param pattern a string specifying what characters are in the set
      * @param options a bitmask indicating which options to apply.
-     * Valid options are IGNORE_SPACE and CASE.
+     * Valid options are {@link #IGNORE_SPACE} and
+     * at most one of {@link #CASE_INSENSITIVE}, {@link #ADD_CASE_MAPPINGS},
+     * {@link #SIMPLE_CASE_INSENSITIVE}. These case options are mutually exclusive.
      * @exception java.lang.IllegalArgumentException if the pattern contains
      * a syntax error.
      */
@@ -483,7 +485,9 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
      * @param symbols a symbol table mapping variables to char[] arrays
      * and chars to UnicodeSets
      * @param options a bitmask indicating which options to apply.
-     * Valid options are IGNORE_SPACE and CASE.
+     * Valid options are {@link #IGNORE_SPACE} and
+     * at most one of {@link #CASE_INSENSITIVE}, {@link #ADD_CASE_MAPPINGS},
+     * {@link #SIMPLE_CASE_INSENSITIVE}. These case options are mutually exclusive.
      * @exception java.lang.IllegalArgumentException if the pattern
      * contains a syntax error.
      */
@@ -569,7 +573,9 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
      * See the class description for the syntax of the pattern language.
      * @param pattern a string specifying what characters are in the set
      * @param options a bitmask indicating which options to apply.
-     * Valid options are IGNORE_SPACE and CASE.
+     * Valid options are {@link #IGNORE_SPACE} and
+     * at most one of {@link #CASE_INSENSITIVE}, {@link #ADD_CASE_MAPPINGS},
+     * {@link #SIMPLE_CASE_INSENSITIVE}. These case options are mutually exclusive.
      * @exception java.lang.IllegalArgumentException if the pattern
      * contains a syntax error.
      */
@@ -2512,8 +2518,10 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
      * variables, or null if none.
      * @param rebuiltPat the pattern that was parsed, rebuilt or
      * copied from the input pattern, as appropriate.
-     * @param options a bit mask of zero or more of the following:
-     * IGNORE_SPACE, CASE.
+     * @param options a bit mask.
+     * Valid options are {@link #IGNORE_SPACE} and
+     * at most one of {@link #CASE_INSENSITIVE}, {@link #ADD_CASE_MAPPINGS},
+     * {@link #SIMPLE_CASE_INSENSITIVE}. These case options are mutually exclusive.
      */
     private void applyPattern(RuleCharacterIterator chars, SymbolTable symbols,
             Appendable rebuiltPat, int options, int depth) {
@@ -2893,8 +2901,8 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
          * to close over case BEFORE COMPLEMENTING.  This makes
          * patterns like /[^abc]/i work.
          */
-        if ((options & CASE) != 0) {
-            closeOver(CASE);
+        if ((options & CASE_MASK) != 0) {
+            closeOver(options);
         }
         if (invert) {
             complement().removeAllStrings();  // code point complement
@@ -3784,28 +3792,74 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
      *
      * @deprecated ICU 73 Use {@link #CASE_INSENSITIVE} instead.
      */
+    @Deprecated
     public static final int CASE = 2;
 
     /**
-     * Alias for UnicodeSet.CASE, for ease of porting from C++ where ICU4C
-     * also has both USET_CASE and USET_CASE_INSENSITIVE (see uset.h).
-     * @see #CASE
+     * Enable case insensitive matching.  E.g., "[ab]" with this flag
+     * will match 'a', 'A', 'b', and 'B'.  "[^ab]" with this flag will
+     * match all except 'a', 'A', 'b', and 'B'. This performs a full
+     * closure over case mappings, e.g. 'ſ' (U+017F long s) for 's'.
+     *
+     * <p>This value is an options bit set value for some
+     * constructors, applyPattern(), and closeOver().
+     * It can be ORed together with other, unrelated options.
+     *
+     * <p>The resulting set is a superset of the input for the code points but
+     * not for the strings.
+     * It performs a case mapping closure of the code points and adds
+     * full case folding strings for the code points, and reduces strings of
+     * the original set to their full case folding equivalents.
+     *
+     * <p>This is designed for case-insensitive matches, for example
+     * in regular expressions. The full code point case closure allows checking of
+     * an input character directly against the closure set.
+     * Strings are matched by comparing the case-folded form from the closure
+     * set with an incremental case folding of the string in question.
+     *
+     * <p>The closure set will also contain single code points if the original
+     * set contained case-equivalent strings (like U+00DF for "ss" or "Ss" etc.).
+     * This is not necessary (that is, redundant) for the above matching method
+     * but results in the same closure sets regardless of whether the original
+     * set contained the code point or a string.
      */
     public static final int CASE_INSENSITIVE = 2;
 
     /**
-     * Bitmask for constructor, applyPattern(), and closeOver()
-     * indicating letter case.  This may be ORed together with other
-     * selectors.
-     *
-     * Enable case insensitive matching.  E.g., "[ab]" with this flag
-     * will match 'a', 'A', 'b', and 'B'.  "[^ab]" with this flag will
-     * match all except 'a', 'A', 'b', and 'B'. This adds the lower-,
-     * title-, and uppercase mappings as well as the case folding
+     * Adds all case mappings for each element in the set.
+     * This adds the full lower-, title-, and uppercase mappings as well as the full case folding
      * of each existing element in the set.
+     *
+     * <p>This value is an options bit set value for some
+     * constructors, applyPattern(), and closeOver().
+     * It can be ORed together with other, unrelated options.
+     *
+     * <p>Unlike the “case insensitive” options, this does not perform a closure.
+     * For example, it does not add 'ſ' (U+017F long s) for 's',
+     * 'K' (U+212A Kelvin sign) for 'k', or replace set strings by their case-folded versions.
      */
     public static final int ADD_CASE_MAPPINGS = 4;
 
+    /**
+     * Enable case insensitive matching.
+     * Same as {@link #CASE_INSENSITIVE} but using only Simple_Case_Folding (scf) mappings,
+     * which map each code point to one code point,
+     * not full Case_Folding (cf) mappings, which map some code points to multiple code points.
+     *
+     * <p>This is designed for case-insensitive matches, for example in certain
+     * regular expression implementations where only Simple_Case_Folding mappings are used,
+     * such as in ECMAScript (JavaScript) regular expressions.
+     *
+     * <p>This value is an options bit set value for some
+     * constructors, applyPattern(), and closeOver().
+     * It can be ORed together with other, unrelated options.
+     *
+     * @hide draft / provisional / internal are hidden on Android
+     */
+    public static final int SIMPLE_CASE_INSENSITIVE = 6;
+
+    private static final int CASE_MASK = CASE_INSENSITIVE | ADD_CASE_MAPPINGS;
+
     //  add the result of a full case mapping to the set
     //  use str as a temporary string to avoid constructing one
     private static final void addCaseMapping(UnicodeSet set, int result, StringBuilder full) {
@@ -3823,96 +3877,190 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
         // see UCaseProps
     }
 
+    /** For case closure on a large set, look only at code points with relevant properties. */
+    UnicodeSet maybeOnlyCaseSensitive(UnicodeSet src) {
+        if (src.size() < 30) {
+            return src;
+        }
+        // Return the intersection of the src code points with Case_Sensitive ones.
+        UnicodeSet sensitive = CharacterProperties.getBinaryPropertySet(UProperty.CASE_SENSITIVE);
+        // Start by cloning the "smaller" set. Try not to copy the strings, if there are any in src.
+        if (src.hasStrings() || src.getRangeCount() > sensitive.getRangeCount()) {
+            return sensitive.cloneAsThawed().retainAll(src);
+        } else {
+            return ((UnicodeSet) src.clone()).retainAll(sensitive);
+        }
+    }
+
+    // Per-character scf = Simple_Case_Folding of a string.
+    // (Normally when we case-fold a string we use full case foldings.)
+    private static final boolean scfString(CharSequence s, StringBuilder scf) {
+        int length = s.length();
+        // Loop while not needing modification.
+        for (int i = 0; i < length;) {
+            int c = Character.codePointAt(s, i);
+            int scfChar = UCharacter.foldCase(c, UCharacter.FOLD_CASE_DEFAULT);
+            if (scfChar != c) {
+                // Copy the characters before c.
+                scf.setLength(0);
+                scf.append(s, 0, i);
+                // Loop over the rest of the string and keep case-folding.
+                for (;;) {
+                    scf.appendCodePoint(scfChar);
+                    i += Character.charCount(c);
+                    if (i == length) {
+                        return true;
+                    }
+                    c = Character.codePointAt(s, i);
+                    scfChar = UCharacter.foldCase(c, UCharacter.FOLD_CASE_DEFAULT);
+                }
+            }
+            i += Character.charCount(c);
+        }
+        return false;
+    }
+
     /**
      * Close this set over the given attribute.  For the attribute
-     * CASE, the result is to modify this set so that:
+     * {@link #CASE_INSENSITIVE}, the result is to modify this set so that:
      *
-     * 1. For each character or string 'a' in this set, all strings
+     * <ol>
+     * <li>For each character or string 'a' in this set, all strings
      * 'b' such that foldCase(a) == foldCase(b) are added to this set.
      * (For most 'a' that are single characters, 'b' will have
      * b.length() == 1.)
      *
-     * 2. For each string 'e' in the resulting set, if e !=
+     * <li>For each string 'e' in the resulting set, if e !=
      * foldCase(e), 'e' will be removed.
+     * </ol>
      *
-     * Example: [aq\u00DF{Bc}{bC}{Fi}] =&gt; [aAqQ\u00DF\uFB01{ss}{bc}{fi}]
+     * <p>Example: [aq\u00DF{Bc}{bC}{Fi}] =&gt; [aAqQ\u00DF\uFB01{ss}{bc}{fi}]
      *
-     * (Here foldCase(x) refers to the operation
+     * <p>(Here foldCase(x) refers to the operation
      * UCharacter.foldCase(x, true), and a == b actually denotes
      * a.equals(b), not pointer comparison.)
      *
      * @param attribute bitmask for attributes to close over.
-     * Currently only the CASE bit is supported.  Any undefined bits
-     * are ignored.
+     * Valid options:
+     * At most one of {@link #CASE_INSENSITIVE}, {@link #ADD_CASE_MAPPINGS},
+     * {@link #SIMPLE_CASE_INSENSITIVE}. These case options are mutually exclusive.
+     * Unrelated options bits are ignored.
      * @return a reference to this set.
      */
     public UnicodeSet closeOver(int attribute) {
         checkFrozen();
-        if ((attribute & (CASE | ADD_CASE_MAPPINGS)) != 0) {
-            UCaseProps csp = UCaseProps.INSTANCE;
-            UnicodeSet foldSet = new UnicodeSet(this);
-            ULocale root = ULocale.ROOT;
-
-            // start with input set to guarantee inclusion
-            // CASE: remove strings because the strings will actually be reduced (folded);
-            //       therefore, start with no strings and add only those needed
-            if((attribute & CASE) != 0 && foldSet.hasStrings()) {
-                foldSet.strings.clear();
-            }
-
-            int n = getRangeCount();
-            int result;
-            StringBuilder full = new StringBuilder();
+        switch (attribute & CASE_MASK) {
+        case 0:
+            break;
+        case CASE_INSENSITIVE:
+            closeOverCaseInsensitive(/* simple= */ false);
+            break;
+        case ADD_CASE_MAPPINGS:
+            closeOverAddCaseMappings();
+            break;
+        case SIMPLE_CASE_INSENSITIVE:
+            closeOverCaseInsensitive(/* simple= */ true);
+            break;
+        default:
+            // bad option (unreachable)
+            break;
+        }
+        return this;
+    }
 
-            for (int i=0; i<n; ++i) {
-                int start = getRangeStart(i);
-                int end   = getRangeEnd(i);
+    private void closeOverCaseInsensitive(boolean simple) {
+        UCaseProps csp = UCaseProps.INSTANCE;
+        // Start with input set to guarantee inclusion.
+        UnicodeSet foldSet = new UnicodeSet(this);
 
-                if((attribute & CASE) != 0) {
-                    // full case closure
-                    for (int cp=start; cp<=end; ++cp) {
-                        csp.addCaseClosure(cp, foldSet);
-                    }
-                } else {
-                    // add case mappings
-                    // (does not add long s for regular s, or Kelvin for k, for example)
-                    for (int cp=start; cp<=end; ++cp) {
-                        result = csp.toFullLower(cp, null, full, UCaseProps.LOC_ROOT);
-                        addCaseMapping(foldSet, result, full);
+        // Full case mappings closure:
+        // Remove strings because the strings will actually be reduced (folded);
+        // therefore, start with no strings and add only those needed.
+        // Do this before processing code points, because they may add strings.
+        if (!simple && foldSet.hasStrings()) {
+            foldSet.strings.clear();
+        }
 
-                        result = csp.toFullTitle(cp, null, full, UCaseProps.LOC_ROOT);
-                        addCaseMapping(foldSet, result, full);
+        UnicodeSet codePoints = maybeOnlyCaseSensitive(this);
 
-                        result = csp.toFullUpper(cp, null, full, UCaseProps.LOC_ROOT);
-                        addCaseMapping(foldSet, result, full);
+        // Iterate over the ranges of single code points. Nested loop for each code point.
+        int n = codePoints.getRangeCount();
+        for (int i=0; i<n; ++i) {
+            int start = codePoints.getRangeStart(i);
+            int end   = codePoints.getRangeEnd(i);
 
-                        result = csp.toFullFolding(cp, full, 0);
-                        addCaseMapping(foldSet, result, full);
-                    }
+            if (simple) {
+                for (int cp=start; cp<=end; ++cp) {
+                    csp.addSimpleCaseClosure(cp, foldSet);
+                }
+            } else {
+                for (int cp=start; cp<=end; ++cp) {
+                    csp.addCaseClosure(cp, foldSet);
                 }
             }
-            if (hasStrings()) {
-                if ((attribute & CASE) != 0) {
-                    for (String s : strings) {
-                        String str = UCharacter.foldCase(s, 0);
-                        if(!csp.addStringCaseClosure(str, foldSet)) {
-                            foldSet.add(str); // does not map to code points: add the folded string itself
-                        }
+        }
+        if (hasStrings()) {
+            StringBuilder sb = simple ? new StringBuilder() : null;
+            for (String s : strings) {
+                if (simple) {
+                    if (scfString(s, sb)) {
+                        foldSet.remove(s).add(sb);
                     }
                 } else {
-                    BreakIterator bi = BreakIterator.getWordInstance(root);
-                    for (String str : strings) {
-                        // TODO: call lower-level functions
-                        foldSet.add(UCharacter.toLowerCase(root, str));
-                        foldSet.add(UCharacter.toTitleCase(root, str, bi));
-                        foldSet.add(UCharacter.toUpperCase(root, str));
-                        foldSet.add(UCharacter.foldCase(str, 0));
+                    String str = UCharacter.foldCase(s, 0);
+                    if(!csp.addStringCaseClosure(str, foldSet)) {
+                        foldSet.add(str); // does not map to code points: add the folded string itself
                     }
                 }
             }
-            set(foldSet);
         }
-        return this;
+        set(foldSet);
+    }
+
+    private void closeOverAddCaseMappings() {
+        UCaseProps csp = UCaseProps.INSTANCE;
+        // Start with input set to guarantee inclusion.
+        UnicodeSet foldSet = new UnicodeSet(this);
+
+        UnicodeSet codePoints = maybeOnlyCaseSensitive(this);
+
+        // Iterate over the ranges of single code points. Nested loop for each code point.
+        int n = codePoints.getRangeCount();
+        int result;
+        StringBuilder full = new StringBuilder();
+
+        for (int i=0; i<n; ++i) {
+            int start = codePoints.getRangeStart(i);
+            int end   = codePoints.getRangeEnd(i);
+
+            // add case mappings
+            // (does not add long s for regular s, or Kelvin for k, for example)
+            for (int cp=start; cp<=end; ++cp) {
+                result = csp.toFullLower(cp, null, full, UCaseProps.LOC_ROOT);
+                addCaseMapping(foldSet, result, full);
+
+                result = csp.toFullTitle(cp, null, full, UCaseProps.LOC_ROOT);
+                addCaseMapping(foldSet, result, full);
+
+                result = csp.toFullUpper(cp, null, full, UCaseProps.LOC_ROOT);
+                addCaseMapping(foldSet, result, full);
+
+                result = csp.toFullFolding(cp, full, 0);
+                addCaseMapping(foldSet, result, full);
+            }
+        }
+        if (hasStrings()) {
+            ULocale root = ULocale.ROOT;
+            BreakIterator bi = BreakIterator.getWordInstance(root);
+            for (String str : strings) {
+                // TODO: call lower-level functions
+                foldSet.add(UCharacter.toLowerCase(root, str));
+                foldSet.add(UCharacter.toTitleCase(root, str, bi));
+                foldSet.add(UCharacter.toUpperCase(root, str));
+                foldSet.add(UCharacter.foldCase(str, 0));
+            }
+        }
+        set(foldSet);
     }
 
     /**
@@ -4513,7 +4661,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
      * Same results as turning the code point into a string (with the [ugly] new StringBuilder().appendCodePoint(codepoint).toString())
      * and comparing, but much faster (no object creation).
      * Actually, there is one difference; a null compares as less.
-     * Note that this (=String) order is UTF-16 order -- *not* code point order.
+     * Note that this (=String) order is UTF-16 order -- <i>not</i> code point order.
      * @hide unsupported on Android
      */
 
@@ -4525,7 +4673,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
      * Utility to compare a string to a code point.
      * Same results as turning the code point into a string and comparing, but much faster (no object creation).
      * Actually, there is one difference; a null compares as less.
-     * Note that this (=String) order is UTF-16 order -- *not* code point order.
+     * Note that this (=String) order is UTF-16 order -- <i>not</i> code point order.
      * @hide unsupported on Android
      */
     public static int compare(int codePoint, CharSequence string) {