icu4j/main/classes/core/src/com/ibm/icu/impl/StaticUnicodeSets.java


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259

// © 2017 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html#License
package com.ibm.icu.impl;

import static com.ibm.icu.impl.number.parse.ParsingUtils.safeContains;

import java.util.EnumMap;
import java.util.Map;

import com.ibm.icu.impl.UResource.Value;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.ULocale;
import com.ibm.icu.util.UResourceBundle;

/**
 * This class statically initializes UnicodeSets, originally built for number parsing. Microbenchmarks
 * show this to bring a very sizeable performance boost.
 *
 * IMPORTANT ASSUMPTION FOR NUMBER PARSING: All of the sets contain code points (no strings) and they are
 * all case-folded. If this assumption were ever broken, logic in classes such as SymbolMatcher would
 * need to be updated in order to return well-formed sets upon calls to getLeadCodePoints().
 *
 * @author sffc
 */
public class StaticUnicodeSets {
    public static enum Key {
        // Ignorables
        DEFAULT_IGNORABLES,
        STRICT_IGNORABLES,

        // Separators
        // Notes:
        // - COMMA is a superset of STRICT_COMMA
        // - PERIOD is a superset of SCRICT_PERIOD
        // - ALL_SEPARATORS is the union of COMMA, PERIOD, and OTHER_GROUPING_SEPARATORS
        // - STRICT_ALL_SEPARATORS is the union of STRICT_COMMA, STRICT_PERIOD, and OTHER_GRP_SEPARATORS
        COMMA,
        PERIOD,
        STRICT_COMMA,
        STRICT_PERIOD,
        OTHER_GROUPING_SEPARATORS,
        ALL_SEPARATORS,
        STRICT_ALL_SEPARATORS,

        // Symbols
        // TODO: NaN?
        MINUS_SIGN,
        PLUS_SIGN,
        PERCENT_SIGN,
        PERMILLE_SIGN,
        INFINITY,

        // Currency Symbols
        DOLLAR_SIGN,
        POUND_SIGN,
        RUPEE_SIGN,
        YEN_SIGN, // not in CLDR data, but Currency.java wants it

        // Other
        DIGITS,

        // Combined Separators with Digits (for lead code points)
        DIGITS_OR_ALL_SEPARATORS,
        DIGITS_OR_STRICT_ALL_SEPARATORS,
    };

    private static final Map<Key, UnicodeSet> unicodeSets = new EnumMap<Key, UnicodeSet>(Key.class);

    /**
     * Gets the static-allocated UnicodeSet according to the provided key.
     *
     * @param key
     *            The desired UnicodeSet according to the enum in this file.
     * @return The requested UnicodeSet. Guaranteed to be frozen and non-null, but may be empty if an
     *         error occurred during data loading.
     */
    public static UnicodeSet get(Key key) {
        UnicodeSet candidate = unicodeSets.get(key);
        if (candidate == null) {
            return UnicodeSet.EMPTY;
        }
        return candidate;
    }

    /**
     * Checks if the UnicodeSet given by key1 contains the given string.
     *
     * @param str
     *            The string to check.
     * @param key1
     *            The set to check.
     * @return key1 if the set contains str, or COUNT if not.
     */
    public static Key chooseFrom(String str, Key key1) {
        return safeContains(get(key1), str) ? key1 : null;
    }

    /**
     * Checks if the UnicodeSet given by either key1 or key2 contains the string.
     *
     * Exported as U_COMMON_API for numparse_decimal.cpp
     *
     * @param str
     *            The string to check.
     * @param key1
     *            The first set to check.
     * @param key2
     *            The second set to check.
     * @return key1 if that set contains str; key2 if that set contains str; or COUNT if neither set
     *         contains str.
     */
    public static Key chooseFrom(String str, Key key1, Key key2) {
        return safeContains(get(key1), str) ? key1 : chooseFrom(str, key2);
    }

    /**
     * Looks through all Currency-related sets for the given string, returning the first match or null if
     * no match was round.
     */
    public static Key chooseCurrency(String str) {
        if (get(Key.DOLLAR_SIGN).contains(str)) {
            return Key.DOLLAR_SIGN;
        } else if (get(Key.POUND_SIGN).contains(str)) {
            return Key.POUND_SIGN;
        } else if (get(Key.RUPEE_SIGN).contains(str)) {
            return Key.RUPEE_SIGN;
        } else if (get(Key.YEN_SIGN).contains(str)) {
            return Key.YEN_SIGN;
        } else {
            return null;
        }
    }

    private static UnicodeSet computeUnion(Key k1, Key k2) {
        return new UnicodeSet().addAll(get(k1)).addAll(get(k2)).freeze();
    }

    private static UnicodeSet computeUnion(Key k1, Key k2, Key k3) {
        return new UnicodeSet().addAll(get(k1)).addAll(get(k2)).addAll(get(k3)).freeze();
    }

    private static void saveSet(Key key, String unicodeSetPattern) {
        assert unicodeSets.get(key) == null;
        unicodeSets.put(key, new UnicodeSet(unicodeSetPattern).freeze());
    }

    /*
    parse{
        date{
            lenient{
                "[\\--/]",
                "[\\:∶]",
            }
        }
        general{
            lenient{
                "[.․。︒﹒．｡]",
                "[\$﹩＄$]",
                "[£₤]",
                "[₨₹{Rp}{Rs}]",
            }
        }
        number{
            lenient{
                "[\\-‒⁻₋−➖﹣－]",
                "[,،٫、︐︑﹐﹑，､]",
                "[+⁺₊➕﬩﹢＋]",
            }
            stricter{
                "[,٫︐﹐，]",
                "[.․﹒．｡]",
            }
        }
    }
     */
    static class ParseDataSink extends UResource.Sink {
        @Override
        public void put(com.ibm.icu.impl.UResource.Key key, Value value, boolean noFallback) {
            UResource.Table contextsTable = value.getTable();
            for (int i = 0; contextsTable.getKeyAndValue(i, key, value); i++) {
                if (key.contentEquals("date")) {
                    // ignore
                } else {
                    assert key.contentEquals("general") || key.contentEquals("number");
                    UResource.Table strictnessTable = value.getTable();
                    for (int j = 0; strictnessTable.getKeyAndValue(j, key, value); j++) {
                        boolean isLenient = key.contentEquals("lenient");
                        UResource.Array array = value.getArray();
                        for (int k = 0; k < array.getSize(); k++) {
                            array.getValue(k, value);
                            String str = value.toString();
                            // There is both lenient and strict data for comma/period,
                            // but not for any of the other symbols.
                            if (str.indexOf('.') != -1) {
                                saveSet(isLenient ? Key.PERIOD : Key.STRICT_PERIOD, str);
                            } else if (str.indexOf(',') != -1) {
                                saveSet(isLenient ? Key.COMMA : Key.STRICT_COMMA, str);
                            } else if (str.indexOf('+') != -1) {
                                saveSet(Key.PLUS_SIGN, str);
                            } else if (str.indexOf('‒') != -1) {
                                saveSet(Key.MINUS_SIGN, str);
                            } else if (str.indexOf('$') != -1) {
                                saveSet(Key.DOLLAR_SIGN, str);
                            } else if (str.indexOf('£') != -1) {
                                saveSet(Key.POUND_SIGN, str);
                            } else if (str.indexOf('₨') != -1) {
                                saveSet(Key.RUPEE_SIGN, str);
                            }
                        }
                    }
                }
            }
        }
    }

    static {
        // These sets were decided after discussion with icu-design@. See tickets #13084 and #13309.
        // Zs+TAB is "horizontal whitespace" according to UTS #18 (blank property).
        unicodeSets.put(Key.DEFAULT_IGNORABLES,
                new UnicodeSet("[[:Zs:][\\u0009][:Bidi_Control:][:Variation_Selector:]]").freeze());
        unicodeSets.put(Key.STRICT_IGNORABLES, new UnicodeSet("[[:Bidi_Control:]]").freeze());

        // CLDR provides data for comma, period, minus sign, and plus sign.
        ICUResourceBundle rb = (ICUResourceBundle) UResourceBundle
                .getBundleInstance(ICUData.ICU_BASE_NAME, ULocale.ROOT);
        rb.getAllItemsWithFallback("parse", new ParseDataSink());

        // NOTE: It is OK for these assertions to fail if there was a no-data build.
        assert unicodeSets.containsKey(Key.COMMA);
        assert unicodeSets.containsKey(Key.STRICT_COMMA);
        assert unicodeSets.containsKey(Key.PERIOD);
        assert unicodeSets.containsKey(Key.STRICT_PERIOD);

        unicodeSets.put(Key.OTHER_GROUPING_SEPARATORS,
                new UnicodeSet("['٬‘’＇\\u0020\\u00A0\\u2000-\\u200A\\u202F\\u205F\\u3000]").freeze());
        unicodeSets.put(Key.ALL_SEPARATORS,
                computeUnion(Key.COMMA, Key.PERIOD, Key.OTHER_GROUPING_SEPARATORS));
        unicodeSets.put(Key.STRICT_ALL_SEPARATORS,
                computeUnion(Key.STRICT_COMMA, Key.STRICT_PERIOD, Key.OTHER_GROUPING_SEPARATORS));

        assert unicodeSets.containsKey(Key.MINUS_SIGN);
        assert unicodeSets.containsKey(Key.PLUS_SIGN);

        unicodeSets.put(Key.PERCENT_SIGN, new UnicodeSet("[%٪]").freeze());
        unicodeSets.put(Key.PERMILLE_SIGN, new UnicodeSet("[‰؉]").freeze());
        unicodeSets.put(Key.INFINITY, new UnicodeSet("[∞]").freeze());

        assert unicodeSets.containsKey(Key.DOLLAR_SIGN);
        assert unicodeSets.containsKey(Key.POUND_SIGN);
        assert unicodeSets.containsKey(Key.RUPEE_SIGN);
        unicodeSets.put(Key.YEN_SIGN, new UnicodeSet("[¥\\uffe5]").freeze());

        unicodeSets.put(Key.DIGITS, new UnicodeSet("[:digit:]").freeze());

        unicodeSets.put(Key.DIGITS_OR_ALL_SEPARATORS, computeUnion(Key.DIGITS, Key.ALL_SEPARATORS));
        unicodeSets.put(Key.DIGITS_OR_STRICT_ALL_SEPARATORS,
                computeUnion(Key.DIGITS, Key.STRICT_ALL_SEPARATORS));
    }
}