summaryrefslogtreecommitdiff
path: root/android_icu4j/src/main/java/android/icu/impl/locale/LikelySubtags.java
diff options
context:
space:
mode:
Diffstat (limited to 'android_icu4j/src/main/java/android/icu/impl/locale/LikelySubtags.java')
-rw-r--r--android_icu4j/src/main/java/android/icu/impl/locale/LikelySubtags.java595
1 files changed, 595 insertions, 0 deletions
diff --git a/android_icu4j/src/main/java/android/icu/impl/locale/LikelySubtags.java b/android_icu4j/src/main/java/android/icu/impl/locale/LikelySubtags.java
new file mode 100644
index 000000000..6c4b699c9
--- /dev/null
+++ b/android_icu4j/src/main/java/android/icu/impl/locale/LikelySubtags.java
@@ -0,0 +1,595 @@
+/* GENERATED SOURCE. DO NOT MODIFY. */
+// © 2017 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package android.icu.impl.locale;
+
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Locale;
+import java.util.Map;
+import java.util.MissingResourceException;
+import java.util.TreeMap;
+
+import android.icu.impl.ICUData;
+import android.icu.impl.ICUResourceBundle;
+import android.icu.impl.UResource;
+import android.icu.util.BytesTrie;
+import android.icu.util.Region;
+import android.icu.util.ULocale;
+
+/**
+ * @hide Only a subset of ICU is exposed in Android
+ */
+public final class LikelySubtags {
+ private static final String PSEUDO_ACCENTS_PREFIX = "'"; // -XA, -PSACCENT
+ private static final String PSEUDO_BIDI_PREFIX = "+"; // -XB, -PSBIDI
+ private static final String PSEUDO_CRACKED_PREFIX = ","; // -XC, -PSCRACK
+
+ public static final int SKIP_SCRIPT = 1;
+
+ private static final boolean DEBUG_OUTPUT = LSR.DEBUG_OUTPUT;
+
+ // VisibleForTesting
+ /**
+ * @hide Only a subset of ICU is exposed in Android
+ */
+ public static final class Data {
+ public final Map<String, String> languageAliases;
+ public final Map<String, String> regionAliases;
+ public final byte[] trie;
+ public final LSR[] lsrs;
+
+ public Data(Map<String, String> languageAliases, Map<String, String> regionAliases,
+ byte[] trie, LSR[] lsrs) {
+ this.languageAliases = languageAliases;
+ this.regionAliases = regionAliases;
+ this.trie = trie;
+ this.lsrs = lsrs;
+ }
+
+ private static UResource.Value getValue(UResource.Table table,
+ String key, UResource.Value value) {
+ if (!table.findValue(key, value)) {
+ throw new MissingResourceException(
+ "langInfo.res missing data", "", "likely/" + key);
+ }
+ return value;
+ }
+
+ // VisibleForTesting
+ public static Data load() throws MissingResourceException {
+ ICUResourceBundle langInfo = ICUResourceBundle.getBundleInstance(
+ ICUData.ICU_BASE_NAME, "langInfo",
+ ICUResourceBundle.ICU_DATA_CLASS_LOADER, ICUResourceBundle.OpenType.DIRECT);
+ UResource.Value value = langInfo.getValueWithFallback("likely");
+ UResource.Table likelyTable = value.getTable();
+
+ Map<String, String> languageAliases;
+ if (likelyTable.findValue("languageAliases", value)) {
+ String[] pairs = value.getStringArray();
+ languageAliases = new HashMap<>(pairs.length / 2);
+ for (int i = 0; i < pairs.length; i += 2) {
+ languageAliases.put(pairs[i], pairs[i + 1]);
+ }
+ } else {
+ languageAliases = Collections.emptyMap();
+ }
+
+ Map<String, String> regionAliases;
+ if (likelyTable.findValue("regionAliases", value)) {
+ String[] pairs = value.getStringArray();
+ regionAliases = new HashMap<>(pairs.length / 2);
+ for (int i = 0; i < pairs.length; i += 2) {
+ regionAliases.put(pairs[i], pairs[i + 1]);
+ }
+ } else {
+ regionAliases = Collections.emptyMap();
+ }
+
+ ByteBuffer buffer = getValue(likelyTable, "trie", value).getBinary();
+ byte[] trie = new byte[buffer.remaining()];
+ buffer.get(trie);
+
+ String[] m49 = getValue(likelyTable, "m49", value).getStringArray();
+ LSR[] lsrs = LSR.decodeInts(getValue(likelyTable, "lsrnum", value).getIntVector(), m49);
+ return new Data(languageAliases, regionAliases, trie, lsrs);
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ if (this == other) { return true; }
+ if (other == null || !getClass().equals(other.getClass())) { return false; }
+ Data od = (Data)other;
+ return
+ languageAliases.equals(od.languageAliases) &&
+ regionAliases.equals(od.regionAliases) &&
+ Arrays.equals(trie, od.trie) &&
+ Arrays.equals(lsrs, od.lsrs);
+ }
+
+ @Override
+ public int hashCode() { // unused; silence ErrorProne
+ return 1;
+ }
+ }
+
+ // VisibleForTesting
+ public static final LikelySubtags INSTANCE = new LikelySubtags(Data.load());
+
+ private final Map<String, String> languageAliases;
+ private final Map<String, String> regionAliases;
+
+ // The trie maps each lang+script+region (encoded in ASCII) to an index into lsrs.
+ // There is also a trie value for each intermediate lang and lang+script.
+ // '*' is used instead of "und", "Zzzz"/"" and "ZZ"/"".
+ private final BytesTrie trie;
+ private final long trieUndState;
+ private final long trieUndZzzzState;
+ private final int defaultLsrIndex;
+ private final long[] trieFirstLetterStates = new long[26];
+ private final LSR[] lsrs;
+
+ private LikelySubtags(LikelySubtags.Data data) {
+ languageAliases = data.languageAliases;
+ regionAliases = data.regionAliases;
+ trie = new BytesTrie(data.trie, 0);
+ lsrs = data.lsrs;
+
+ // Cache the result of looking up language="und" encoded as "*", and "und-Zzzz" ("**").
+ BytesTrie.Result result = trie.next('*');
+ assert result.hasNext();
+ trieUndState = trie.getState64();
+ result = trie.next('*');
+ assert result.hasNext();
+ trieUndZzzzState = trie.getState64();
+ result = trie.next('*');
+ assert result.hasValue();
+ defaultLsrIndex = trie.getValue();
+ trie.reset();
+
+ for (char c = 'a'; c <= 'z'; ++c) {
+ result = trie.next(c);
+ if (result == BytesTrie.Result.NO_VALUE) {
+ trieFirstLetterStates[c - 'a'] = trie.getState64();
+ }
+ trie.reset();
+ }
+
+ if (DEBUG_OUTPUT) {
+ System.out.println("*** likely subtags");
+ for (Map.Entry<String, LSR> mapping : getTable().entrySet()) {
+ System.out.println(mapping);
+ }
+ }
+ }
+
+ /**
+ * Implementation of LocaleMatcher.canonicalize(ULocale).
+ */
+ public ULocale canonicalize(ULocale locale) {
+ String lang = locale.getLanguage();
+ String lang2 = languageAliases.get(lang);
+ String region = locale.getCountry();
+ String region2 = regionAliases.get(region);
+ if (lang2 != null || region2 != null) {
+ return new ULocale(
+ lang2 == null ? lang : lang2,
+ locale.getScript(),
+ region2 == null ? region : region2);
+ }
+ return locale;
+ }
+
+ private static String getCanonical(Map<String, String> aliases, String alias) {
+ String canonical = aliases.get(alias);
+ return canonical == null ? alias : canonical;
+ }
+
+ // VisibleForTesting
+ public LSR makeMaximizedLsrFrom(ULocale locale, boolean returnInputIfUnmatch) {
+ String name = locale.getName(); // Faster than .toLanguageTag().
+ if (name.startsWith("@x=")) {
+ String tag = locale.toLanguageTag();
+ assert tag.startsWith("und-x-");
+ // Private use language tag x-subtag-subtag... which CLDR changes to
+ // und-x-subtag-subtag...
+ return new LSR(tag, "", "", LSR.EXPLICIT_LSR);
+ }
+ LSR max = makeMaximizedLsr(locale.getLanguage(), locale.getScript(), locale.getCountry(),
+ locale.getVariant(), returnInputIfUnmatch);
+ if (max.language.isEmpty() && max.script.isEmpty() && max.region.isEmpty()) {
+ return new LSR(locale.getLanguage(), locale.getScript(), locale.getCountry(), LSR.EXPLICIT_LSR);
+ }
+ return max;
+ }
+
+ public LSR makeMaximizedLsrFrom(Locale locale) {
+ String tag = locale.toLanguageTag();
+ if (tag.startsWith("x-") || tag.startsWith("und-x-")) {
+ // Private use language tag x-subtag-subtag... which CLDR changes to
+ // und-x-subtag-subtag...
+ return new LSR(tag, "", "", LSR.EXPLICIT_LSR);
+ }
+ return makeMaximizedLsr(locale.getLanguage(), locale.getScript(), locale.getCountry(),
+ locale.getVariant(), false);
+ }
+
+ private LSR makeMaximizedLsr(String language, String script, String region, String variant, boolean returnInputIfUnmatch) {
+ // Handle pseudolocales like en-XA, ar-XB, fr-PSCRACK.
+ // They should match only themselves,
+ // not other locales with what looks like the same language and script subtags.
+ if (!returnInputIfUnmatch) {
+ if (region.length() == 2 && region.charAt(0) == 'X') {
+ switch (region.charAt(1)) {
+ case 'A':
+ return new LSR(PSEUDO_ACCENTS_PREFIX + language,
+ PSEUDO_ACCENTS_PREFIX + script, region, LSR.EXPLICIT_LSR);
+ case 'B':
+ return new LSR(PSEUDO_BIDI_PREFIX + language,
+ PSEUDO_BIDI_PREFIX + script, region, LSR.EXPLICIT_LSR);
+ case 'C':
+ return new LSR(PSEUDO_CRACKED_PREFIX + language,
+ PSEUDO_CRACKED_PREFIX + script, region, LSR.EXPLICIT_LSR);
+ default: // normal locale
+ break;
+ }
+ }
+
+ if (variant.startsWith("PS")) {
+ int lsrFlags = region.isEmpty() ?
+ LSR.EXPLICIT_LANGUAGE | LSR.EXPLICIT_SCRIPT : LSR.EXPLICIT_LSR;
+ switch (variant) {
+ case "PSACCENT":
+ return new LSR(PSEUDO_ACCENTS_PREFIX + language,
+ PSEUDO_ACCENTS_PREFIX + script,
+ region.isEmpty() ? "XA" : region, lsrFlags);
+ case "PSBIDI":
+ return new LSR(PSEUDO_BIDI_PREFIX + language,
+ PSEUDO_BIDI_PREFIX + script,
+ region.isEmpty() ? "XB" : region, lsrFlags);
+ case "PSCRACK":
+ return new LSR(PSEUDO_CRACKED_PREFIX + language,
+ PSEUDO_CRACKED_PREFIX + script,
+ region.isEmpty() ? "XC" : region, lsrFlags);
+ default: // normal locale
+ break;
+ }
+ }
+ }
+
+ language = getCanonical(languageAliases, language);
+ // (We have no script mappings.)
+ region = getCanonical(regionAliases, region);
+ return maximize(language, script, region, returnInputIfUnmatch);
+ }
+
+ /**
+ * Helper method to find out a region is a macroregion
+ */
+ private boolean isMacroregion(String region) {
+ Region.RegionType type = Region.getInstance(region).getType();
+ return type == Region.RegionType.WORLD ||
+ type == Region.RegionType.CONTINENT ||
+ type == Region.RegionType.SUBCONTINENT ;
+ }
+
+ /**
+ * Raw access to addLikelySubtags. Input must be in canonical format, eg "en", not "eng" or "EN".
+ */
+ private LSR maximize(String language, String script, String region, boolean returnInputIfUnmatch) {
+ if (language.equals("und")) {
+ language = "";
+ }
+ if (script.equals("Zzzz")) {
+ script = "";
+ }
+ if (region.equals("ZZ")) {
+ region = "";
+ }
+ if (!script.isEmpty() && !region.isEmpty() && !language.isEmpty()) {
+ return new LSR(language, script, region, LSR.EXPLICIT_LSR); // already maximized
+ }
+
+ boolean retainLanguage = false;
+ boolean retainScript = false;
+ boolean retainRegion = false;
+ BytesTrie iter = new BytesTrie(trie);
+ long state;
+ int value;
+ // Small optimization: Array lookup for first language letter.
+ int c0;
+ if (language.length() >= 2 && 0 <= (c0 = language.charAt(0) - 'a') && c0 <= 25 &&
+ (state = trieFirstLetterStates[c0]) != 0) {
+ value = trieNext(iter.resetToState64(state), language, 1);
+ } else {
+ value = trieNext(iter, language, 0);
+ }
+ boolean matchLanguage = (value >= 0);
+ boolean matchScript = false;
+ if (value >= 0) {
+ retainLanguage = ! language.isEmpty();
+ state = iter.getState64();
+ } else {
+ retainLanguage = true;
+ iter.resetToState64(trieUndState); // "und" ("*")
+ state = 0;
+ }
+
+ if (value >= 0 && !script.isEmpty()) {
+ matchScript = true;
+ }
+ if (value > 0) {
+ // Intermediate or final value from just language.
+ if (value == SKIP_SCRIPT) {
+ value = 0;
+ }
+ retainScript = ! script.isEmpty();
+ } else {
+ value = trieNext(iter, script, 0);
+ if (value >= 0) {
+ retainScript = ! script.isEmpty();
+ state = iter.getState64();
+ } else {
+ retainScript = true;
+ if (state == 0) {
+ iter.resetToState64(trieUndZzzzState); // "und-Zzzz" ("**")
+ } else {
+ iter.resetToState64(state);
+ value = trieNext(iter, "", 0);
+ assert value >= 0;
+ state = iter.getState64();
+ }
+ }
+ }
+
+ boolean matchRegion = false;
+ if (value > 0) {
+ // Final value from just language or language+script.
+ retainRegion = ! region.isEmpty();
+ } else {
+ value = trieNext(iter, region, 0);
+ if (value >= 0) {
+ if (!region.isEmpty() && !isMacroregion(region)) {
+ retainRegion = true;
+ matchRegion = true;
+ }
+ } else {
+ retainRegion = true;
+ if (state == 0) {
+ value = defaultLsrIndex;
+ } else {
+ iter.resetToState64(state);
+ value = trieNext(iter, "", 0);
+ assert value > 0;
+ }
+ }
+ }
+ LSR result = lsrs[value];
+
+ if (returnInputIfUnmatch &&
+ (!(matchLanguage || matchScript || (matchRegion && language.isEmpty())))) {
+ return new LSR("", "", "", LSR.EXPLICIT_LSR); // no matching.
+ }
+ if (language.isEmpty()) {
+ language = "und";
+ }
+
+ if (! (retainLanguage || retainScript || retainRegion)) {
+ assert result.flags == LSR.IMPLICIT_LSR;
+ return result;
+ }
+ if (!retainLanguage) {
+ language = result.language;
+ }
+ if (!retainScript) {
+ script = result.script;
+ }
+ if (!retainRegion) {
+ region = result.region;
+ }
+ int retainMask = (retainLanguage ? 4 : 0) + (retainScript ? 2 : 0) + (retainRegion ? 1 : 0);
+ // retainOldMask flags = LSR explicit-subtag flags
+ return new LSR(language, script, region, retainMask);
+ }
+
+ /**
+ * Tests whether lsr is "more likely" than other.
+ * For example, fr-Latn-FR is more likely than fr-Latn-CH because
+ * FR is the default region for fr-Latn.
+ *
+ * <p>The likelyInfo caches lookup information between calls.
+ * The return value is an updated likelyInfo value,
+ * with bit 0 set if lsr is "more likely".
+ * The initial value of likelyInfo must be negative.
+ */
+ int compareLikely(LSR lsr, LSR other, int likelyInfo) {
+ // If likelyInfo >= 0:
+ // likelyInfo bit 1 is set if the previous comparison with lsr
+ // was for equal language and script.
+ // Otherwise the scripts differed.
+ if (!lsr.language.equals(other.language)) {
+ return 0xfffffffc; // negative, lsr not better than other
+ }
+ if (!lsr.script.equals(other.script)) {
+ int index;
+ if (likelyInfo >= 0 && (likelyInfo & 2) == 0) {
+ index = likelyInfo >> 2;
+ } else {
+ index = getLikelyIndex(lsr.language, "");
+ likelyInfo = index << 2;
+ }
+ LSR likely = lsrs[index];
+ if (lsr.script.equals(likely.script)) {
+ return likelyInfo | 1;
+ } else {
+ return likelyInfo & ~1;
+ }
+ }
+ if (!lsr.region.equals(other.region)) {
+ int index;
+ if (likelyInfo >= 0 && (likelyInfo & 2) != 0) {
+ index = likelyInfo >> 2;
+ } else {
+ index = getLikelyIndex(lsr.language, lsr.region);
+ likelyInfo = (index << 2) | 2;
+ }
+ LSR likely = lsrs[index];
+ if (lsr.region.equals(likely.region)) {
+ return likelyInfo | 1;
+ } else {
+ return likelyInfo & ~1;
+ }
+ }
+ return likelyInfo & ~1; // lsr not better than other
+ }
+
+ // Subset of maximize().
+ private int getLikelyIndex(String language, String script) {
+ if (language.equals("und")) {
+ language = "";
+ }
+ if (script.equals("Zzzz")) {
+ script = "";
+ }
+
+ BytesTrie iter = new BytesTrie(trie);
+ long state;
+ int value;
+ // Small optimization: Array lookup for first language letter.
+ int c0;
+ if (language.length() >= 2 && 0 <= (c0 = language.charAt(0) - 'a') && c0 <= 25 &&
+ (state = trieFirstLetterStates[c0]) != 0) {
+ value = trieNext(iter.resetToState64(state), language, 1);
+ } else {
+ value = trieNext(iter, language, 0);
+ }
+ if (value >= 0) {
+ state = iter.getState64();
+ } else {
+ iter.resetToState64(trieUndState); // "und" ("*")
+ state = 0;
+ }
+
+ if (value > 0) {
+ // Intermediate or final value from just language.
+ if (value == SKIP_SCRIPT) {
+ value = 0;
+ }
+ } else {
+ value = trieNext(iter, script, 0);
+ if (value >= 0) {
+ state = iter.getState64();
+ } else {
+ if (state == 0) {
+ iter.resetToState64(trieUndZzzzState); // "und-Zzzz" ("**")
+ } else {
+ iter.resetToState64(state);
+ value = trieNext(iter, "", 0);
+ assert value >= 0;
+ state = iter.getState64();
+ }
+ }
+ }
+
+ if (value > 0) {
+ // Final value from just language or language+script.
+ } else {
+ value = trieNext(iter, "", 0);
+ assert value > 0;
+ }
+ return value;
+ }
+
+ private static final int trieNext(BytesTrie iter, String s, int i) {
+ BytesTrie.Result result;
+ if (s.isEmpty()) {
+ result = iter.next('*');
+ } else {
+ int end = s.length() - 1;
+ for (;; ++i) {
+ int c = s.charAt(i);
+ if (i < end) {
+ if (!iter.next(c).hasNext()) {
+ return -1;
+ }
+ } else {
+ // last character of this subtag
+ result = iter.next(c | 0x80);
+ break;
+ }
+ }
+ }
+ switch (result) {
+ case NO_MATCH: return -1;
+ case NO_VALUE: return 0;
+ case INTERMEDIATE_VALUE:
+ assert iter.getValue() == SKIP_SCRIPT;
+ return SKIP_SCRIPT;
+ case FINAL_VALUE: return iter.getValue();
+ default: return -1;
+ }
+ }
+
+ public LSR minimizeSubtags(String languageIn, String scriptIn, String regionIn,
+ ULocale.Minimize fieldToFavor) {
+ LSR max = maximize(languageIn, scriptIn, regionIn, true);
+ if (max.language.isEmpty() && max.region.isEmpty() && max.script.isEmpty()) {
+ // Cannot match, return as is
+ return new LSR(languageIn, scriptIn, regionIn, LSR.EXPLICIT_LSR);
+ }
+ LSR test = maximize(max.language, "", "", true);
+ if (test.isEquivalentTo(max)) {
+ return new LSR(max.language, "", "", LSR.DONT_CARE_FLAGS);
+ }
+ if (ULocale.Minimize.FAVOR_REGION == fieldToFavor) {
+ test = maximize(max.language, "", max.region, true);
+ if (test.isEquivalentTo(max)) {
+ return new LSR(max.language, "", max.region, LSR.DONT_CARE_FLAGS);
+ }
+ test = maximize(max.language, max.script, "", true);
+ if (test.isEquivalentTo(max)) {
+ return new LSR(max.language, max.script, "", LSR.DONT_CARE_FLAGS);
+ }
+ } else {
+ test = maximize(max.language, max.script, "", true);
+ if (test.isEquivalentTo(max)) {
+ return new LSR(max.language, max.script, "", LSR.DONT_CARE_FLAGS);
+ }
+ test = maximize(max.language, "", max.region, true);
+ if (test.isEquivalentTo(max)) {
+ return new LSR(max.language, "", max.region, LSR.DONT_CARE_FLAGS);
+ }
+ }
+ return new LSR(max.language, max.script, max.region, LSR.DONT_CARE_FLAGS);
+ }
+
+ private Map<String, LSR> getTable() {
+ Map<String, LSR> map = new TreeMap<>();
+ StringBuilder sb = new StringBuilder();
+ for (BytesTrie.Entry entry : trie) {
+ sb.setLength(0);
+ int length = entry.bytesLength();
+ for (int i = 0; i < length;) {
+ byte b = entry.byteAt(i++);
+ if (b == '*') {
+ sb.append("*-");
+ } else if (b >= 0) {
+ sb.append((char) b);
+ } else { // end of subtag
+ sb.append((char) (b & 0x7f)).append('-');
+ }
+ }
+ assert sb.length() > 0 && sb.charAt(sb.length() - 1) == '-';
+ sb.setLength(sb.length() - 1);
+ map.put(sb.toString(), lsrs[entry.value]);
+ }
+ return map;
+ }
+
+ @Override
+ public String toString() {
+ return getTable().toString();
+ }
+}