diff options
Diffstat (limited to 'android/text/Hyphenator.java')
-rw-r--r-- | android/text/Hyphenator.java | 251 |
1 files changed, 247 insertions, 4 deletions
diff --git a/android/text/Hyphenator.java b/android/text/Hyphenator.java index 4f1488e1..ad26f23a 100644 --- a/android/text/Hyphenator.java +++ b/android/text/Hyphenator.java @@ -16,15 +16,258 @@ package android.text; +import android.annotation.IntRange; +import android.annotation.NonNull; +import android.annotation.Nullable; +import android.system.ErrnoException; +import android.system.Os; +import android.system.OsConstants; +import android.util.Log; + +import com.android.internal.annotations.GuardedBy; + +import java.io.File; +import java.io.IOException; +import java.io.RandomAccessFile; +import java.util.HashMap; +import java.util.Locale; + /** - * Hyphenator just initializes the native implementation of automatic hyphenation, + * Hyphenator is a wrapper class for a native implementation of automatic hyphenation, * in essence finding valid hyphenation opportunities in a word. * * @hide */ public class Hyphenator { - public static void init() { - nInit(); + private static String TAG = "Hyphenator"; + + private final static Object sLock = new Object(); + + @GuardedBy("sLock") + final static HashMap<Locale, Hyphenator> sMap = new HashMap<Locale, Hyphenator>(); + + private final long mNativePtr; + private final HyphenationData mData; + + private Hyphenator(long nativePtr, HyphenationData data) { + mNativePtr = nativePtr; + mData = data; + } + + public long getNativePtr() { + return mNativePtr; } - private static native void nInit(); + + public static Hyphenator get(@Nullable Locale locale) { + synchronized (sLock) { + Hyphenator result = sMap.get(locale); + if (result != null) { + return result; + } + + // If there's a variant, fall back to language+variant only, if available + final String variant = locale.getVariant(); + if (!variant.isEmpty()) { + final Locale languageAndVariantOnlyLocale = + new Locale(locale.getLanguage(), "", variant); + result = sMap.get(languageAndVariantOnlyLocale); + if (result != null) { + return putAlias(locale, result); + } + } + + // Fall back to language-only, if available + final Locale languageOnlyLocale = new Locale(locale.getLanguage()); + result = sMap.get(languageOnlyLocale); + if (result != null) { + return putAlias(locale, result); + } + + // Fall back to script-only, if available + final String script = locale.getScript(); + if (!script.equals("")) { + final Locale scriptOnlyLocale = new Locale.Builder() + .setLanguage("und") + .setScript(script) + .build(); + result = sMap.get(scriptOnlyLocale); + if (result != null) { + return putAlias(locale, result); + } + } + + return putEmptyAlias(locale); + } + } + + private static class HyphenationData { + private static final String SYSTEM_HYPHENATOR_LOCATION = "/system/usr/hyphen-data"; + + public final int mMinPrefix, mMinSuffix; + public final long mDataAddress; + + // Reasonable enough values for cases where we have no hyphenation patterns but may be able + // to do some automatic hyphenation based on characters. These values would be used very + // rarely. + private static final int DEFAULT_MIN_PREFIX = 2; + private static final int DEFAULT_MIN_SUFFIX = 2; + + public static final HyphenationData sEmptyData = + new HyphenationData(DEFAULT_MIN_PREFIX, DEFAULT_MIN_SUFFIX); + + // Create empty HyphenationData. + private HyphenationData(int minPrefix, int minSuffix) { + mMinPrefix = minPrefix; + mMinSuffix = minSuffix; + mDataAddress = 0; + } + + HyphenationData(String languageTag, int minPrefix, int minSuffix) { + mMinPrefix = minPrefix; + mMinSuffix = minSuffix; + + final String patternFilename = "hyph-" + languageTag.toLowerCase(Locale.US) + ".hyb"; + final File patternFile = new File(SYSTEM_HYPHENATOR_LOCATION, patternFilename); + if (!patternFile.canRead()) { + Log.e(TAG, "hyphenation patterns for " + patternFile + " not found or unreadable"); + mDataAddress = 0; + } else { + long address; + try (RandomAccessFile f = new RandomAccessFile(patternFile, "r")) { + address = Os.mmap(0, f.length(), OsConstants.PROT_READ, + OsConstants.MAP_SHARED, f.getFD(), 0 /* offset */); + } catch (IOException | ErrnoException e) { + Log.e(TAG, "error loading hyphenation " + patternFile, e); + address = 0; + } + mDataAddress = address; + } + } + } + + // Do not call this method outside of init method. + private static Hyphenator putNewHyphenator(Locale loc, HyphenationData data) { + final Hyphenator hyphenator = new Hyphenator(nBuildHyphenator( + data.mDataAddress, loc.getLanguage(), data.mMinPrefix, data.mMinSuffix), data); + sMap.put(loc, hyphenator); + return hyphenator; + } + + // Do not call this method outside of init method. + private static void loadData(String langTag, int minPrefix, int maxPrefix) { + final HyphenationData data = new HyphenationData(langTag, minPrefix, maxPrefix); + putNewHyphenator(Locale.forLanguageTag(langTag), data); + } + + // Caller must acquire sLock before calling this method. + // The Hyphenator for the baseLangTag must exists. + private static Hyphenator addAliasByTag(String langTag, String baseLangTag) { + return putAlias(Locale.forLanguageTag(langTag), + sMap.get(Locale.forLanguageTag(baseLangTag))); + } + + // Caller must acquire sLock before calling this method. + private static Hyphenator putAlias(Locale locale, Hyphenator base) { + return putNewHyphenator(locale, base.mData); + } + + // Caller must acquire sLock before calling this method. + private static Hyphenator putEmptyAlias(Locale locale) { + return putNewHyphenator(locale, HyphenationData.sEmptyData); + } + + // TODO: Confirm that these are the best values. Various sources suggest (1, 1), but + // that appears too small. + private static final int INDIC_MIN_PREFIX = 2; + private static final int INDIC_MIN_SUFFIX = 2; + + /** + * Load hyphenation patterns at initialization time. We want to have patterns + * for all locales loaded and ready to use so we don't have to do any file IO + * on the UI thread when drawing text in different locales. + * + * @hide + */ + public static void init() { + synchronized (sLock) { + sMap.put(null, null); + + loadData("as", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Assamese + loadData("bg", 2, 2); // Bulgarian + loadData("bn", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Bengali + loadData("cu", 1, 2); // Church Slavonic + loadData("cy", 2, 3); // Welsh + loadData("da", 2, 2); // Danish + loadData("de-1901", 2, 2); // German 1901 orthography + loadData("de-1996", 2, 2); // German 1996 orthography + loadData("de-CH-1901", 2, 2); // Swiss High German 1901 orthography + loadData("en-GB", 2, 3); // British English + loadData("en-US", 2, 3); // American English + loadData("es", 2, 2); // Spanish + loadData("et", 2, 3); // Estonian + loadData("eu", 2, 2); // Basque + loadData("fr", 2, 3); // French + loadData("ga", 2, 3); // Irish + loadData("gu", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Gujarati + loadData("hi", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Hindi + loadData("hr", 2, 2); // Croatian + loadData("hu", 2, 2); // Hungarian + // texhyphen sources say Armenian may be (1, 2); but that it needs confirmation. + // Going with a more conservative value of (2, 2) for now. + loadData("hy", 2, 2); // Armenian + loadData("kn", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Kannada + loadData("ml", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Malayalam + loadData("mn-Cyrl", 2, 2); // Mongolian in Cyrillic script + loadData("mr", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Marathi + loadData("nb", 2, 2); // Norwegian Bokmål + loadData("nn", 2, 2); // Norwegian Nynorsk + loadData("or", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Oriya + loadData("pa", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Punjabi + loadData("pt", 2, 3); // Portuguese + loadData("sl", 2, 2); // Slovenian + loadData("ta", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Tamil + loadData("te", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Telugu + loadData("tk", 2, 2); // Turkmen + loadData("und-Ethi", 1, 1); // Any language in Ethiopic script + + // English locales that fall back to en-US. The data is + // from CLDR. It's all English locales, minus the locales whose + // parent is en-001 (from supplementalData.xml, under <parentLocales>). + // TODO: Figure out how to get this from ICU. + addAliasByTag("en-AS", "en-US"); // English (American Samoa) + addAliasByTag("en-GU", "en-US"); // English (Guam) + addAliasByTag("en-MH", "en-US"); // English (Marshall Islands) + addAliasByTag("en-MP", "en-US"); // English (Northern Mariana Islands) + addAliasByTag("en-PR", "en-US"); // English (Puerto Rico) + addAliasByTag("en-UM", "en-US"); // English (United States Minor Outlying Islands) + addAliasByTag("en-VI", "en-US"); // English (Virgin Islands) + + // All English locales other than those falling back to en-US are mapped to en-GB. + addAliasByTag("en", "en-GB"); + + // For German, we're assuming the 1996 (and later) orthography by default. + addAliasByTag("de", "de-1996"); + // Liechtenstein uses the Swiss hyphenation rules for the 1901 orthography. + addAliasByTag("de-LI-1901", "de-CH-1901"); + + // Norwegian is very probably Norwegian Bokmål. + addAliasByTag("no", "nb"); + + // Use mn-Cyrl. According to CLDR's likelySubtags.xml, mn is most likely to be mn-Cyrl. + addAliasByTag("mn", "mn-Cyrl"); // Mongolian + + // Fall back to Ethiopic script for languages likely to be written in Ethiopic. + // Data is from CLDR's likelySubtags.xml. + // TODO: Convert this to a mechanism using ICU4J's ULocale#addLikelySubtags(). + addAliasByTag("am", "und-Ethi"); // Amharic + addAliasByTag("byn", "und-Ethi"); // Blin + addAliasByTag("gez", "und-Ethi"); // Geʻez + addAliasByTag("ti", "und-Ethi"); // Tigrinya + addAliasByTag("wal", "und-Ethi"); // Wolaytta + } + }; + + private static native long nBuildHyphenator(/* non-zero */ long dataAddress, + @NonNull String langTag, @IntRange(from = 1) int minPrefix, + @IntRange(from = 1) int minSuffix); } |