summaryrefslogtreecommitdiff
path: root/android/text/Hyphenator.java
diff options
context:
space:
mode:
Diffstat (limited to 'android/text/Hyphenator.java')
-rw-r--r--android/text/Hyphenator.java251
1 files changed, 247 insertions, 4 deletions
diff --git a/android/text/Hyphenator.java b/android/text/Hyphenator.java
index 4f1488e1..ad26f23a 100644
--- a/android/text/Hyphenator.java
+++ b/android/text/Hyphenator.java
@@ -16,15 +16,258 @@
package android.text;
+import android.annotation.IntRange;
+import android.annotation.NonNull;
+import android.annotation.Nullable;
+import android.system.ErrnoException;
+import android.system.Os;
+import android.system.OsConstants;
+import android.util.Log;
+
+import com.android.internal.annotations.GuardedBy;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.RandomAccessFile;
+import java.util.HashMap;
+import java.util.Locale;
+
/**
- * Hyphenator just initializes the native implementation of automatic hyphenation,
+ * Hyphenator is a wrapper class for a native implementation of automatic hyphenation,
* in essence finding valid hyphenation opportunities in a word.
*
* @hide
*/
public class Hyphenator {
- public static void init() {
- nInit();
+ private static String TAG = "Hyphenator";
+
+ private final static Object sLock = new Object();
+
+ @GuardedBy("sLock")
+ final static HashMap<Locale, Hyphenator> sMap = new HashMap<Locale, Hyphenator>();
+
+ private final long mNativePtr;
+ private final HyphenationData mData;
+
+ private Hyphenator(long nativePtr, HyphenationData data) {
+ mNativePtr = nativePtr;
+ mData = data;
+ }
+
+ public long getNativePtr() {
+ return mNativePtr;
}
- private static native void nInit();
+
+ public static Hyphenator get(@Nullable Locale locale) {
+ synchronized (sLock) {
+ Hyphenator result = sMap.get(locale);
+ if (result != null) {
+ return result;
+ }
+
+ // If there's a variant, fall back to language+variant only, if available
+ final String variant = locale.getVariant();
+ if (!variant.isEmpty()) {
+ final Locale languageAndVariantOnlyLocale =
+ new Locale(locale.getLanguage(), "", variant);
+ result = sMap.get(languageAndVariantOnlyLocale);
+ if (result != null) {
+ return putAlias(locale, result);
+ }
+ }
+
+ // Fall back to language-only, if available
+ final Locale languageOnlyLocale = new Locale(locale.getLanguage());
+ result = sMap.get(languageOnlyLocale);
+ if (result != null) {
+ return putAlias(locale, result);
+ }
+
+ // Fall back to script-only, if available
+ final String script = locale.getScript();
+ if (!script.equals("")) {
+ final Locale scriptOnlyLocale = new Locale.Builder()
+ .setLanguage("und")
+ .setScript(script)
+ .build();
+ result = sMap.get(scriptOnlyLocale);
+ if (result != null) {
+ return putAlias(locale, result);
+ }
+ }
+
+ return putEmptyAlias(locale);
+ }
+ }
+
+ private static class HyphenationData {
+ private static final String SYSTEM_HYPHENATOR_LOCATION = "/system/usr/hyphen-data";
+
+ public final int mMinPrefix, mMinSuffix;
+ public final long mDataAddress;
+
+ // Reasonable enough values for cases where we have no hyphenation patterns but may be able
+ // to do some automatic hyphenation based on characters. These values would be used very
+ // rarely.
+ private static final int DEFAULT_MIN_PREFIX = 2;
+ private static final int DEFAULT_MIN_SUFFIX = 2;
+
+ public static final HyphenationData sEmptyData =
+ new HyphenationData(DEFAULT_MIN_PREFIX, DEFAULT_MIN_SUFFIX);
+
+ // Create empty HyphenationData.
+ private HyphenationData(int minPrefix, int minSuffix) {
+ mMinPrefix = minPrefix;
+ mMinSuffix = minSuffix;
+ mDataAddress = 0;
+ }
+
+ HyphenationData(String languageTag, int minPrefix, int minSuffix) {
+ mMinPrefix = minPrefix;
+ mMinSuffix = minSuffix;
+
+ final String patternFilename = "hyph-" + languageTag.toLowerCase(Locale.US) + ".hyb";
+ final File patternFile = new File(SYSTEM_HYPHENATOR_LOCATION, patternFilename);
+ if (!patternFile.canRead()) {
+ Log.e(TAG, "hyphenation patterns for " + patternFile + " not found or unreadable");
+ mDataAddress = 0;
+ } else {
+ long address;
+ try (RandomAccessFile f = new RandomAccessFile(patternFile, "r")) {
+ address = Os.mmap(0, f.length(), OsConstants.PROT_READ,
+ OsConstants.MAP_SHARED, f.getFD(), 0 /* offset */);
+ } catch (IOException | ErrnoException e) {
+ Log.e(TAG, "error loading hyphenation " + patternFile, e);
+ address = 0;
+ }
+ mDataAddress = address;
+ }
+ }
+ }
+
+ // Do not call this method outside of init method.
+ private static Hyphenator putNewHyphenator(Locale loc, HyphenationData data) {
+ final Hyphenator hyphenator = new Hyphenator(nBuildHyphenator(
+ data.mDataAddress, loc.getLanguage(), data.mMinPrefix, data.mMinSuffix), data);
+ sMap.put(loc, hyphenator);
+ return hyphenator;
+ }
+
+ // Do not call this method outside of init method.
+ private static void loadData(String langTag, int minPrefix, int maxPrefix) {
+ final HyphenationData data = new HyphenationData(langTag, minPrefix, maxPrefix);
+ putNewHyphenator(Locale.forLanguageTag(langTag), data);
+ }
+
+ // Caller must acquire sLock before calling this method.
+ // The Hyphenator for the baseLangTag must exists.
+ private static Hyphenator addAliasByTag(String langTag, String baseLangTag) {
+ return putAlias(Locale.forLanguageTag(langTag),
+ sMap.get(Locale.forLanguageTag(baseLangTag)));
+ }
+
+ // Caller must acquire sLock before calling this method.
+ private static Hyphenator putAlias(Locale locale, Hyphenator base) {
+ return putNewHyphenator(locale, base.mData);
+ }
+
+ // Caller must acquire sLock before calling this method.
+ private static Hyphenator putEmptyAlias(Locale locale) {
+ return putNewHyphenator(locale, HyphenationData.sEmptyData);
+ }
+
+ // TODO: Confirm that these are the best values. Various sources suggest (1, 1), but
+ // that appears too small.
+ private static final int INDIC_MIN_PREFIX = 2;
+ private static final int INDIC_MIN_SUFFIX = 2;
+
+ /**
+ * Load hyphenation patterns at initialization time. We want to have patterns
+ * for all locales loaded and ready to use so we don't have to do any file IO
+ * on the UI thread when drawing text in different locales.
+ *
+ * @hide
+ */
+ public static void init() {
+ synchronized (sLock) {
+ sMap.put(null, null);
+
+ loadData("as", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Assamese
+ loadData("bg", 2, 2); // Bulgarian
+ loadData("bn", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Bengali
+ loadData("cu", 1, 2); // Church Slavonic
+ loadData("cy", 2, 3); // Welsh
+ loadData("da", 2, 2); // Danish
+ loadData("de-1901", 2, 2); // German 1901 orthography
+ loadData("de-1996", 2, 2); // German 1996 orthography
+ loadData("de-CH-1901", 2, 2); // Swiss High German 1901 orthography
+ loadData("en-GB", 2, 3); // British English
+ loadData("en-US", 2, 3); // American English
+ loadData("es", 2, 2); // Spanish
+ loadData("et", 2, 3); // Estonian
+ loadData("eu", 2, 2); // Basque
+ loadData("fr", 2, 3); // French
+ loadData("ga", 2, 3); // Irish
+ loadData("gu", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Gujarati
+ loadData("hi", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Hindi
+ loadData("hr", 2, 2); // Croatian
+ loadData("hu", 2, 2); // Hungarian
+ // texhyphen sources say Armenian may be (1, 2); but that it needs confirmation.
+ // Going with a more conservative value of (2, 2) for now.
+ loadData("hy", 2, 2); // Armenian
+ loadData("kn", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Kannada
+ loadData("ml", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Malayalam
+ loadData("mn-Cyrl", 2, 2); // Mongolian in Cyrillic script
+ loadData("mr", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Marathi
+ loadData("nb", 2, 2); // Norwegian Bokmål
+ loadData("nn", 2, 2); // Norwegian Nynorsk
+ loadData("or", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Oriya
+ loadData("pa", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Punjabi
+ loadData("pt", 2, 3); // Portuguese
+ loadData("sl", 2, 2); // Slovenian
+ loadData("ta", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Tamil
+ loadData("te", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Telugu
+ loadData("tk", 2, 2); // Turkmen
+ loadData("und-Ethi", 1, 1); // Any language in Ethiopic script
+
+ // English locales that fall back to en-US. The data is
+ // from CLDR. It's all English locales, minus the locales whose
+ // parent is en-001 (from supplementalData.xml, under <parentLocales>).
+ // TODO: Figure out how to get this from ICU.
+ addAliasByTag("en-AS", "en-US"); // English (American Samoa)
+ addAliasByTag("en-GU", "en-US"); // English (Guam)
+ addAliasByTag("en-MH", "en-US"); // English (Marshall Islands)
+ addAliasByTag("en-MP", "en-US"); // English (Northern Mariana Islands)
+ addAliasByTag("en-PR", "en-US"); // English (Puerto Rico)
+ addAliasByTag("en-UM", "en-US"); // English (United States Minor Outlying Islands)
+ addAliasByTag("en-VI", "en-US"); // English (Virgin Islands)
+
+ // All English locales other than those falling back to en-US are mapped to en-GB.
+ addAliasByTag("en", "en-GB");
+
+ // For German, we're assuming the 1996 (and later) orthography by default.
+ addAliasByTag("de", "de-1996");
+ // Liechtenstein uses the Swiss hyphenation rules for the 1901 orthography.
+ addAliasByTag("de-LI-1901", "de-CH-1901");
+
+ // Norwegian is very probably Norwegian Bokmål.
+ addAliasByTag("no", "nb");
+
+ // Use mn-Cyrl. According to CLDR's likelySubtags.xml, mn is most likely to be mn-Cyrl.
+ addAliasByTag("mn", "mn-Cyrl"); // Mongolian
+
+ // Fall back to Ethiopic script for languages likely to be written in Ethiopic.
+ // Data is from CLDR's likelySubtags.xml.
+ // TODO: Convert this to a mechanism using ICU4J's ULocale#addLikelySubtags().
+ addAliasByTag("am", "und-Ethi"); // Amharic
+ addAliasByTag("byn", "und-Ethi"); // Blin
+ addAliasByTag("gez", "und-Ethi"); // Geʻez
+ addAliasByTag("ti", "und-Ethi"); // Tigrinya
+ addAliasByTag("wal", "und-Ethi"); // Wolaytta
+ }
+ };
+
+ private static native long nBuildHyphenator(/* non-zero */ long dataAddress,
+ @NonNull String langTag, @IntRange(from = 1) int minPrefix,
+ @IntRange(from = 1) int minSuffix);
}