Snap for 10117275 from a0c400f4c7f34b2e92b61c672fa3c483387fcb4a to udc-d1-releaseandroid-14.0.0_r9 android-14.0.0_r8 android-14.0.0_r7 android-14.0.0_r6 android-14.0.0_r5 android-14.0.0_r4 android-14.0.0_r3 android-14.0.0_r12 android-14.0.0_r11 android-14.0.0_r10 android14-d1-s7-release android14-d1-s6-release android14-d1-s5-release android14-d1-s4-release android14-d1-s3-release android14-d1-s2-release android14-d1-s1-release android14-d1-release

Change-Id: Ibe21e9b4bc0837b0a1a991f3763143b04440035a
author: Android Build Coastguard Worker <android-build-coastguard-worker@google.com> 2023-05-11 23:23:03 +0000
committer: Android Build Coastguard Worker <android-build-coastguard-worker@google.com> 2023-05-11 23:23:03 +0000
commit: b44defbff537fa99314e89d609276b849dd55a38 (patch)
tree: c2bbaabcdf91cd5359247e70a9fd8af5c355507d
parent: 137290a2b6df7c6223b29d1a0afb0f4ba8026d70 (diff)
parent: a0c400f4c7f34b2e92b61c672fa3c483387fcb4a (diff)
download: minikin-android14-d1-s2-release.tar.gz
3 files changed, 26 insertions, 57 deletions
diff --git a/include/minikin/Hyphenator.h b/include/minikin/Hyphenator.h
index 47c1648..da265f0 100644
--- a/include/minikin/Hyphenator.h
+++ b/include/minikin/Hyphenator.h
@@ -23,7 +23,6 @@
 
 #include <string>
 #include <vector>
-#include <unicode/uscript.h>
 
 #include "minikin/Characters.h"
 #include "minikin/U16StringPiece.h"
@@ -64,12 +63,7 @@ enum class HyphenationType : uint8_t {
     // Break the line, insert a ZWJ and hyphen at the first line, and a ZWJ at the second line.
     // This is used in Arabic script, mostly for writing systems of Central Asia. It's our default
     // behavior when a soft hyphen is used in Arabic script.
-    BREAK_AND_INSERT_HYPHEN_AND_ZWJ = 8,
-    // Break the line, and insert hyphen at current line and the beginning of the next line. It's
-    // very similar to BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE. BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE
-    // is for the words which included hyphenator but this is used to actively hyphenate words
-    // based on the pattern.
-    BREAK_AND_INSERT_HYPHEN_AT_CURRENT_AND_NEXT_LINE = 9
+    BREAK_AND_INSERT_HYPHEN_AND_ZWJ = 8
 };
 
 // The hyphen edit represents an edit to the string when a word is hyphenated.
@@ -236,11 +230,6 @@ private:
     void hyphenateFromCodes(const uint16_t* codes, size_t len, HyphenationType hyphenValue,
                             HyphenationType* out) const;
 
-    HyphenationType hyphenationTypeBasedOnScriptAndLocale(uint32_t codePoint) const;
-
-    bool isRepeatHyphen(UScriptCode script,
-                        HyphenationLocale locale) const;
-
     // See also LONGEST_HYPHENATED_WORD in LineBreaker.cpp. Here the constant is used so
     // that temporary buffers can be stack-allocated without waste, which is a slightly
     // different use case. It measures UTF-16 code units.
diff --git a/libs/minikin/Hyphenator.cpp b/libs/minikin/Hyphenator.cpp
index 098e6e8..c755a87 100644
--- a/libs/minikin/Hyphenator.cpp
+++ b/libs/minikin/Hyphenator.cpp
@@ -158,7 +158,6 @@ bool Hyphenator::isLineBreakingHyphen(uint32_t c) {
 EndHyphenEdit editForThisLine(HyphenationType type) {
     switch (type) {
         case HyphenationType::BREAK_AND_INSERT_HYPHEN:
-        case HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_CURRENT_AND_NEXT_LINE:
             return EndHyphenEdit::INSERT_HYPHEN;
         case HyphenationType::BREAK_AND_INSERT_ARMENIAN_HYPHEN:
             return EndHyphenEdit::INSERT_ARMENIAN_HYPHEN;
@@ -179,7 +178,6 @@ EndHyphenEdit editForThisLine(HyphenationType type) {
 StartHyphenEdit editForNextLine(HyphenationType type) {
     switch (type) {
         case HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE:
-        case HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_CURRENT_AND_NEXT_LINE:
             return StartHyphenEdit::INSERT_HYPHEN;
         case HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ:
             return StartHyphenEdit::INSERT_ZWJ;
@@ -199,6 +197,25 @@ static UScriptCode getScript(uint32_t codePoint) {
     }
 }
 
+static HyphenationType hyphenationTypeBasedOnScript(uint32_t codePoint) {
+    // Note: It's not clear what the best hyphen for Hebrew is. While maqaf is the "correct" hyphen
+    // for Hebrew, modern practice may have shifted towards Western hyphens. We use normal hyphens
+    // for now to be safe.  BREAK_AND_INSERT_MAQAF is already implemented, so if we want to switch
+    // to maqaf for Hebrew, we can simply add a condition here.
+    const UScriptCode script = getScript(codePoint);
+    if (script == USCRIPT_KANNADA || script == USCRIPT_MALAYALAM || script == USCRIPT_TAMIL ||
+        script == USCRIPT_TELUGU) {
+        // Grantha is not included, since we don't support non-BMP hyphenation yet.
+        return HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN;
+    } else if (script == USCRIPT_ARMENIAN) {
+        return HyphenationType::BREAK_AND_INSERT_ARMENIAN_HYPHEN;
+    } else if (script == USCRIPT_CANADIAN_ABORIGINAL) {
+        return HyphenationType::BREAK_AND_INSERT_UCAS_HYPHEN;
+    } else {
+        return HyphenationType::BREAK_AND_INSERT_HYPHEN;
+    }
+}
+
 static inline int32_t getJoiningType(UChar32 codepoint) {
     return u_getIntPropertyValue(codepoint, UCHAR_JOINING_TYPE);
 }
@@ -227,27 +244,6 @@ static inline HyphenationType getHyphTypeForArabic(const U16StringPiece& word, s
     return HyphenationType::BREAK_AND_INSERT_HYPHEN;
 }
 
-HyphenationType Hyphenator::hyphenationTypeBasedOnScriptAndLocale(uint32_t codePoint) const {
-    // Note: It's not clear what the best hyphen for Hebrew is. While maqaf is the "correct" hyphen
-    // for Hebrew, modern practice may have shifted towards Western hyphens. We use normal hyphens
-    // for now to be safe.  BREAK_AND_INSERT_MAQAF is already implemented, so if we want to switch
-    // to maqaf for Hebrew, we can simply add a condition here.
-    const UScriptCode script = getScript(codePoint);
-    if (script == USCRIPT_KANNADA || script == USCRIPT_MALAYALAM || script == USCRIPT_TAMIL ||
-        script == USCRIPT_TELUGU) {
-        // Grantha is not included, since we don't support non-BMP hyphenation yet.
-        return HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN;
-    } else if (script == USCRIPT_ARMENIAN) {
-        return HyphenationType::BREAK_AND_INSERT_ARMENIAN_HYPHEN;
-    } else if (script == USCRIPT_CANADIAN_ABORIGINAL) {
-        return HyphenationType::BREAK_AND_INSERT_UCAS_HYPHEN;
-    } else if (isRepeatHyphen(script, mHyphenationLocale)) {
-        return HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_CURRENT_AND_NEXT_LINE;
-    } else {
-        return HyphenationType::BREAK_AND_INSERT_HYPHEN;
-    }
-}
-
 // Use various recommendations of UAX #14 Unicode Line Breaking Algorithm for hyphenating words
 // that didn't match patterns, especially words that contain hyphens or soft hyphens (See sections
 // 5.3, Use of Hyphen, and 5.4, Use of Soft Hyphen).
@@ -259,7 +255,9 @@ void Hyphenator::hyphenateWithNoPatterns(const U16StringPiece& word, Hyphenation
             // Break after hyphens, but only if they don't start the word.
 
             if ((prevChar == CHAR_HYPHEN_MINUS || prevChar == CHAR_HYPHEN) &&
-                isRepeatHyphen(getScript(word[i]), mHyphenationLocale)) {
+                (mHyphenationLocale == HyphenationLocale::POLISH ||
+                 mHyphenationLocale == HyphenationLocale::SLOVENIAN) &&
+                getScript(word[i]) == USCRIPT_LATIN) {
                 // In Polish and Slovenian, hyphens get repeated at the next line. To be safe,
                 // we will do this only if the next character is Latin.
                 out[i] = HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE;
@@ -275,7 +273,7 @@ void Hyphenator::hyphenateWithNoPatterns(const U16StringPiece& word, Hyphenation
                 // actually join. If they don't, we'll just insert a normal hyphen.
                 out[i] = getHyphTypeForArabic(word, i);
             } else {
-                out[i] = hyphenationTypeBasedOnScriptAndLocale(word[i]);
+                out[i] = hyphenationTypeBasedOnScript(word[i]);
             }
         } else if (prevChar == CHAR_MIDDLE_DOT && mMinPrefix < i && i <= word.size() - mMinSuffix &&
                    ((word[i - 2] == 'l' && word[i] == 'l') ||
@@ -311,7 +309,7 @@ HyphenationType Hyphenator::alphabetLookup(uint16_t* alpha_codes,
                 return HyphenationType::DONT_BREAK;
             }
             if (result == HyphenationType::BREAK_AND_INSERT_HYPHEN) {
-                result = hyphenationTypeBasedOnScriptAndLocale(c);
+                result = hyphenationTypeBasedOnScript(c);
             }
             alpha_codes[i + 1] = code;
         }
@@ -334,7 +332,7 @@ HyphenationType Hyphenator::alphabetLookup(uint16_t* alpha_codes,
                 return HyphenationType::DONT_BREAK;
             }
             if (result == HyphenationType::BREAK_AND_INSERT_HYPHEN) {
-                result = hyphenationTypeBasedOnScriptAndLocale(c);
+                result = hyphenationTypeBasedOnScript(c);
             }
             alpha_codes[i + 1] = AlphabetTable1::value(entry);
         }
@@ -400,10 +398,4 @@ void Hyphenator::hyphenateFromCodes(const uint16_t* codes, size_t len, Hyphenati
     }
 }
 
-bool Hyphenator::isRepeatHyphen(UScriptCode script,
-                                HyphenationLocale locale) const {
-    return script == USCRIPT_LATIN &&
-            (locale == HyphenationLocale::POLISH ||locale == HyphenationLocale::SLOVENIAN);
-}
-
 }  // namespace minikin
diff --git a/tests/unittest/HyphenatorTest.cpp b/tests/unittest/HyphenatorTest.cpp
index 5936146..14c3ea3 100644
--- a/tests/unittest/HyphenatorTest.cpp
+++ b/tests/unittest/HyphenatorTest.cpp
@@ -121,18 +121,6 @@ TEST(HyphenatorTest, polishEnDash) {
     EXPECT_EQ(HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN, result[2]);
 }
 
-// If there is a soft hyphen in Polish, the hyphen should be repeated on the next line.
-TEST(HyphenatorTest, polishSoftHyphen) {
-    Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "pl");
-    const uint16_t word[] = {'x', SOFT_HYPHEN, 'y'};
-    std::vector<HyphenationType> result;
-    hyphenator->hyphenate(word, &result);
-    EXPECT_EQ((size_t)3, result.size());
-    EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
-    EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
-    EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_CURRENT_AND_NEXT_LINE, result[2]);
-}
-
 // If we break on a hyphen in Slovenian, the hyphen should be repeated on the next line. (Same as
 // Polish.)
 TEST(HyphenatorTest, slovenianHyphen) {
author	Android Build Coastguard Worker <android-build-coastguard-worker@google.com>	2023-05-11 23:23:03 +0000
committer	Android Build Coastguard Worker <android-build-coastguard-worker@google.com>	2023-05-11 23:23:03 +0000
commit	b44defbff537fa99314e89d609276b849dd55a38 (patch)
tree	c2bbaabcdf91cd5359247e70a9fd8af5c355507d
parent	137290a2b6df7c6223b29d1a0afb0f4ba8026d70 (diff)
parent	a0c400f4c7f34b2e92b61c672fa3c483387fcb4a (diff)
download	minikin-android14-d1-s2-release.tar.gz