From 4920747bfd33712251b7a6cda4e6c2c827aeea97 Mon Sep 17 00:00:00 2001
From: Calvin Pan <calvinpan@google.com>
Date: Tue, 2 May 2023 08:01:08 +0000
Subject: Revert "Support pattern in non-standard hyphenation(Polish)"

This reverts commit d70a89113cc104c1d3866f775df90c8373c05463.

Reason for revert: In the past, we followed the linguist's suggestion to repeat the hyphen(b/238164187), but this is not true in Polish.

Bug: 278262732
Change-Id: I7eae75f9461daebeabf1f78fb2905412c108bc60
---
 include/minikin/Hyphenator.h      | 13 +--------
 libs/minikin/Hyphenator.cpp       | 58 +++++++++++++++++----------------------
 tests/unittest/HyphenatorTest.cpp | 12 --------
 3 files changed, 26 insertions(+), 57 deletions(-)
diff --git a/include/minikin/Hyphenator.h b/include/minikin/Hyphenator.h
index 47c1648..da265f0 100644
--- a/include/minikin/Hyphenator.h
+++ b/include/minikin/Hyphenator.h
@@ -23,7 +23,6 @@
 
 #include <string>
 #include <vector>
-#include <unicode/uscript.h>
 
 #include "minikin/Characters.h"
 #include "minikin/U16StringPiece.h"
@@ -64,12 +63,7 @@ enum class HyphenationType : uint8_t {
     // Break the line, insert a ZWJ and hyphen at the first line, and a ZWJ at the second line.
     // This is used in Arabic script, mostly for writing systems of Central Asia. It's our default
     // behavior when a soft hyphen is used in Arabic script.
-    BREAK_AND_INSERT_HYPHEN_AND_ZWJ = 8,
-    // Break the line, and insert hyphen at current line and the beginning of the next line. It's
-    // very similar to BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE. BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE
-    // is for the words which included hyphenator but this is used to actively hyphenate words
-    // based on the pattern.
-    BREAK_AND_INSERT_HYPHEN_AT_CURRENT_AND_NEXT_LINE = 9
+    BREAK_AND_INSERT_HYPHEN_AND_ZWJ = 8
 };
 
 // The hyphen edit represents an edit to the string when a word is hyphenated.
@@ -236,11 +230,6 @@ private:
     void hyphenateFromCodes(const uint16_t* codes, size_t len, HyphenationType hyphenValue,
                             HyphenationType* out) const;
 
-    HyphenationType hyphenationTypeBasedOnScriptAndLocale(uint32_t codePoint) const;
-
-    bool isRepeatHyphen(UScriptCode script,
-                        HyphenationLocale locale) const;
-
     // See also LONGEST_HYPHENATED_WORD in LineBreaker.cpp. Here the constant is used so
     // that temporary buffers can be stack-allocated without waste, which is a slightly
     // different use case. It measures UTF-16 code units.
diff --git a/libs/minikin/Hyphenator.cpp b/libs/minikin/Hyphenator.cpp
index 098e6e8..c755a87 100644
--- a/libs/minikin/Hyphenator.cpp
+++ b/libs/minikin/Hyphenator.cpp
@@ -158,7 +158,6 @@ bool Hyphenator::isLineBreakingHyphen(uint32_t c) {
 EndHyphenEdit editForThisLine(HyphenationType type) {
     switch (type) {
         case HyphenationType::BREAK_AND_INSERT_HYPHEN:
-        case HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_CURRENT_AND_NEXT_LINE:
             return EndHyphenEdit::INSERT_HYPHEN;
         case HyphenationType::BREAK_AND_INSERT_ARMENIAN_HYPHEN:
             return EndHyphenEdit::INSERT_ARMENIAN_HYPHEN;
@@ -179,7 +178,6 @@ EndHyphenEdit editForThisLine(HyphenationType type) {
 StartHyphenEdit editForNextLine(HyphenationType type) {
     switch (type) {
         case HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE:
-        case HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_CURRENT_AND_NEXT_LINE:
             return StartHyphenEdit::INSERT_HYPHEN;
         case HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ:
             return StartHyphenEdit::INSERT_ZWJ;
@@ -199,6 +197,25 @@ static UScriptCode getScript(uint32_t codePoint) {
     }
 }
 
+static HyphenationType hyphenationTypeBasedOnScript(uint32_t codePoint) {
+    // Note: It's not clear what the best hyphen for Hebrew is. While maqaf is the "correct" hyphen
+    // for Hebrew, modern practice may have shifted towards Western hyphens. We use normal hyphens
+    // for now to be safe.  BREAK_AND_INSERT_MAQAF is already implemented, so if we want to switch
+    // to maqaf for Hebrew, we can simply add a condition here.
+    const UScriptCode script = getScript(codePoint);
+    if (script == USCRIPT_KANNADA || script == USCRIPT_MALAYALAM || script == USCRIPT_TAMIL ||
+        script == USCRIPT_TELUGU) {
+        // Grantha is not included, since we don't support non-BMP hyphenation yet.
+        return HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN;
+    } else if (script == USCRIPT_ARMENIAN) {
+        return HyphenationType::BREAK_AND_INSERT_ARMENIAN_HYPHEN;
+    } else if (script == USCRIPT_CANADIAN_ABORIGINAL) {
+        return HyphenationType::BREAK_AND_INSERT_UCAS_HYPHEN;
+    } else {
+        return HyphenationType::BREAK_AND_INSERT_HYPHEN;
+    }
+}
+
 static inline int32_t getJoiningType(UChar32 codepoint) {
     return u_getIntPropertyValue(codepoint, UCHAR_JOINING_TYPE);
 }
@@ -227,27 +244,6 @@ static inline HyphenationType getHyphTypeForArabic(const U16StringPiece& word, s
     return HyphenationType::BREAK_AND_INSERT_HYPHEN;
 }
 
-HyphenationType Hyphenator::hyphenationTypeBasedOnScriptAndLocale(uint32_t codePoint) const {
-    // Note: It's not clear what the best hyphen for Hebrew is. While maqaf is the "correct" hyphen
-    // for Hebrew, modern practice may have shifted towards Western hyphens. We use normal hyphens
-    // for now to be safe.  BREAK_AND_INSERT_MAQAF is already implemented, so if we want to switch
-    // to maqaf for Hebrew, we can simply add a condition here.
-    const UScriptCode script = getScript(codePoint);
-    if (script == USCRIPT_KANNADA || script == USCRIPT_MALAYALAM || script == USCRIPT_TAMIL ||
-        script == USCRIPT_TELUGU) {
-        // Grantha is not included, since we don't support non-BMP hyphenation yet.
-        return HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN;
-    } else if (script == USCRIPT_ARMENIAN) {
-        return HyphenationType::BREAK_AND_INSERT_ARMENIAN_HYPHEN;
-    } else if (script == USCRIPT_CANADIAN_ABORIGINAL) {
-        return HyphenationType::BREAK_AND_INSERT_UCAS_HYPHEN;
-    } else if (isRepeatHyphen(script, mHyphenationLocale)) {
-        return HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_CURRENT_AND_NEXT_LINE;
-    } else {
-        return HyphenationType::BREAK_AND_INSERT_HYPHEN;
-    }
-}
-
 // Use various recommendations of UAX #14 Unicode Line Breaking Algorithm for hyphenating words
 // that didn't match patterns, especially words that contain hyphens or soft hyphens (See sections
 // 5.3, Use of Hyphen, and 5.4, Use of Soft Hyphen).
@@ -259,7 +255,9 @@ void Hyphenator::hyphenateWithNoPatterns(const U16StringPiece& word, Hyphenation
             // Break after hyphens, but only if they don't start the word.
 
             if ((prevChar == CHAR_HYPHEN_MINUS || prevChar == CHAR_HYPHEN) &&
-                isRepeatHyphen(getScript(word[i]), mHyphenationLocale)) {
+                (mHyphenationLocale == HyphenationLocale::POLISH ||
+                 mHyphenationLocale == HyphenationLocale::SLOVENIAN) &&
+                getScript(word[i]) == USCRIPT_LATIN) {
                 // In Polish and Slovenian, hyphens get repeated at the next line. To be safe,
                 // we will do this only if the next character is Latin.
                 out[i] = HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE;
@@ -275,7 +273,7 @@ void Hyphenator::hyphenateWithNoPatterns(const U16StringPiece& word, Hyphenation
                 // actually join. If they don't, we'll just insert a normal hyphen.
                 out[i] = getHyphTypeForArabic(word, i);
             } else {
-                out[i] = hyphenationTypeBasedOnScriptAndLocale(word[i]);
+                out[i] = hyphenationTypeBasedOnScript(word[i]);
             }
         } else if (prevChar == CHAR_MIDDLE_DOT && mMinPrefix < i && i <= word.size() - mMinSuffix &&
                    ((word[i - 2] == 'l' && word[i] == 'l') ||
@@ -311,7 +309,7 @@ HyphenationType Hyphenator::alphabetLookup(uint16_t* alpha_codes,
                 return HyphenationType::DONT_BREAK;
             }
             if (result == HyphenationType::BREAK_AND_INSERT_HYPHEN) {
-                result = hyphenationTypeBasedOnScriptAndLocale(c);
+                result = hyphenationTypeBasedOnScript(c);
             }
             alpha_codes[i + 1] = code;
         }
@@ -334,7 +332,7 @@ HyphenationType Hyphenator::alphabetLookup(uint16_t* alpha_codes,
                 return HyphenationType::DONT_BREAK;
             }
             if (result == HyphenationType::BREAK_AND_INSERT_HYPHEN) {
-                result = hyphenationTypeBasedOnScriptAndLocale(c);
+                result = hyphenationTypeBasedOnScript(c);
             }
             alpha_codes[i + 1] = AlphabetTable1::value(entry);
         }
@@ -400,10 +398,4 @@ void Hyphenator::hyphenateFromCodes(const uint16_t* codes, size_t len, Hyphenati
     }
 }
 
-bool Hyphenator::isRepeatHyphen(UScriptCode script,
-                                HyphenationLocale locale) const {
-    return script == USCRIPT_LATIN &&
-            (locale == HyphenationLocale::POLISH ||locale == HyphenationLocale::SLOVENIAN);
-}
-
 }  // namespace minikin
diff --git a/tests/unittest/HyphenatorTest.cpp b/tests/unittest/HyphenatorTest.cpp
index 5936146..14c3ea3 100644
--- a/tests/unittest/HyphenatorTest.cpp
+++ b/tests/unittest/HyphenatorTest.cpp
@@ -121,18 +121,6 @@ TEST(HyphenatorTest, polishEnDash) {
     EXPECT_EQ(HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN, result[2]);
 }
 
-// If there is a soft hyphen in Polish, the hyphen should be repeated on the next line.
-TEST(HyphenatorTest, polishSoftHyphen) {
-    Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "pl");
-    const uint16_t word[] = {'x', SOFT_HYPHEN, 'y'};
-    std::vector<HyphenationType> result;
-    hyphenator->hyphenate(word, &result);
-    EXPECT_EQ((size_t)3, result.size());
-    EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
-    EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
-    EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_CURRENT_AND_NEXT_LINE, result[2]);
-}
-
 // If we break on a hyphen in Slovenian, the hyphen should be repeated on the next line. (Same as
 // Polish.)
 TEST(HyphenatorTest, slovenianHyphen) {
-- 
cgit v1.2.3