From 4920747bfd33712251b7a6cda4e6c2c827aeea97 Mon Sep 17 00:00:00 2001 From: Calvin Pan Date: Tue, 2 May 2023 08:01:08 +0000 Subject: Revert "Support pattern in non-standard hyphenation(Polish)" This reverts commit d70a89113cc104c1d3866f775df90c8373c05463. Reason for revert: In the past, we followed the linguist's suggestion to repeat the hyphen(b/238164187), but this is not true in Polish. Bug: 278262732 Change-Id: I7eae75f9461daebeabf1f78fb2905412c108bc60 --- include/minikin/Hyphenator.h | 13 +-------- libs/minikin/Hyphenator.cpp | 58 +++++++++++++++++---------------------- tests/unittest/HyphenatorTest.cpp | 12 -------- 3 files changed, 26 insertions(+), 57 deletions(-) diff --git a/include/minikin/Hyphenator.h b/include/minikin/Hyphenator.h index 47c1648..da265f0 100644 --- a/include/minikin/Hyphenator.h +++ b/include/minikin/Hyphenator.h @@ -23,7 +23,6 @@ #include #include -#include #include "minikin/Characters.h" #include "minikin/U16StringPiece.h" @@ -64,12 +63,7 @@ enum class HyphenationType : uint8_t { // Break the line, insert a ZWJ and hyphen at the first line, and a ZWJ at the second line. // This is used in Arabic script, mostly for writing systems of Central Asia. It's our default // behavior when a soft hyphen is used in Arabic script. - BREAK_AND_INSERT_HYPHEN_AND_ZWJ = 8, - // Break the line, and insert hyphen at current line and the beginning of the next line. It's - // very similar to BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE. BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE - // is for the words which included hyphenator but this is used to actively hyphenate words - // based on the pattern. - BREAK_AND_INSERT_HYPHEN_AT_CURRENT_AND_NEXT_LINE = 9 + BREAK_AND_INSERT_HYPHEN_AND_ZWJ = 8 }; // The hyphen edit represents an edit to the string when a word is hyphenated. @@ -236,11 +230,6 @@ private: void hyphenateFromCodes(const uint16_t* codes, size_t len, HyphenationType hyphenValue, HyphenationType* out) const; - HyphenationType hyphenationTypeBasedOnScriptAndLocale(uint32_t codePoint) const; - - bool isRepeatHyphen(UScriptCode script, - HyphenationLocale locale) const; - // See also LONGEST_HYPHENATED_WORD in LineBreaker.cpp. Here the constant is used so // that temporary buffers can be stack-allocated without waste, which is a slightly // different use case. It measures UTF-16 code units. diff --git a/libs/minikin/Hyphenator.cpp b/libs/minikin/Hyphenator.cpp index 098e6e8..c755a87 100644 --- a/libs/minikin/Hyphenator.cpp +++ b/libs/minikin/Hyphenator.cpp @@ -158,7 +158,6 @@ bool Hyphenator::isLineBreakingHyphen(uint32_t c) { EndHyphenEdit editForThisLine(HyphenationType type) { switch (type) { case HyphenationType::BREAK_AND_INSERT_HYPHEN: - case HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_CURRENT_AND_NEXT_LINE: return EndHyphenEdit::INSERT_HYPHEN; case HyphenationType::BREAK_AND_INSERT_ARMENIAN_HYPHEN: return EndHyphenEdit::INSERT_ARMENIAN_HYPHEN; @@ -179,7 +178,6 @@ EndHyphenEdit editForThisLine(HyphenationType type) { StartHyphenEdit editForNextLine(HyphenationType type) { switch (type) { case HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE: - case HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_CURRENT_AND_NEXT_LINE: return StartHyphenEdit::INSERT_HYPHEN; case HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ: return StartHyphenEdit::INSERT_ZWJ; @@ -199,6 +197,25 @@ static UScriptCode getScript(uint32_t codePoint) { } } +static HyphenationType hyphenationTypeBasedOnScript(uint32_t codePoint) { + // Note: It's not clear what the best hyphen for Hebrew is. While maqaf is the "correct" hyphen + // for Hebrew, modern practice may have shifted towards Western hyphens. We use normal hyphens + // for now to be safe. BREAK_AND_INSERT_MAQAF is already implemented, so if we want to switch + // to maqaf for Hebrew, we can simply add a condition here. + const UScriptCode script = getScript(codePoint); + if (script == USCRIPT_KANNADA || script == USCRIPT_MALAYALAM || script == USCRIPT_TAMIL || + script == USCRIPT_TELUGU) { + // Grantha is not included, since we don't support non-BMP hyphenation yet. + return HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN; + } else if (script == USCRIPT_ARMENIAN) { + return HyphenationType::BREAK_AND_INSERT_ARMENIAN_HYPHEN; + } else if (script == USCRIPT_CANADIAN_ABORIGINAL) { + return HyphenationType::BREAK_AND_INSERT_UCAS_HYPHEN; + } else { + return HyphenationType::BREAK_AND_INSERT_HYPHEN; + } +} + static inline int32_t getJoiningType(UChar32 codepoint) { return u_getIntPropertyValue(codepoint, UCHAR_JOINING_TYPE); } @@ -227,27 +244,6 @@ static inline HyphenationType getHyphTypeForArabic(const U16StringPiece& word, s return HyphenationType::BREAK_AND_INSERT_HYPHEN; } -HyphenationType Hyphenator::hyphenationTypeBasedOnScriptAndLocale(uint32_t codePoint) const { - // Note: It's not clear what the best hyphen for Hebrew is. While maqaf is the "correct" hyphen - // for Hebrew, modern practice may have shifted towards Western hyphens. We use normal hyphens - // for now to be safe. BREAK_AND_INSERT_MAQAF is already implemented, so if we want to switch - // to maqaf for Hebrew, we can simply add a condition here. - const UScriptCode script = getScript(codePoint); - if (script == USCRIPT_KANNADA || script == USCRIPT_MALAYALAM || script == USCRIPT_TAMIL || - script == USCRIPT_TELUGU) { - // Grantha is not included, since we don't support non-BMP hyphenation yet. - return HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN; - } else if (script == USCRIPT_ARMENIAN) { - return HyphenationType::BREAK_AND_INSERT_ARMENIAN_HYPHEN; - } else if (script == USCRIPT_CANADIAN_ABORIGINAL) { - return HyphenationType::BREAK_AND_INSERT_UCAS_HYPHEN; - } else if (isRepeatHyphen(script, mHyphenationLocale)) { - return HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_CURRENT_AND_NEXT_LINE; - } else { - return HyphenationType::BREAK_AND_INSERT_HYPHEN; - } -} - // Use various recommendations of UAX #14 Unicode Line Breaking Algorithm for hyphenating words // that didn't match patterns, especially words that contain hyphens or soft hyphens (See sections // 5.3, Use of Hyphen, and 5.4, Use of Soft Hyphen). @@ -259,7 +255,9 @@ void Hyphenator::hyphenateWithNoPatterns(const U16StringPiece& word, Hyphenation // Break after hyphens, but only if they don't start the word. if ((prevChar == CHAR_HYPHEN_MINUS || prevChar == CHAR_HYPHEN) && - isRepeatHyphen(getScript(word[i]), mHyphenationLocale)) { + (mHyphenationLocale == HyphenationLocale::POLISH || + mHyphenationLocale == HyphenationLocale::SLOVENIAN) && + getScript(word[i]) == USCRIPT_LATIN) { // In Polish and Slovenian, hyphens get repeated at the next line. To be safe, // we will do this only if the next character is Latin. out[i] = HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE; @@ -275,7 +273,7 @@ void Hyphenator::hyphenateWithNoPatterns(const U16StringPiece& word, Hyphenation // actually join. If they don't, we'll just insert a normal hyphen. out[i] = getHyphTypeForArabic(word, i); } else { - out[i] = hyphenationTypeBasedOnScriptAndLocale(word[i]); + out[i] = hyphenationTypeBasedOnScript(word[i]); } } else if (prevChar == CHAR_MIDDLE_DOT && mMinPrefix < i && i <= word.size() - mMinSuffix && ((word[i - 2] == 'l' && word[i] == 'l') || @@ -311,7 +309,7 @@ HyphenationType Hyphenator::alphabetLookup(uint16_t* alpha_codes, return HyphenationType::DONT_BREAK; } if (result == HyphenationType::BREAK_AND_INSERT_HYPHEN) { - result = hyphenationTypeBasedOnScriptAndLocale(c); + result = hyphenationTypeBasedOnScript(c); } alpha_codes[i + 1] = code; } @@ -334,7 +332,7 @@ HyphenationType Hyphenator::alphabetLookup(uint16_t* alpha_codes, return HyphenationType::DONT_BREAK; } if (result == HyphenationType::BREAK_AND_INSERT_HYPHEN) { - result = hyphenationTypeBasedOnScriptAndLocale(c); + result = hyphenationTypeBasedOnScript(c); } alpha_codes[i + 1] = AlphabetTable1::value(entry); } @@ -400,10 +398,4 @@ void Hyphenator::hyphenateFromCodes(const uint16_t* codes, size_t len, Hyphenati } } -bool Hyphenator::isRepeatHyphen(UScriptCode script, - HyphenationLocale locale) const { - return script == USCRIPT_LATIN && - (locale == HyphenationLocale::POLISH ||locale == HyphenationLocale::SLOVENIAN); -} - } // namespace minikin diff --git a/tests/unittest/HyphenatorTest.cpp b/tests/unittest/HyphenatorTest.cpp index 5936146..14c3ea3 100644 --- a/tests/unittest/HyphenatorTest.cpp +++ b/tests/unittest/HyphenatorTest.cpp @@ -121,18 +121,6 @@ TEST(HyphenatorTest, polishEnDash) { EXPECT_EQ(HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN, result[2]); } -// If there is a soft hyphen in Polish, the hyphen should be repeated on the next line. -TEST(HyphenatorTest, polishSoftHyphen) { - Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "pl"); - const uint16_t word[] = {'x', SOFT_HYPHEN, 'y'}; - std::vector result; - hyphenator->hyphenate(word, &result); - EXPECT_EQ((size_t)3, result.size()); - EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]); - EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]); - EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_CURRENT_AND_NEXT_LINE, result[2]); -} - // If we break on a hyphen in Slovenian, the hyphen should be repeated on the next line. (Same as // Polish.) TEST(HyphenatorTest, slovenianHyphen) { -- cgit v1.2.3