summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndroid Build Coastguard Worker <android-build-coastguard-worker@google.com>2023-05-11 23:23:03 +0000
committerAndroid Build Coastguard Worker <android-build-coastguard-worker@google.com>2023-05-11 23:23:03 +0000
commitb44defbff537fa99314e89d609276b849dd55a38 (patch)
treec2bbaabcdf91cd5359247e70a9fd8af5c355507d
parent137290a2b6df7c6223b29d1a0afb0f4ba8026d70 (diff)
parenta0c400f4c7f34b2e92b61c672fa3c483387fcb4a (diff)
downloadminikin-android14-d1-s2-release.tar.gz
Change-Id: Ibe21e9b4bc0837b0a1a991f3763143b04440035a
-rw-r--r--include/minikin/Hyphenator.h13
-rw-r--r--libs/minikin/Hyphenator.cpp58
-rw-r--r--tests/unittest/HyphenatorTest.cpp12
3 files changed, 26 insertions, 57 deletions
diff --git a/include/minikin/Hyphenator.h b/include/minikin/Hyphenator.h
index 47c1648..da265f0 100644
--- a/include/minikin/Hyphenator.h
+++ b/include/minikin/Hyphenator.h
@@ -23,7 +23,6 @@
#include <string>
#include <vector>
-#include <unicode/uscript.h>
#include "minikin/Characters.h"
#include "minikin/U16StringPiece.h"
@@ -64,12 +63,7 @@ enum class HyphenationType : uint8_t {
// Break the line, insert a ZWJ and hyphen at the first line, and a ZWJ at the second line.
// This is used in Arabic script, mostly for writing systems of Central Asia. It's our default
// behavior when a soft hyphen is used in Arabic script.
- BREAK_AND_INSERT_HYPHEN_AND_ZWJ = 8,
- // Break the line, and insert hyphen at current line and the beginning of the next line. It's
- // very similar to BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE. BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE
- // is for the words which included hyphenator but this is used to actively hyphenate words
- // based on the pattern.
- BREAK_AND_INSERT_HYPHEN_AT_CURRENT_AND_NEXT_LINE = 9
+ BREAK_AND_INSERT_HYPHEN_AND_ZWJ = 8
};
// The hyphen edit represents an edit to the string when a word is hyphenated.
@@ -236,11 +230,6 @@ private:
void hyphenateFromCodes(const uint16_t* codes, size_t len, HyphenationType hyphenValue,
HyphenationType* out) const;
- HyphenationType hyphenationTypeBasedOnScriptAndLocale(uint32_t codePoint) const;
-
- bool isRepeatHyphen(UScriptCode script,
- HyphenationLocale locale) const;
-
// See also LONGEST_HYPHENATED_WORD in LineBreaker.cpp. Here the constant is used so
// that temporary buffers can be stack-allocated without waste, which is a slightly
// different use case. It measures UTF-16 code units.
diff --git a/libs/minikin/Hyphenator.cpp b/libs/minikin/Hyphenator.cpp
index 098e6e8..c755a87 100644
--- a/libs/minikin/Hyphenator.cpp
+++ b/libs/minikin/Hyphenator.cpp
@@ -158,7 +158,6 @@ bool Hyphenator::isLineBreakingHyphen(uint32_t c) {
EndHyphenEdit editForThisLine(HyphenationType type) {
switch (type) {
case HyphenationType::BREAK_AND_INSERT_HYPHEN:
- case HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_CURRENT_AND_NEXT_LINE:
return EndHyphenEdit::INSERT_HYPHEN;
case HyphenationType::BREAK_AND_INSERT_ARMENIAN_HYPHEN:
return EndHyphenEdit::INSERT_ARMENIAN_HYPHEN;
@@ -179,7 +178,6 @@ EndHyphenEdit editForThisLine(HyphenationType type) {
StartHyphenEdit editForNextLine(HyphenationType type) {
switch (type) {
case HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE:
- case HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_CURRENT_AND_NEXT_LINE:
return StartHyphenEdit::INSERT_HYPHEN;
case HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ:
return StartHyphenEdit::INSERT_ZWJ;
@@ -199,6 +197,25 @@ static UScriptCode getScript(uint32_t codePoint) {
}
}
+static HyphenationType hyphenationTypeBasedOnScript(uint32_t codePoint) {
+ // Note: It's not clear what the best hyphen for Hebrew is. While maqaf is the "correct" hyphen
+ // for Hebrew, modern practice may have shifted towards Western hyphens. We use normal hyphens
+ // for now to be safe. BREAK_AND_INSERT_MAQAF is already implemented, so if we want to switch
+ // to maqaf for Hebrew, we can simply add a condition here.
+ const UScriptCode script = getScript(codePoint);
+ if (script == USCRIPT_KANNADA || script == USCRIPT_MALAYALAM || script == USCRIPT_TAMIL ||
+ script == USCRIPT_TELUGU) {
+ // Grantha is not included, since we don't support non-BMP hyphenation yet.
+ return HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN;
+ } else if (script == USCRIPT_ARMENIAN) {
+ return HyphenationType::BREAK_AND_INSERT_ARMENIAN_HYPHEN;
+ } else if (script == USCRIPT_CANADIAN_ABORIGINAL) {
+ return HyphenationType::BREAK_AND_INSERT_UCAS_HYPHEN;
+ } else {
+ return HyphenationType::BREAK_AND_INSERT_HYPHEN;
+ }
+}
+
static inline int32_t getJoiningType(UChar32 codepoint) {
return u_getIntPropertyValue(codepoint, UCHAR_JOINING_TYPE);
}
@@ -227,27 +244,6 @@ static inline HyphenationType getHyphTypeForArabic(const U16StringPiece& word, s
return HyphenationType::BREAK_AND_INSERT_HYPHEN;
}
-HyphenationType Hyphenator::hyphenationTypeBasedOnScriptAndLocale(uint32_t codePoint) const {
- // Note: It's not clear what the best hyphen for Hebrew is. While maqaf is the "correct" hyphen
- // for Hebrew, modern practice may have shifted towards Western hyphens. We use normal hyphens
- // for now to be safe. BREAK_AND_INSERT_MAQAF is already implemented, so if we want to switch
- // to maqaf for Hebrew, we can simply add a condition here.
- const UScriptCode script = getScript(codePoint);
- if (script == USCRIPT_KANNADA || script == USCRIPT_MALAYALAM || script == USCRIPT_TAMIL ||
- script == USCRIPT_TELUGU) {
- // Grantha is not included, since we don't support non-BMP hyphenation yet.
- return HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN;
- } else if (script == USCRIPT_ARMENIAN) {
- return HyphenationType::BREAK_AND_INSERT_ARMENIAN_HYPHEN;
- } else if (script == USCRIPT_CANADIAN_ABORIGINAL) {
- return HyphenationType::BREAK_AND_INSERT_UCAS_HYPHEN;
- } else if (isRepeatHyphen(script, mHyphenationLocale)) {
- return HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_CURRENT_AND_NEXT_LINE;
- } else {
- return HyphenationType::BREAK_AND_INSERT_HYPHEN;
- }
-}
-
// Use various recommendations of UAX #14 Unicode Line Breaking Algorithm for hyphenating words
// that didn't match patterns, especially words that contain hyphens or soft hyphens (See sections
// 5.3, Use of Hyphen, and 5.4, Use of Soft Hyphen).
@@ -259,7 +255,9 @@ void Hyphenator::hyphenateWithNoPatterns(const U16StringPiece& word, Hyphenation
// Break after hyphens, but only if they don't start the word.
if ((prevChar == CHAR_HYPHEN_MINUS || prevChar == CHAR_HYPHEN) &&
- isRepeatHyphen(getScript(word[i]), mHyphenationLocale)) {
+ (mHyphenationLocale == HyphenationLocale::POLISH ||
+ mHyphenationLocale == HyphenationLocale::SLOVENIAN) &&
+ getScript(word[i]) == USCRIPT_LATIN) {
// In Polish and Slovenian, hyphens get repeated at the next line. To be safe,
// we will do this only if the next character is Latin.
out[i] = HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE;
@@ -275,7 +273,7 @@ void Hyphenator::hyphenateWithNoPatterns(const U16StringPiece& word, Hyphenation
// actually join. If they don't, we'll just insert a normal hyphen.
out[i] = getHyphTypeForArabic(word, i);
} else {
- out[i] = hyphenationTypeBasedOnScriptAndLocale(word[i]);
+ out[i] = hyphenationTypeBasedOnScript(word[i]);
}
} else if (prevChar == CHAR_MIDDLE_DOT && mMinPrefix < i && i <= word.size() - mMinSuffix &&
((word[i - 2] == 'l' && word[i] == 'l') ||
@@ -311,7 +309,7 @@ HyphenationType Hyphenator::alphabetLookup(uint16_t* alpha_codes,
return HyphenationType::DONT_BREAK;
}
if (result == HyphenationType::BREAK_AND_INSERT_HYPHEN) {
- result = hyphenationTypeBasedOnScriptAndLocale(c);
+ result = hyphenationTypeBasedOnScript(c);
}
alpha_codes[i + 1] = code;
}
@@ -334,7 +332,7 @@ HyphenationType Hyphenator::alphabetLookup(uint16_t* alpha_codes,
return HyphenationType::DONT_BREAK;
}
if (result == HyphenationType::BREAK_AND_INSERT_HYPHEN) {
- result = hyphenationTypeBasedOnScriptAndLocale(c);
+ result = hyphenationTypeBasedOnScript(c);
}
alpha_codes[i + 1] = AlphabetTable1::value(entry);
}
@@ -400,10 +398,4 @@ void Hyphenator::hyphenateFromCodes(const uint16_t* codes, size_t len, Hyphenati
}
}
-bool Hyphenator::isRepeatHyphen(UScriptCode script,
- HyphenationLocale locale) const {
- return script == USCRIPT_LATIN &&
- (locale == HyphenationLocale::POLISH ||locale == HyphenationLocale::SLOVENIAN);
-}
-
} // namespace minikin
diff --git a/tests/unittest/HyphenatorTest.cpp b/tests/unittest/HyphenatorTest.cpp
index 5936146..14c3ea3 100644
--- a/tests/unittest/HyphenatorTest.cpp
+++ b/tests/unittest/HyphenatorTest.cpp
@@ -121,18 +121,6 @@ TEST(HyphenatorTest, polishEnDash) {
EXPECT_EQ(HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN, result[2]);
}
-// If there is a soft hyphen in Polish, the hyphen should be repeated on the next line.
-TEST(HyphenatorTest, polishSoftHyphen) {
- Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "pl");
- const uint16_t word[] = {'x', SOFT_HYPHEN, 'y'};
- std::vector<HyphenationType> result;
- hyphenator->hyphenate(word, &result);
- EXPECT_EQ((size_t)3, result.size());
- EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
- EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
- EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_CURRENT_AND_NEXT_LINE, result[2]);
-}
-
// If we break on a hyphen in Slovenian, the hyphen should be repeated on the next line. (Same as
// Polish.)
TEST(HyphenatorTest, slovenianHyphen) {