diff options
author | Seigo Nonaka <nona@google.com> | 2023-11-30 07:02:28 +0000 |
---|---|---|
committer | Android (Google) Code Review <android-gerrit@google.com> | 2023-11-30 07:02:28 +0000 |
commit | e68bf31aedbee180f3a4ce0073937cfc1fdf0b4b (patch) | |
tree | 2c9fc9ee3774b34054a3aa2a7d262ab55ffe6f16 | |
parent | b86a47f7aaae91efd5176e79367bc926301bb388 (diff) | |
parent | a78455bfe28949ea689ad46517d5f865f8ed43b6 (diff) | |
download | minikin-e68bf31aedbee180f3a4ce0073937cfc1fdf0b4b.tar.gz |
Merge "Refactoring: pull out script splitter with unit test" into main
-rw-r--r-- | include/minikin/Characters.h | 1 | ||||
-rw-r--r-- | libs/minikin/Android.bp | 1 | ||||
-rw-r--r-- | libs/minikin/LayoutCore.cpp | 49 | ||||
-rw-r--r-- | libs/minikin/ScriptUtils.cpp | 88 | ||||
-rw-r--r-- | libs/minikin/ScriptUtils.h | 89 | ||||
-rw-r--r-- | tests/unittest/Android.bp | 1 | ||||
-rw-r--r-- | tests/unittest/ScriptUtilsTest.cpp | 162 |
7 files changed, 347 insertions, 44 deletions
diff --git a/include/minikin/Characters.h b/include/minikin/Characters.h index 074d134..d298d07 100644 --- a/include/minikin/Characters.h +++ b/include/minikin/Characters.h @@ -34,6 +34,7 @@ constexpr uint32_t CHAR_MAQAF = 0x05BE; constexpr uint32_t CHAR_UCAS_HYPHEN = 0x1400; constexpr uint32_t CHAR_ZWJ = 0x200D; constexpr uint32_t CHAR_HYPHEN = 0x2010; +constexpr uint32_t CHAR_REPLACEMENT_CHARACTER = 0xFFFD; } // namespace minikin diff --git a/libs/minikin/Android.bp b/libs/minikin/Android.bp index 5fb524a..d23f053 100644 --- a/libs/minikin/Android.bp +++ b/libs/minikin/Android.bp @@ -52,6 +52,7 @@ cc_library { "MinikinFontFactory.cpp", "MinikinInternal.cpp", "OptimalLineBreaker.cpp", + "ScriptUtils.cpp", "SparseBitSet.cpp", "SystemFonts.cpp", "WordBreaker.cpp", diff --git a/libs/minikin/LayoutCore.cpp b/libs/minikin/LayoutCore.cpp index 10c932a..b079e85 100644 --- a/libs/minikin/LayoutCore.cpp +++ b/libs/minikin/LayoutCore.cpp @@ -37,6 +37,7 @@ #include "LayoutUtils.h" #include "LocaleListCache.h" #include "MinikinInternal.h" +#include "ScriptUtils.h" #include "minikin/Emoji.h" #include "minikin/FontFeature.h" #include "minikin/HbUtils.h" @@ -138,45 +139,6 @@ static bool isColorBitmapFont(const HbFontUniquePtr& font) { return cbdt; } -static hb_codepoint_t decodeUtf16(const uint16_t* chars, size_t len, ssize_t* iter) { - UChar32 result; - U16_NEXT(chars, *iter, (ssize_t)len, result); - if (U_IS_SURROGATE(result)) { // isolated surrogate - result = 0xFFFDu; // U+FFFD REPLACEMENT CHARACTER - } - return (hb_codepoint_t)result; -} - -static hb_script_t getScriptRun(const uint16_t* chars, size_t len, ssize_t* iter) { - if (size_t(*iter) == len) { - return HB_SCRIPT_UNKNOWN; - } - uint32_t cp = decodeUtf16(chars, len, iter); - hb_unicode_funcs_t* unicode_func = hb_unicode_funcs_get_default(); - hb_script_t current_script = hb_unicode_script(unicode_func, cp); - for (;;) { - if (size_t(*iter) == len) break; - const ssize_t prev_iter = *iter; - cp = decodeUtf16(chars, len, iter); - const hb_script_t script = hb_unicode_script(unicode_func, cp); - if (script != current_script) { - if (current_script == HB_SCRIPT_INHERITED || current_script == HB_SCRIPT_COMMON) { - current_script = script; - } else if (script == HB_SCRIPT_INHERITED || script == HB_SCRIPT_COMMON) { - continue; - } else { - *iter = prev_iter; - break; - } - } - } - if (current_script == HB_SCRIPT_INHERITED) { - current_script = HB_SCRIPT_COMMON; - } - - return current_script; -} - /** * Disable certain scripts (mostly those with cursive connection) from having letterspacing * applied. See https://github.com/behdad/harfbuzz/issues/64 for more details. @@ -403,11 +365,10 @@ LayoutPiece::LayoutPiece(const U16StringPiece& textBuf, const Range& range, bool // Note: scriptRunStart and scriptRunEnd, as well as run.start and run.end, run between 0 // and count. - ssize_t scriptRunEnd; - for (ssize_t scriptRunStart = run.start; scriptRunStart < run.end; - scriptRunStart = scriptRunEnd) { - scriptRunEnd = scriptRunStart; - hb_script_t script = getScriptRun(buf + start, run.end, &scriptRunEnd /* iterator */); + for (const auto [range, script] : ScriptText(textBuf, run.start, run.end)) { + ssize_t scriptRunStart = range.getStart(); + ssize_t scriptRunEnd = range.getEnd(); + // After the last line, scriptRunEnd is guaranteed to have increased, since the only // time getScriptRun does not increase its iterator is when it has already reached the // end of the buffer. But that can't happen, since if we have already reached the end diff --git a/libs/minikin/ScriptUtils.cpp b/libs/minikin/ScriptUtils.cpp new file mode 100644 index 0000000..90bd5de --- /dev/null +++ b/libs/minikin/ScriptUtils.cpp @@ -0,0 +1,88 @@ +/* + * Copyright (C) 2023 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define LOG_TAG "Minikin" + +#include "ScriptUtils.h" + +#include <unicode/ubidi.h> +#include <unicode/uscript.h> +#include <unicode/utf16.h> +#include <unicode/utypes.h> + +#include <algorithm> + +#include "MinikinInternal.h" +#include "minikin/Emoji.h" + +namespace minikin { + +static hb_codepoint_t decodeUtf16(U16StringPiece text, Range range, uint32_t pos) { + uint32_t result; + U16_NEXT(text.data(), pos, range.getEnd(), result); + if (U_IS_SURROGATE(result)) { // isolated surrogate + result = CHAR_REPLACEMENT_CHARACTER; + } + return static_cast<hb_codepoint_t>(result); +} + +static UScriptCode getICUScript(uint32_t cp) { + UErrorCode status = U_ZERO_ERROR; + UScriptCode scriptCode = uscript_getScript(cp, &status); + if (U_FAILURE(status)) [[unlikely]] { + return USCRIPT_INVALID_CODE; + } + return scriptCode; +} + +static hb_script_t getHbScript(uint32_t cp) { + hb_unicode_funcs_t* unicode_func = hb_unicode_funcs_get_default(); + return hb_unicode_script(unicode_func, cp); +} + +// static +std::pair<uint32_t, hb_script_t> ScriptText::getScriptRun(U16StringPiece text, Range range, + uint32_t pos) { + if (!range.contains(pos)) { + return std::make_pair(range.getEnd(), HB_SCRIPT_UNKNOWN); + } + + uint32_t cp = decodeUtf16(text, range, pos); + UScriptCode current_script = getICUScript(cp); + hb_script_t current_hb_script = getHbScript(cp); + uint32_t i; + for (i = pos + U16_LENGTH(cp); i < range.getEnd(); i += U16_LENGTH(cp)) { + cp = decodeUtf16(text, range, i); + UScriptCode next_script = getICUScript(cp); + if (current_script != next_script) { + if (current_script == USCRIPT_INHERITED || current_script == USCRIPT_COMMON) { + current_script = next_script; + current_hb_script = getHbScript(cp); + } else if (next_script == USCRIPT_INHERITED || next_script == USCRIPT_COMMON) { + continue; + } else { + break; + } + } + } + if (current_script == USCRIPT_INHERITED) { + return std::make_pair(i, HB_SCRIPT_COMMON); + } else { + return std::make_pair(i, current_hb_script); + } +} + +} // namespace minikin diff --git a/libs/minikin/ScriptUtils.h b/libs/minikin/ScriptUtils.h new file mode 100644 index 0000000..7bf575f --- /dev/null +++ b/libs/minikin/ScriptUtils.h @@ -0,0 +1,89 @@ +/* + * Copyright (C) 2023 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINIKIN_SCRIPT_UTILS_H +#define MINIKIN_SCRIPT_UTILS_H + +#define LOG_TAG "Minikin" + +#include <unicode/ubidi.h> + +#include <memory> + +#include "minikin/Layout.h" +#include "minikin/Macros.h" +#include "minikin/U16StringPiece.h" + +namespace minikin { + +// A helper class for iterating the bidi run transitions. +class ScriptText { +public: + struct RunInfo { + Range range; + hb_script_t script; + }; + + ScriptText(const U16StringPiece& textBuf, uint32_t start, uint32_t end) + : mTextBuf(textBuf), mRange(start, end) {} + + class iterator { + public: + inline bool operator==(const iterator& o) const { + return mStart == o.mStart && mParent == o.mParent; + } + + inline bool operator!=(const iterator& o) const { return !(*this == o); } + + inline std::pair<Range, hb_script_t> operator*() const { + return std::make_pair(Range(mStart, mEnd), mScript); + } + + inline iterator& operator++() { + mStart = mEnd; + std::tie(mEnd, mScript) = getScriptRun(mParent->mTextBuf, mParent->mRange, mStart); + return *this; + } + + private: + friend class ScriptText; + + iterator(const ScriptText* parent, uint32_t start) : mParent(parent), mStart(start) { + std::tie(mEnd, mScript) = getScriptRun(mParent->mTextBuf, mParent->mRange, mStart); + } + + const ScriptText* mParent; + uint32_t mStart; + uint32_t mEnd; + hb_script_t mScript; + }; + + inline iterator begin() const { return iterator(this, mRange.getStart()); } + inline iterator end() const { return iterator(this, mRange.getEnd()); } + +private: + U16StringPiece mTextBuf; + Range mRange; + + static std::pair<uint32_t, hb_script_t> getScriptRun(U16StringPiece text, Range range, + uint32_t pos); + + MINIKIN_PREVENT_COPY_AND_ASSIGN(ScriptText); +}; + +} // namespace minikin + +#endif // MINIKIN_SCRIPT_UTILS_H diff --git a/tests/unittest/Android.bp b/tests/unittest/Android.bp index c2f7d6c..5a822cd 100644 --- a/tests/unittest/Android.bp +++ b/tests/unittest/Android.bp @@ -75,6 +75,7 @@ cc_test { "MeasuredTextTest.cpp", "MeasurementTests.cpp", "OptimalLineBreakerTest.cpp", + "ScriptUtilsTest.cpp", "SparseBitSetTest.cpp", "StringPieceTest.cpp", "SystemFontsTest.cpp", diff --git a/tests/unittest/ScriptUtilsTest.cpp b/tests/unittest/ScriptUtilsTest.cpp new file mode 100644 index 0000000..cbd9a56 --- /dev/null +++ b/tests/unittest/ScriptUtilsTest.cpp @@ -0,0 +1,162 @@ +/* + * Copyright (C) 2023 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <gtest/gtest.h> + +#include "ScriptUtils.h" +#include "UnicodeUtils.h" + +namespace minikin { +namespace { + +struct Result { + Result(int start, int end, hb_script_t script) : start(start), end(end), script(script) {} + int start; + int end; + hb_script_t script; +}; + +bool operator==(const Result& l, const Result& r) { + return l.start == r.start && l.end == r.end && l.script == r.script; +} + +std::ostream& operator<<(std::ostream& os, const Result& r) { + char buf[5] = {}; + buf[0] = static_cast<char>((r.script >> 24) & 0xFF); + buf[1] = static_cast<char>((r.script >> 16) & 0xFF); + buf[2] = static_cast<char>((r.script >> 8) & 0xFF); + buf[3] = static_cast<char>((r.script) & 0xFF); + return os << "(" << r.start << "," << r.end << "): " << buf; +} + +std::vector<Result> splitByScript(const std::vector<uint16_t>& text, uint32_t start, uint32_t end) { + std::vector<Result> result; + for (const auto [range, script] : ScriptText(text, start, end)) { + result.emplace_back(range.getStart(), range.getEnd(), script); + } + return result; +} + +std::vector<Result> splitByScript(const std::string& text, uint32_t start, uint32_t end) { + std::vector<uint16_t> utf16 = utf8ToUtf16(text); + return splitByScript(utf16, start, end); +} + +TEST(ScriptUtilsTest, Latin) { + auto result = splitByScript("abcde", 0, 5); + ASSERT_EQ(1u, result.size()); + EXPECT_EQ(Result(0, 5, HB_SCRIPT_LATIN), result[0]); + + result = splitByScript("abcde", 0, 3); + ASSERT_EQ(1u, result.size()); + EXPECT_EQ(Result(0, 3, HB_SCRIPT_LATIN), result[0]); + + result = splitByScript("abcde", 2, 5); + ASSERT_EQ(1u, result.size()); + EXPECT_EQ(Result(2, 5, HB_SCRIPT_LATIN), result[0]); + + result = splitByScript("abcde", 2, 3); + ASSERT_EQ(1u, result.size()); + EXPECT_EQ(Result(2, 3, HB_SCRIPT_LATIN), result[0]); +} + +TEST(ScriptUtilsTest, Arabic) { + auto result = splitByScript("\u0645\u0631\u062D\u0628\u064B\u0627", 0, 6); + ASSERT_EQ(1u, result.size()); + EXPECT_EQ(Result(0, 6, HB_SCRIPT_ARABIC), result[0]); + + result = splitByScript("\u0645\u0631\u062D\u0628\u064B\u0627", 0, 3); + ASSERT_EQ(1u, result.size()); + EXPECT_EQ(Result(0, 3, HB_SCRIPT_ARABIC), result[0]); + + result = splitByScript("\u0645\u0631\u062D\u0628\u064B\u0627", 2, 5); + ASSERT_EQ(1u, result.size()); + EXPECT_EQ(Result(2, 5, HB_SCRIPT_ARABIC), result[0]); + + result = splitByScript("\u0645\u0631\u062D\u0628\u064B\u0627", 2, 3); + ASSERT_EQ(1u, result.size()); + EXPECT_EQ(Result(2, 3, HB_SCRIPT_ARABIC), result[0]); +} + +TEST(ScriptUtilsTest, Common) { + auto result = splitByScript(" ", 0, 5); + ASSERT_EQ(1u, result.size()); + EXPECT_EQ(Result(0, 5, HB_SCRIPT_COMMON), result[0]); + + result = splitByScript(" ", 0, 3); + ASSERT_EQ(1u, result.size()); + EXPECT_EQ(Result(0, 3, HB_SCRIPT_COMMON), result[0]); + + result = splitByScript(" ", 2, 5); + ASSERT_EQ(1u, result.size()); + EXPECT_EQ(Result(2, 5, HB_SCRIPT_COMMON), result[0]); + + result = splitByScript(" ", 2, 3); + ASSERT_EQ(1u, result.size()); + EXPECT_EQ(Result(2, 3, HB_SCRIPT_COMMON), result[0]); +} + +TEST(ScriptUtilsTest, InheritOrCommon) { + // Parens are inherit which is inherit from the previous script. If there is no character + // before, use the next non-inherit type of script. + auto result = splitByScript("(abc)", 0, 5); + ASSERT_EQ(1u, result.size()); + EXPECT_EQ(Result(0, 5, HB_SCRIPT_LATIN), result[0]); + + result = splitByScript("[(b)]", 0, 5); + ASSERT_EQ(1u, result.size()); + EXPECT_EQ(Result(0, 5, HB_SCRIPT_LATIN), result[0]); + + result = splitByScript("[(b)]", 0, 2); + ASSERT_EQ(1u, result.size()); + EXPECT_EQ(Result(0, 2, HB_SCRIPT_COMMON), result[0]); +} + +TEST(ScriptUtilsTest, MultiScript_InheritOrCommon) { + auto result = splitByScript("a(\u0645)e", 0, 5); + EXPECT_EQ(Result(0, 2, HB_SCRIPT_LATIN), result[0]); + EXPECT_EQ(Result(2, 4, HB_SCRIPT_ARABIC), result[1]); + EXPECT_EQ(Result(4, 5, HB_SCRIPT_LATIN), result[2]); +} + +TEST(ScriptUtilsTest, MultiScript_NoInheritOrCommon) { + auto result = splitByScript("a\u0645b\u0631c", 0, 5); + EXPECT_EQ(Result(0, 1, HB_SCRIPT_LATIN), result[0]); + EXPECT_EQ(Result(1, 2, HB_SCRIPT_ARABIC), result[1]); + EXPECT_EQ(Result(2, 3, HB_SCRIPT_LATIN), result[2]); + EXPECT_EQ(Result(3, 4, HB_SCRIPT_ARABIC), result[3]); + EXPECT_EQ(Result(4, 5, HB_SCRIPT_LATIN), result[4]); +} + +TEST(ScriptUtilsTest, SurrogatePair) { + auto result = splitByScript(std::vector<uint16_t>({0xD83C, 0xDFF3, 0xD83C, 0xDFF3}), 0, 4); + ASSERT_EQ(1u, result.size()); + EXPECT_EQ(Result(0, 4, HB_SCRIPT_COMMON), result[0]); + + result = splitByScript(std::vector<uint16_t>({0xD83C, 0xDFF3, 0xD83C, 0xDFF3}), 0, 3); + ASSERT_EQ(1u, result.size()); + EXPECT_EQ(Result(0, 3, HB_SCRIPT_COMMON), result[0]); + + result = splitByScript(std::vector<uint16_t>({0xD83C, 0xDFF3, 0xD83C, 0xDFF3}), 1, 4); + ASSERT_EQ(1u, result.size()); + EXPECT_EQ(Result(1, 4, HB_SCRIPT_COMMON), result[0]); + + result = splitByScript(std::vector<uint16_t>({0xD83C, 0xDFF3, 0xD83C, 0xDFF3}), 1, 3); + ASSERT_EQ(1u, result.size()); + EXPECT_EQ(Result(1, 3, HB_SCRIPT_COMMON), result[0]); +} +} // namespace +} // namespace minikin |