summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSeigo Nonaka <nona@google.com>2023-11-04 15:33:59 +0900
committerSeigo Nonaka <nona@google.com>2023-11-07 15:02:15 +0900
commita78455bfe28949ea689ad46517d5f865f8ed43b6 (patch)
tree979e1516df4a72f6b385108bb0218cfae7c058f1
parentd27148d9ed3d653b6e8d95a769de0150be1104e9 (diff)
downloadminikin-a78455bfe28949ea689ad46517d5f865f8ed43b6.tar.gz
Refactoring: pull out script splitter with unit test
The script transition point will be used for other places, so pull out the script splitter from LayoutCore.cpp with unit tests. Bug: 283193133 Test: minikin_tests Change-Id: Icd9679d334815a6c864f27d0e0222186d514ff81
-rw-r--r--include/minikin/Characters.h1
-rw-r--r--libs/minikin/Android.bp1
-rw-r--r--libs/minikin/LayoutCore.cpp49
-rw-r--r--libs/minikin/ScriptUtils.cpp88
-rw-r--r--libs/minikin/ScriptUtils.h89
-rw-r--r--tests/unittest/Android.bp1
-rw-r--r--tests/unittest/ScriptUtilsTest.cpp162
7 files changed, 347 insertions, 44 deletions
diff --git a/include/minikin/Characters.h b/include/minikin/Characters.h
index 074d134..d298d07 100644
--- a/include/minikin/Characters.h
+++ b/include/minikin/Characters.h
@@ -34,6 +34,7 @@ constexpr uint32_t CHAR_MAQAF = 0x05BE;
constexpr uint32_t CHAR_UCAS_HYPHEN = 0x1400;
constexpr uint32_t CHAR_ZWJ = 0x200D;
constexpr uint32_t CHAR_HYPHEN = 0x2010;
+constexpr uint32_t CHAR_REPLACEMENT_CHARACTER = 0xFFFD;
} // namespace minikin
diff --git a/libs/minikin/Android.bp b/libs/minikin/Android.bp
index 5fb524a..d23f053 100644
--- a/libs/minikin/Android.bp
+++ b/libs/minikin/Android.bp
@@ -52,6 +52,7 @@ cc_library {
"MinikinFontFactory.cpp",
"MinikinInternal.cpp",
"OptimalLineBreaker.cpp",
+ "ScriptUtils.cpp",
"SparseBitSet.cpp",
"SystemFonts.cpp",
"WordBreaker.cpp",
diff --git a/libs/minikin/LayoutCore.cpp b/libs/minikin/LayoutCore.cpp
index b89958e..5a19f89 100644
--- a/libs/minikin/LayoutCore.cpp
+++ b/libs/minikin/LayoutCore.cpp
@@ -38,6 +38,7 @@
#include "LayoutUtils.h"
#include "LocaleListCache.h"
#include "MinikinInternal.h"
+#include "ScriptUtils.h"
#include "minikin/Emoji.h"
#include "minikin/HbUtils.h"
#include "minikin/LayoutCache.h"
@@ -138,45 +139,6 @@ static bool isColorBitmapFont(const HbFontUniquePtr& font) {
return cbdt;
}
-static hb_codepoint_t decodeUtf16(const uint16_t* chars, size_t len, ssize_t* iter) {
- UChar32 result;
- U16_NEXT(chars, *iter, (ssize_t)len, result);
- if (U_IS_SURROGATE(result)) { // isolated surrogate
- result = 0xFFFDu; // U+FFFD REPLACEMENT CHARACTER
- }
- return (hb_codepoint_t)result;
-}
-
-static hb_script_t getScriptRun(const uint16_t* chars, size_t len, ssize_t* iter) {
- if (size_t(*iter) == len) {
- return HB_SCRIPT_UNKNOWN;
- }
- uint32_t cp = decodeUtf16(chars, len, iter);
- hb_unicode_funcs_t* unicode_func = hb_unicode_funcs_get_default();
- hb_script_t current_script = hb_unicode_script(unicode_func, cp);
- for (;;) {
- if (size_t(*iter) == len) break;
- const ssize_t prev_iter = *iter;
- cp = decodeUtf16(chars, len, iter);
- const hb_script_t script = hb_unicode_script(unicode_func, cp);
- if (script != current_script) {
- if (current_script == HB_SCRIPT_INHERITED || current_script == HB_SCRIPT_COMMON) {
- current_script = script;
- } else if (script == HB_SCRIPT_INHERITED || script == HB_SCRIPT_COMMON) {
- continue;
- } else {
- *iter = prev_iter;
- break;
- }
- }
- }
- if (current_script == HB_SCRIPT_INHERITED) {
- current_script = HB_SCRIPT_COMMON;
- }
-
- return current_script;
-}
-
/**
* Disable certain scripts (mostly those with cursive connection) from having letterspacing
* applied. See https://github.com/behdad/harfbuzz/issues/64 for more details.
@@ -403,11 +365,10 @@ LayoutPiece::LayoutPiece(const U16StringPiece& textBuf, const Range& range, bool
// Note: scriptRunStart and scriptRunEnd, as well as run.start and run.end, run between 0
// and count.
- ssize_t scriptRunEnd;
- for (ssize_t scriptRunStart = run.start; scriptRunStart < run.end;
- scriptRunStart = scriptRunEnd) {
- scriptRunEnd = scriptRunStart;
- hb_script_t script = getScriptRun(buf + start, run.end, &scriptRunEnd /* iterator */);
+ for (const auto [range, script] : ScriptText(textBuf, run.start, run.end)) {
+ ssize_t scriptRunStart = range.getStart();
+ ssize_t scriptRunEnd = range.getEnd();
+
// After the last line, scriptRunEnd is guaranteed to have increased, since the only
// time getScriptRun does not increase its iterator is when it has already reached the
// end of the buffer. But that can't happen, since if we have already reached the end
diff --git a/libs/minikin/ScriptUtils.cpp b/libs/minikin/ScriptUtils.cpp
new file mode 100644
index 0000000..90bd5de
--- /dev/null
+++ b/libs/minikin/ScriptUtils.cpp
@@ -0,0 +1,88 @@
+/*
+ * Copyright (C) 2023 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define LOG_TAG "Minikin"
+
+#include "ScriptUtils.h"
+
+#include <unicode/ubidi.h>
+#include <unicode/uscript.h>
+#include <unicode/utf16.h>
+#include <unicode/utypes.h>
+
+#include <algorithm>
+
+#include "MinikinInternal.h"
+#include "minikin/Emoji.h"
+
+namespace minikin {
+
+static hb_codepoint_t decodeUtf16(U16StringPiece text, Range range, uint32_t pos) {
+ uint32_t result;
+ U16_NEXT(text.data(), pos, range.getEnd(), result);
+ if (U_IS_SURROGATE(result)) { // isolated surrogate
+ result = CHAR_REPLACEMENT_CHARACTER;
+ }
+ return static_cast<hb_codepoint_t>(result);
+}
+
+static UScriptCode getICUScript(uint32_t cp) {
+ UErrorCode status = U_ZERO_ERROR;
+ UScriptCode scriptCode = uscript_getScript(cp, &status);
+ if (U_FAILURE(status)) [[unlikely]] {
+ return USCRIPT_INVALID_CODE;
+ }
+ return scriptCode;
+}
+
+static hb_script_t getHbScript(uint32_t cp) {
+ hb_unicode_funcs_t* unicode_func = hb_unicode_funcs_get_default();
+ return hb_unicode_script(unicode_func, cp);
+}
+
+// static
+std::pair<uint32_t, hb_script_t> ScriptText::getScriptRun(U16StringPiece text, Range range,
+ uint32_t pos) {
+ if (!range.contains(pos)) {
+ return std::make_pair(range.getEnd(), HB_SCRIPT_UNKNOWN);
+ }
+
+ uint32_t cp = decodeUtf16(text, range, pos);
+ UScriptCode current_script = getICUScript(cp);
+ hb_script_t current_hb_script = getHbScript(cp);
+ uint32_t i;
+ for (i = pos + U16_LENGTH(cp); i < range.getEnd(); i += U16_LENGTH(cp)) {
+ cp = decodeUtf16(text, range, i);
+ UScriptCode next_script = getICUScript(cp);
+ if (current_script != next_script) {
+ if (current_script == USCRIPT_INHERITED || current_script == USCRIPT_COMMON) {
+ current_script = next_script;
+ current_hb_script = getHbScript(cp);
+ } else if (next_script == USCRIPT_INHERITED || next_script == USCRIPT_COMMON) {
+ continue;
+ } else {
+ break;
+ }
+ }
+ }
+ if (current_script == USCRIPT_INHERITED) {
+ return std::make_pair(i, HB_SCRIPT_COMMON);
+ } else {
+ return std::make_pair(i, current_hb_script);
+ }
+}
+
+} // namespace minikin
diff --git a/libs/minikin/ScriptUtils.h b/libs/minikin/ScriptUtils.h
new file mode 100644
index 0000000..7bf575f
--- /dev/null
+++ b/libs/minikin/ScriptUtils.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (C) 2023 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINIKIN_SCRIPT_UTILS_H
+#define MINIKIN_SCRIPT_UTILS_H
+
+#define LOG_TAG "Minikin"
+
+#include <unicode/ubidi.h>
+
+#include <memory>
+
+#include "minikin/Layout.h"
+#include "minikin/Macros.h"
+#include "minikin/U16StringPiece.h"
+
+namespace minikin {
+
+// A helper class for iterating the bidi run transitions.
+class ScriptText {
+public:
+ struct RunInfo {
+ Range range;
+ hb_script_t script;
+ };
+
+ ScriptText(const U16StringPiece& textBuf, uint32_t start, uint32_t end)
+ : mTextBuf(textBuf), mRange(start, end) {}
+
+ class iterator {
+ public:
+ inline bool operator==(const iterator& o) const {
+ return mStart == o.mStart && mParent == o.mParent;
+ }
+
+ inline bool operator!=(const iterator& o) const { return !(*this == o); }
+
+ inline std::pair<Range, hb_script_t> operator*() const {
+ return std::make_pair(Range(mStart, mEnd), mScript);
+ }
+
+ inline iterator& operator++() {
+ mStart = mEnd;
+ std::tie(mEnd, mScript) = getScriptRun(mParent->mTextBuf, mParent->mRange, mStart);
+ return *this;
+ }
+
+ private:
+ friend class ScriptText;
+
+ iterator(const ScriptText* parent, uint32_t start) : mParent(parent), mStart(start) {
+ std::tie(mEnd, mScript) = getScriptRun(mParent->mTextBuf, mParent->mRange, mStart);
+ }
+
+ const ScriptText* mParent;
+ uint32_t mStart;
+ uint32_t mEnd;
+ hb_script_t mScript;
+ };
+
+ inline iterator begin() const { return iterator(this, mRange.getStart()); }
+ inline iterator end() const { return iterator(this, mRange.getEnd()); }
+
+private:
+ U16StringPiece mTextBuf;
+ Range mRange;
+
+ static std::pair<uint32_t, hb_script_t> getScriptRun(U16StringPiece text, Range range,
+ uint32_t pos);
+
+ MINIKIN_PREVENT_COPY_AND_ASSIGN(ScriptText);
+};
+
+} // namespace minikin
+
+#endif // MINIKIN_SCRIPT_UTILS_H
diff --git a/tests/unittest/Android.bp b/tests/unittest/Android.bp
index c2f7d6c..5a822cd 100644
--- a/tests/unittest/Android.bp
+++ b/tests/unittest/Android.bp
@@ -75,6 +75,7 @@ cc_test {
"MeasuredTextTest.cpp",
"MeasurementTests.cpp",
"OptimalLineBreakerTest.cpp",
+ "ScriptUtilsTest.cpp",
"SparseBitSetTest.cpp",
"StringPieceTest.cpp",
"SystemFontsTest.cpp",
diff --git a/tests/unittest/ScriptUtilsTest.cpp b/tests/unittest/ScriptUtilsTest.cpp
new file mode 100644
index 0000000..cbd9a56
--- /dev/null
+++ b/tests/unittest/ScriptUtilsTest.cpp
@@ -0,0 +1,162 @@
+/*
+ * Copyright (C) 2023 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include "ScriptUtils.h"
+#include "UnicodeUtils.h"
+
+namespace minikin {
+namespace {
+
+struct Result {
+ Result(int start, int end, hb_script_t script) : start(start), end(end), script(script) {}
+ int start;
+ int end;
+ hb_script_t script;
+};
+
+bool operator==(const Result& l, const Result& r) {
+ return l.start == r.start && l.end == r.end && l.script == r.script;
+}
+
+std::ostream& operator<<(std::ostream& os, const Result& r) {
+ char buf[5] = {};
+ buf[0] = static_cast<char>((r.script >> 24) & 0xFF);
+ buf[1] = static_cast<char>((r.script >> 16) & 0xFF);
+ buf[2] = static_cast<char>((r.script >> 8) & 0xFF);
+ buf[3] = static_cast<char>((r.script) & 0xFF);
+ return os << "(" << r.start << "," << r.end << "): " << buf;
+}
+
+std::vector<Result> splitByScript(const std::vector<uint16_t>& text, uint32_t start, uint32_t end) {
+ std::vector<Result> result;
+ for (const auto [range, script] : ScriptText(text, start, end)) {
+ result.emplace_back(range.getStart(), range.getEnd(), script);
+ }
+ return result;
+}
+
+std::vector<Result> splitByScript(const std::string& text, uint32_t start, uint32_t end) {
+ std::vector<uint16_t> utf16 = utf8ToUtf16(text);
+ return splitByScript(utf16, start, end);
+}
+
+TEST(ScriptUtilsTest, Latin) {
+ auto result = splitByScript("abcde", 0, 5);
+ ASSERT_EQ(1u, result.size());
+ EXPECT_EQ(Result(0, 5, HB_SCRIPT_LATIN), result[0]);
+
+ result = splitByScript("abcde", 0, 3);
+ ASSERT_EQ(1u, result.size());
+ EXPECT_EQ(Result(0, 3, HB_SCRIPT_LATIN), result[0]);
+
+ result = splitByScript("abcde", 2, 5);
+ ASSERT_EQ(1u, result.size());
+ EXPECT_EQ(Result(2, 5, HB_SCRIPT_LATIN), result[0]);
+
+ result = splitByScript("abcde", 2, 3);
+ ASSERT_EQ(1u, result.size());
+ EXPECT_EQ(Result(2, 3, HB_SCRIPT_LATIN), result[0]);
+}
+
+TEST(ScriptUtilsTest, Arabic) {
+ auto result = splitByScript("\u0645\u0631\u062D\u0628\u064B\u0627", 0, 6);
+ ASSERT_EQ(1u, result.size());
+ EXPECT_EQ(Result(0, 6, HB_SCRIPT_ARABIC), result[0]);
+
+ result = splitByScript("\u0645\u0631\u062D\u0628\u064B\u0627", 0, 3);
+ ASSERT_EQ(1u, result.size());
+ EXPECT_EQ(Result(0, 3, HB_SCRIPT_ARABIC), result[0]);
+
+ result = splitByScript("\u0645\u0631\u062D\u0628\u064B\u0627", 2, 5);
+ ASSERT_EQ(1u, result.size());
+ EXPECT_EQ(Result(2, 5, HB_SCRIPT_ARABIC), result[0]);
+
+ result = splitByScript("\u0645\u0631\u062D\u0628\u064B\u0627", 2, 3);
+ ASSERT_EQ(1u, result.size());
+ EXPECT_EQ(Result(2, 3, HB_SCRIPT_ARABIC), result[0]);
+}
+
+TEST(ScriptUtilsTest, Common) {
+ auto result = splitByScript(" ", 0, 5);
+ ASSERT_EQ(1u, result.size());
+ EXPECT_EQ(Result(0, 5, HB_SCRIPT_COMMON), result[0]);
+
+ result = splitByScript(" ", 0, 3);
+ ASSERT_EQ(1u, result.size());
+ EXPECT_EQ(Result(0, 3, HB_SCRIPT_COMMON), result[0]);
+
+ result = splitByScript(" ", 2, 5);
+ ASSERT_EQ(1u, result.size());
+ EXPECT_EQ(Result(2, 5, HB_SCRIPT_COMMON), result[0]);
+
+ result = splitByScript(" ", 2, 3);
+ ASSERT_EQ(1u, result.size());
+ EXPECT_EQ(Result(2, 3, HB_SCRIPT_COMMON), result[0]);
+}
+
+TEST(ScriptUtilsTest, InheritOrCommon) {
+ // Parens are inherit which is inherit from the previous script. If there is no character
+ // before, use the next non-inherit type of script.
+ auto result = splitByScript("(abc)", 0, 5);
+ ASSERT_EQ(1u, result.size());
+ EXPECT_EQ(Result(0, 5, HB_SCRIPT_LATIN), result[0]);
+
+ result = splitByScript("[(b)]", 0, 5);
+ ASSERT_EQ(1u, result.size());
+ EXPECT_EQ(Result(0, 5, HB_SCRIPT_LATIN), result[0]);
+
+ result = splitByScript("[(b)]", 0, 2);
+ ASSERT_EQ(1u, result.size());
+ EXPECT_EQ(Result(0, 2, HB_SCRIPT_COMMON), result[0]);
+}
+
+TEST(ScriptUtilsTest, MultiScript_InheritOrCommon) {
+ auto result = splitByScript("a(\u0645)e", 0, 5);
+ EXPECT_EQ(Result(0, 2, HB_SCRIPT_LATIN), result[0]);
+ EXPECT_EQ(Result(2, 4, HB_SCRIPT_ARABIC), result[1]);
+ EXPECT_EQ(Result(4, 5, HB_SCRIPT_LATIN), result[2]);
+}
+
+TEST(ScriptUtilsTest, MultiScript_NoInheritOrCommon) {
+ auto result = splitByScript("a\u0645b\u0631c", 0, 5);
+ EXPECT_EQ(Result(0, 1, HB_SCRIPT_LATIN), result[0]);
+ EXPECT_EQ(Result(1, 2, HB_SCRIPT_ARABIC), result[1]);
+ EXPECT_EQ(Result(2, 3, HB_SCRIPT_LATIN), result[2]);
+ EXPECT_EQ(Result(3, 4, HB_SCRIPT_ARABIC), result[3]);
+ EXPECT_EQ(Result(4, 5, HB_SCRIPT_LATIN), result[4]);
+}
+
+TEST(ScriptUtilsTest, SurrogatePair) {
+ auto result = splitByScript(std::vector<uint16_t>({0xD83C, 0xDFF3, 0xD83C, 0xDFF3}), 0, 4);
+ ASSERT_EQ(1u, result.size());
+ EXPECT_EQ(Result(0, 4, HB_SCRIPT_COMMON), result[0]);
+
+ result = splitByScript(std::vector<uint16_t>({0xD83C, 0xDFF3, 0xD83C, 0xDFF3}), 0, 3);
+ ASSERT_EQ(1u, result.size());
+ EXPECT_EQ(Result(0, 3, HB_SCRIPT_COMMON), result[0]);
+
+ result = splitByScript(std::vector<uint16_t>({0xD83C, 0xDFF3, 0xD83C, 0xDFF3}), 1, 4);
+ ASSERT_EQ(1u, result.size());
+ EXPECT_EQ(Result(1, 4, HB_SCRIPT_COMMON), result[0]);
+
+ result = splitByScript(std::vector<uint16_t>({0xD83C, 0xDFF3, 0xD83C, 0xDFF3}), 1, 3);
+ ASSERT_EQ(1u, result.size());
+ EXPECT_EQ(Result(1, 3, HB_SCRIPT_COMMON), result[0]);
+}
+} // namespace
+} // namespace minikin