aboutsummaryrefslogtreecommitdiff
path: root/icing/tokenization/icu/icu-language-segmenter_test.cc
diff options
context:
space:
mode:
Diffstat (limited to 'icing/tokenization/icu/icu-language-segmenter_test.cc')
-rw-r--r--icing/tokenization/icu/icu-language-segmenter_test.cc91
1 files changed, 56 insertions, 35 deletions
diff --git a/icing/tokenization/icu/icu-language-segmenter_test.cc b/icing/tokenization/icu/icu-language-segmenter_test.cc
index 71e04e2..6771050 100644
--- a/icing/tokenization/icu/icu-language-segmenter_test.cc
+++ b/icing/tokenization/icu/icu-language-segmenter_test.cc
@@ -21,6 +21,7 @@
#include "gtest/gtest.h"
#include "icing/absl_ports/str_cat.h"
#include "icing/jni/jni-cache.h"
+#include "icing/portable/platform.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/icu-i18n-test-utils.h"
@@ -118,6 +119,9 @@ class IcuLanguageSegmenterAllLocalesTest
: public testing::TestWithParam<const char*> {
protected:
void SetUp() override {
+ if (!IsIcuTokenization()) {
+ GTEST_SKIP() << "ICU tokenization not enabled!";
+ }
ICING_ASSERT_OK(
// File generated via icu_data_file rule in //icing/BUILD.
icu_data_file_helper::SetUpICUDataFile(
@@ -223,46 +227,42 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, WordConnector) {
// Word connecters
EXPECT_THAT(language_segmenter->GetAllTerms("com.google.android"),
IsOkAndHolds(ElementsAre("com.google.android")));
- EXPECT_THAT(language_segmenter->GetAllTerms("com:google:android"),
- IsOkAndHolds(ElementsAre("com:google:android")));
EXPECT_THAT(language_segmenter->GetAllTerms("com'google'android"),
IsOkAndHolds(ElementsAre("com'google'android")));
EXPECT_THAT(language_segmenter->GetAllTerms("com_google_android"),
IsOkAndHolds(ElementsAre("com_google_android")));
// Word connecters can be mixed
- EXPECT_THAT(language_segmenter->GetAllTerms("com.google.android:icing"),
- IsOkAndHolds(ElementsAre("com.google.android:icing")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com.google.android_icing"),
+ IsOkAndHolds(ElementsAre("com.google.android_icing")));
// Connectors that don't have valid terms on both sides of it are not
// considered connectors.
- EXPECT_THAT(language_segmenter->GetAllTerms(":bar:baz"),
- IsOkAndHolds(ElementsAre(":", "bar:baz")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("'bar'baz"),
+ IsOkAndHolds(ElementsAre("'", "bar'baz")));
- EXPECT_THAT(language_segmenter->GetAllTerms("bar:baz:"),
- IsOkAndHolds(ElementsAre("bar:baz", ":")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("bar.baz."),
+ IsOkAndHolds(ElementsAre("bar.baz", ".")));
// Connectors that don't have valid terms on both sides of it are not
// considered connectors.
- EXPECT_THAT(language_segmenter->GetAllTerms(" :bar:baz"),
- IsOkAndHolds(ElementsAre(" ", ":", "bar:baz")));
+ EXPECT_THAT(language_segmenter->GetAllTerms(" .bar.baz"),
+ IsOkAndHolds(ElementsAre(" ", ".", "bar.baz")));
- EXPECT_THAT(language_segmenter->GetAllTerms("bar:baz: "),
- IsOkAndHolds(ElementsAre("bar:baz", ":", " ")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("bar'baz' "),
+ IsOkAndHolds(ElementsAre("bar'baz", "'", " ")));
// Connectors don't connect if one side is an invalid term (?)
- EXPECT_THAT(language_segmenter->GetAllTerms("bar:baz:?"),
- IsOkAndHolds(ElementsAre("bar:baz", ":", "?")));
- EXPECT_THAT(language_segmenter->GetAllTerms("?:bar:baz"),
- IsOkAndHolds(ElementsAre("?", ":", "bar:baz")));
- EXPECT_THAT(language_segmenter->GetAllTerms("3:14"),
- IsOkAndHolds(ElementsAre("3", ":", "14")));
- EXPECT_THAT(language_segmenter->GetAllTerms("私:は"),
- IsOkAndHolds(ElementsAre("私", ":", "は")));
- EXPECT_THAT(language_segmenter->GetAllTerms("我:每"),
- IsOkAndHolds(ElementsAre("我", ":", "每")));
- EXPECT_THAT(language_segmenter->GetAllTerms("เดิน:ไป"),
- IsOkAndHolds(ElementsAre("เดิน:ไป")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("bar.baz.?"),
+ IsOkAndHolds(ElementsAre("bar.baz", ".", "?")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("?'bar'baz"),
+ IsOkAndHolds(ElementsAre("?", "'", "bar'baz")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("私'は"),
+ IsOkAndHolds(ElementsAre("私", "'", "は")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("我.每"),
+ IsOkAndHolds(ElementsAre("我", ".", "每")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("เดิน'ไป"),
+ IsOkAndHolds(ElementsAre("เดิน'ไป")));
// Any heading and trailing characters are not connecters
EXPECT_THAT(language_segmenter->GetAllTerms(".com.google.android."),
@@ -277,8 +277,6 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, WordConnector) {
IsOkAndHolds(ElementsAre("com", "+", "google", "+", "android")));
EXPECT_THAT(language_segmenter->GetAllTerms("com*google*android"),
IsOkAndHolds(ElementsAre("com", "*", "google", "*", "android")));
- EXPECT_THAT(language_segmenter->GetAllTerms("com@google@android"),
- IsOkAndHolds(ElementsAre("com", "@", "google", "@", "android")));
EXPECT_THAT(language_segmenter->GetAllTerms("com^google^android"),
IsOkAndHolds(ElementsAre("com", "^", "google", "^", "android")));
EXPECT_THAT(language_segmenter->GetAllTerms("com&google&android"),
@@ -292,6 +290,29 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, WordConnector) {
EXPECT_THAT(
language_segmenter->GetAllTerms("com\"google\"android"),
IsOkAndHolds(ElementsAre("com", "\"", "google", "\"", "android")));
+
+ // In ICU 72, there were a few changes:
+ // 1. ':' stopped being a word connector
+ // 2. '@' became a word connector
+ // 3. <numeric><word-connector><numeric> such as "3'14" is now considered as
+ // a single token.
+ if (IsIcu72PlusTokenization()) {
+ EXPECT_THAT(
+ language_segmenter->GetAllTerms("com:google:android"),
+ IsOkAndHolds(ElementsAre("com", ":", "google", ":", "android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com@google@android"),
+ IsOkAndHolds(ElementsAre("com@google@android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("3'14"),
+ IsOkAndHolds(ElementsAre("3'14")));
+ } else {
+ EXPECT_THAT(language_segmenter->GetAllTerms("com:google:android"),
+ IsOkAndHolds(ElementsAre("com:google:android")));
+ EXPECT_THAT(
+ language_segmenter->GetAllTerms("com@google@android"),
+ IsOkAndHolds(ElementsAre("com", "@", "google", "@", "android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("3'14"),
+ IsOkAndHolds(ElementsAre("3", "'", "14")));
+ }
}
TEST_P(IcuLanguageSegmenterAllLocalesTest, Apostrophes) {
@@ -494,17 +515,17 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToStartUtf32WordConnector) {
ICING_ASSERT_OK_AND_ASSIGN(
auto segmenter, language_segmenter_factory::Create(
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
- constexpr std::string_view kText = "com:google:android is package";
+ constexpr std::string_view kText = "com.google.android is package";
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
segmenter->Segment(kText));
- // String: "com:google:android is package"
+ // String: "com.google.android is package"
// ^ ^^ ^^
// UTF-8 idx: 0 18 19 21 22
// UTF-32 idx: 0 18 19 21 22
auto position_or = itr->ResetToStartUtf32();
EXPECT_THAT(position_or, IsOk());
- ASSERT_THAT(itr->GetTerm(), Eq("com:google:android"));
+ ASSERT_THAT(itr->GetTerm(), Eq("com.google.android"));
}
TEST_P(IcuLanguageSegmenterAllLocalesTest, NewIteratorResetToStartUtf32) {
@@ -585,11 +606,11 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToTermAfterUtf32WordConnector) {
ICING_ASSERT_OK_AND_ASSIGN(
auto segmenter, language_segmenter_factory::Create(
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
- constexpr std::string_view kText = "package com:google:android name";
+ constexpr std::string_view kText = "package com.google.android name";
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
segmenter->Segment(kText));
- // String: "package com:google:android name"
+ // String: "package com.google.android name"
// ^ ^^ ^^
// UTF-8 idx: 0 7 8 26 27
// UTF-32 idx: 0 7 8 26 27
@@ -601,7 +622,7 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToTermAfterUtf32WordConnector) {
position_or = itr->ResetToTermStartingAfterUtf32(7);
EXPECT_THAT(position_or, IsOk());
EXPECT_THAT(position_or.ValueOrDie(), Eq(8));
- ASSERT_THAT(itr->GetTerm(), Eq("com:google:android"));
+ ASSERT_THAT(itr->GetTerm(), Eq("com.google.android"));
}
TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToTermAfterUtf32OutOfBounds) {
@@ -961,18 +982,18 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest,
ICING_ASSERT_OK_AND_ASSIGN(
auto segmenter, language_segmenter_factory::Create(
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
- constexpr std::string_view kText = "package name com:google:android!";
+ constexpr std::string_view kText = "package name com.google.android!";
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
segmenter->Segment(kText));
- // String: "package name com:google:android!"
+ // String: "package name com.google.android!"
// ^ ^^ ^^ ^
// UTF-8 idx: 0 7 8 12 13 31
// UTF-32 idx: 0 7 8 12 13 31
auto position_or = itr->ResetToTermEndingBeforeUtf32(31);
EXPECT_THAT(position_or, IsOk());
EXPECT_THAT(position_or.ValueOrDie(), Eq(13));
- ASSERT_THAT(itr->GetTerm(), Eq("com:google:android"));
+ ASSERT_THAT(itr->GetTerm(), Eq("com.google.android"));
position_or = itr->ResetToTermEndingBeforeUtf32(21);
EXPECT_THAT(position_or, IsOk());