diff options
Diffstat (limited to 'icing/tokenization/icu/icu-language-segmenter_test.cc')
-rw-r--r-- | icing/tokenization/icu/icu-language-segmenter_test.cc | 91 |
1 files changed, 56 insertions, 35 deletions
diff --git a/icing/tokenization/icu/icu-language-segmenter_test.cc b/icing/tokenization/icu/icu-language-segmenter_test.cc index 71e04e2..6771050 100644 --- a/icing/tokenization/icu/icu-language-segmenter_test.cc +++ b/icing/tokenization/icu/icu-language-segmenter_test.cc @@ -21,6 +21,7 @@ #include "gtest/gtest.h" #include "icing/absl_ports/str_cat.h" #include "icing/jni/jni-cache.h" +#include "icing/portable/platform.h" #include "icing/testing/common-matchers.h" #include "icing/testing/icu-data-file-helper.h" #include "icing/testing/icu-i18n-test-utils.h" @@ -118,6 +119,9 @@ class IcuLanguageSegmenterAllLocalesTest : public testing::TestWithParam<const char*> { protected: void SetUp() override { + if (!IsIcuTokenization()) { + GTEST_SKIP() << "ICU tokenization not enabled!"; + } ICING_ASSERT_OK( // File generated via icu_data_file rule in //icing/BUILD. icu_data_file_helper::SetUpICUDataFile( @@ -223,46 +227,42 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, WordConnector) { // Word connecters EXPECT_THAT(language_segmenter->GetAllTerms("com.google.android"), IsOkAndHolds(ElementsAre("com.google.android"))); - EXPECT_THAT(language_segmenter->GetAllTerms("com:google:android"), - IsOkAndHolds(ElementsAre("com:google:android"))); EXPECT_THAT(language_segmenter->GetAllTerms("com'google'android"), IsOkAndHolds(ElementsAre("com'google'android"))); EXPECT_THAT(language_segmenter->GetAllTerms("com_google_android"), IsOkAndHolds(ElementsAre("com_google_android"))); // Word connecters can be mixed - EXPECT_THAT(language_segmenter->GetAllTerms("com.google.android:icing"), - IsOkAndHolds(ElementsAre("com.google.android:icing"))); + EXPECT_THAT(language_segmenter->GetAllTerms("com.google.android_icing"), + IsOkAndHolds(ElementsAre("com.google.android_icing"))); // Connectors that don't have valid terms on both sides of it are not // considered connectors. - EXPECT_THAT(language_segmenter->GetAllTerms(":bar:baz"), - IsOkAndHolds(ElementsAre(":", "bar:baz"))); + EXPECT_THAT(language_segmenter->GetAllTerms("'bar'baz"), + IsOkAndHolds(ElementsAre("'", "bar'baz"))); - EXPECT_THAT(language_segmenter->GetAllTerms("bar:baz:"), - IsOkAndHolds(ElementsAre("bar:baz", ":"))); + EXPECT_THAT(language_segmenter->GetAllTerms("bar.baz."), + IsOkAndHolds(ElementsAre("bar.baz", "."))); // Connectors that don't have valid terms on both sides of it are not // considered connectors. - EXPECT_THAT(language_segmenter->GetAllTerms(" :bar:baz"), - IsOkAndHolds(ElementsAre(" ", ":", "bar:baz"))); + EXPECT_THAT(language_segmenter->GetAllTerms(" .bar.baz"), + IsOkAndHolds(ElementsAre(" ", ".", "bar.baz"))); - EXPECT_THAT(language_segmenter->GetAllTerms("bar:baz: "), - IsOkAndHolds(ElementsAre("bar:baz", ":", " "))); + EXPECT_THAT(language_segmenter->GetAllTerms("bar'baz' "), + IsOkAndHolds(ElementsAre("bar'baz", "'", " "))); // Connectors don't connect if one side is an invalid term (?) - EXPECT_THAT(language_segmenter->GetAllTerms("bar:baz:?"), - IsOkAndHolds(ElementsAre("bar:baz", ":", "?"))); - EXPECT_THAT(language_segmenter->GetAllTerms("?:bar:baz"), - IsOkAndHolds(ElementsAre("?", ":", "bar:baz"))); - EXPECT_THAT(language_segmenter->GetAllTerms("3:14"), - IsOkAndHolds(ElementsAre("3", ":", "14"))); - EXPECT_THAT(language_segmenter->GetAllTerms("私:は"), - IsOkAndHolds(ElementsAre("私", ":", "は"))); - EXPECT_THAT(language_segmenter->GetAllTerms("我:每"), - IsOkAndHolds(ElementsAre("我", ":", "每"))); - EXPECT_THAT(language_segmenter->GetAllTerms("เดิน:ไป"), - IsOkAndHolds(ElementsAre("เดิน:ไป"))); + EXPECT_THAT(language_segmenter->GetAllTerms("bar.baz.?"), + IsOkAndHolds(ElementsAre("bar.baz", ".", "?"))); + EXPECT_THAT(language_segmenter->GetAllTerms("?'bar'baz"), + IsOkAndHolds(ElementsAre("?", "'", "bar'baz"))); + EXPECT_THAT(language_segmenter->GetAllTerms("私'は"), + IsOkAndHolds(ElementsAre("私", "'", "は"))); + EXPECT_THAT(language_segmenter->GetAllTerms("我.每"), + IsOkAndHolds(ElementsAre("我", ".", "每"))); + EXPECT_THAT(language_segmenter->GetAllTerms("เดิน'ไป"), + IsOkAndHolds(ElementsAre("เดิน'ไป"))); // Any heading and trailing characters are not connecters EXPECT_THAT(language_segmenter->GetAllTerms(".com.google.android."), @@ -277,8 +277,6 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, WordConnector) { IsOkAndHolds(ElementsAre("com", "+", "google", "+", "android"))); EXPECT_THAT(language_segmenter->GetAllTerms("com*google*android"), IsOkAndHolds(ElementsAre("com", "*", "google", "*", "android"))); - EXPECT_THAT(language_segmenter->GetAllTerms("com@google@android"), - IsOkAndHolds(ElementsAre("com", "@", "google", "@", "android"))); EXPECT_THAT(language_segmenter->GetAllTerms("com^google^android"), IsOkAndHolds(ElementsAre("com", "^", "google", "^", "android"))); EXPECT_THAT(language_segmenter->GetAllTerms("com&google&android"), @@ -292,6 +290,29 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, WordConnector) { EXPECT_THAT( language_segmenter->GetAllTerms("com\"google\"android"), IsOkAndHolds(ElementsAre("com", "\"", "google", "\"", "android"))); + + // In ICU 72, there were a few changes: + // 1. ':' stopped being a word connector + // 2. '@' became a word connector + // 3. <numeric><word-connector><numeric> such as "3'14" is now considered as + // a single token. + if (IsIcu72PlusTokenization()) { + EXPECT_THAT( + language_segmenter->GetAllTerms("com:google:android"), + IsOkAndHolds(ElementsAre("com", ":", "google", ":", "android"))); + EXPECT_THAT(language_segmenter->GetAllTerms("com@google@android"), + IsOkAndHolds(ElementsAre("com@google@android"))); + EXPECT_THAT(language_segmenter->GetAllTerms("3'14"), + IsOkAndHolds(ElementsAre("3'14"))); + } else { + EXPECT_THAT(language_segmenter->GetAllTerms("com:google:android"), + IsOkAndHolds(ElementsAre("com:google:android"))); + EXPECT_THAT( + language_segmenter->GetAllTerms("com@google@android"), + IsOkAndHolds(ElementsAre("com", "@", "google", "@", "android"))); + EXPECT_THAT(language_segmenter->GetAllTerms("3'14"), + IsOkAndHolds(ElementsAre("3", "'", "14"))); + } } TEST_P(IcuLanguageSegmenterAllLocalesTest, Apostrophes) { @@ -494,17 +515,17 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToStartUtf32WordConnector) { ICING_ASSERT_OK_AND_ASSIGN( auto segmenter, language_segmenter_factory::Create( GetSegmenterOptions(GetLocale(), jni_cache_.get()))); - constexpr std::string_view kText = "com:google:android is package"; + constexpr std::string_view kText = "com.google.android is package"; ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, segmenter->Segment(kText)); - // String: "com:google:android is package" + // String: "com.google.android is package" // ^ ^^ ^^ // UTF-8 idx: 0 18 19 21 22 // UTF-32 idx: 0 18 19 21 22 auto position_or = itr->ResetToStartUtf32(); EXPECT_THAT(position_or, IsOk()); - ASSERT_THAT(itr->GetTerm(), Eq("com:google:android")); + ASSERT_THAT(itr->GetTerm(), Eq("com.google.android")); } TEST_P(IcuLanguageSegmenterAllLocalesTest, NewIteratorResetToStartUtf32) { @@ -585,11 +606,11 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToTermAfterUtf32WordConnector) { ICING_ASSERT_OK_AND_ASSIGN( auto segmenter, language_segmenter_factory::Create( GetSegmenterOptions(GetLocale(), jni_cache_.get()))); - constexpr std::string_view kText = "package com:google:android name"; + constexpr std::string_view kText = "package com.google.android name"; ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, segmenter->Segment(kText)); - // String: "package com:google:android name" + // String: "package com.google.android name" // ^ ^^ ^^ // UTF-8 idx: 0 7 8 26 27 // UTF-32 idx: 0 7 8 26 27 @@ -601,7 +622,7 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToTermAfterUtf32WordConnector) { position_or = itr->ResetToTermStartingAfterUtf32(7); EXPECT_THAT(position_or, IsOk()); EXPECT_THAT(position_or.ValueOrDie(), Eq(8)); - ASSERT_THAT(itr->GetTerm(), Eq("com:google:android")); + ASSERT_THAT(itr->GetTerm(), Eq("com.google.android")); } TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToTermAfterUtf32OutOfBounds) { @@ -961,18 +982,18 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, ICING_ASSERT_OK_AND_ASSIGN( auto segmenter, language_segmenter_factory::Create( GetSegmenterOptions(GetLocale(), jni_cache_.get()))); - constexpr std::string_view kText = "package name com:google:android!"; + constexpr std::string_view kText = "package name com.google.android!"; ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, segmenter->Segment(kText)); - // String: "package name com:google:android!" + // String: "package name com.google.android!" // ^ ^^ ^^ ^ // UTF-8 idx: 0 7 8 12 13 31 // UTF-32 idx: 0 7 8 12 13 31 auto position_or = itr->ResetToTermEndingBeforeUtf32(31); EXPECT_THAT(position_or, IsOk()); EXPECT_THAT(position_or.ValueOrDie(), Eq(13)); - ASSERT_THAT(itr->GetTerm(), Eq("com:google:android")); + ASSERT_THAT(itr->GetTerm(), Eq("com.google.android")); position_or = itr->ResetToTermEndingBeforeUtf32(21); EXPECT_THAT(position_or, IsOk()); |