diff options
Diffstat (limited to 'icing/transform/icu')
-rw-r--r-- | icing/transform/icu/icu-normalizer.cc | 1 | ||||
-rw-r--r-- | icing/transform/icu/icu-normalizer.h | 3 | ||||
-rw-r--r-- | icing/transform/icu/icu-normalizer_benchmark.cc | 47 | ||||
-rw-r--r-- | icing/transform/icu/icu-normalizer_test.cc | 79 |
4 files changed, 111 insertions, 19 deletions
diff --git a/icing/transform/icu/icu-normalizer.cc b/icing/transform/icu/icu-normalizer.cc index f32e541..58d4956 100644 --- a/icing/transform/icu/icu-normalizer.cc +++ b/icing/transform/icu/icu-normalizer.cc @@ -50,6 +50,7 @@ constexpr UChar kTransformRulesUtf16[] = "Latin-ASCII; " // Map Latin characters to ASCII characters "Hiragana-Katakana; " // Map hiragana to katakana "[:Latin:] NFD; " // Decompose Latin letters + "[:Greek:] NFD; " // Decompose Greek letters "[:Nonspacing Mark:] Remove; " // Remove accent / diacritic marks "NFKC"; // Decompose and compose everything diff --git a/icing/transform/icu/icu-normalizer.h b/icing/transform/icu/icu-normalizer.h index 7c64506..f6f2b78 100644 --- a/icing/transform/icu/icu-normalizer.h +++ b/icing/transform/icu/icu-normalizer.h @@ -33,7 +33,8 @@ namespace lib { // 2. Transforms full-width Latin characters to ASCII characters if possible. // 3. Transforms hiragana to katakana. // 4. Removes accent / diacritic marks on Latin characters -// 5. Normalized text must be less than or equal to max_term_byte_size, +// 5. Removes accent / diacritic marks on Greek characters +// 6. Normalized text must be less than or equal to max_term_byte_size, // otherwise it will be truncated. // // There're some other rules from ICU not listed here, please see .cc file for diff --git a/icing/transform/icu/icu-normalizer_benchmark.cc b/icing/transform/icu/icu-normalizer_benchmark.cc index fe8289a..89d5f1e 100644 --- a/icing/transform/icu/icu-normalizer_benchmark.cc +++ b/icing/transform/icu/icu-normalizer_benchmark.cc @@ -39,8 +39,8 @@ // blaze-bin/icing/transform/icu/icu-normalizer_benchmark // /data/local/tmp/ // -// $ adb shell /data/local/tmp/icu-normalizer_benchmark --benchmark_filter=all -// --adb +// $ adb shell /data/local/tmp/icu-normalizer_benchmark +// --benchmark_filter=all --adb // Flag to tell the benchmark that it'll be run on an Android device via adb, // the benchmark will set up data files accordingly. @@ -61,7 +61,6 @@ void BM_NormalizeUppercase(benchmark::State& state) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Normalizer> normalizer, normalizer_factory::Create( - /*max_term_byte_size=*/std::numeric_limits<int>::max())); std::string input_string(state.range(0), 'A'); @@ -95,7 +94,6 @@ void BM_NormalizeAccent(benchmark::State& state) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Normalizer> normalizer, normalizer_factory::Create( - /*max_term_byte_size=*/std::numeric_limits<int>::max())); std::string input_string; @@ -123,7 +121,7 @@ BENCHMARK(BM_NormalizeAccent) ->Arg(2048000) ->Arg(4096000); -void BM_NormalizeHiragana(benchmark::State& state) { +void BM_NormalizeGreekAccent(benchmark::State& state) { bool run_via_adb = absl::GetFlag(FLAGS_adb); if (!run_via_adb) { ICING_ASSERT_OK(icu_data_file_helper::SetUpICUDataFile( @@ -133,7 +131,43 @@ void BM_NormalizeHiragana(benchmark::State& state) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Normalizer> normalizer, normalizer_factory::Create( + /*max_term_byte_size=*/std::numeric_limits<int>::max())); + + std::string input_string; + while (input_string.length() < state.range(0)) { + input_string.append("άὰᾶἀἄ"); + } + + for (auto _ : state) { + normalizer->NormalizeTerm(input_string); + } +} +BENCHMARK(BM_NormalizeGreekAccent) + ->Arg(1000) + ->Arg(2000) + ->Arg(4000) + ->Arg(8000) + ->Arg(16000) + ->Arg(32000) + ->Arg(64000) + ->Arg(128000) + ->Arg(256000) + ->Arg(384000) + ->Arg(512000) + ->Arg(1024000) + ->Arg(2048000) + ->Arg(4096000); + +void BM_NormalizeHiragana(benchmark::State& state) { + bool run_via_adb = absl::GetFlag(FLAGS_adb); + if (!run_via_adb) { + ICING_ASSERT_OK(icu_data_file_helper::SetUpICUDataFile( + GetTestFilePath("icing/icu.dat"))); + } + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<Normalizer> normalizer, + normalizer_factory::Create( /*max_term_byte_size=*/std::numeric_limits<int>::max())); std::string input_string; @@ -171,7 +205,6 @@ void BM_UppercaseSubTokenLength(benchmark::State& state) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Normalizer> normalizer, normalizer_factory::Create( - /*max_term_byte_size=*/std::numeric_limits<int>::max())); std::string input_string(state.range(0), 'A'); @@ -207,7 +240,6 @@ void BM_AccentSubTokenLength(benchmark::State& state) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Normalizer> normalizer, normalizer_factory::Create( - /*max_term_byte_size=*/std::numeric_limits<int>::max())); std::string input_string; @@ -248,7 +280,6 @@ void BM_HiraganaSubTokenLength(benchmark::State& state) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Normalizer> normalizer, normalizer_factory::Create( - /*max_term_byte_size=*/std::numeric_limits<int>::max())); std::string input_string; diff --git a/icing/transform/icu/icu-normalizer_test.cc b/icing/transform/icu/icu-normalizer_test.cc index 719f7be..0df23fc 100644 --- a/icing/transform/icu/icu-normalizer_test.cc +++ b/icing/transform/icu/icu-normalizer_test.cc @@ -83,14 +83,12 @@ TEST_F(IcuNormalizerTest, LatinLetterRemoveAccent) { Eq("eeeeeeeeeeeeeeeeeeeeeeeeeee")); EXPECT_THAT(normalizer_->NormalizeTerm("Ḟḟ"), Eq("ff")); EXPECT_THAT(normalizer_->NormalizeTerm("ĜĞĠĢḠḡĝğġģ"), Eq("gggggggggg")); - EXPECT_THAT(normalizer_->NormalizeTerm("ĤḢḤḦḨḪḣḥḧḩḫĥẖ"), - Eq("hhhhhhhhhhhhh")); + EXPECT_THAT(normalizer_->NormalizeTerm("ĤḢḤḦḨḪḣḥḧḩḫĥẖ"), Eq("hhhhhhhhhhhhh")); EXPECT_THAT(normalizer_->NormalizeTerm("ÌÍÎÏĨĪĬḬḭḯìíîïĩīĭ"), Eq("iiiiiiiiiiiiiiiii")); EXPECT_THAT(normalizer_->NormalizeTerm("Ĵĵ"), Eq("jj")); EXPECT_THAT(normalizer_->NormalizeTerm("ĶḰḲḴḵḱḳķ"), Eq("kkkkkkkk")); - EXPECT_THAT(normalizer_->NormalizeTerm("ĹĻĽḶḸḼḷḹḻḽĺļľ"), - Eq("lllllllllllll")); + EXPECT_THAT(normalizer_->NormalizeTerm("ĹĻĽḶḸḼḷḹḻḽĺļľ"), Eq("lllllllllllll")); EXPECT_THAT(normalizer_->NormalizeTerm("ḾṀṂḿṁṃ"), Eq("mmmmmm")); EXPECT_THAT(normalizer_->NormalizeTerm("ÑŃŅŇṄṆṈṊṅṇṉṋñńņň"), Eq("nnnnnnnnnnnnnnnn")); @@ -109,19 +107,38 @@ TEST_F(IcuNormalizerTest, LatinLetterRemoveAccent) { EXPECT_THAT(normalizer_->NormalizeTerm("ŴẀẂẄẆẈẁẃẅẇẉŵ"), Eq("wwwwwwwwwwww")); EXPECT_THAT(normalizer_->NormalizeTerm("ẊẌẋẍ"), Eq("xxxx")); EXPECT_THAT(normalizer_->NormalizeTerm("ÝŶŸẎẏŷýÿ"), Eq("yyyyyyyy")); - EXPECT_THAT(normalizer_->NormalizeTerm("ŹŻŽẐẒẔẑẓẕźżž"), - Eq("zzzzzzzzzzzz")); + EXPECT_THAT(normalizer_->NormalizeTerm("ŹŻŽẐẒẔẑẓẕźżž"), Eq("zzzzzzzzzzzz")); EXPECT_THAT(normalizer_->NormalizeTerm("Barış"), Eq("baris")); } +TEST_F(IcuNormalizerTest, GreekLetterRemoveAccent) { + EXPECT_THAT(normalizer_->NormalizeTerm("kαλημέρα"), Eq("kαλημερα")); + EXPECT_THAT(normalizer_->NormalizeTerm("εγγραφή"), Eq("εγγραφη")); + EXPECT_THAT(normalizer_->NormalizeTerm( + "ἈἉἊἋἌἍἎἏᾈᾉᾊᾋᾌᾍᾎᾏᾸᾹᾺΆᾼἀἁἂἃἄἅἆἇὰάᾀᾁᾂᾃᾄᾅᾆᾇᾰᾱᾲᾳᾴᾶᾷ"), + Eq("αααααααααααααααααααααααααααααααααααααααααααααα")); + EXPECT_THAT(normalizer_->NormalizeTerm("ἘἙἚἛἜἝῈΈἐἑἒἓἔἕὲέ"), + Eq("εεεεεεεεεεεεεεεε")); + EXPECT_THAT( + normalizer_->NormalizeTerm("ἨἩἪἫἬἭἮἯᾘᾙᾚᾛᾜᾝᾞᾟῊΉῌἠἡἢἣἤἥἦἧὴήᾐᾑᾒᾓᾔᾕᾖᾗῂῃῄῆῇ"), + Eq("ηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηη")); + EXPECT_THAT(normalizer_->NormalizeTerm("ἸἹἺἻἼἽἾἿῘῙῚΊἰἱἲἳἴἵἶἷὶίῐῑῒΐῖῗ"), + Eq("ιιιιιιιιιιιιιιιιιιιιιιιιιιιι")); + EXPECT_THAT(normalizer_->NormalizeTerm("ὈὉὊὋὌὍῸΌὀὁὂὃὄὅὸό"), + Eq("οοοοοοοοοοοοοοοο")); + EXPECT_THAT(normalizer_->NormalizeTerm("ὙὛὝὟῨῩῪΎὐὑὒὓὔὕὖὗὺύῠῡῢΰῦῧ"), + Eq("υυυυυυυυυυυυυυυυυυυυυυυυ")); + EXPECT_THAT( + normalizer_->NormalizeTerm("ὨὩὪὫὬὭὮὯᾨᾩᾪᾫᾬᾭᾮᾯῺΏῼὠὡὢὣὤὥὦὧὼώᾠᾡᾢᾣᾤᾥᾦᾧῲῳῴῶῷ"), + Eq("ωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωω")); + EXPECT_THAT(normalizer_->NormalizeTerm("Ῥῤῥ"), Eq("ρρρ")); +} + // Accent / diacritic marks won't be removed in non-latin chars, e.g. in -// Japanese and Greek +// Japanese TEST_F(IcuNormalizerTest, NonLatinLetterNotRemoveAccent) { // Katakana EXPECT_THAT(normalizer_->NormalizeTerm("ダヂヅデド"), Eq("ダヂヅデド")); - // Greek - EXPECT_THAT(normalizer_->NormalizeTerm("kαλημέρα"), Eq("kαλημέρα")); - EXPECT_THAT(normalizer_->NormalizeTerm("εγγραφή"), Eq("εγγραφή")); // Our current ICU rules can't handle Hebrew properly, e.g. the accents in // "אָלֶף־בֵּית עִבְרִי" @@ -287,6 +304,27 @@ TEST_F(IcuNormalizerTest, PrefixMatchLength) { term = "ÀĄḁáIcing"; match_end = normalizer->FindNormalizedMatchEndPosition(term, "aaaa"); EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("ÀĄḁá")); + + // Greek accents + term = "άνθρωπος"; + match_end = normalizer->FindNormalizedMatchEndPosition(term, "ανθ"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("άνθ")); + + term = "καλημέρα"; + match_end = normalizer->FindNormalizedMatchEndPosition(term, "καλημε"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("καλημέ")); + + term = "όχι"; + match_end = normalizer->FindNormalizedMatchEndPosition(term, "οχ"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("όχ")); + + term = "πότε"; + match_end = normalizer->FindNormalizedMatchEndPosition(term, "ποτ"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("πότ")); + + term = "ἈἉἊἋIcing"; + match_end = normalizer->FindNormalizedMatchEndPosition(term, "αααα"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("ἈἉἊἋ")); } TEST_F(IcuNormalizerTest, SharedPrefixMatchLength) { @@ -340,6 +378,27 @@ TEST_F(IcuNormalizerTest, SharedPrefixMatchLength) { term = "BarışIcing"; match_end = normalizer->FindNormalizedMatchEndPosition(term, "barismdi"); EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Barış")); + + // Greek accents + term = "άνθρωπος"; + match_end = normalizer->FindNormalizedMatchEndPosition(term, "ανθν"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("άνθ")); + + term = "καλημέρα"; + match_end = normalizer->FindNormalizedMatchEndPosition(term, "καλημεος"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("καλημέ")); + + term = "όχι"; + match_end = normalizer->FindNormalizedMatchEndPosition(term, "οχκα"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("όχ")); + + term = "πότε"; + match_end = normalizer->FindNormalizedMatchEndPosition(term, "ποτρα"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("πότ")); + + term = "ἈἉἊἋIcing"; + match_end = normalizer->FindNormalizedMatchEndPosition(term, "ααααmdi"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("ἈἉἊἋ")); } } // namespace |