aboutsummaryrefslogtreecommitdiff
path: root/icing/transform
diff options
context:
space:
mode:
Diffstat (limited to 'icing/transform')
-rw-r--r--icing/transform/icu/icu-normalizer.cc1
-rw-r--r--icing/transform/icu/icu-normalizer.h3
-rw-r--r--icing/transform/icu/icu-normalizer_benchmark.cc47
-rw-r--r--icing/transform/icu/icu-normalizer_test.cc79
4 files changed, 111 insertions, 19 deletions
diff --git a/icing/transform/icu/icu-normalizer.cc b/icing/transform/icu/icu-normalizer.cc
index f32e541..58d4956 100644
--- a/icing/transform/icu/icu-normalizer.cc
+++ b/icing/transform/icu/icu-normalizer.cc
@@ -50,6 +50,7 @@ constexpr UChar kTransformRulesUtf16[] =
"Latin-ASCII; " // Map Latin characters to ASCII characters
"Hiragana-Katakana; " // Map hiragana to katakana
"[:Latin:] NFD; " // Decompose Latin letters
+ "[:Greek:] NFD; " // Decompose Greek letters
"[:Nonspacing Mark:] Remove; " // Remove accent / diacritic marks
"NFKC"; // Decompose and compose everything
diff --git a/icing/transform/icu/icu-normalizer.h b/icing/transform/icu/icu-normalizer.h
index 7c64506..f6f2b78 100644
--- a/icing/transform/icu/icu-normalizer.h
+++ b/icing/transform/icu/icu-normalizer.h
@@ -33,7 +33,8 @@ namespace lib {
// 2. Transforms full-width Latin characters to ASCII characters if possible.
// 3. Transforms hiragana to katakana.
// 4. Removes accent / diacritic marks on Latin characters
-// 5. Normalized text must be less than or equal to max_term_byte_size,
+// 5. Removes accent / diacritic marks on Greek characters
+// 6. Normalized text must be less than or equal to max_term_byte_size,
// otherwise it will be truncated.
//
// There're some other rules from ICU not listed here, please see .cc file for
diff --git a/icing/transform/icu/icu-normalizer_benchmark.cc b/icing/transform/icu/icu-normalizer_benchmark.cc
index fe8289a..89d5f1e 100644
--- a/icing/transform/icu/icu-normalizer_benchmark.cc
+++ b/icing/transform/icu/icu-normalizer_benchmark.cc
@@ -39,8 +39,8 @@
// blaze-bin/icing/transform/icu/icu-normalizer_benchmark
// /data/local/tmp/
//
-// $ adb shell /data/local/tmp/icu-normalizer_benchmark --benchmark_filter=all
-// --adb
+// $ adb shell /data/local/tmp/icu-normalizer_benchmark
+// --benchmark_filter=all --adb
// Flag to tell the benchmark that it'll be run on an Android device via adb,
// the benchmark will set up data files accordingly.
@@ -61,7 +61,6 @@ void BM_NormalizeUppercase(benchmark::State& state) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Normalizer> normalizer,
normalizer_factory::Create(
-
/*max_term_byte_size=*/std::numeric_limits<int>::max()));
std::string input_string(state.range(0), 'A');
@@ -95,7 +94,6 @@ void BM_NormalizeAccent(benchmark::State& state) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Normalizer> normalizer,
normalizer_factory::Create(
-
/*max_term_byte_size=*/std::numeric_limits<int>::max()));
std::string input_string;
@@ -123,7 +121,7 @@ BENCHMARK(BM_NormalizeAccent)
->Arg(2048000)
->Arg(4096000);
-void BM_NormalizeHiragana(benchmark::State& state) {
+void BM_NormalizeGreekAccent(benchmark::State& state) {
bool run_via_adb = absl::GetFlag(FLAGS_adb);
if (!run_via_adb) {
ICING_ASSERT_OK(icu_data_file_helper::SetUpICUDataFile(
@@ -133,7 +131,43 @@ void BM_NormalizeHiragana(benchmark::State& state) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Normalizer> normalizer,
normalizer_factory::Create(
+ /*max_term_byte_size=*/std::numeric_limits<int>::max()));
+
+ std::string input_string;
+ while (input_string.length() < state.range(0)) {
+ input_string.append("άὰᾶἀἄ");
+ }
+
+ for (auto _ : state) {
+ normalizer->NormalizeTerm(input_string);
+ }
+}
+BENCHMARK(BM_NormalizeGreekAccent)
+ ->Arg(1000)
+ ->Arg(2000)
+ ->Arg(4000)
+ ->Arg(8000)
+ ->Arg(16000)
+ ->Arg(32000)
+ ->Arg(64000)
+ ->Arg(128000)
+ ->Arg(256000)
+ ->Arg(384000)
+ ->Arg(512000)
+ ->Arg(1024000)
+ ->Arg(2048000)
+ ->Arg(4096000);
+
+void BM_NormalizeHiragana(benchmark::State& state) {
+ bool run_via_adb = absl::GetFlag(FLAGS_adb);
+ if (!run_via_adb) {
+ ICING_ASSERT_OK(icu_data_file_helper::SetUpICUDataFile(
+ GetTestFilePath("icing/icu.dat")));
+ }
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Normalizer> normalizer,
+ normalizer_factory::Create(
/*max_term_byte_size=*/std::numeric_limits<int>::max()));
std::string input_string;
@@ -171,7 +205,6 @@ void BM_UppercaseSubTokenLength(benchmark::State& state) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Normalizer> normalizer,
normalizer_factory::Create(
-
/*max_term_byte_size=*/std::numeric_limits<int>::max()));
std::string input_string(state.range(0), 'A');
@@ -207,7 +240,6 @@ void BM_AccentSubTokenLength(benchmark::State& state) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Normalizer> normalizer,
normalizer_factory::Create(
-
/*max_term_byte_size=*/std::numeric_limits<int>::max()));
std::string input_string;
@@ -248,7 +280,6 @@ void BM_HiraganaSubTokenLength(benchmark::State& state) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Normalizer> normalizer,
normalizer_factory::Create(
-
/*max_term_byte_size=*/std::numeric_limits<int>::max()));
std::string input_string;
diff --git a/icing/transform/icu/icu-normalizer_test.cc b/icing/transform/icu/icu-normalizer_test.cc
index 719f7be..0df23fc 100644
--- a/icing/transform/icu/icu-normalizer_test.cc
+++ b/icing/transform/icu/icu-normalizer_test.cc
@@ -83,14 +83,12 @@ TEST_F(IcuNormalizerTest, LatinLetterRemoveAccent) {
Eq("eeeeeeeeeeeeeeeeeeeeeeeeeee"));
EXPECT_THAT(normalizer_->NormalizeTerm("Ḟḟ"), Eq("ff"));
EXPECT_THAT(normalizer_->NormalizeTerm("ĜĞĠĢḠḡĝğġģ"), Eq("gggggggggg"));
- EXPECT_THAT(normalizer_->NormalizeTerm("ĤḢḤḦḨḪḣḥḧḩḫĥẖ"),
- Eq("hhhhhhhhhhhhh"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ĤḢḤḦḨḪḣḥḧḩḫĥẖ"), Eq("hhhhhhhhhhhhh"));
EXPECT_THAT(normalizer_->NormalizeTerm("ÌÍÎÏĨĪĬḬḭḯìíîïĩīĭ"),
Eq("iiiiiiiiiiiiiiiii"));
EXPECT_THAT(normalizer_->NormalizeTerm("Ĵĵ"), Eq("jj"));
EXPECT_THAT(normalizer_->NormalizeTerm("ĶḰḲḴḵḱḳķ"), Eq("kkkkkkkk"));
- EXPECT_THAT(normalizer_->NormalizeTerm("ĹĻĽḶḸḼḷḹḻḽĺļľ"),
- Eq("lllllllllllll"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ĹĻĽḶḸḼḷḹḻḽĺļľ"), Eq("lllllllllllll"));
EXPECT_THAT(normalizer_->NormalizeTerm("ḾṀṂḿṁṃ"), Eq("mmmmmm"));
EXPECT_THAT(normalizer_->NormalizeTerm("ÑŃŅŇṄṆṈṊṅṇṉṋñńņň"),
Eq("nnnnnnnnnnnnnnnn"));
@@ -109,19 +107,38 @@ TEST_F(IcuNormalizerTest, LatinLetterRemoveAccent) {
EXPECT_THAT(normalizer_->NormalizeTerm("ŴẀẂẄẆẈẁẃẅẇẉŵ"), Eq("wwwwwwwwwwww"));
EXPECT_THAT(normalizer_->NormalizeTerm("ẊẌẋẍ"), Eq("xxxx"));
EXPECT_THAT(normalizer_->NormalizeTerm("ÝŶŸẎẏŷýÿ"), Eq("yyyyyyyy"));
- EXPECT_THAT(normalizer_->NormalizeTerm("ŹŻŽẐẒẔẑẓẕźżž"),
- Eq("zzzzzzzzzzzz"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ŹŻŽẐẒẔẑẓẕźżž"), Eq("zzzzzzzzzzzz"));
EXPECT_THAT(normalizer_->NormalizeTerm("Barış"), Eq("baris"));
}
+TEST_F(IcuNormalizerTest, GreekLetterRemoveAccent) {
+ EXPECT_THAT(normalizer_->NormalizeTerm("kαλημέρα"), Eq("kαλημερα"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("εγγραφή"), Eq("εγγραφη"));
+ EXPECT_THAT(normalizer_->NormalizeTerm(
+ "ἈἉἊἋἌἍἎἏᾈᾉᾊᾋᾌᾍᾎᾏᾸᾹᾺΆᾼἀἁἂἃἄἅἆἇὰάᾀᾁᾂᾃᾄᾅᾆᾇᾰᾱᾲᾳᾴᾶᾷ"),
+ Eq("αααααααααααααααααααααααααααααααααααααααααααααα"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ἘἙἚἛἜἝῈΈἐἑἒἓἔἕὲέ"),
+ Eq("εεεεεεεεεεεεεεεε"));
+ EXPECT_THAT(
+ normalizer_->NormalizeTerm("ἨἩἪἫἬἭἮἯᾘᾙᾚᾛᾜᾝᾞᾟῊΉῌἠἡἢἣἤἥἦἧὴήᾐᾑᾒᾓᾔᾕᾖᾗῂῃῄῆῇ"),
+ Eq("ηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηη"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ἸἹἺἻἼἽἾἿῘῙῚΊἰἱἲἳἴἵἶἷὶίῐῑῒΐῖῗ"),
+ Eq("ιιιιιιιιιιιιιιιιιιιιιιιιιιιι"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ὈὉὊὋὌὍῸΌὀὁὂὃὄὅὸό"),
+ Eq("οοοοοοοοοοοοοοοο"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ὙὛὝὟῨῩῪΎὐὑὒὓὔὕὖὗὺύῠῡῢΰῦῧ"),
+ Eq("υυυυυυυυυυυυυυυυυυυυυυυυ"));
+ EXPECT_THAT(
+ normalizer_->NormalizeTerm("ὨὩὪὫὬὭὮὯᾨᾩᾪᾫᾬᾭᾮᾯῺΏῼὠὡὢὣὤὥὦὧὼώᾠᾡᾢᾣᾤᾥᾦᾧῲῳῴῶῷ"),
+ Eq("ωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωω"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("Ῥῤῥ"), Eq("ρρρ"));
+}
+
// Accent / diacritic marks won't be removed in non-latin chars, e.g. in
-// Japanese and Greek
+// Japanese
TEST_F(IcuNormalizerTest, NonLatinLetterNotRemoveAccent) {
// Katakana
EXPECT_THAT(normalizer_->NormalizeTerm("ダヂヅデド"), Eq("ダヂヅデド"));
- // Greek
- EXPECT_THAT(normalizer_->NormalizeTerm("kαλημέρα"), Eq("kαλημέρα"));
- EXPECT_THAT(normalizer_->NormalizeTerm("εγγραφή"), Eq("εγγραφή"));
// Our current ICU rules can't handle Hebrew properly, e.g. the accents in
// "אָלֶף־בֵּית עִבְרִי"
@@ -287,6 +304,27 @@ TEST_F(IcuNormalizerTest, PrefixMatchLength) {
term = "ÀĄḁáIcing";
match_end = normalizer->FindNormalizedMatchEndPosition(term, "aaaa");
EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("ÀĄḁá"));
+
+ // Greek accents
+ term = "άνθρωπος";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "ανθ");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("άνθ"));
+
+ term = "καλημέρα";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "καλημε");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("καλημέ"));
+
+ term = "όχι";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "οχ");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("όχ"));
+
+ term = "πότε";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "ποτ");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("πότ"));
+
+ term = "ἈἉἊἋIcing";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "αααα");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("ἈἉἊἋ"));
}
TEST_F(IcuNormalizerTest, SharedPrefixMatchLength) {
@@ -340,6 +378,27 @@ TEST_F(IcuNormalizerTest, SharedPrefixMatchLength) {
term = "BarışIcing";
match_end = normalizer->FindNormalizedMatchEndPosition(term, "barismdi");
EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Barış"));
+
+ // Greek accents
+ term = "άνθρωπος";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "ανθν");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("άνθ"));
+
+ term = "καλημέρα";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "καλημεος");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("καλημέ"));
+
+ term = "όχι";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "οχκα");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("όχ"));
+
+ term = "πότε";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "ποτρα");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("πότ"));
+
+ term = "ἈἉἊἋIcing";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "ααααmdi");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("ἈἉἊἋ"));
}
} // namespace