diff options
Diffstat (limited to 'icing/transform/icu/icu-normalizer_test.cc')
-rw-r--r-- | icing/transform/icu/icu-normalizer_test.cc | 79 |
1 files changed, 69 insertions, 10 deletions
diff --git a/icing/transform/icu/icu-normalizer_test.cc b/icing/transform/icu/icu-normalizer_test.cc index 719f7be..0df23fc 100644 --- a/icing/transform/icu/icu-normalizer_test.cc +++ b/icing/transform/icu/icu-normalizer_test.cc @@ -83,14 +83,12 @@ TEST_F(IcuNormalizerTest, LatinLetterRemoveAccent) { Eq("eeeeeeeeeeeeeeeeeeeeeeeeeee")); EXPECT_THAT(normalizer_->NormalizeTerm("Ḟḟ"), Eq("ff")); EXPECT_THAT(normalizer_->NormalizeTerm("ĜĞĠĢḠḡĝğġģ"), Eq("gggggggggg")); - EXPECT_THAT(normalizer_->NormalizeTerm("ĤḢḤḦḨḪḣḥḧḩḫĥẖ"), - Eq("hhhhhhhhhhhhh")); + EXPECT_THAT(normalizer_->NormalizeTerm("ĤḢḤḦḨḪḣḥḧḩḫĥẖ"), Eq("hhhhhhhhhhhhh")); EXPECT_THAT(normalizer_->NormalizeTerm("ÌÍÎÏĨĪĬḬḭḯìíîïĩīĭ"), Eq("iiiiiiiiiiiiiiiii")); EXPECT_THAT(normalizer_->NormalizeTerm("Ĵĵ"), Eq("jj")); EXPECT_THAT(normalizer_->NormalizeTerm("ĶḰḲḴḵḱḳķ"), Eq("kkkkkkkk")); - EXPECT_THAT(normalizer_->NormalizeTerm("ĹĻĽḶḸḼḷḹḻḽĺļľ"), - Eq("lllllllllllll")); + EXPECT_THAT(normalizer_->NormalizeTerm("ĹĻĽḶḸḼḷḹḻḽĺļľ"), Eq("lllllllllllll")); EXPECT_THAT(normalizer_->NormalizeTerm("ḾṀṂḿṁṃ"), Eq("mmmmmm")); EXPECT_THAT(normalizer_->NormalizeTerm("ÑŃŅŇṄṆṈṊṅṇṉṋñńņň"), Eq("nnnnnnnnnnnnnnnn")); @@ -109,19 +107,38 @@ TEST_F(IcuNormalizerTest, LatinLetterRemoveAccent) { EXPECT_THAT(normalizer_->NormalizeTerm("ŴẀẂẄẆẈẁẃẅẇẉŵ"), Eq("wwwwwwwwwwww")); EXPECT_THAT(normalizer_->NormalizeTerm("ẊẌẋẍ"), Eq("xxxx")); EXPECT_THAT(normalizer_->NormalizeTerm("ÝŶŸẎẏŷýÿ"), Eq("yyyyyyyy")); - EXPECT_THAT(normalizer_->NormalizeTerm("ŹŻŽẐẒẔẑẓẕźżž"), - Eq("zzzzzzzzzzzz")); + EXPECT_THAT(normalizer_->NormalizeTerm("ŹŻŽẐẒẔẑẓẕźżž"), Eq("zzzzzzzzzzzz")); EXPECT_THAT(normalizer_->NormalizeTerm("Barış"), Eq("baris")); } +TEST_F(IcuNormalizerTest, GreekLetterRemoveAccent) { + EXPECT_THAT(normalizer_->NormalizeTerm("kαλημέρα"), Eq("kαλημερα")); + EXPECT_THAT(normalizer_->NormalizeTerm("εγγραφή"), Eq("εγγραφη")); + EXPECT_THAT(normalizer_->NormalizeTerm( + "ἈἉἊἋἌἍἎἏᾈᾉᾊᾋᾌᾍᾎᾏᾸᾹᾺΆᾼἀἁἂἃἄἅἆἇὰάᾀᾁᾂᾃᾄᾅᾆᾇᾰᾱᾲᾳᾴᾶᾷ"), + Eq("αααααααααααααααααααααααααααααααααααααααααααααα")); + EXPECT_THAT(normalizer_->NormalizeTerm("ἘἙἚἛἜἝῈΈἐἑἒἓἔἕὲέ"), + Eq("εεεεεεεεεεεεεεεε")); + EXPECT_THAT( + normalizer_->NormalizeTerm("ἨἩἪἫἬἭἮἯᾘᾙᾚᾛᾜᾝᾞᾟῊΉῌἠἡἢἣἤἥἦἧὴήᾐᾑᾒᾓᾔᾕᾖᾗῂῃῄῆῇ"), + Eq("ηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηη")); + EXPECT_THAT(normalizer_->NormalizeTerm("ἸἹἺἻἼἽἾἿῘῙῚΊἰἱἲἳἴἵἶἷὶίῐῑῒΐῖῗ"), + Eq("ιιιιιιιιιιιιιιιιιιιιιιιιιιιι")); + EXPECT_THAT(normalizer_->NormalizeTerm("ὈὉὊὋὌὍῸΌὀὁὂὃὄὅὸό"), + Eq("οοοοοοοοοοοοοοοο")); + EXPECT_THAT(normalizer_->NormalizeTerm("ὙὛὝὟῨῩῪΎὐὑὒὓὔὕὖὗὺύῠῡῢΰῦῧ"), + Eq("υυυυυυυυυυυυυυυυυυυυυυυυ")); + EXPECT_THAT( + normalizer_->NormalizeTerm("ὨὩὪὫὬὭὮὯᾨᾩᾪᾫᾬᾭᾮᾯῺΏῼὠὡὢὣὤὥὦὧὼώᾠᾡᾢᾣᾤᾥᾦᾧῲῳῴῶῷ"), + Eq("ωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωω")); + EXPECT_THAT(normalizer_->NormalizeTerm("Ῥῤῥ"), Eq("ρρρ")); +} + // Accent / diacritic marks won't be removed in non-latin chars, e.g. in -// Japanese and Greek +// Japanese TEST_F(IcuNormalizerTest, NonLatinLetterNotRemoveAccent) { // Katakana EXPECT_THAT(normalizer_->NormalizeTerm("ダヂヅデド"), Eq("ダヂヅデド")); - // Greek - EXPECT_THAT(normalizer_->NormalizeTerm("kαλημέρα"), Eq("kαλημέρα")); - EXPECT_THAT(normalizer_->NormalizeTerm("εγγραφή"), Eq("εγγραφή")); // Our current ICU rules can't handle Hebrew properly, e.g. the accents in // "אָלֶף־בֵּית עִבְרִי" @@ -287,6 +304,27 @@ TEST_F(IcuNormalizerTest, PrefixMatchLength) { term = "ÀĄḁáIcing"; match_end = normalizer->FindNormalizedMatchEndPosition(term, "aaaa"); EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("ÀĄḁá")); + + // Greek accents + term = "άνθρωπος"; + match_end = normalizer->FindNormalizedMatchEndPosition(term, "ανθ"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("άνθ")); + + term = "καλημέρα"; + match_end = normalizer->FindNormalizedMatchEndPosition(term, "καλημε"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("καλημέ")); + + term = "όχι"; + match_end = normalizer->FindNormalizedMatchEndPosition(term, "οχ"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("όχ")); + + term = "πότε"; + match_end = normalizer->FindNormalizedMatchEndPosition(term, "ποτ"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("πότ")); + + term = "ἈἉἊἋIcing"; + match_end = normalizer->FindNormalizedMatchEndPosition(term, "αααα"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("ἈἉἊἋ")); } TEST_F(IcuNormalizerTest, SharedPrefixMatchLength) { @@ -340,6 +378,27 @@ TEST_F(IcuNormalizerTest, SharedPrefixMatchLength) { term = "BarışIcing"; match_end = normalizer->FindNormalizedMatchEndPosition(term, "barismdi"); EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Barış")); + + // Greek accents + term = "άνθρωπος"; + match_end = normalizer->FindNormalizedMatchEndPosition(term, "ανθν"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("άνθ")); + + term = "καλημέρα"; + match_end = normalizer->FindNormalizedMatchEndPosition(term, "καλημεος"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("καλημέ")); + + term = "όχι"; + match_end = normalizer->FindNormalizedMatchEndPosition(term, "οχκα"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("όχ")); + + term = "πότε"; + match_end = normalizer->FindNormalizedMatchEndPosition(term, "ποτρα"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("πότ")); + + term = "ἈἉἊἋIcing"; + match_end = normalizer->FindNormalizedMatchEndPosition(term, "ααααmdi"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("ἈἉἊἋ")); } } // namespace |