diff options
Diffstat (limited to 'icing/result/snippet-retriever_test.cc')
-rw-r--r-- | icing/result/snippet-retriever_test.cc | 203 |
1 files changed, 30 insertions, 173 deletions
diff --git a/icing/result/snippet-retriever_test.cc b/icing/result/snippet-retriever_test.cc index 0de2295..e7988ae 100644 --- a/icing/result/snippet-retriever_test.cc +++ b/icing/result/snippet-retriever_test.cc @@ -22,6 +22,7 @@ #include "gtest/gtest.h" #include "icing/document-builder.h" #include "icing/file/mock-filesystem.h" +#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/portable/equals-proto.h" #include "icing/portable/platform.h" #include "icing/proto/document.pb.h" @@ -36,14 +37,12 @@ #include "icing/store/key-mapper.h" #include "icing/testing/common-matchers.h" #include "icing/testing/fake-clock.h" -#include "icing/testing/icu-data-file-helper.h" #include "icing/testing/jni-test-helpers.h" #include "icing/testing/snippet-helpers.h" #include "icing/testing/test-data.h" #include "icing/testing/tmp-directory.h" #include "icing/tokenization/language-segmenter-factory.h" #include "icing/tokenization/language-segmenter.h" -#include "icing/transform/map/map-normalizer.h" #include "icing/transform/normalizer-factory.h" #include "icing/transform/normalizer.h" #include "unicode/uloc.h" @@ -58,18 +57,16 @@ using ::testing::Eq; using ::testing::IsEmpty; using ::testing::SizeIs; -constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL = - PropertyConfigProto::Cardinality::OPTIONAL; -constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REPEATED = - PropertyConfigProto::Cardinality::REPEATED; +constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL = + PropertyConfigProto_Cardinality_Code_OPTIONAL; +constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REPEATED = + PropertyConfigProto_Cardinality_Code_REPEATED; -constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN = - StringIndexingConfig::TokenizerType::PLAIN; -constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_VERBATIM = - StringIndexingConfig::TokenizerType::VERBATIM; +constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN = + StringIndexingConfig_TokenizerType_Code_PLAIN; -constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY; -constexpr TermMatchType::Code MATCH_PREFIX = TermMatchType::PREFIX; +constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY; +constexpr TermMatchType_Code MATCH_PREFIX = TermMatchType_Code_PREFIX; std::vector<std::string_view> GetPropertyPaths(const SnippetProto& snippet) { std::vector<std::string_view> paths; @@ -133,7 +130,7 @@ class SnippetRetrieverTest : public testing::Test { snippet_spec_.set_num_to_snippet(std::numeric_limits<int32_t>::max()); snippet_spec_.set_num_matches_per_property( std::numeric_limits<int32_t>::max()); - snippet_spec_.set_max_window_utf32_length(64); + snippet_spec_.set_max_window_bytes(64); } void TearDown() override { @@ -180,7 +177,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowSizeSmallerThanMatch) { // Window starts at the beginning of "three" and ends in the middle of // "three". len=4, orig_window= "thre" - snippet_spec_.set_max_window_utf32_length(4); + snippet_spec_.set_max_window_bytes(4); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -206,7 +203,7 @@ TEST_F(SnippetRetrieverTest, // Window starts at the beginning of "three" and at the exact end of // "three". len=5, orig_window= "three" - snippet_spec_.set_max_window_utf32_length(5); + snippet_spec_.set_max_window_bytes(5); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -232,7 +229,7 @@ TEST_F(SnippetRetrieverTest, // Window starts at the beginning of "four" and at the exact end of // "four". len=4, orig_window= "four" - snippet_spec_.set_max_window_utf32_length(4); + snippet_spec_.set_max_window_bytes(4); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -264,7 +261,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowStartsInWhitespace) { // 1. untrimmed, no-shifting window will be (2,17). // 2. trimmed, no-shifting window [4,13) "two three" // 3. trimmed, shifted window [4,18) "two three four" - snippet_spec_.set_max_window_utf32_length(14); + snippet_spec_.set_max_window_bytes(14); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -297,7 +294,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowStartsMidToken) { // 1. untrimmed, no-shifting window will be (1,18). // 2. trimmed, no-shifting window [4,18) "two three four" // 3. trimmed, shifted window [4,20) "two three four.." - snippet_spec_.set_max_window_utf32_length(16); + snippet_spec_.set_max_window_bytes(16); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -323,7 +320,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowEndsInPunctuation) { // Window ends in the middle of all the punctuation and window starts at 0. // len=20, orig_window="one two three four.." - snippet_spec_.set_max_window_utf32_length(20); + snippet_spec_.set_max_window_bytes(20); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -351,7 +348,7 @@ TEST_F(SnippetRetrieverTest, // Window ends in the middle of all the punctuation and window starts at 0. // len=26, orig_window="pside down in Australia¿" - snippet_spec_.set_max_window_utf32_length(24); + snippet_spec_.set_max_window_bytes(24); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -379,7 +376,7 @@ TEST_F(SnippetRetrieverTest, // Window ends in the middle of all the punctuation and window starts at 0. // len=26, orig_window="upside down in Australia¿ " - snippet_spec_.set_max_window_utf32_length(26); + snippet_spec_.set_max_window_bytes(26); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -412,7 +409,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowStartsBeforeValueStart) { // 1. untrimmed, no-shifting window will be (-2,21). // 2. trimmed, no-shifting window [0,21) "one two three four..." // 3. trimmed, shifted window [0,22) "one two three four...." - snippet_spec_.set_max_window_utf32_length(22); + snippet_spec_.set_max_window_bytes(22); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -438,7 +435,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowEndsInWhitespace) { // Window ends before "five" but after all the punctuation // len=26, orig_window="one two three four.... " - snippet_spec_.set_max_window_utf32_length(26); + snippet_spec_.set_max_window_bytes(26); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -471,7 +468,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowEndsMidToken) { // 1. untrimmed, no-shifting window will be ((-7,26). // 2. trimmed, no-shifting window [0,26) "one two three four...." // 3. trimmed, shifted window [0,27) "one two three four.... five" - snippet_spec_.set_max_window_utf32_length(32); + snippet_spec_.set_max_window_bytes(32); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -497,7 +494,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowSizeEqualToValueSize) { // Max window size equals the size of the value. // len=34, orig_window="one two three four.... five" - snippet_spec_.set_max_window_utf32_length(34); + snippet_spec_.set_max_window_bytes(34); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -523,7 +520,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowSizeLargerThanValueSize) { // Max window size exceeds the size of the value. // len=36, orig_window="one two three four.... five" - snippet_spec_.set_max_window_utf32_length(36); + snippet_spec_.set_max_window_bytes(36); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -557,7 +554,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextStart) { // 1. untrimmed, no-shifting window will be (-10,19). // 2. trimmed, no-shifting window [0,19) "one two three four." // 3. trimmed, shifted window [0,27) "one two three four.... five" - snippet_spec_.set_max_window_utf32_length(28); + snippet_spec_.set_max_window_bytes(28); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -591,7 +588,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextEnd) { // 1. untrimmed, no-shifting window will be (10,39). // 2. trimmed, no-shifting window [14,31) "four.... five six" // 3. trimmed, shifted window [4,31) "two three four.... five six" - snippet_spec_.set_max_window_utf32_length(28); + snippet_spec_.set_max_window_bytes(28); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -625,7 +622,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextStartShortText) { // 1. untrimmed, no-shifting window will be (-10,19). // 2. trimmed, no-shifting window [0, 19) "one two three four." // 3. trimmed, shifted window [0, 22) "one two three four...." - snippet_spec_.set_max_window_utf32_length(28); + snippet_spec_.set_max_window_bytes(28); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -659,7 +656,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextEndShortText) { // 1. untrimmed, no-shifting window will be (1,30). // 2. trimmed, no-shifting window [4, 22) "two three four...." // 3. trimmed, shifted window [0, 22) "one two three four...." - snippet_spec_.set_max_window_utf32_length(28); + snippet_spec_.set_max_window_bytes(28); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -693,7 +690,6 @@ TEST_F(SnippetRetrieverTest, PrefixSnippeting) { EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("subject foo")); EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("foo")); - EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("f")); } TEST_F(SnippetRetrieverTest, ExactSnippeting) { @@ -723,7 +719,7 @@ TEST_F(SnippetRetrieverTest, SimpleSnippetingNoWindowing) { .AddStringProperty("body", "Only a fool would match this content.") .Build(); - snippet_spec_.set_max_window_utf32_length(0); + snippet_spec_.set_max_window_bytes(0); SectionIdMask section_mask = 0b00000011; SectionRestrictQueryTermsMap query_terms{{"", {"foo"}}}; @@ -737,7 +733,6 @@ TEST_F(SnippetRetrieverTest, SimpleSnippetingNoWindowing) { GetString(&document, snippet.entries(0).property_name()); EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("")); EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("foo")); - EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("foo")); } TEST_F(SnippetRetrieverTest, SnippetingMultipleMatches) { @@ -784,15 +779,12 @@ TEST_F(SnippetRetrieverTest, SnippetingMultipleMatches) { "we need to begin considering our options regarding body bar.")); EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("foo", "bar")); - EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), - ElementsAre("foo", "bar")); EXPECT_THAT(snippet.entries(1).property_name(), Eq("subject")); content = GetString(&document, snippet.entries(1).property_name()); EXPECT_THAT(GetWindows(content, snippet.entries(1)), ElementsAre("subject foo")); EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("foo")); - EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("foo")); } TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesSectionRestrict) { @@ -842,8 +834,6 @@ TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesSectionRestrict) { "we need to begin considering our options regarding body bar.")); EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("foo", "bar")); - EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), - ElementsAre("foo", "bar")); } TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesSectionRestrictedTerm) { @@ -894,16 +884,12 @@ TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesSectionRestrictedTerm) { "Concerning the subject of foo, we need to begin considering our")); EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("subject", "foo")); - EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), - ElementsAre("subject", "foo")); EXPECT_THAT(snippet.entries(1).property_name(), Eq("subject")); content = GetString(&document, snippet.entries(1).property_name()); EXPECT_THAT(GetWindows(content, snippet.entries(1)), ElementsAre("subject foo")); EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("subject")); - EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), - ElementsAre("subject")); } TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesOneMatchPerProperty) { @@ -947,14 +933,12 @@ TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesOneMatchPerProperty) { ElementsAre( "Concerning the subject of foo, we need to begin considering our")); EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("foo")); - EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("foo")); EXPECT_THAT(snippet.entries(1).property_name(), Eq("subject")); content = GetString(&document, snippet.entries(1).property_name()); EXPECT_THAT(GetWindows(content, snippet.entries(1)), ElementsAre("subject foo")); EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("foo")); - EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("foo")); } TEST_F(SnippetRetrieverTest, PrefixSnippetingNormalization) { @@ -976,7 +960,6 @@ TEST_F(SnippetRetrieverTest, PrefixSnippetingNormalization) { GetString(&document, snippet.entries(0).property_name()); EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("MDI team")); EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("MDI")); - EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("MD")); } TEST_F(SnippetRetrieverTest, ExactSnippetingNormalization) { @@ -1000,9 +983,6 @@ TEST_F(SnippetRetrieverTest, ExactSnippetingNormalization) { EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("Some members are in Zürich.")); EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("Zürich")); - - EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), - ElementsAre("Zürich")); } TEST_F(SnippetRetrieverTest, SnippetingTestOneLevel) { @@ -1063,13 +1043,11 @@ TEST_F(SnippetRetrieverTest, SnippetingTestOneLevel) { GetString(&document, snippet.entries(0).property_name()); EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("polo")); EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("polo")); - EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("polo")); EXPECT_THAT(snippet.entries(1).property_name(), Eq("X[3]")); content = GetString(&document, snippet.entries(1).property_name()); EXPECT_THAT(GetWindows(content, snippet.entries(1)), ElementsAre("polo")); EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("polo")); - EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("polo")); EXPECT_THAT(GetPropertyPaths(snippet), ElementsAre("X[1]", "X[3]", "Y[1]", "Y[3]", "Z[1]", "Z[3]")); @@ -1166,13 +1144,11 @@ TEST_F(SnippetRetrieverTest, SnippetingTestMultiLevel) { GetString(&document, snippet.entries(0).property_name()); EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("polo")); EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("polo")); - EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("polo")); EXPECT_THAT(snippet.entries(1).property_name(), Eq("A.X[3]")); content = GetString(&document, snippet.entries(1).property_name()); EXPECT_THAT(GetWindows(content, snippet.entries(1)), ElementsAre("polo")); EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("polo")); - EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("polo")); EXPECT_THAT( GetPropertyPaths(snippet), @@ -1275,13 +1251,11 @@ TEST_F(SnippetRetrieverTest, SnippetingTestMultiLevelRepeated) { GetString(&document, snippet.entries(0).property_name()); EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("polo")); EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("polo")); - EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("polo")); EXPECT_THAT(snippet.entries(1).property_name(), Eq("A[0].X[3]")); content = GetString(&document, snippet.entries(1).property_name()); EXPECT_THAT(GetWindows(content, snippet.entries(1)), ElementsAre("polo")); EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("polo")); - EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("polo")); EXPECT_THAT(GetPropertyPaths(snippet), ElementsAre("A[0].X[1]", "A[0].X[3]", "A[1].X[1]", "A[1].X[3]", @@ -1382,13 +1356,11 @@ TEST_F(SnippetRetrieverTest, SnippetingTestMultiLevelSingleValue) { GetString(&document, snippet.entries(0).property_name()); EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("polo")); EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("polo")); - EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("polo")); EXPECT_THAT(snippet.entries(1).property_name(), Eq("A[1].X")); content = GetString(&document, snippet.entries(1).property_name()); EXPECT_THAT(GetWindows(content, snippet.entries(1)), ElementsAre("polo")); EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("polo")); - EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("polo")); EXPECT_THAT( GetPropertyPaths(snippet), @@ -1432,12 +1404,10 @@ TEST_F(SnippetRetrieverTest, CJKSnippetMatchTest) { // Ensure that the match is correct. EXPECT_THAT(GetMatches(content, *entry), ElementsAre("走路")); - EXPECT_THAT(GetSubMatches(content, *entry), ElementsAre("走")); // Ensure that the utf-16 values are also as expected EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(3)); EXPECT_THAT(match_proto.exact_match_utf16_length(), Eq(2)); - EXPECT_THAT(match_proto.submatch_utf16_length(), Eq(1)); } TEST_F(SnippetRetrieverTest, CJKSnippetWindowTest) { @@ -1475,7 +1445,7 @@ TEST_F(SnippetRetrieverTest, CJKSnippetWindowTest) { // 1. untrimmed, no-shifting window will be (0,7). // 2. trimmed, no-shifting window [1, 6) "每天走路去". // 3. trimmed, shifted window [0, 6) "我每天走路去" - snippet_spec_.set_max_window_utf32_length(6); + snippet_spec_.set_max_window_bytes(6); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask); @@ -1537,12 +1507,10 @@ TEST_F(SnippetRetrieverTest, Utf16MultiCodeUnitSnippetMatchTest) { // Ensure that the match is correct. EXPECT_THAT(GetMatches(content, *entry), ElementsAre("𐀂𐀃")); - EXPECT_THAT(GetSubMatches(content, *entry), ElementsAre("𐀂")); // Ensure that the utf-16 values are also as expected EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(5)); EXPECT_THAT(match_proto.exact_match_utf16_length(), Eq(4)); - EXPECT_THAT(match_proto.submatch_utf16_length(), Eq(2)); } TEST_F(SnippetRetrieverTest, Utf16MultiCodeUnitWindowTest) { @@ -1574,7 +1542,7 @@ TEST_F(SnippetRetrieverTest, Utf16MultiCodeUnitWindowTest) { // UTF8 idx: 9 22 // UTF16 idx: 5 12 // UTF32 idx: 3 7 - snippet_spec_.set_max_window_utf32_length(6); + snippet_spec_.set_max_window_bytes(6); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask); @@ -1598,117 +1566,6 @@ TEST_F(SnippetRetrieverTest, Utf16MultiCodeUnitWindowTest) { EXPECT_THAT(match_proto.window_utf16_length(), Eq(7)); } -TEST_F(SnippetRetrieverTest, SnippettingVerbatimAscii) { - SchemaProto schema = - SchemaBuilder() - .AddType(SchemaTypeConfigBuilder() - .SetType("verbatimType") - .AddProperty(PropertyConfigBuilder() - .SetName("verbatim") - .SetDataTypeString(MATCH_EXACT, - TOKENIZER_VERBATIM) - .SetCardinality(CARDINALITY_REPEATED))) - .Build(); - ICING_ASSERT_OK(schema_store_->SetSchema( - schema, /*ignore_errors_and_delete_documents=*/true)); - ICING_ASSERT_OK_AND_ASSIGN( - snippet_retriever_, - SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(), - normalizer_.get())); - - DocumentProto document = DocumentBuilder() - .SetKey("icing", "verbatim/1") - .SetSchema("verbatimType") - .AddStringProperty("verbatim", "Hello, world!") - .Build(); - - SectionIdMask section_mask = 0b00000001; - SectionRestrictQueryTermsMap query_terms{{"", {"Hello, world!"}}}; - - snippet_spec_.set_max_window_utf32_length(13); - SnippetProto snippet = snippet_retriever_->RetrieveSnippet( - query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); - - // There should only be one snippet entry and match, the verbatim token in its - // entirety. - ASSERT_THAT(snippet.entries(), SizeIs(1)); - - const SnippetProto::EntryProto* entry = &snippet.entries(0); - ASSERT_THAT(entry->snippet_matches(), SizeIs(1)); - ASSERT_THAT(entry->property_name(), "verbatim"); - - const SnippetMatchProto& match_proto = entry->snippet_matches(0); - // We expect the match to begin at position 0, and to span the entire token - // which contains 13 characters. - EXPECT_THAT(match_proto.window_byte_position(), Eq(0)); - EXPECT_THAT(match_proto.window_utf16_length(), Eq(13)); - - // We expect the submatch to begin at position 0 of the verbatim token and - // span the length of our query term "Hello, world!", which has utf-16 length - // of 13. The submatch length is equal to the window length as the query the - // snippet is retrieved with an exact term match. - EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(0)); - EXPECT_THAT(match_proto.submatch_utf16_length(), Eq(13)); -} - -TEST_F(SnippetRetrieverTest, SnippettingVerbatimCJK) { - SchemaProto schema = - SchemaBuilder() - .AddType(SchemaTypeConfigBuilder() - .SetType("verbatimType") - .AddProperty(PropertyConfigBuilder() - .SetName("verbatim") - .SetDataTypeString(MATCH_PREFIX, - TOKENIZER_VERBATIM) - .SetCardinality(CARDINALITY_REPEATED))) - .Build(); - ICING_ASSERT_OK(schema_store_->SetSchema( - schema, /*ignore_errors_and_delete_documents=*/true)); - ICING_ASSERT_OK_AND_ASSIGN( - snippet_retriever_, - SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(), - normalizer_.get())); - - // String: "我每天走路去上班。" - // ^ ^ ^ ^^ - // UTF8 idx: 0 3 9 15 18 - // UTF16 idx: 0 1 3 5 6 - // UTF32 idx: 0 1 3 5 6 - // Breaks into segments: "我", "每天", "走路", "去", "上班" - std::string chinese_string = "我每天走路去上班。"; - DocumentProto document = DocumentBuilder() - .SetKey("icing", "verbatim/1") - .SetSchema("verbatimType") - .AddStringProperty("verbatim", chinese_string) - .Build(); - - SectionIdMask section_mask = 0b00000001; - SectionRestrictQueryTermsMap query_terms{{"", {"我每"}}}; - - snippet_spec_.set_max_window_utf32_length(9); - SnippetProto snippet = snippet_retriever_->RetrieveSnippet( - query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask); - - // There should only be one snippet entry and match, the verbatim token in its - // entirety. - ASSERT_THAT(snippet.entries(), SizeIs(1)); - - const SnippetProto::EntryProto* entry = &snippet.entries(0); - ASSERT_THAT(entry->snippet_matches(), SizeIs(1)); - ASSERT_THAT(entry->property_name(), "verbatim"); - - const SnippetMatchProto& match_proto = entry->snippet_matches(0); - // We expect the match to begin at position 0, and to span the entire token - // which has utf-16 length of 9. - EXPECT_THAT(match_proto.window_byte_position(), Eq(0)); - EXPECT_THAT(match_proto.window_utf16_length(), Eq(9)); - - // We expect the submatch to begin at position 0 of the verbatim token and - // span the length of our query term "我每", which has utf-16 length of 2. - EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(0)); - EXPECT_THAT(match_proto.submatch_utf16_length(), Eq(2)); -} - } // namespace } // namespace lib |