diff options
Diffstat (limited to 'icing/result')
-rw-r--r-- | icing/result/result-retriever_test.cc | 20 | ||||
-rw-r--r-- | icing/result/result-state-manager_test.cc | 4 | ||||
-rw-r--r-- | icing/result/result-state_test.cc | 4 | ||||
-rw-r--r-- | icing/result/snippet-retriever.cc | 94 | ||||
-rw-r--r-- | icing/result/snippet-retriever_test.cc | 173 |
5 files changed, 237 insertions, 58 deletions
diff --git a/icing/result/result-retriever_test.cc b/icing/result/result-retriever_test.cc index 1c9684d..0d812e4 100644 --- a/icing/result/result-retriever_test.cc +++ b/icing/result/result-retriever_test.cc @@ -22,7 +22,6 @@ #include "gtest/gtest.h" #include "icing/document-builder.h" #include "icing/file/mock-filesystem.h" -#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/portable/equals-proto.h" #include "icing/portable/platform.h" #include "icing/proto/document.pb.h" @@ -36,6 +35,7 @@ #include "icing/store/document-id.h" #include "icing/testing/common-matchers.h" #include "icing/testing/fake-clock.h" +#include "icing/testing/icu-data-file-helper.h" #include "icing/testing/snippet-helpers.h" #include "icing/testing/test-data.h" #include "icing/testing/tmp-directory.h" @@ -55,14 +55,14 @@ using ::testing::IsEmpty; using ::testing::Return; using ::testing::SizeIs; -constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL = - PropertyConfigProto_Cardinality_Code_OPTIONAL; +constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL = + PropertyConfigProto::Cardinality::OPTIONAL; -constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN = - StringIndexingConfig_TokenizerType_Code_PLAIN; +constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN = + StringIndexingConfig::TokenizerType::PLAIN; -constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY; -constexpr TermMatchType_Code MATCH_PREFIX = TermMatchType_Code_PREFIX; +constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY; +constexpr TermMatchType::Code MATCH_PREFIX = TermMatchType::PREFIX; class ResultRetrieverTest : public testing::Test { protected: @@ -160,7 +160,7 @@ ResultSpecProto::SnippetSpecProto CreateSnippetSpec() { ResultSpecProto::SnippetSpecProto snippet_spec; snippet_spec.set_num_to_snippet(std::numeric_limits<int>::max()); snippet_spec.set_num_matches_per_property(std::numeric_limits<int>::max()); - snippet_spec.set_max_window_bytes(1024); + snippet_spec.set_max_window_utf32_length(1024); return snippet_spec; } @@ -362,8 +362,8 @@ TEST_F(ResultRetrieverTest, NotIgnoreErrors) { TEST_F(ResultRetrieverTest, IOErrorShouldReturnInternalError) { MockFilesystem mock_filesystem; - ON_CALL(mock_filesystem, OpenForRead(_)).WillByDefault(Return(false)); - + ON_CALL(mock_filesystem, PRead(A<int>(), A<void*>(), A<size_t>(), A<off_t>())) + .WillByDefault(Return(false)); ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, DocumentStore::Create(&mock_filesystem, test_dir_, &fake_clock_, diff --git a/icing/result/result-state-manager_test.cc b/icing/result/result-state-manager_test.cc index 32e45aa..8a9005d 100644 --- a/icing/result/result-state-manager_test.cc +++ b/icing/result/result-state-manager_test.cc @@ -849,7 +849,7 @@ TEST_F(ResultStateManagerTest, ShouldGetSnippetContext) { ResultSpecProto result_spec = CreateResultSpec(/*num_per_page=*/1); result_spec.mutable_snippet_spec()->set_num_to_snippet(5); result_spec.mutable_snippet_spec()->set_num_matches_per_property(5); - result_spec.mutable_snippet_spec()->set_max_window_bytes(5); + result_spec.mutable_snippet_spec()->set_max_window_utf32_length(5); SearchSpecProto search_spec; search_spec.set_term_match_type(TermMatchType::EXACT_ONLY); @@ -884,7 +884,7 @@ TEST_F(ResultStateManagerTest, ShouldGetDefaultSnippetContext) { // 0 indicates no snippeting result_spec.mutable_snippet_spec()->set_num_to_snippet(0); result_spec.mutable_snippet_spec()->set_num_matches_per_property(0); - result_spec.mutable_snippet_spec()->set_max_window_bytes(0); + result_spec.mutable_snippet_spec()->set_max_window_utf32_length(0); SearchSpecProto search_spec; search_spec.set_term_match_type(TermMatchType::EXACT_ONLY); diff --git a/icing/result/result-state_test.cc b/icing/result/result-state_test.cc index f2121a5..d92fcfa 100644 --- a/icing/result/result-state_test.cc +++ b/icing/result/result-state_test.cc @@ -143,7 +143,7 @@ TEST_F(ResultStateTest, ShouldReturnSnippetContextAccordingToSpecs) { ResultSpecProto result_spec = CreateResultSpec(/*num_per_page=*/2); result_spec.mutable_snippet_spec()->set_num_to_snippet(5); result_spec.mutable_snippet_spec()->set_num_matches_per_property(5); - result_spec.mutable_snippet_spec()->set_max_window_bytes(5); + result_spec.mutable_snippet_spec()->set_max_window_utf32_length(5); SectionRestrictQueryTermsMap query_terms_map; query_terms_map.emplace("term1", std::unordered_set<std::string>()); @@ -178,7 +178,7 @@ TEST_F(ResultStateTest, NoSnippetingShouldReturnNull) { // stored. result_spec.mutable_snippet_spec()->set_num_to_snippet(0); result_spec.mutable_snippet_spec()->set_num_matches_per_property(5); - result_spec.mutable_snippet_spec()->set_max_window_bytes(5); + result_spec.mutable_snippet_spec()->set_max_window_utf32_length(5); SectionRestrictQueryTermsMap query_terms_map; query_terms_map.emplace("term1", std::unordered_set<std::string>()); diff --git a/icing/result/snippet-retriever.cc b/icing/result/snippet-retriever.cc index c46762e..bd1524e 100644 --- a/icing/result/snippet-retriever.cc +++ b/icing/result/snippet-retriever.cc @@ -41,6 +41,7 @@ #include "icing/transform/normalizer.h" #include "icing/util/character-iterator.h" #include "icing/util/i18n-utils.h" +#include "icing/util/logging.h" #include "icing/util/status-macros.h" namespace icing { @@ -75,6 +76,67 @@ inline std::string AddIndexToPath(int values_size, int index, kRBracket); } +// Returns a string of the normalized text of the input Token. Normalization +// is applied based on the Token's type. +std::string NormalizeToken(const Normalizer& normalizer, const Token& token) { + switch (token.type) { + case Token::Type::REGULAR: + return normalizer.NormalizeTerm(token.text); + case Token::Type::VERBATIM: + return std::string(token.text); + case Token::Type::QUERY_EXCLUSION: + [[fallthrough]]; + case Token::Type::QUERY_LEFT_PARENTHESES: + [[fallthrough]]; + case Token::Type::QUERY_RIGHT_PARENTHESES: + [[fallthrough]]; + case Token::Type::QUERY_OR: + [[fallthrough]]; + case Token::Type::QUERY_PROPERTY: + [[fallthrough]]; + case Token::Type::INVALID: + ICING_LOG(WARNING) << "Unable to normalize token of type: " + << static_cast<int>(token.type); + return std::string(token.text); + } +} + +// Returns a CharacterIterator for token's text, advancing one past the last +// matching character from the query term. +CharacterIterator FindMatchEnd(const Normalizer& normalizer, const Token& token, + const std::string& match_query_term) { + switch (token.type) { + case Token::Type::VERBATIM: { + // VERBATIM tokens are not normalized. This means the non-normalized + // matched query term must be either equal to or a prefix of the token's + // text. Therefore, the match must end at the end of the matched query + // term. + CharacterIterator verbatim_match_end = + CharacterIterator(token.text, 0, 0, 0); + verbatim_match_end.AdvanceToUtf8(match_query_term.length()); + return verbatim_match_end; + } + case Token::Type::QUERY_EXCLUSION: + [[fallthrough]]; + case Token::Type::QUERY_LEFT_PARENTHESES: + [[fallthrough]]; + case Token::Type::QUERY_RIGHT_PARENTHESES: + [[fallthrough]]; + case Token::Type::QUERY_OR: + [[fallthrough]]; + case Token::Type::QUERY_PROPERTY: + [[fallthrough]]; + case Token::Type::INVALID: + ICING_LOG(WARNING) + << "Unexpected Token type " << static_cast<int>(token.type) + << " found when finding match end of query term and token."; + [[fallthrough]]; + case Token::Type::REGULAR: + return normalizer.FindNormalizedMatchEndPosition(token.text, + match_query_term); + } +} + class TokenMatcher { public: virtual ~TokenMatcher() = default; @@ -102,15 +164,16 @@ class TokenMatcherExact : public TokenMatcher { normalizer_(normalizer) {} CharacterIterator Matches(Token token) const override { - std::string s = normalizer_.NormalizeTerm(token.text); + std::string s = NormalizeToken(normalizer_, token); auto itr = unrestricted_query_terms_.find(s); if (itr == unrestricted_query_terms_.end()) { itr = restricted_query_terms_.find(s); } if (itr != unrestricted_query_terms_.end() && itr != restricted_query_terms_.end()) { - return normalizer_.FindNormalizedMatchEndPosition(token.text, *itr); + return FindMatchEnd(normalizer_, token, *itr); } + return CharacterIterator(token.text, -1, -1, -1); } @@ -131,19 +194,17 @@ class TokenMatcherPrefix : public TokenMatcher { normalizer_(normalizer) {} CharacterIterator Matches(Token token) const override { - std::string s = normalizer_.NormalizeTerm(token.text); + std::string s = NormalizeToken(normalizer_, token); for (const std::string& query_term : unrestricted_query_terms_) { if (query_term.length() <= s.length() && s.compare(0, query_term.length(), query_term) == 0) { - return normalizer_.FindNormalizedMatchEndPosition(token.text, - query_term); + return FindMatchEnd(normalizer_, token, query_term); } } for (const std::string& query_term : restricted_query_terms_) { if (query_term.length() <= s.length() && s.compare(0, query_term.length(), query_term) == 0) { - return normalizer_.FindNormalizedMatchEndPosition(token.text, - query_term); + return FindMatchEnd(normalizer_, token, query_term); } } return CharacterIterator(token.text, -1, -1, -1); @@ -184,7 +245,7 @@ libtextclassifier3::StatusOr<CharacterIterator> DetermineWindowStart( const ResultSpecProto::SnippetSpecProto& snippet_spec, std::string_view value, int window_start_min_exclusive_utf32, Tokenizer::Iterator* iterator) { - if (!iterator->ResetToTokenAfter(window_start_min_exclusive_utf32)) { + if (!iterator->ResetToTokenStartingAfter(window_start_min_exclusive_utf32)) { return absl_ports::InternalError( "Couldn't reset tokenizer to determine snippet window!"); } @@ -219,7 +280,7 @@ libtextclassifier3::StatusOr<CharacterIterator> DetermineWindowEnd( const ResultSpecProto::SnippetSpecProto& snippet_spec, std::string_view value, int window_end_max_exclusive_utf32, Tokenizer::Iterator* iterator) { - if (!iterator->ResetToTokenBefore(window_end_max_exclusive_utf32)) { + if (!iterator->ResetToTokenEndingBefore(window_end_max_exclusive_utf32)) { return absl_ports::InternalError( "Couldn't reset tokenizer to determine snippet window!"); } @@ -283,9 +344,9 @@ libtextclassifier3::StatusOr<SnippetMatchProto> RetrieveMatch( int match_len_utf32 = end_itr.utf32_index() - match_pos_utf32; int match_mid_utf32 = match_pos_utf32 + match_len_utf32 / 2; int window_start_min_exclusive_utf32 = - (match_mid_utf32 - snippet_spec.max_window_bytes() / 2) - 1; + (match_mid_utf32 - snippet_spec.max_window_utf32_length() / 2) - 1; int window_end_max_exclusive_utf32 = - match_mid_utf32 + (snippet_spec.max_window_bytes() + 1) / 2; + match_mid_utf32 + (snippet_spec.max_window_utf32_length() + 1) / 2; snippet_match.set_exact_match_byte_position(start_itr.utf8_index()); snippet_match.set_exact_match_utf16_position(start_itr.utf16_index()); @@ -296,7 +357,7 @@ libtextclassifier3::StatusOr<SnippetMatchProto> RetrieveMatch( // Only include windows if it'll at least include the matched text. Otherwise, // it'll just be an empty string anyways. - if (snippet_spec.max_window_bytes() >= match_len_utf32) { + if (snippet_spec.max_window_utf32_length() >= match_len_utf32) { // Find the beginning of the window. ICING_ASSIGN_OR_RETURN( CharacterIterator window_start, @@ -337,8 +398,13 @@ libtextclassifier3::StatusOr<SnippetMatchProto> RetrieveMatch( // DetermineWindowStart/End may change the position of the iterator. So, // reset the iterator back to the original position. - bool success = (match_pos_utf32 > 0) ? iterator->ResetToTokenAfter(match_pos_utf32 - 1) - : iterator->ResetToStart(); + bool success = false; + if (match_pos_utf32 > 0) { + success = iterator->ResetToTokenStartingAfter(match_pos_utf32 - 1); + } else { + success = iterator->ResetToStart(); + } + if (!success) { return absl_ports::InternalError( "Couldn't reset tokenizer to determine snippet window!"); diff --git a/icing/result/snippet-retriever_test.cc b/icing/result/snippet-retriever_test.cc index f811941..0de2295 100644 --- a/icing/result/snippet-retriever_test.cc +++ b/icing/result/snippet-retriever_test.cc @@ -22,7 +22,6 @@ #include "gtest/gtest.h" #include "icing/document-builder.h" #include "icing/file/mock-filesystem.h" -#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/portable/equals-proto.h" #include "icing/portable/platform.h" #include "icing/proto/document.pb.h" @@ -37,6 +36,7 @@ #include "icing/store/key-mapper.h" #include "icing/testing/common-matchers.h" #include "icing/testing/fake-clock.h" +#include "icing/testing/icu-data-file-helper.h" #include "icing/testing/jni-test-helpers.h" #include "icing/testing/snippet-helpers.h" #include "icing/testing/test-data.h" @@ -58,16 +58,18 @@ using ::testing::Eq; using ::testing::IsEmpty; using ::testing::SizeIs; -constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL = - PropertyConfigProto_Cardinality_Code_OPTIONAL; -constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REPEATED = - PropertyConfigProto_Cardinality_Code_REPEATED; +constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL = + PropertyConfigProto::Cardinality::OPTIONAL; +constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REPEATED = + PropertyConfigProto::Cardinality::REPEATED; -constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN = - StringIndexingConfig_TokenizerType_Code_PLAIN; +constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN = + StringIndexingConfig::TokenizerType::PLAIN; +constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_VERBATIM = + StringIndexingConfig::TokenizerType::VERBATIM; -constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY; -constexpr TermMatchType_Code MATCH_PREFIX = TermMatchType_Code_PREFIX; +constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY; +constexpr TermMatchType::Code MATCH_PREFIX = TermMatchType::PREFIX; std::vector<std::string_view> GetPropertyPaths(const SnippetProto& snippet) { std::vector<std::string_view> paths; @@ -131,7 +133,7 @@ class SnippetRetrieverTest : public testing::Test { snippet_spec_.set_num_to_snippet(std::numeric_limits<int32_t>::max()); snippet_spec_.set_num_matches_per_property( std::numeric_limits<int32_t>::max()); - snippet_spec_.set_max_window_bytes(64); + snippet_spec_.set_max_window_utf32_length(64); } void TearDown() override { @@ -178,7 +180,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowSizeSmallerThanMatch) { // Window starts at the beginning of "three" and ends in the middle of // "three". len=4, orig_window= "thre" - snippet_spec_.set_max_window_bytes(4); + snippet_spec_.set_max_window_utf32_length(4); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -204,7 +206,7 @@ TEST_F(SnippetRetrieverTest, // Window starts at the beginning of "three" and at the exact end of // "three". len=5, orig_window= "three" - snippet_spec_.set_max_window_bytes(5); + snippet_spec_.set_max_window_utf32_length(5); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -230,7 +232,7 @@ TEST_F(SnippetRetrieverTest, // Window starts at the beginning of "four" and at the exact end of // "four". len=4, orig_window= "four" - snippet_spec_.set_max_window_bytes(4); + snippet_spec_.set_max_window_utf32_length(4); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -262,7 +264,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowStartsInWhitespace) { // 1. untrimmed, no-shifting window will be (2,17). // 2. trimmed, no-shifting window [4,13) "two three" // 3. trimmed, shifted window [4,18) "two three four" - snippet_spec_.set_max_window_bytes(14); + snippet_spec_.set_max_window_utf32_length(14); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -295,7 +297,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowStartsMidToken) { // 1. untrimmed, no-shifting window will be (1,18). // 2. trimmed, no-shifting window [4,18) "two three four" // 3. trimmed, shifted window [4,20) "two three four.." - snippet_spec_.set_max_window_bytes(16); + snippet_spec_.set_max_window_utf32_length(16); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -321,7 +323,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowEndsInPunctuation) { // Window ends in the middle of all the punctuation and window starts at 0. // len=20, orig_window="one two three four.." - snippet_spec_.set_max_window_bytes(20); + snippet_spec_.set_max_window_utf32_length(20); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -349,7 +351,7 @@ TEST_F(SnippetRetrieverTest, // Window ends in the middle of all the punctuation and window starts at 0. // len=26, orig_window="pside down in Australia¿" - snippet_spec_.set_max_window_bytes(24); + snippet_spec_.set_max_window_utf32_length(24); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -377,7 +379,7 @@ TEST_F(SnippetRetrieverTest, // Window ends in the middle of all the punctuation and window starts at 0. // len=26, orig_window="upside down in Australia¿ " - snippet_spec_.set_max_window_bytes(26); + snippet_spec_.set_max_window_utf32_length(26); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -410,7 +412,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowStartsBeforeValueStart) { // 1. untrimmed, no-shifting window will be (-2,21). // 2. trimmed, no-shifting window [0,21) "one two three four..." // 3. trimmed, shifted window [0,22) "one two three four...." - snippet_spec_.set_max_window_bytes(22); + snippet_spec_.set_max_window_utf32_length(22); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -436,7 +438,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowEndsInWhitespace) { // Window ends before "five" but after all the punctuation // len=26, orig_window="one two three four.... " - snippet_spec_.set_max_window_bytes(26); + snippet_spec_.set_max_window_utf32_length(26); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -469,7 +471,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowEndsMidToken) { // 1. untrimmed, no-shifting window will be ((-7,26). // 2. trimmed, no-shifting window [0,26) "one two three four...." // 3. trimmed, shifted window [0,27) "one two three four.... five" - snippet_spec_.set_max_window_bytes(32); + snippet_spec_.set_max_window_utf32_length(32); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -495,7 +497,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowSizeEqualToValueSize) { // Max window size equals the size of the value. // len=34, orig_window="one two three four.... five" - snippet_spec_.set_max_window_bytes(34); + snippet_spec_.set_max_window_utf32_length(34); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -521,7 +523,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowSizeLargerThanValueSize) { // Max window size exceeds the size of the value. // len=36, orig_window="one two three four.... five" - snippet_spec_.set_max_window_bytes(36); + snippet_spec_.set_max_window_utf32_length(36); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -555,7 +557,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextStart) { // 1. untrimmed, no-shifting window will be (-10,19). // 2. trimmed, no-shifting window [0,19) "one two three four." // 3. trimmed, shifted window [0,27) "one two three four.... five" - snippet_spec_.set_max_window_bytes(28); + snippet_spec_.set_max_window_utf32_length(28); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -589,7 +591,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextEnd) { // 1. untrimmed, no-shifting window will be (10,39). // 2. trimmed, no-shifting window [14,31) "four.... five six" // 3. trimmed, shifted window [4,31) "two three four.... five six" - snippet_spec_.set_max_window_bytes(28); + snippet_spec_.set_max_window_utf32_length(28); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -623,7 +625,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextStartShortText) { // 1. untrimmed, no-shifting window will be (-10,19). // 2. trimmed, no-shifting window [0, 19) "one two three four." // 3. trimmed, shifted window [0, 22) "one two three four...." - snippet_spec_.set_max_window_bytes(28); + snippet_spec_.set_max_window_utf32_length(28); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -657,7 +659,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextEndShortText) { // 1. untrimmed, no-shifting window will be (1,30). // 2. trimmed, no-shifting window [4, 22) "two three four...." // 3. trimmed, shifted window [0, 22) "one two three four...." - snippet_spec_.set_max_window_bytes(28); + snippet_spec_.set_max_window_utf32_length(28); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -721,7 +723,7 @@ TEST_F(SnippetRetrieverTest, SimpleSnippetingNoWindowing) { .AddStringProperty("body", "Only a fool would match this content.") .Build(); - snippet_spec_.set_max_window_bytes(0); + snippet_spec_.set_max_window_utf32_length(0); SectionIdMask section_mask = 0b00000011; SectionRestrictQueryTermsMap query_terms{{"", {"foo"}}}; @@ -1473,7 +1475,7 @@ TEST_F(SnippetRetrieverTest, CJKSnippetWindowTest) { // 1. untrimmed, no-shifting window will be (0,7). // 2. trimmed, no-shifting window [1, 6) "每天走路去". // 3. trimmed, shifted window [0, 6) "我每天走路去" - snippet_spec_.set_max_window_bytes(6); + snippet_spec_.set_max_window_utf32_length(6); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask); @@ -1572,7 +1574,7 @@ TEST_F(SnippetRetrieverTest, Utf16MultiCodeUnitWindowTest) { // UTF8 idx: 9 22 // UTF16 idx: 5 12 // UTF32 idx: 3 7 - snippet_spec_.set_max_window_bytes(6); + snippet_spec_.set_max_window_utf32_length(6); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask); @@ -1596,6 +1598,117 @@ TEST_F(SnippetRetrieverTest, Utf16MultiCodeUnitWindowTest) { EXPECT_THAT(match_proto.window_utf16_length(), Eq(7)); } +TEST_F(SnippetRetrieverTest, SnippettingVerbatimAscii) { + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType("verbatimType") + .AddProperty(PropertyConfigBuilder() + .SetName("verbatim") + .SetDataTypeString(MATCH_EXACT, + TOKENIZER_VERBATIM) + .SetCardinality(CARDINALITY_REPEATED))) + .Build(); + ICING_ASSERT_OK(schema_store_->SetSchema( + schema, /*ignore_errors_and_delete_documents=*/true)); + ICING_ASSERT_OK_AND_ASSIGN( + snippet_retriever_, + SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(), + normalizer_.get())); + + DocumentProto document = DocumentBuilder() + .SetKey("icing", "verbatim/1") + .SetSchema("verbatimType") + .AddStringProperty("verbatim", "Hello, world!") + .Build(); + + SectionIdMask section_mask = 0b00000001; + SectionRestrictQueryTermsMap query_terms{{"", {"Hello, world!"}}}; + + snippet_spec_.set_max_window_utf32_length(13); + SnippetProto snippet = snippet_retriever_->RetrieveSnippet( + query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); + + // There should only be one snippet entry and match, the verbatim token in its + // entirety. + ASSERT_THAT(snippet.entries(), SizeIs(1)); + + const SnippetProto::EntryProto* entry = &snippet.entries(0); + ASSERT_THAT(entry->snippet_matches(), SizeIs(1)); + ASSERT_THAT(entry->property_name(), "verbatim"); + + const SnippetMatchProto& match_proto = entry->snippet_matches(0); + // We expect the match to begin at position 0, and to span the entire token + // which contains 13 characters. + EXPECT_THAT(match_proto.window_byte_position(), Eq(0)); + EXPECT_THAT(match_proto.window_utf16_length(), Eq(13)); + + // We expect the submatch to begin at position 0 of the verbatim token and + // span the length of our query term "Hello, world!", which has utf-16 length + // of 13. The submatch length is equal to the window length as the query the + // snippet is retrieved with an exact term match. + EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(0)); + EXPECT_THAT(match_proto.submatch_utf16_length(), Eq(13)); +} + +TEST_F(SnippetRetrieverTest, SnippettingVerbatimCJK) { + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType("verbatimType") + .AddProperty(PropertyConfigBuilder() + .SetName("verbatim") + .SetDataTypeString(MATCH_PREFIX, + TOKENIZER_VERBATIM) + .SetCardinality(CARDINALITY_REPEATED))) + .Build(); + ICING_ASSERT_OK(schema_store_->SetSchema( + schema, /*ignore_errors_and_delete_documents=*/true)); + ICING_ASSERT_OK_AND_ASSIGN( + snippet_retriever_, + SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(), + normalizer_.get())); + + // String: "我每天走路去上班。" + // ^ ^ ^ ^^ + // UTF8 idx: 0 3 9 15 18 + // UTF16 idx: 0 1 3 5 6 + // UTF32 idx: 0 1 3 5 6 + // Breaks into segments: "我", "每天", "走路", "去", "上班" + std::string chinese_string = "我每天走路去上班。"; + DocumentProto document = DocumentBuilder() + .SetKey("icing", "verbatim/1") + .SetSchema("verbatimType") + .AddStringProperty("verbatim", chinese_string) + .Build(); + + SectionIdMask section_mask = 0b00000001; + SectionRestrictQueryTermsMap query_terms{{"", {"我每"}}}; + + snippet_spec_.set_max_window_utf32_length(9); + SnippetProto snippet = snippet_retriever_->RetrieveSnippet( + query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask); + + // There should only be one snippet entry and match, the verbatim token in its + // entirety. + ASSERT_THAT(snippet.entries(), SizeIs(1)); + + const SnippetProto::EntryProto* entry = &snippet.entries(0); + ASSERT_THAT(entry->snippet_matches(), SizeIs(1)); + ASSERT_THAT(entry->property_name(), "verbatim"); + + const SnippetMatchProto& match_proto = entry->snippet_matches(0); + // We expect the match to begin at position 0, and to span the entire token + // which has utf-16 length of 9. + EXPECT_THAT(match_proto.window_byte_position(), Eq(0)); + EXPECT_THAT(match_proto.window_utf16_length(), Eq(9)); + + // We expect the submatch to begin at position 0 of the verbatim token and + // span the length of our query term "我每", which has utf-16 length of 2. + EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(0)); + EXPECT_THAT(match_proto.submatch_utf16_length(), Eq(2)); +} + } // namespace } // namespace lib |