diff options
author | Tim Barron <tjbarron@google.com> | 2020-12-09 04:46:41 +0000 |
---|---|---|
committer | Tim Barron <tjbarron@google.com> | 2020-12-09 04:53:04 +0000 |
commit | 282a5708af10879b12a09a59ad5bbfa253b1e92a (patch) | |
tree | e54fd6ebc322a8983a707b3bc05d52faf48ef22f | |
parent | 1ae91ee03a08a4f0391f0f90e2797fb3c2e02960 (diff) | |
download | icing-282a5708af10879b12a09a59ad5bbfa253b1e92a.tar.gz |
Update Icing from upstream.
Change-Id: I662cd2f97794722eca9cc43b73549babed884be4
46 files changed, 2085 insertions, 618 deletions
diff --git a/icing/document-builder.h b/icing/document-builder.h index 4c95b89..ba68ec5 100644 --- a/icing/document-builder.h +++ b/icing/document-builder.h @@ -71,11 +71,6 @@ class DocumentBuilder { return *this; } - DocumentBuilder& ClearCustomProperties() { - document_.clear_custom_properties(); - return *this; - } - // Takes a property name and any number of string values. template <typename... V> DocumentBuilder& AddStringProperty(std::string property_name, @@ -83,14 +78,6 @@ class DocumentBuilder { return AddStringProperty(std::move(property_name), {string_values...}); } - // Takes a custom property name and any number of string values. - template <typename... V> - DocumentBuilder& AddCustomStringProperty(std::string property_name, - V... string_values) { - return AddCustomStringProperty(std::move(property_name), - {string_values...}); - } - // Takes a property name and any number of int64_t values. template <typename... V> DocumentBuilder& AddInt64Property(std::string property_name, @@ -98,13 +85,6 @@ class DocumentBuilder { return AddInt64Property(std::move(property_name), {int64_values...}); } - // Takes a custom property name and any number of int64_t values. - template <typename... V> - DocumentBuilder& AddCustomInt64Property(std::string property_name, - V... int64_values) { - return AddCustomInt64Property(std::move(property_name), {int64_values...}); - } - // Takes a property name and any number of double values. template <typename... V> DocumentBuilder& AddDoubleProperty(std::string property_name, @@ -112,14 +92,6 @@ class DocumentBuilder { return AddDoubleProperty(std::move(property_name), {double_values...}); } - // Takes a custom property name and any number of double values. - template <typename... V> - DocumentBuilder& AddCustomDoubleProperty(std::string property_name, - V... double_values) { - return AddCustomDoubleProperty(std::move(property_name), - {double_values...}); - } - // Takes a property name and any number of boolean values. template <typename... V> DocumentBuilder& AddBooleanProperty(std::string property_name, @@ -127,28 +99,12 @@ class DocumentBuilder { return AddBooleanProperty(std::move(property_name), {boolean_values...}); } - // Takes a custom property name and any number of boolean values. - template <typename... V> - DocumentBuilder& AddCustomBooleanProperty(std::string property_name, - V... boolean_values) { - return AddCustomBooleanProperty(std::move(property_name), - {boolean_values...}); - } - // Takes a property name and any number of bytes values. template <typename... V> DocumentBuilder& AddBytesProperty(std::string property_name, V... bytes_values) { return AddBytesProperty(std::move(property_name), {bytes_values...}); } - - // Takes a custom property name and any number of bytes values. - template <typename... V> - DocumentBuilder& AddCustomBytesProperty(std::string property_name, - V... bytes_values) { - return AddCustomBytesProperty(std::move(property_name), {bytes_values...}); - } - // Takes a property name and any number of document values. template <typename... V> DocumentBuilder& AddDocumentProperty(std::string property_name, @@ -156,14 +112,6 @@ class DocumentBuilder { return AddDocumentProperty(std::move(property_name), {document_values...}); } - // Takes a custom property name and any number of document values. - template <typename... V> - DocumentBuilder& AddCustomDocumentProperty(std::string property_name, - V&&... document_values) { - return AddCustomDocumentProperty(std::move(property_name), - {document_values...}); - } - DocumentProto Build() const { return document_; } private: @@ -180,17 +128,6 @@ class DocumentBuilder { return *this; } - DocumentBuilder& AddCustomStringProperty( - std::string property_name, - std::initializer_list<std::string_view> string_values) { - auto custom_property = document_.add_custom_properties(); - custom_property->set_name(std::move(property_name)); - for (std::string_view string_value : string_values) { - custom_property->mutable_string_values()->Add(std::string(string_value)); - } - return *this; - } - DocumentBuilder& AddInt64Property( std::string property_name, std::initializer_list<int64_t> int64_values) { auto property = document_.add_properties(); @@ -201,16 +138,6 @@ class DocumentBuilder { return *this; } - DocumentBuilder& AddCustomInt64Property( - std::string property_name, std::initializer_list<int64_t> int64_values) { - auto custom_property = document_.add_custom_properties(); - custom_property->set_name(std::move(property_name)); - for (int64_t int64_value : int64_values) { - custom_property->mutable_int64_values()->Add(int64_value); - } - return *this; - } - DocumentBuilder& AddDoubleProperty( std::string property_name, std::initializer_list<double> double_values) { auto property = document_.add_properties(); @@ -221,16 +148,6 @@ class DocumentBuilder { return *this; } - DocumentBuilder& AddCustomDoubleProperty( - std::string property_name, std::initializer_list<double> double_values) { - auto custom_property = document_.add_custom_properties(); - custom_property->set_name(std::move(property_name)); - for (double double_value : double_values) { - custom_property->mutable_double_values()->Add(double_value); - } - return *this; - } - DocumentBuilder& AddBooleanProperty( std::string property_name, std::initializer_list<bool> boolean_values) { auto property = document_.add_properties(); @@ -241,16 +158,6 @@ class DocumentBuilder { return *this; } - DocumentBuilder& AddCustomBooleanProperty( - std::string property_name, std::initializer_list<bool> boolean_values) { - auto custom_property = document_.add_custom_properties(); - custom_property->set_name(std::move(property_name)); - for (bool boolean_value : boolean_values) { - custom_property->mutable_boolean_values()->Add(boolean_value); - } - return *this; - } - DocumentBuilder& AddBytesProperty( std::string property_name, std::initializer_list<std::string> bytes_values) { @@ -262,17 +169,6 @@ class DocumentBuilder { return *this; } - DocumentBuilder& AddCustomBytesProperty( - std::string property_name, - std::initializer_list<std::string> bytes_values) { - auto custom_property = document_.add_custom_properties(); - custom_property->set_name(std::move(property_name)); - for (const std::string& bytes_value : bytes_values) { - custom_property->mutable_bytes_values()->Add(std::string(bytes_value)); - } - return *this; - } - DocumentBuilder& AddDocumentProperty( std::string property_name, std::initializer_list<DocumentProto> document_values) { @@ -283,18 +179,6 @@ class DocumentBuilder { } return *this; } - - DocumentBuilder& AddCustomDocumentProperty( - std::string property_name, - std::initializer_list<DocumentProto> document_values) { - auto custom_property = document_.add_custom_properties(); - custom_property->set_name(std::move(property_name)); - for (DocumentProto document_value : document_values) { - custom_property->mutable_document_values()->Add( - std::move(document_value)); - } - return *this; - } }; } // namespace lib diff --git a/icing/icing-search-engine_test.cc b/icing/icing-search-engine_test.cc index ab05646..8d69d78 100644 --- a/icing/icing-search-engine_test.cc +++ b/icing/icing-search-engine_test.cc @@ -194,6 +194,56 @@ SchemaProto CreateEmailSchema() { TermMatchType::PREFIX); subj->mutable_string_indexing_config()->set_tokenizer_type( StringIndexingConfig::TokenizerType::PLAIN); + return schema; +} + +SchemaProto CreatePersonAndEmailSchema() { + SchemaProto schema; + + auto* person_type = schema.add_types(); + person_type->set_schema_type("Person"); + auto* name = person_type->add_properties(); + name->set_property_name("name"); + name->set_data_type(PropertyConfigProto::DataType::STRING); + name->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); + name->mutable_string_indexing_config()->set_term_match_type( + TermMatchType::PREFIX); + name->mutable_string_indexing_config()->set_tokenizer_type( + StringIndexingConfig::TokenizerType::PLAIN); + auto* address = person_type->add_properties(); + address->set_property_name("emailAddress"); + address->set_data_type(PropertyConfigProto::DataType::STRING); + address->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); + address->mutable_string_indexing_config()->set_term_match_type( + TermMatchType::PREFIX); + address->mutable_string_indexing_config()->set_tokenizer_type( + StringIndexingConfig::TokenizerType::PLAIN); + + auto* type = schema.add_types(); + type->set_schema_type("Email"); + + auto* body = type->add_properties(); + body->set_property_name("body"); + body->set_data_type(PropertyConfigProto::DataType::STRING); + body->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); + body->mutable_string_indexing_config()->set_term_match_type( + TermMatchType::PREFIX); + body->mutable_string_indexing_config()->set_tokenizer_type( + StringIndexingConfig::TokenizerType::PLAIN); + auto* subj = type->add_properties(); + subj->set_property_name("subject"); + subj->set_data_type(PropertyConfigProto::DataType::STRING); + subj->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); + subj->mutable_string_indexing_config()->set_term_match_type( + TermMatchType::PREFIX); + subj->mutable_string_indexing_config()->set_tokenizer_type( + StringIndexingConfig::TokenizerType::PLAIN); + auto* sender = type->add_properties(); + sender->set_property_name("sender"); + sender->set_schema_type("Person"); + sender->set_data_type(PropertyConfigProto::DataType::DOCUMENT); + sender->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); + sender->mutable_document_indexing_config()->set_index_nested_properties(true); return schema; } @@ -5243,6 +5293,194 @@ TEST_F(IcingSearchEngineTest, PutDocumentShouldLogIndexMergeLatency) { Eq(10)); } +TEST_F(IcingSearchEngineTest, SearchWithProjectionEmptyFieldPath) { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(), + ProtoIsOk()); + + // 1. Add two email documents + DocumentProto document_one = + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetCreationTimestampMs(1000) + .SetSchema("Email") + .AddDocumentProperty( + "sender", + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetSchema("Person") + .AddStringProperty("name", "Meg Ryan") + .AddStringProperty("emailAddress", "shopgirl@aol.com") + .Build()) + .AddStringProperty("subject", "Hello World!") + .AddStringProperty( + "body", "Oh what a beautiful morning! Oh what a beautiful day!") + .Build(); + ASSERT_THAT(icing.Put(document_one).status(), ProtoIsOk()); + + DocumentProto document_two = + DocumentBuilder() + .SetKey("namespace", "uri2") + .SetCreationTimestampMs(1000) + .SetSchema("Email") + .AddDocumentProperty( + "sender", DocumentBuilder() + .SetKey("namespace", "uri2") + .SetSchema("Person") + .AddStringProperty("name", "Tom Hanks") + .AddStringProperty("emailAddress", "ny152@aol.com") + .Build()) + .AddStringProperty("subject", "Goodnight Moon!") + .AddStringProperty("body", + "Count all the sheep and tell them 'Hello'.") + .Build(); + ASSERT_THAT(icing.Put(document_two).status(), ProtoIsOk()); + + // 2. Issue a query that will match those documents and use an empty field + // mask to request NO properties. + SearchSpecProto search_spec; + search_spec.set_term_match_type(TermMatchType::PREFIX); + search_spec.set_query("hello"); + + ResultSpecProto result_spec; + // Retrieve only one result at a time to make sure that projection works when + // retrieving all pages. + result_spec.set_num_per_page(1); + ResultSpecProto::TypePropertyMask* email_field_mask = + result_spec.add_type_property_masks(); + email_field_mask->set_schema_type("Email"); + email_field_mask->add_paths(""); + + SearchResultProto results = + icing.Search(search_spec, GetDefaultScoringSpec(), result_spec); + EXPECT_THAT(results.status(), ProtoIsOk()); + EXPECT_THAT(results.results(), SizeIs(1)); + + // 3. Verify that the returned results contain no properties. + DocumentProto projected_document_two = DocumentBuilder() + .SetKey("namespace", "uri2") + .SetCreationTimestampMs(1000) + .SetSchema("Email") + .Build(); + EXPECT_THAT(results.results(0).document(), + EqualsProto(projected_document_two)); + + results = icing.GetNextPage(results.next_page_token()); + EXPECT_THAT(results.status(), ProtoIsOk()); + EXPECT_THAT(results.results(), SizeIs(1)); + DocumentProto projected_document_one = DocumentBuilder() + .SetKey("namespace", "uri1") + .SetCreationTimestampMs(1000) + .SetSchema("Email") + .Build(); + EXPECT_THAT(results.results(0).document(), + EqualsProto(projected_document_one)); +} + +TEST_F(IcingSearchEngineTest, SearchWithProjectionMultipleFieldPaths) { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(), + ProtoIsOk()); + + // 1. Add two email documents + DocumentProto document_one = + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetCreationTimestampMs(1000) + .SetSchema("Email") + .AddDocumentProperty( + "sender", + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetSchema("Person") + .AddStringProperty("name", "Meg Ryan") + .AddStringProperty("emailAddress", "shopgirl@aol.com") + .Build()) + .AddStringProperty("subject", "Hello World!") + .AddStringProperty( + "body", "Oh what a beautiful morning! Oh what a beautiful day!") + .Build(); + ASSERT_THAT(icing.Put(document_one).status(), ProtoIsOk()); + + DocumentProto document_two = + DocumentBuilder() + .SetKey("namespace", "uri2") + .SetCreationTimestampMs(1000) + .SetSchema("Email") + .AddDocumentProperty( + "sender", DocumentBuilder() + .SetKey("namespace", "uri2") + .SetSchema("Person") + .AddStringProperty("name", "Tom Hanks") + .AddStringProperty("emailAddress", "ny152@aol.com") + .Build()) + .AddStringProperty("subject", "Goodnight Moon!") + .AddStringProperty("body", + "Count all the sheep and tell them 'Hello'.") + .Build(); + ASSERT_THAT(icing.Put(document_two).status(), ProtoIsOk()); + + // 2. Issue a query that will match those documents and request only + // 'sender.name' and 'subject' properties. + SearchSpecProto search_spec; + search_spec.set_term_match_type(TermMatchType::PREFIX); + search_spec.set_query("hello"); + + ResultSpecProto result_spec; + // Retrieve only one result at a time to make sure that projection works when + // retrieving all pages. + result_spec.set_num_per_page(1); + ResultSpecProto::TypePropertyMask* email_field_mask = + result_spec.add_type_property_masks(); + email_field_mask->set_schema_type("Email"); + email_field_mask->add_paths("sender.name"); + email_field_mask->add_paths("subject"); + + SearchResultProto results = + icing.Search(search_spec, GetDefaultScoringSpec(), result_spec); + EXPECT_THAT(results.status(), ProtoIsOk()); + EXPECT_THAT(results.results(), SizeIs(1)); + + // 3. Verify that the returned results only contain the 'sender.name' + // property. + DocumentProto projected_document_two = + DocumentBuilder() + .SetKey("namespace", "uri2") + .SetCreationTimestampMs(1000) + .SetSchema("Email") + .AddDocumentProperty("sender", + DocumentBuilder() + .SetKey("namespace", "uri2") + .SetSchema("Person") + .AddStringProperty("name", "Tom Hanks") + .Build()) + .AddStringProperty("subject", "Goodnight Moon!") + .Build(); + EXPECT_THAT(results.results(0).document(), + EqualsProto(projected_document_two)); + + results = icing.GetNextPage(results.next_page_token()); + EXPECT_THAT(results.status(), ProtoIsOk()); + EXPECT_THAT(results.results(), SizeIs(1)); + DocumentProto projected_document_one = + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetCreationTimestampMs(1000) + .SetSchema("Email") + .AddDocumentProperty("sender", + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetSchema("Person") + .AddStringProperty("name", "Meg Ryan") + .Build()) + .AddStringProperty("subject", "Hello World!") + .Build(); + EXPECT_THAT(results.results(0).document(), + EqualsProto(projected_document_one)); +} + } // namespace } // namespace lib } // namespace icing diff --git a/icing/index/hit/doc-hit-info.h b/icing/index/hit/doc-hit-info.h index 386822d..32ba97e 100644 --- a/icing/index/hit/doc-hit-info.h +++ b/icing/index/hit/doc-hit-info.h @@ -36,7 +36,7 @@ class DocHitInfo { SectionIdMask hit_section_ids_mask = kSectionIdMaskNone) : document_id_(document_id_in), hit_section_ids_mask_(hit_section_ids_mask) { - memset(max_hit_score_, Hit::kMaxHitScore, sizeof(max_hit_score_)); + memset(max_hit_score_, Hit::kDefaultHitScore, sizeof(max_hit_score_)); } DocumentId document_id() const { return document_id_; } diff --git a/icing/index/hit/doc-hit-info_test.cc b/icing/index/hit/doc-hit-info_test.cc index d8adbc1..1e1880f 100644 --- a/icing/index/hit/doc-hit-info_test.cc +++ b/icing/index/hit/doc-hit-info_test.cc @@ -34,17 +34,17 @@ constexpr DocumentId kSomeOtherDocumentId = 54; TEST(DocHitInfoTest, InitialMaxHitScores) { DocHitInfo info(kSomeDocumentId); for (SectionId i = 0; i <= kMaxSectionId; ++i) { - EXPECT_THAT(info.max_hit_score(i), Eq(Hit::kMaxHitScore)); + EXPECT_THAT(info.max_hit_score(i), Eq(Hit::kDefaultHitScore)); } } TEST(DocHitInfoTest, UpdateHitScores) { DocHitInfo info(kSomeDocumentId); - ASSERT_THAT(info.max_hit_score(3), Eq(Hit::kMaxHitScore)); + ASSERT_THAT(info.max_hit_score(3), Eq(Hit::kDefaultHitScore)); // Updating a section for the first time, should change its max hit score, // even though the hit score (16) may be lower than the current value returned - // by info.max_hit_score(3) (kMaxHitScore) + // by info.max_hit_score(3) (kDefaultHitScore) info.UpdateSection(3, 16); EXPECT_THAT(info.max_hit_score(3), Eq(16)); @@ -58,6 +58,12 @@ TEST(DocHitInfoTest, UpdateHitScores) { info.UpdateSection(3, 17); EXPECT_THAT(info.max_hit_score(3), Eq(17)); + // Updating a section with kDefaultHitScore should *never* set the + // max_hit_score to kDefaultHitScore (unless it already was kDefaultHitScore) + // because kDefaultHitScore is the lowest possible valid hit score. + info.UpdateSection(3, Hit::kDefaultHitScore); + EXPECT_THAT(info.max_hit_score(3), Eq(17)); + // Updating a section with kMaxHitScore should *always* set the max hit // score to kMaxHitScore (regardless of what value kMaxHitScore is // defined with). @@ -150,7 +156,7 @@ TEST(DocHitInfoTest, Comparison) { DocHitInfo high_section_id_info(kDocumentId); high_section_id_info.UpdateSection(1, 12); - high_section_id_info.UpdateSection(6, Hit::kMaxHitScore); + high_section_id_info.UpdateSection(6, Hit::kDefaultHitScore); std::vector<DocHitInfo> infos{info, high_document_id_info, high_section_id_info}; diff --git a/icing/index/hit/hit.cc b/icing/index/hit/hit.cc index 1852bd5..d089dd5 100644 --- a/icing/index/hit/hit.cc +++ b/icing/index/hit/hit.cc @@ -30,7 +30,7 @@ enum FlagOffset { // This hit represents a prefix of a longer term. If exact matches are // required, then this hit should be ignored. kPrefixHit = 1, - // Whether or not the hit has a hit score other than kMaxHitScore. + // Whether or not the hit has a hit score other than kDefaultHitScore. kHasScore = 2, kNumFlags = 3, }; @@ -64,7 +64,7 @@ Hit::Hit(SectionId section_id, DocumentId document_id, Hit::Score score, kSectionIdBits + kNumFlags, kDocumentIdBits, &temp_value); bit_util::BitfieldSet(section_id, kNumFlags, kSectionIdBits, &temp_value); - bit_util::BitfieldSet(score != kMaxHitScore, kHasScore, 1, &temp_value); + bit_util::BitfieldSet(score != kDefaultHitScore, kHasScore, 1, &temp_value); bit_util::BitfieldSet(is_prefix_hit, kPrefixHit, 1, &temp_value); bit_util::BitfieldSet(is_in_prefix_section, kInPrefixSection, 1, &temp_value); value_ = temp_value; diff --git a/icing/index/hit/hit.h b/icing/index/hit/hit.h index d1be204..53553f0 100644 --- a/icing/index/hit/hit.h +++ b/icing/index/hit/hit.h @@ -55,11 +55,15 @@ class Hit { // A score reflecting the "quality" of this hit. The higher the score, the // higher quality the hit. + // The score is being repurposed for term frequency. + // TODO(b/173156700): refactor Score to TermFrequency. using Score = uint8_t; - // By default, hits are given the highest possible score. + // Max Score is 255. static constexpr Score kMaxHitScore = std::numeric_limits<Score>::max(); + // Default value of term frequency is 1. + static constexpr Score kDefaultHitScore = 1; - explicit Hit(Value value = kInvalidValue, Score score = kMaxHitScore) + explicit Hit(Value value = kInvalidValue, Score score = kDefaultHitScore) : value_(value), score_(score) {} Hit(SectionId section_id, DocumentId document_id, Score score, bool is_in_prefix_section = false, bool is_prefix_hit = false); @@ -68,8 +72,7 @@ class Hit { Value value() const { return value_; } DocumentId document_id() const; SectionId section_id() const; - // Whether or not the hit contains a non-default score. Hits with non-default - // score are considered to be of lower quality. + // Whether or not the hit contains a non-default score. bool has_score() const; Score score() const { return score_; } bool is_prefix_hit() const; diff --git a/icing/index/hit/hit_test.cc b/icing/index/hit/hit_test.cc index 17db66b..8c883d1 100644 --- a/icing/index/hit/hit_test.cc +++ b/icing/index/hit/hit_test.cc @@ -36,9 +36,9 @@ static constexpr SectionId kSomeSectionid = 5; static constexpr Hit::Score kSomeHitScore = 57; TEST(HitTest, HasScoreFlag) { - Hit h1(kSomeSectionid, kSomeDocumentId, Hit::kMaxHitScore); + Hit h1(kSomeSectionid, kSomeDocumentId, Hit::kDefaultHitScore); EXPECT_THAT(h1.has_score(), IsFalse()); - EXPECT_THAT(h1.score(), Eq(Hit::kMaxHitScore)); + EXPECT_THAT(h1.score(), Eq(Hit::kDefaultHitScore)); Hit h2(kSomeSectionid, kSomeDocumentId, kSomeHitScore); EXPECT_THAT(h2.has_score(), IsTrue()); @@ -46,33 +46,33 @@ TEST(HitTest, HasScoreFlag) { } TEST(HitTest, IsPrefixHitFlag) { - Hit h1(kSomeSectionid, kSomeDocumentId, Hit::kMaxHitScore); + Hit h1(kSomeSectionid, kSomeDocumentId, Hit::kDefaultHitScore); EXPECT_THAT(h1.is_prefix_hit(), IsFalse()); - Hit h2(kSomeSectionid, kSomeDocumentId, Hit::kMaxHitScore, + Hit h2(kSomeSectionid, kSomeDocumentId, Hit::kDefaultHitScore, /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); EXPECT_THAT(h2.is_prefix_hit(), IsFalse()); - Hit h3(kSomeSectionid, kSomeDocumentId, Hit::kMaxHitScore, + Hit h3(kSomeSectionid, kSomeDocumentId, Hit::kDefaultHitScore, /*is_in_prefix_section=*/false, /*is_prefix_hit=*/true); EXPECT_THAT(h3.is_prefix_hit(), IsTrue()); } TEST(HitTest, IsInPrefixSectionFlag) { - Hit h1(kSomeSectionid, kSomeDocumentId, Hit::kMaxHitScore); + Hit h1(kSomeSectionid, kSomeDocumentId, Hit::kDefaultHitScore); EXPECT_THAT(h1.is_in_prefix_section(), IsFalse()); - Hit h2(kSomeSectionid, kSomeDocumentId, Hit::kMaxHitScore, + Hit h2(kSomeSectionid, kSomeDocumentId, Hit::kDefaultHitScore, /*is_in_prefix_section=*/false); EXPECT_THAT(h2.is_in_prefix_section(), IsFalse()); - Hit h3(kSomeSectionid, kSomeDocumentId, Hit::kMaxHitScore, + Hit h3(kSomeSectionid, kSomeDocumentId, Hit::kDefaultHitScore, /*is_in_prefix_section=*/true); EXPECT_THAT(h3.is_in_prefix_section(), IsTrue()); } TEST(HitTest, Accessors) { - Hit h1(kSomeSectionid, kSomeDocumentId, Hit::kMaxHitScore); + Hit h1(kSomeSectionid, kSomeDocumentId, Hit::kDefaultHitScore); EXPECT_THAT(h1.document_id(), Eq(kSomeDocumentId)); EXPECT_THAT(h1.section_id(), Eq(kSomeSectionid)); } @@ -102,18 +102,19 @@ TEST(HitTest, Valid) { } TEST(HitTest, Comparison) { - Hit hit(1, 243, Hit::kMaxHitScore); + Hit hit(1, 243, Hit::kDefaultHitScore); // DocumentIds are sorted in ascending order. So a hit with a lower // document_id should be considered greater than one with a higher // document_id. - Hit higher_document_id_hit(1, 2409, Hit::kMaxHitScore); - Hit higher_section_id_hit(15, 243, Hit::kMaxHitScore); + Hit higher_document_id_hit(1, 2409, Hit::kDefaultHitScore); + Hit higher_section_id_hit(15, 243, Hit::kDefaultHitScore); // Whether or not a hit score was set is considered, but the score itself is // not. Hit hitscore_hit(1, 243, 12); - Hit prefix_hit(1, 243, Hit::kMaxHitScore, /*is_in_prefix_section=*/false, + Hit prefix_hit(1, 243, Hit::kDefaultHitScore, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/true); - Hit hit_in_prefix_section(1, 243, Hit::kMaxHitScore, + Hit hit_in_prefix_section(1, 243, Hit::kDefaultHitScore, /*is_in_prefix_section=*/true, /*is_prefix_hit=*/false); diff --git a/icing/index/index-processor.cc b/icing/index/index-processor.cc index 9df9b87..892263b 100644 --- a/icing/index/index-processor.cc +++ b/icing/index/index-processor.cc @@ -85,6 +85,8 @@ libtextclassifier3::Status IndexProcessor::IndexDocument( tokenizer->Tokenize(subcontent)); while (itr->Advance()) { if (++num_tokens > options_.max_tokens_per_document) { + // Index all tokens buffered so far. + editor.IndexAllBufferedTerms(); if (put_document_stats != nullptr) { put_document_stats->mutable_tokenization_stats() ->set_exceeded_max_token_num(true); @@ -96,16 +98,16 @@ libtextclassifier3::Status IndexProcessor::IndexDocument( return absl_ports::ResourceExhaustedError( "Max number of tokens reached!"); case Options::TokenLimitBehavior::kSuppressError: - return libtextclassifier3::Status::OK; + return overall_status; } } std::string term = normalizer_.NormalizeTerm(itr->GetToken().text); - // Add this term to the index. Even if adding this hit fails, we keep + // Add this term to Hit buffer. Even if adding this hit fails, we keep // trying to add more hits because it's possible that future hits could // still be added successfully. For instance if the lexicon is full, we // might fail to add a hit for a new term, but should still be able to // add hits for terms that are already in the index. - auto status = editor.AddHit(term.c_str()); + auto status = editor.BufferTerm(term.c_str()); if (overall_status.ok() && !status.ok()) { // If we've succeeded to add everything so far, set overall_status to // represent this new failure. If we've already failed, no need to @@ -115,6 +117,15 @@ libtextclassifier3::Status IndexProcessor::IndexDocument( } } } + // Add all the seen terms to the index with their term frequency. + auto status = editor.IndexAllBufferedTerms(); + if (overall_status.ok() && !status.ok()) { + // If we've succeeded so far, set overall_status to + // represent this new failure. If we've already failed, no need to + // update the status - we're already going to return a resource + // exhausted error. + overall_status = status; + } } if (put_document_stats != nullptr) { diff --git a/icing/index/index-processor_test.cc b/icing/index/index-processor_test.cc index ff11e60..3168dad 100644 --- a/icing/index/index-processor_test.cc +++ b/icing/index/index-processor_test.cc @@ -19,6 +19,7 @@ #include <memory> #include <string> #include <string_view> +#include <unordered_map> #include <utility> #include <vector> @@ -292,9 +293,10 @@ TEST_F(IndexProcessorTest, OneDoc) { ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DocHitInfoIterator> itr, index_->GetIterator("hello", kSectionIdMaskAll, TermMatchType::EXACT_ONLY)); - EXPECT_THAT(GetHits(std::move(itr)), - ElementsAre(EqualsDocHitInfo( - kDocumentId0, std::vector<SectionId>{kExactSectionId}))); + std::vector<DocHitInfo> hits = GetHits(std::move(itr)); + std::unordered_map<SectionId, Hit::Score> expectedMap{{kExactSectionId, 1}}; + EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency( + kDocumentId0, expectedMap))); ICING_ASSERT_OK_AND_ASSIGN( itr, index_->GetIterator("hello", 1U << kPrefixedSectionId, @@ -313,12 +315,18 @@ TEST_F(IndexProcessorTest, MultipleDocs) { EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0), IsOk()); EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); + std::string coffeeRepeatedString = "coffee"; + for (int i = 0; i < Hit::kMaxHitScore + 1; i++) { + coffeeRepeatedString += " coffee"; + } + document = DocumentBuilder() .SetKey("icing", "fake_type/2") .SetSchema(std::string(kFakeType)) - .AddStringProperty(std::string(kExactProperty), "pitbull") - .AddStringProperty(std::string(kPrefixedProperty), "mr. world wide") + .AddStringProperty(std::string(kExactProperty), coffeeRepeatedString) + .AddStringProperty(std::string(kPrefixedProperty), + "mr. world world wide") .Build(); EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId1), IsOk()); EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1)); @@ -326,19 +334,31 @@ TEST_F(IndexProcessorTest, MultipleDocs) { ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DocHitInfoIterator> itr, index_->GetIterator("world", kSectionIdMaskAll, TermMatchType::EXACT_ONLY)); + std::vector<DocHitInfo> hits = GetHits(std::move(itr)); + std::unordered_map<SectionId, Hit::Score> expectedMap1{ + {kPrefixedSectionId, 2}}; + std::unordered_map<SectionId, Hit::Score> expectedMap2{{kExactSectionId, 1}}; EXPECT_THAT( - GetHits(std::move(itr)), - ElementsAre(EqualsDocHitInfo(kDocumentId1, - std::vector<SectionId>{kPrefixedSectionId}), - EqualsDocHitInfo(kDocumentId0, - std::vector<SectionId>{kExactSectionId}))); + hits, ElementsAre( + EqualsDocHitInfoWithTermFrequency(kDocumentId1, expectedMap1), + EqualsDocHitInfoWithTermFrequency(kDocumentId0, expectedMap2))); ICING_ASSERT_OK_AND_ASSIGN( itr, index_->GetIterator("world", 1U << kPrefixedSectionId, TermMatchType::EXACT_ONLY)); - EXPECT_THAT(GetHits(std::move(itr)), - ElementsAre(EqualsDocHitInfo( - kDocumentId1, std::vector<SectionId>{kPrefixedSectionId}))); + hits = GetHits(std::move(itr)); + std::unordered_map<SectionId, Hit::Score> expectedMap{ + {kPrefixedSectionId, 2}}; + EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency( + kDocumentId1, expectedMap))); + + ICING_ASSERT_OK_AND_ASSIGN(itr, + index_->GetIterator("coffee", kSectionIdMaskAll, + TermMatchType::EXACT_ONLY)); + hits = GetHits(std::move(itr)); + expectedMap = {{kExactSectionId, Hit::kMaxHitScore}}; + EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency( + kDocumentId1, expectedMap))); } TEST_F(IndexProcessorTest, DocWithNestedProperty) { diff --git a/icing/index/index.cc b/icing/index/index.cc index 1b808f8..f0c8bbd 100644 --- a/icing/index/index.cc +++ b/icing/index/index.cc @@ -277,8 +277,7 @@ Index::FindTermsByPrefix(const std::string& prefix, std::move(main_term_metadata_list), num_to_return); } -libtextclassifier3::Status Index::Editor::AddHit(const char* term, - Hit::Score score) { +libtextclassifier3::Status Index::Editor::BufferTerm(const char* term) { // Step 1: See if this term is already in the lexicon uint32_t tvi; auto tvi_or = lite_index_->GetTermId(term); @@ -287,8 +286,10 @@ libtextclassifier3::Status Index::Editor::AddHit(const char* term, if (tvi_or.ok()) { tvi = tvi_or.ValueOrDie(); if (seen_tokens_.find(tvi) != seen_tokens_.end()) { - ICING_VLOG(1) << "A hit for term " << term - << " has already been added. Skipping."; + ICING_VLOG(1) << "Updating term frequency for term " << term; + if (seen_tokens_[tvi] != Hit::kMaxHitScore) { + ++seen_tokens_[tvi]; + } return libtextclassifier3::Status::OK; } ICING_VLOG(1) << "Term " << term @@ -302,14 +303,20 @@ libtextclassifier3::Status Index::Editor::AddHit(const char* term, ICING_ASSIGN_OR_RETURN( tvi, lite_index_->InsertTerm(term, term_match_type_, namespace_id_)); } - seen_tokens_.insert(tvi); - - // Step 3: Add the hit itself - Hit hit(section_id_, document_id_, score, - term_match_type_ == TermMatchType::PREFIX); - ICING_ASSIGN_OR_RETURN(uint32_t term_id, - term_id_codec_->EncodeTvi(tvi, TviType::LITE)); - return lite_index_->AddHit(term_id, hit); + // Token seen for the first time in the current document. + seen_tokens_[tvi] = 1; + return libtextclassifier3::Status::OK; +} + +libtextclassifier3::Status Index::Editor::IndexAllBufferedTerms() { + for (auto itr = seen_tokens_.begin(); itr != seen_tokens_.end(); itr++) { + Hit hit(section_id_, document_id_, /*score=*/itr->second, + term_match_type_ == TermMatchType::PREFIX); + ICING_ASSIGN_OR_RETURN( + uint32_t term_id, term_id_codec_->EncodeTvi(itr->first, TviType::LITE)); + ICING_RETURN_IF_ERROR(lite_index_->AddHit(term_id, hit)); + } + return libtextclassifier3::Status::OK; } } // namespace lib diff --git a/icing/index/index.h b/icing/index/index.h index 1305b2c..32f2b17 100644 --- a/icing/index/index.h +++ b/icing/index/index.h @@ -197,14 +197,16 @@ class Index { namespace_id_(namespace_id), section_id_(section_id) {} - libtextclassifier3::Status AddHit(const char* term, - Hit::Score score = Hit::kMaxHitScore); + // Buffer the term in seen_tokens_. + libtextclassifier3::Status BufferTerm(const char* term); + // Index all the terms stored in seen_tokens_. + libtextclassifier3::Status IndexAllBufferedTerms(); private: // The Editor is able to store previously seen terms as TermIds. This is // is more efficient than a client doing this externally because TermIds are // not exposed to clients. - std::unordered_set<uint32_t> seen_tokens_; + std::unordered_map<uint32_t, Hit::Score> seen_tokens_; const TermIdCodec* term_id_codec_; LiteIndex* lite_index_; DocumentId document_id_; diff --git a/icing/index/index_test.cc b/icing/index/index_test.cc index 1d12274..3479ab1 100644 --- a/icing/index/index_test.cc +++ b/icing/index/index_test.cc @@ -177,7 +177,8 @@ TEST_F(IndexTest, EmptyIndexAfterMerge) { TEST_F(IndexTest, AdvancePastEnd) { Index::Editor edit = index_->Edit( kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - EXPECT_THAT(edit.AddHit("foo"), IsOk()); + EXPECT_THAT(edit.BufferTerm("foo"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<DocHitInfoIterator> itr, @@ -200,7 +201,8 @@ TEST_F(IndexTest, AdvancePastEnd) { TEST_F(IndexTest, AdvancePastEndAfterMerge) { Index::Editor edit = index_->Edit( kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - EXPECT_THAT(edit.AddHit("foo"), IsOk()); + EXPECT_THAT(edit.BufferTerm("foo"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); ICING_ASSERT_OK(index_->Merge()); @@ -225,7 +227,8 @@ TEST_F(IndexTest, AdvancePastEndAfterMerge) { TEST_F(IndexTest, SingleHitSingleTermIndex) { Index::Editor edit = index_->Edit( kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - EXPECT_THAT(edit.AddHit("foo"), IsOk()); + EXPECT_THAT(edit.BufferTerm("foo"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<DocHitInfoIterator> itr, @@ -240,7 +243,8 @@ TEST_F(IndexTest, SingleHitSingleTermIndex) { TEST_F(IndexTest, SingleHitSingleTermIndexAfterMerge) { Index::Editor edit = index_->Edit( kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - EXPECT_THAT(edit.AddHit("foo"), IsOk()); + EXPECT_THAT(edit.BufferTerm("foo"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); ICING_ASSERT_OK(index_->Merge()); @@ -257,8 +261,9 @@ TEST_F(IndexTest, SingleHitSingleTermIndexAfterMerge) { TEST_F(IndexTest, SingleHitMultiTermIndex) { Index::Editor edit = index_->Edit( kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - EXPECT_THAT(edit.AddHit("foo"), IsOk()); - EXPECT_THAT(edit.AddHit("bar"), IsOk()); + EXPECT_THAT(edit.BufferTerm("foo"), IsOk()); + EXPECT_THAT(edit.BufferTerm("bar"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<DocHitInfoIterator> itr, @@ -273,8 +278,9 @@ TEST_F(IndexTest, SingleHitMultiTermIndex) { TEST_F(IndexTest, SingleHitMultiTermIndexAfterMerge) { Index::Editor edit = index_->Edit( kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - EXPECT_THAT(edit.AddHit("foo"), IsOk()); - EXPECT_THAT(edit.AddHit("bar"), IsOk()); + EXPECT_THAT(edit.BufferTerm("foo"), IsOk()); + EXPECT_THAT(edit.BufferTerm("bar"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); ICING_ASSERT_OK(index_->Merge()); @@ -291,8 +297,9 @@ TEST_F(IndexTest, SingleHitMultiTermIndexAfterMerge) { TEST_F(IndexTest, NoHitMultiTermIndex) { Index::Editor edit = index_->Edit( kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - EXPECT_THAT(edit.AddHit("foo"), IsOk()); - EXPECT_THAT(edit.AddHit("bar"), IsOk()); + EXPECT_THAT(edit.BufferTerm("foo"), IsOk()); + EXPECT_THAT(edit.BufferTerm("bar"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<DocHitInfoIterator> itr, @@ -305,8 +312,9 @@ TEST_F(IndexTest, NoHitMultiTermIndex) { TEST_F(IndexTest, NoHitMultiTermIndexAfterMerge) { Index::Editor edit = index_->Edit( kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - EXPECT_THAT(edit.AddHit("foo"), IsOk()); - EXPECT_THAT(edit.AddHit("bar"), IsOk()); + EXPECT_THAT(edit.BufferTerm("foo"), IsOk()); + EXPECT_THAT(edit.BufferTerm("bar"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); ICING_ASSERT_OK(index_->Merge()); @@ -321,15 +329,18 @@ TEST_F(IndexTest, NoHitMultiTermIndexAfterMerge) { TEST_F(IndexTest, MultiHitMultiTermIndex) { Index::Editor edit = index_->Edit( kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - EXPECT_THAT(edit.AddHit("foo"), IsOk()); + EXPECT_THAT(edit.BufferTerm("foo"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); edit = index_->Edit(kDocumentId1, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - EXPECT_THAT(edit.AddHit("bar"), IsOk()); + EXPECT_THAT(edit.BufferTerm("bar"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); edit = index_->Edit(kDocumentId2, kSectionId3, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - EXPECT_THAT(edit.AddHit("foo"), IsOk()); + EXPECT_THAT(edit.BufferTerm("foo"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<DocHitInfoIterator> itr, @@ -345,15 +356,18 @@ TEST_F(IndexTest, MultiHitMultiTermIndex) { TEST_F(IndexTest, MultiHitMultiTermIndexAfterMerge) { Index::Editor edit = index_->Edit( kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - EXPECT_THAT(edit.AddHit("foo"), IsOk()); + EXPECT_THAT(edit.BufferTerm("foo"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); edit = index_->Edit(kDocumentId1, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - EXPECT_THAT(edit.AddHit("bar"), IsOk()); + EXPECT_THAT(edit.BufferTerm("bar"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); edit = index_->Edit(kDocumentId2, kSectionId3, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - EXPECT_THAT(edit.AddHit("foo"), IsOk()); + EXPECT_THAT(edit.BufferTerm("foo"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); ICING_ASSERT_OK(index_->Merge()); @@ -371,11 +385,13 @@ TEST_F(IndexTest, MultiHitMultiTermIndexAfterMerge) { TEST_F(IndexTest, MultiHitSectionRestrict) { Index::Editor edit = index_->Edit( kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - EXPECT_THAT(edit.AddHit("foo"), IsOk()); + EXPECT_THAT(edit.BufferTerm("foo"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); edit = index_->Edit(kDocumentId1, kSectionId3, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - EXPECT_THAT(edit.AddHit("foo"), IsOk()); + EXPECT_THAT(edit.BufferTerm("foo"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); SectionIdMask desired_section = 1U << kSectionId2; ICING_ASSERT_OK_AND_ASSIGN( @@ -391,11 +407,13 @@ TEST_F(IndexTest, MultiHitSectionRestrict) { TEST_F(IndexTest, MultiHitSectionRestrictAfterMerge) { Index::Editor edit = index_->Edit( kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - EXPECT_THAT(edit.AddHit("foo"), IsOk()); + EXPECT_THAT(edit.BufferTerm("foo"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); edit = index_->Edit(kDocumentId1, kSectionId3, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - EXPECT_THAT(edit.AddHit("foo"), IsOk()); + EXPECT_THAT(edit.BufferTerm("foo"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); ICING_ASSERT_OK(index_->Merge()); @@ -415,12 +433,13 @@ TEST_F(IndexTest, SingleHitDedupeIndex) { EXPECT_THAT(size, Eq(0)); Index::Editor edit = index_->Edit( kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - EXPECT_THAT(edit.AddHit("foo"), IsOk()); + EXPECT_THAT(edit.BufferTerm("foo"), IsOk()); ICING_ASSERT_OK_AND_ASSIGN(size, index_->GetElementsSize()); EXPECT_THAT(size, Gt(0)); - EXPECT_THAT(edit.AddHit("foo"), IsOk()); + EXPECT_THAT(edit.BufferTerm("foo"), IsOk()); ICING_ASSERT_OK_AND_ASSIGN(int64_t new_size, index_->GetElementsSize()); EXPECT_THAT(new_size, Eq(size)); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<DocHitInfoIterator> itr, @@ -435,7 +454,8 @@ TEST_F(IndexTest, SingleHitDedupeIndex) { TEST_F(IndexTest, PrefixHit) { Index::Editor edit = index_->Edit(kDocumentId0, kSectionId2, TermMatchType::PREFIX, /*namespace_id=*/0); - ASSERT_THAT(edit.AddHit("fool"), IsOk()); + ASSERT_THAT(edit.BufferTerm("fool"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<DocHitInfoIterator> itr, @@ -450,7 +470,8 @@ TEST_F(IndexTest, PrefixHit) { TEST_F(IndexTest, PrefixHitAfterMerge) { Index::Editor edit = index_->Edit(kDocumentId0, kSectionId2, TermMatchType::PREFIX, /*namespace_id=*/0); - ASSERT_THAT(edit.AddHit("fool"), IsOk()); + ASSERT_THAT(edit.BufferTerm("fool"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); ICING_ASSERT_OK(index_->Merge()); @@ -467,11 +488,13 @@ TEST_F(IndexTest, PrefixHitAfterMerge) { TEST_F(IndexTest, MultiPrefixHit) { Index::Editor edit = index_->Edit(kDocumentId0, kSectionId2, TermMatchType::PREFIX, /*namespace_id=*/0); - ASSERT_THAT(edit.AddHit("fool"), IsOk()); + ASSERT_THAT(edit.BufferTerm("fool"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); edit = index_->Edit(kDocumentId1, kSectionId3, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - ASSERT_THAT(edit.AddHit("foo"), IsOk()); + ASSERT_THAT(edit.BufferTerm("foo"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<DocHitInfoIterator> itr, @@ -488,11 +511,13 @@ TEST_F(IndexTest, MultiPrefixHit) { TEST_F(IndexTest, MultiPrefixHitAfterMerge) { Index::Editor edit = index_->Edit(kDocumentId0, kSectionId2, TermMatchType::PREFIX, /*namespace_id=*/0); - ASSERT_THAT(edit.AddHit("fool"), IsOk()); + ASSERT_THAT(edit.BufferTerm("fool"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); edit = index_->Edit(kDocumentId1, kSectionId3, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - ASSERT_THAT(edit.AddHit("foo"), IsOk()); + ASSERT_THAT(edit.BufferTerm("foo"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); ICING_ASSERT_OK(index_->Merge()); @@ -511,11 +536,13 @@ TEST_F(IndexTest, MultiPrefixHitAfterMerge) { TEST_F(IndexTest, NoExactHitInPrefixQuery) { Index::Editor edit = index_->Edit( kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - ASSERT_THAT(edit.AddHit("fool"), IsOk()); + ASSERT_THAT(edit.BufferTerm("fool"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); edit = index_->Edit(kDocumentId1, kSectionId3, TermMatchType::PREFIX, /*namespace_id=*/0); - ASSERT_THAT(edit.AddHit("foo"), IsOk()); + ASSERT_THAT(edit.BufferTerm("foo"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<DocHitInfoIterator> itr, @@ -529,11 +556,13 @@ TEST_F(IndexTest, NoExactHitInPrefixQuery) { TEST_F(IndexTest, NoExactHitInPrefixQueryAfterMerge) { Index::Editor edit = index_->Edit( kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - ASSERT_THAT(edit.AddHit("fool"), IsOk()); + ASSERT_THAT(edit.BufferTerm("fool"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); edit = index_->Edit(kDocumentId1, kSectionId3, TermMatchType::PREFIX, /*namespace_id=*/0); - ASSERT_THAT(edit.AddHit("foo"), IsOk()); + ASSERT_THAT(edit.BufferTerm("foo"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); ICING_ASSERT_OK(index_->Merge()); @@ -549,8 +578,9 @@ TEST_F(IndexTest, NoExactHitInPrefixQueryAfterMerge) { TEST_F(IndexTest, PrefixHitDedupe) { Index::Editor edit = index_->Edit(kDocumentId0, kSectionId2, TermMatchType::PREFIX, /*namespace_id=*/0); - ASSERT_THAT(edit.AddHit("foo"), IsOk()); - ASSERT_THAT(edit.AddHit("fool"), IsOk()); + ASSERT_THAT(edit.BufferTerm("foo"), IsOk()); + ASSERT_THAT(edit.BufferTerm("fool"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<DocHitInfoIterator> itr, @@ -564,8 +594,9 @@ TEST_F(IndexTest, PrefixHitDedupe) { TEST_F(IndexTest, PrefixHitDedupeAfterMerge) { Index::Editor edit = index_->Edit(kDocumentId0, kSectionId2, TermMatchType::PREFIX, /*namespace_id=*/0); - ASSERT_THAT(edit.AddHit("foo"), IsOk()); - ASSERT_THAT(edit.AddHit("fool"), IsOk()); + ASSERT_THAT(edit.BufferTerm("foo"), IsOk()); + ASSERT_THAT(edit.BufferTerm("fool"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); ICING_ASSERT_OK(index_->Merge()); @@ -621,8 +652,9 @@ TEST_F(IndexTest, ExactToString) { TEST_F(IndexTest, NonAsciiTerms) { Index::Editor edit = index_->Edit(kDocumentId0, kSectionId2, TermMatchType::PREFIX, /*namespace_id=*/0); - ASSERT_THAT(edit.AddHit("こんにちは"), IsOk()); - ASSERT_THAT(edit.AddHit("あなた"), IsOk()); + ASSERT_THAT(edit.BufferTerm("こんにちは"), IsOk()); + ASSERT_THAT(edit.BufferTerm("あなた"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<DocHitInfoIterator> itr, @@ -642,8 +674,9 @@ TEST_F(IndexTest, NonAsciiTerms) { TEST_F(IndexTest, NonAsciiTermsAfterMerge) { Index::Editor edit = index_->Edit(kDocumentId0, kSectionId2, TermMatchType::PREFIX, /*namespace_id=*/0); - ASSERT_THAT(edit.AddHit("こんにちは"), IsOk()); - ASSERT_THAT(edit.AddHit("あなた"), IsOk()); + ASSERT_THAT(edit.BufferTerm("こんにちは"), IsOk()); + ASSERT_THAT(edit.BufferTerm("あなた"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); ICING_ASSERT_OK(index_->Merge()); @@ -684,7 +717,11 @@ TEST_F(IndexTest, FullIndex) { index_->Edit(document_id, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); size_t idx = uniform(random); - status = edit.AddHit(query_terms.at(idx).c_str()); + status = edit.BufferTerm(query_terms.at(idx).c_str()); + if (!status.ok()) { + break; + } + status = edit.IndexAllBufferedTerms(); if (!status.ok()) { break; } @@ -696,11 +733,10 @@ TEST_F(IndexTest, FullIndex) { Index::Editor edit = index_->Edit(document_id + 1, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - EXPECT_THAT(edit.AddHit("foo"), - StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED)); - EXPECT_THAT(edit.AddHit("bar"), - StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED)); - EXPECT_THAT(edit.AddHit("baz"), + EXPECT_THAT(edit.BufferTerm("foo"), IsOk()); + EXPECT_THAT(edit.BufferTerm("bar"), IsOk()); + EXPECT_THAT(edit.BufferTerm("baz"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED)); for (int i = 0; i < query_terms.size(); i += 25) { @@ -737,7 +773,11 @@ TEST_F(IndexTest, FullIndexMerge) { index_->Edit(document_id, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); size_t idx = uniform(random); - status = edit.AddHit(query_terms.at(idx).c_str()); + status = edit.BufferTerm(query_terms.at(idx).c_str()); + if (!status.ok()) { + break; + } + status = edit.IndexAllBufferedTerms(); if (!status.ok()) { break; } @@ -751,11 +791,10 @@ TEST_F(IndexTest, FullIndexMerge) { Index::Editor edit = index_->Edit(document_id + 1, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - EXPECT_THAT(edit.AddHit("foo"), - StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED)); - EXPECT_THAT(edit.AddHit("bar"), - StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED)); - EXPECT_THAT(edit.AddHit("baz"), + EXPECT_THAT(edit.BufferTerm("foo"), IsOk()); + EXPECT_THAT(edit.BufferTerm("bar"), IsOk()); + EXPECT_THAT(edit.BufferTerm("baz"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED)); EXPECT_THAT(index_->last_added_document_id(), Eq(document_id - 1)); @@ -763,9 +802,10 @@ TEST_F(IndexTest, FullIndexMerge) { ICING_ASSERT_OK(index_->Merge()); edit = index_->Edit(document_id + 1, kSectionId2, TermMatchType::EXACT_ONLY, 0); - EXPECT_THAT(edit.AddHit("foo"), IsOk()); - EXPECT_THAT(edit.AddHit("bar"), IsOk()); - EXPECT_THAT(edit.AddHit("baz"), IsOk()); + EXPECT_THAT(edit.BufferTerm("foo"), IsOk()); + EXPECT_THAT(edit.BufferTerm("bar"), IsOk()); + EXPECT_THAT(edit.BufferTerm("baz"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<DocHitInfoIterator> itr, index_->GetIterator("bar", kSectionIdMaskAll, TermMatchType::EXACT_ONLY)); @@ -790,8 +830,9 @@ TEST_F(IndexTest, IndexCreateCorruptionFailure) { // Add some content to the index Index::Editor edit = index_->Edit(kDocumentId0, kSectionId2, TermMatchType::PREFIX, /*namespace_id=*/0); - ASSERT_THAT(edit.AddHit("foo"), IsOk()); - ASSERT_THAT(edit.AddHit("bar"), IsOk()); + ASSERT_THAT(edit.BufferTerm("foo"), IsOk()); + ASSERT_THAT(edit.BufferTerm("bar"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); // Close the index. index_.reset(); @@ -820,8 +861,9 @@ TEST_F(IndexTest, IndexPersistence) { // Add some content to the index Index::Editor edit = index_->Edit(kDocumentId0, kSectionId2, TermMatchType::PREFIX, /*namespace_id=*/0); - ASSERT_THAT(edit.AddHit("foo"), IsOk()); - ASSERT_THAT(edit.AddHit("bar"), IsOk()); + ASSERT_THAT(edit.BufferTerm("foo"), IsOk()); + ASSERT_THAT(edit.BufferTerm("bar"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); EXPECT_THAT(index_->PersistToDisk(), IsOk()); // Close the index. @@ -847,8 +889,9 @@ TEST_F(IndexTest, IndexPersistenceAfterMerge) { // Add some content to the index Index::Editor edit = index_->Edit(kDocumentId0, kSectionId2, TermMatchType::PREFIX, /*namespace_id=*/0); - ASSERT_THAT(edit.AddHit("foo"), IsOk()); - ASSERT_THAT(edit.AddHit("bar"), IsOk()); + ASSERT_THAT(edit.BufferTerm("foo"), IsOk()); + ASSERT_THAT(edit.BufferTerm("bar"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); ICING_ASSERT_OK(index_->Merge()); EXPECT_THAT(index_->PersistToDisk(), IsOk()); @@ -881,7 +924,8 @@ TEST_F(IndexTest, InvalidHitBufferSize) { TEST_F(IndexTest, FindTermByPrefixShouldReturnEmpty) { Index::Editor edit = index_->Edit(kDocumentId0, kSectionId2, TermMatchType::PREFIX, /*namespace_id=*/0); - EXPECT_THAT(edit.AddHit("fool"), IsOk()); + EXPECT_THAT(edit.BufferTerm("fool"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"foo", /*namespace_ids=*/{0}, /*num_to_return=*/0), @@ -903,8 +947,9 @@ TEST_F(IndexTest, FindTermByPrefixShouldReturnEmpty) { TEST_F(IndexTest, FindTermByPrefixShouldReturnCorrectResult) { Index::Editor edit = index_->Edit( kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - EXPECT_THAT(edit.AddHit("foo"), IsOk()); - EXPECT_THAT(edit.AddHit("bar"), IsOk()); + EXPECT_THAT(edit.BufferTerm("foo"), IsOk()); + EXPECT_THAT(edit.BufferTerm("bar"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); // "b" should only match "bar" but not "foo". EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"b", /*namespace_ids=*/{0}, @@ -923,9 +968,10 @@ TEST_F(IndexTest, FindTermByPrefixShouldReturnCorrectResult) { TEST_F(IndexTest, FindTermByPrefixShouldRespectNumToReturn) { Index::Editor edit = index_->Edit( kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - EXPECT_THAT(edit.AddHit("fo"), IsOk()); - EXPECT_THAT(edit.AddHit("foo"), IsOk()); - EXPECT_THAT(edit.AddHit("fool"), IsOk()); + EXPECT_THAT(edit.BufferTerm("fo"), IsOk()); + EXPECT_THAT(edit.BufferTerm("foo"), IsOk()); + EXPECT_THAT(edit.BufferTerm("fool"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); // We have 3 results but only 2 should be returned. EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0}, @@ -944,13 +990,15 @@ TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsInOneNamespace) { Index::Editor edit1 = index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - EXPECT_THAT(edit1.AddHit("fo"), IsOk()); - EXPECT_THAT(edit1.AddHit("foo"), IsOk()); + EXPECT_THAT(edit1.BufferTerm("fo"), IsOk()); + EXPECT_THAT(edit1.BufferTerm("foo"), IsOk()); + EXPECT_THAT(edit1.IndexAllBufferedTerms(), IsOk()); Index::Editor edit2 = index_->Edit(kDocumentId1, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/1); - EXPECT_THAT(edit2.AddHit("fool"), IsOk()); + EXPECT_THAT(edit2.BufferTerm("fool"), IsOk()); + EXPECT_THAT(edit2.IndexAllBufferedTerms(), IsOk()); // namespace with id 0 has 2 results. EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0}, @@ -982,17 +1030,20 @@ TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsInMultipleNamespaces) { Index::Editor edit1 = index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - EXPECT_THAT(edit1.AddHit("fo"), IsOk()); + EXPECT_THAT(edit1.BufferTerm("fo"), IsOk()); + EXPECT_THAT(edit1.IndexAllBufferedTerms(), IsOk()); Index::Editor edit2 = index_->Edit(kDocumentId1, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/1); - EXPECT_THAT(edit2.AddHit("foo"), IsOk()); + EXPECT_THAT(edit2.BufferTerm("foo"), IsOk()); + EXPECT_THAT(edit2.IndexAllBufferedTerms(), IsOk()); Index::Editor edit3 = index_->Edit(kDocumentId2, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/2); - EXPECT_THAT(edit3.AddHit("fool"), IsOk()); + EXPECT_THAT(edit3.BufferTerm("fool"), IsOk()); + EXPECT_THAT(edit3.IndexAllBufferedTerms(), IsOk()); // Should return "foo" and "fool" which are in namespaces with ids 1 and 2. EXPECT_THAT( @@ -1015,17 +1066,20 @@ TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsInAllNamespaces) { Index::Editor edit1 = index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - EXPECT_THAT(edit1.AddHit("fo"), IsOk()); + EXPECT_THAT(edit1.BufferTerm("fo"), IsOk()); + EXPECT_THAT(edit1.IndexAllBufferedTerms(), IsOk()); Index::Editor edit2 = index_->Edit(kDocumentId1, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/1); - EXPECT_THAT(edit2.AddHit("foo"), IsOk()); + EXPECT_THAT(edit2.BufferTerm("foo"), IsOk()); + EXPECT_THAT(edit2.IndexAllBufferedTerms(), IsOk()); Index::Editor edit3 = index_->Edit(kDocumentId2, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/2); - EXPECT_THAT(edit3.AddHit("fool"), IsOk()); + EXPECT_THAT(edit3.BufferTerm("fool"), IsOk()); + EXPECT_THAT(edit3.IndexAllBufferedTerms(), IsOk()); // Should return "fo", "foo" and "fool" across all namespaces. EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{}, @@ -1049,13 +1103,15 @@ TEST_F(IndexTest, FindTermByPrefixShouldReturnCorrectHitCount) { Index::Editor edit1 = index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - EXPECT_THAT(edit1.AddHit("foo"), IsOk()); - EXPECT_THAT(edit1.AddHit("fool"), IsOk()); + EXPECT_THAT(edit1.BufferTerm("foo"), IsOk()); + EXPECT_THAT(edit1.BufferTerm("fool"), IsOk()); + EXPECT_THAT(edit1.IndexAllBufferedTerms(), IsOk()); Index::Editor edit2 = index_->Edit(kDocumentId1, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - EXPECT_THAT(edit2.AddHit("fool"), IsOk()); + EXPECT_THAT(edit2.BufferTerm("fool"), IsOk()); + EXPECT_THAT(edit2.IndexAllBufferedTerms(), IsOk()); // 'foo' has 1 hit, 'fool' has 2 hits. EXPECT_THAT( @@ -1079,30 +1135,38 @@ TEST_F(IndexTest, FindTermByPrefixShouldReturnApproximateHitCountForMain) { Index::Editor edit = index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - EXPECT_THAT(edit.AddHit("foo"), IsOk()); - EXPECT_THAT(edit.AddHit("fool"), IsOk()); + EXPECT_THAT(edit.BufferTerm("foo"), IsOk()); + EXPECT_THAT(edit.BufferTerm("fool"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); edit = index_->Edit(kDocumentId1, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - EXPECT_THAT(edit.AddHit("fool"), IsOk()); + EXPECT_THAT(edit.BufferTerm("fool"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); edit = index_->Edit(kDocumentId2, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - EXPECT_THAT(edit.AddHit("fool"), IsOk()); + EXPECT_THAT(edit.BufferTerm("fool"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); edit = index_->Edit(kDocumentId3, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - EXPECT_THAT(edit.AddHit("fool"), IsOk()); + EXPECT_THAT(edit.BufferTerm("fool"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); edit = index_->Edit(kDocumentId4, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - EXPECT_THAT(edit.AddHit("fool"), IsOk()); + EXPECT_THAT(edit.BufferTerm("fool"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); edit = index_->Edit(kDocumentId5, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - EXPECT_THAT(edit.AddHit("fool"), IsOk()); + EXPECT_THAT(edit.BufferTerm("fool"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); edit = index_->Edit(kDocumentId6, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - EXPECT_THAT(edit.AddHit("fool"), IsOk()); + EXPECT_THAT(edit.BufferTerm("fool"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); edit = index_->Edit(kDocumentId7, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - EXPECT_THAT(edit.AddHit("fool"), IsOk()); + EXPECT_THAT(edit.BufferTerm("fool"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); // 'foo' has 1 hit, 'fool' has 8 hits. EXPECT_THAT( @@ -1125,14 +1189,16 @@ TEST_F(IndexTest, FindTermByPrefixShouldReturnCombinedHitCount) { Index::Editor edit = index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - EXPECT_THAT(edit.AddHit("foo"), IsOk()); - EXPECT_THAT(edit.AddHit("fool"), IsOk()); + EXPECT_THAT(edit.BufferTerm("foo"), IsOk()); + EXPECT_THAT(edit.BufferTerm("fool"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); ICING_ASSERT_OK(index_->Merge()); edit = index_->Edit(kDocumentId1, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - EXPECT_THAT(edit.AddHit("fool"), IsOk()); + EXPECT_THAT(edit.BufferTerm("fool"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); // 'foo' has 1 hit in the main index, 'fool' has 1 hit in the main index and // 1 hit in the lite index. @@ -1147,13 +1213,15 @@ TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsFromBothIndices) { Index::Editor edit = index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - EXPECT_THAT(edit.AddHit("foo"), IsOk()); + EXPECT_THAT(edit.BufferTerm("foo"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); ICING_ASSERT_OK(index_->Merge()); edit = index_->Edit(kDocumentId1, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - EXPECT_THAT(edit.AddHit("fool"), IsOk()); + EXPECT_THAT(edit.BufferTerm("fool"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); // 'foo' has 1 hit in the main index, 'fool' has 1 hit in the lite index. EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0}, @@ -1171,7 +1239,8 @@ TEST_F(IndexTest, GetElementsSize) { // Add an element. Index::Editor edit = index_->Edit( kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - EXPECT_THAT(edit.AddHit("foo"), IsOk()); + EXPECT_THAT(edit.BufferTerm("foo"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); ICING_ASSERT_OK_AND_ASSIGN(size, index_->GetElementsSize()); EXPECT_THAT(size, Gt(0)); @@ -1183,19 +1252,23 @@ TEST_F(IndexTest, GetElementsSize) { TEST_F(IndexTest, ExactResultsFromLiteAndMain) { Index::Editor edit = index_->Edit( kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - EXPECT_THAT(edit.AddHit("foo"), IsOk()); - EXPECT_THAT(edit.AddHit("fool"), IsOk()); + EXPECT_THAT(edit.BufferTerm("foo"), IsOk()); + EXPECT_THAT(edit.BufferTerm("fool"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); edit = index_->Edit(kDocumentId1, kSectionId3, TermMatchType::PREFIX, /*namespace_id=*/0); - EXPECT_THAT(edit.AddHit("foot"), IsOk()); + EXPECT_THAT(edit.BufferTerm("foot"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); ICING_ASSERT_OK(index_->Merge()); edit = index_->Edit(kDocumentId2, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - EXPECT_THAT(edit.AddHit("footer"), IsOk()); + EXPECT_THAT(edit.BufferTerm("footer"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); edit = index_->Edit(kDocumentId2, kSectionId3, TermMatchType::PREFIX, /*namespace_id=*/0); - EXPECT_THAT(edit.AddHit("foo"), IsOk()); + EXPECT_THAT(edit.BufferTerm("foo"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<DocHitInfoIterator> itr, @@ -1212,19 +1285,23 @@ TEST_F(IndexTest, ExactResultsFromLiteAndMain) { TEST_F(IndexTest, PrefixResultsFromLiteAndMain) { Index::Editor edit = index_->Edit( kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - EXPECT_THAT(edit.AddHit("foo"), IsOk()); - EXPECT_THAT(edit.AddHit("fool"), IsOk()); + EXPECT_THAT(edit.BufferTerm("foo"), IsOk()); + EXPECT_THAT(edit.BufferTerm("fool"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); edit = index_->Edit(kDocumentId1, kSectionId3, TermMatchType::PREFIX, /*namespace_id=*/0); - EXPECT_THAT(edit.AddHit("foot"), IsOk()); + EXPECT_THAT(edit.BufferTerm("foot"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); ICING_ASSERT_OK(index_->Merge()); edit = index_->Edit(kDocumentId2, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - EXPECT_THAT(edit.AddHit("footer"), IsOk()); + EXPECT_THAT(edit.BufferTerm("footer"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); edit = index_->Edit(kDocumentId2, kSectionId3, TermMatchType::PREFIX, /*namespace_id=*/0); - EXPECT_THAT(edit.AddHit("foo"), IsOk()); + EXPECT_THAT(edit.BufferTerm("foo"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<DocHitInfoIterator> itr, @@ -1244,19 +1321,23 @@ TEST_F(IndexTest, GetDebugInfo) { // then add another doc to the lite index. Index::Editor edit = index_->Edit( kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - ASSERT_THAT(edit.AddHit("foo"), IsOk()); - ASSERT_THAT(edit.AddHit("fool"), IsOk()); + ASSERT_THAT(edit.BufferTerm("foo"), IsOk()); + ASSERT_THAT(edit.BufferTerm("fool"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); edit = index_->Edit(kDocumentId1, kSectionId3, TermMatchType::PREFIX, /*namespace_id=*/0); - ASSERT_THAT(edit.AddHit("foot"), IsOk()); + ASSERT_THAT(edit.BufferTerm("foot"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); ICING_ASSERT_OK(index_->Merge()); edit = index_->Edit(kDocumentId2, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - ASSERT_THAT(edit.AddHit("footer"), IsOk()); + ASSERT_THAT(edit.BufferTerm("footer"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); edit = index_->Edit(kDocumentId2, kSectionId3, TermMatchType::PREFIX, /*namespace_id=*/0); - ASSERT_THAT(edit.AddHit("foo"), IsOk()); + ASSERT_THAT(edit.BufferTerm("foo"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); std::string out0; index_->GetDebugInfo(/*verbosity=*/0, &out0); @@ -1269,7 +1350,8 @@ TEST_F(IndexTest, GetDebugInfo) { // Add one more doc to the lite index. Debug strings should change. edit = index_->Edit(kDocumentId3, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - ASSERT_THAT(edit.AddHit("far"), IsOk()); + ASSERT_THAT(edit.BufferTerm("far"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); std::string out2; index_->GetDebugInfo(/*verbosity=*/0, &out2); @@ -1298,13 +1380,16 @@ TEST_F(IndexTest, BackfillingMultipleTermsSucceeds) { // then add another doc to the lite index. Index::Editor edit = index_->Edit( kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - ASSERT_THAT(edit.AddHit("foo"), IsOk()); + ASSERT_THAT(edit.BufferTerm("foo"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); edit = index_->Edit(kDocumentId0, kSectionId3, TermMatchType::PREFIX, /*namespace_id=*/0); - ASSERT_THAT(edit.AddHit("fool"), IsOk()); + ASSERT_THAT(edit.BufferTerm("fool"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); edit = index_->Edit(kDocumentId1, kSectionId3, TermMatchType::PREFIX, /*namespace_id=*/0); - ASSERT_THAT(edit.AddHit("foot"), IsOk()); + ASSERT_THAT(edit.BufferTerm("foot"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); // After this merge the index should have posting lists for // "fool" {(doc0,sec3)}, @@ -1315,7 +1400,8 @@ TEST_F(IndexTest, BackfillingMultipleTermsSucceeds) { // Add one more doc to the lite index. edit = index_->Edit(kDocumentId2, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - ASSERT_THAT(edit.AddHit("far"), IsOk()); + ASSERT_THAT(edit.BufferTerm("far"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); // After this merge the index should add a posting list for "far" and a // backfill branch point for "f". In addition to the posting lists described @@ -1343,11 +1429,13 @@ TEST_F(IndexTest, BackfillingNewTermsSucceeds) { // then add another doc to the lite index. Index::Editor edit = index_->Edit( kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - ASSERT_THAT(edit.AddHit("foo"), IsOk()); - ASSERT_THAT(edit.AddHit("fool"), IsOk()); + ASSERT_THAT(edit.BufferTerm("foo"), IsOk()); + ASSERT_THAT(edit.BufferTerm("fool"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); edit = index_->Edit(kDocumentId1, kSectionId3, TermMatchType::PREFIX, /*namespace_id=*/0); - ASSERT_THAT(edit.AddHit("foot"), IsOk()); + ASSERT_THAT(edit.BufferTerm("foot"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); // After this merge the index should have posting lists for // "fool" {(doc0,sec2)}, // "foot" {(doc1,sec3)}, @@ -1356,14 +1444,17 @@ TEST_F(IndexTest, BackfillingNewTermsSucceeds) { edit = index_->Edit(kDocumentId2, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - ASSERT_THAT(edit.AddHit("footer"), IsOk()); + ASSERT_THAT(edit.BufferTerm("footer"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); edit = index_->Edit(kDocumentId2, kSectionId3, TermMatchType::PREFIX, /*namespace_id=*/0); - ASSERT_THAT(edit.AddHit("foo"), IsOk()); + ASSERT_THAT(edit.BufferTerm("foo"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); // Add one more doc to the lite index. Debug strings should change. edit = index_->Edit(kDocumentId3, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - ASSERT_THAT(edit.AddHit("far"), IsOk()); + ASSERT_THAT(edit.BufferTerm("far"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); // After this merge the index should add posting lists for "far" and "footer" // and a backfill branch point for "f". The new posting lists should be @@ -1400,7 +1491,8 @@ TEST_F(IndexTest, TruncateToInvalidDocumentIdHasNoEffect) { // Add one document to the lite index Index::Editor edit = index_->Edit(kDocumentId0, kSectionId2, TermMatchType::PREFIX, /*namespace_id=*/0); - ASSERT_THAT(edit.AddHit("foo"), IsOk()); + ASSERT_THAT(edit.BufferTerm("foo"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); // Clipping to invalid should have no effect. ICING_EXPECT_OK(index_->TruncateTo(kInvalidDocumentId)); ICING_ASSERT_OK_AND_ASSIGN( @@ -1420,7 +1512,8 @@ TEST_F(IndexTest, TruncateToInvalidDocumentIdHasNoEffect) { edit = index_->Edit(kDocumentId1, kSectionId3, TermMatchType::PREFIX, /*namespace_id=*/0); - ASSERT_THAT(edit.AddHit("foot"), IsOk()); + ASSERT_THAT(edit.BufferTerm("foot"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); // Clipping to invalid should still have no effect even if both indices have // hits. @@ -1447,7 +1540,8 @@ TEST_F(IndexTest, TruncateToLastAddedDocumentIdHasNoEffect) { // Add one document to the lite index Index::Editor edit = index_->Edit(kDocumentId0, kSectionId2, TermMatchType::PREFIX, /*namespace_id=*/0); - ASSERT_THAT(edit.AddHit("foo"), IsOk()); + ASSERT_THAT(edit.BufferTerm("foo"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); ICING_EXPECT_OK(index_->TruncateTo(index_->last_added_document_id())); // Clipping to invalid should have no effect. ICING_ASSERT_OK_AND_ASSIGN( @@ -1467,7 +1561,8 @@ TEST_F(IndexTest, TruncateToLastAddedDocumentIdHasNoEffect) { edit = index_->Edit(kDocumentId1, kSectionId3, TermMatchType::PREFIX, /*namespace_id=*/0); - ASSERT_THAT(edit.AddHit("foot"), IsOk()); + ASSERT_THAT(edit.BufferTerm("foot"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); // Clipping to invalid should still have no effect even if both indices have // hits. @@ -1487,14 +1582,16 @@ TEST_F(IndexTest, TruncateToThrowsOutLiteIndex) { // Add one document to the lite index and merge it into main. Index::Editor edit = index_->Edit(kDocumentId0, kSectionId2, TermMatchType::PREFIX, /*namespace_id=*/0); - ASSERT_THAT(edit.AddHit("foo"), IsOk()); + ASSERT_THAT(edit.BufferTerm("foo"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); ICING_ASSERT_OK(index_->Merge()); // Add another document to the lite index. edit = index_->Edit(kDocumentId1, kSectionId3, TermMatchType::PREFIX, /*namespace_id=*/0); - ASSERT_THAT(edit.AddHit("foot"), IsOk()); + ASSERT_THAT(edit.BufferTerm("foot"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); EXPECT_THAT(index_->TruncateTo(kDocumentId0), IsOk()); @@ -1513,17 +1610,20 @@ TEST_F(IndexTest, TruncateToThrowsOutBothIndices) { // Add two documents to the lite index and merge them into main. Index::Editor edit = index_->Edit(kDocumentId0, kSectionId2, TermMatchType::PREFIX, /*namespace_id=*/0); - ASSERT_THAT(edit.AddHit("foo"), IsOk()); + ASSERT_THAT(edit.BufferTerm("foo"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); edit = index_->Edit(kDocumentId1, kSectionId2, TermMatchType::PREFIX, /*namespace_id=*/0); - ASSERT_THAT(edit.AddHit("foul"), IsOk()); + ASSERT_THAT(edit.BufferTerm("foul"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); ICING_ASSERT_OK(index_->Merge()); // Add another document to the lite index. edit = index_->Edit(kDocumentId2, kSectionId3, TermMatchType::PREFIX, /*namespace_id=*/0); - ASSERT_THAT(edit.AddHit("foot"), IsOk()); + ASSERT_THAT(edit.BufferTerm("foot"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); EXPECT_THAT(index_->TruncateTo(kDocumentId0), IsOk()); diff --git a/icing/index/lite/lite-index.cc b/icing/index/lite/lite-index.cc index 89240ee..ea4bcaf 100644 --- a/icing/index/lite/lite-index.cc +++ b/icing/index/lite/lite-index.cc @@ -448,7 +448,7 @@ uint32_t LiteIndex::Seek(uint32_t term_id) { // Binary search for our term_id. Make sure we get the first // element. Using kBeginSortValue ensures this for the hit value. TermIdHitPair term_id_hit_pair( - term_id, Hit(Hit::kMaxDocumentIdSortValue, Hit::kMaxHitScore)); + term_id, Hit(Hit::kMaxDocumentIdSortValue, Hit::kDefaultHitScore)); const TermIdHitPair::Value* array = hit_buffer_.array_cast<TermIdHitPair::Value>(); diff --git a/icing/index/main/index-block_test.cc b/icing/index/main/index-block_test.cc index 08ba57d..493055f 100644 --- a/icing/index/main/index-block_test.cc +++ b/icing/index/main/index-block_test.cc @@ -105,11 +105,11 @@ TEST(IndexBlockTest, IndexBlockChangesPersistAcrossInstances) { ASSERT_TRUE(CreateFileWithSize(filesystem, flash_file, kBlockSize)); std::vector<Hit> test_hits{ - Hit(/*section_id=*/2, /*document_id=*/0, Hit::kMaxHitScore), - Hit(/*section_id=*/1, /*document_id=*/0, Hit::kMaxHitScore), + Hit(/*section_id=*/2, /*document_id=*/0, Hit::kDefaultHitScore), + Hit(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultHitScore), Hit(/*section_id=*/5, /*document_id=*/1, /*score=*/99), Hit(/*section_id=*/3, /*document_id=*/3, /*score=*/17), - Hit(/*section_id=*/10, /*document_id=*/10, Hit::kMaxHitScore), + Hit(/*section_id=*/10, /*document_id=*/10, Hit::kDefaultHitScore), }; PostingListIndex allocated_index; { @@ -152,18 +152,18 @@ TEST(IndexBlockTest, IndexBlockMultiplePostingLists) { ASSERT_TRUE(CreateFileWithSize(filesystem, flash_file, kBlockSize)); std::vector<Hit> hits_in_posting_list1{ - Hit(/*section_id=*/2, /*document_id=*/0, Hit::kMaxHitScore), - Hit(/*section_id=*/1, /*document_id=*/0, Hit::kMaxHitScore), + Hit(/*section_id=*/2, /*document_id=*/0, Hit::kDefaultHitScore), + Hit(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultHitScore), Hit(/*section_id=*/5, /*document_id=*/1, /*score=*/99), Hit(/*section_id=*/3, /*document_id=*/3, /*score=*/17), - Hit(/*section_id=*/10, /*document_id=*/10, Hit::kMaxHitScore), + Hit(/*section_id=*/10, /*document_id=*/10, Hit::kDefaultHitScore), }; std::vector<Hit> hits_in_posting_list2{ Hit(/*section_id=*/12, /*document_id=*/220, /*score=*/88), - Hit(/*section_id=*/17, /*document_id=*/265, Hit::kMaxHitScore), + Hit(/*section_id=*/17, /*document_id=*/265, Hit::kDefaultHitScore), Hit(/*section_id=*/0, /*document_id=*/287, /*score=*/2), Hit(/*section_id=*/11, /*document_id=*/306, /*score=*/12), - Hit(/*section_id=*/10, /*document_id=*/306, Hit::kMaxHitScore), + Hit(/*section_id=*/10, /*document_id=*/306, Hit::kDefaultHitScore), }; PostingListIndex allocated_index_1; PostingListIndex allocated_index_2; @@ -242,11 +242,11 @@ TEST(IndexBlockTest, IndexBlockReallocatingPostingLists) { // Add hits to the first posting list. std::vector<Hit> hits_in_posting_list1{ - Hit(/*section_id=*/2, /*document_id=*/0, Hit::kMaxHitScore), - Hit(/*section_id=*/1, /*document_id=*/0, Hit::kMaxHitScore), + Hit(/*section_id=*/2, /*document_id=*/0, Hit::kDefaultHitScore), + Hit(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultHitScore), Hit(/*section_id=*/5, /*document_id=*/1, /*score=*/99), Hit(/*section_id=*/3, /*document_id=*/3, /*score=*/17), - Hit(/*section_id=*/10, /*document_id=*/10, Hit::kMaxHitScore), + Hit(/*section_id=*/10, /*document_id=*/10, Hit::kDefaultHitScore), }; ICING_ASSERT_OK_AND_ASSIGN(PostingListIndex allocated_index_1, block.AllocatePostingList()); @@ -262,10 +262,10 @@ TEST(IndexBlockTest, IndexBlockReallocatingPostingLists) { // Add hits to the second posting list. std::vector<Hit> hits_in_posting_list2{ Hit(/*section_id=*/12, /*document_id=*/220, /*score=*/88), - Hit(/*section_id=*/17, /*document_id=*/265, Hit::kMaxHitScore), + Hit(/*section_id=*/17, /*document_id=*/265, Hit::kDefaultHitScore), Hit(/*section_id=*/0, /*document_id=*/287, /*score=*/2), Hit(/*section_id=*/11, /*document_id=*/306, /*score=*/12), - Hit(/*section_id=*/10, /*document_id=*/306, Hit::kMaxHitScore), + Hit(/*section_id=*/10, /*document_id=*/306, Hit::kDefaultHitScore), }; ICING_ASSERT_OK_AND_ASSIGN(PostingListIndex allocated_index_2, block.AllocatePostingList()); @@ -289,7 +289,7 @@ TEST(IndexBlockTest, IndexBlockReallocatingPostingLists) { std::vector<Hit> hits_in_posting_list3{ Hit(/*section_id=*/12, /*document_id=*/0, /*score=*/88), - Hit(/*section_id=*/17, /*document_id=*/1, Hit::kMaxHitScore), + Hit(/*section_id=*/17, /*document_id=*/1, Hit::kDefaultHitScore), Hit(/*section_id=*/0, /*document_id=*/2, /*score=*/2), }; ICING_ASSERT_OK_AND_ASSIGN(PostingListIndex allocated_index_3, diff --git a/icing/index/main/main-index-merger.cc b/icing/index/main/main-index-merger.cc index 8142b79..500774d 100644 --- a/icing/index/main/main-index-merger.cc +++ b/icing/index/main/main-index-merger.cc @@ -33,8 +33,8 @@ namespace { class HitSelector { public: - // Returns whether or not term_id_hit_pair has the same term_id, document_id and section_id - // as the previously selected hits. + // Returns whether or not term_id_hit_pair has the same term_id, document_id + // and section_id as the previously selected hits. bool IsEquivalentHit(const TermIdHitPair& term_id_hit_pair) { return prev_.term_id() == term_id_hit_pair.term_id() && prev_.hit().document_id() == term_id_hit_pair.hit().document_id() && @@ -56,20 +56,24 @@ class HitSelector { // This function may add between 0-2 hits depending on whether the HitSelector // holds both a valid exact hit and a valid prefix hit, one of those or none. size_t InsertSelectedHits(size_t pos, std::vector<TermIdHitPair>* hits) { - // Given highest scoring prefix/exact hits for a given - // term+docid+sectionid, push needed hits into hits array at offset - // pos. Return new pos. + // Given the prefix/exact hits for a given term+docid+sectionid, push needed + // hits into hits array at offset pos. Return new pos. if (best_prefix_hit_.hit().is_valid() && best_exact_hit_.hit().is_valid()) { - // Output both if scores are unequal. Otherwise only exact hit is - // sufficient because 1) they have the same scores and 2) any prefix query - // will also accept an exact hit. (*hits)[pos++] = best_exact_hit_; - if (best_prefix_hit_.hit().score() != best_exact_hit_.hit().score()) { - (*hits)[pos++] = best_prefix_hit_; - // Ensure sorted. - if (best_prefix_hit_.hit() < best_exact_hit_.hit()) { - std::swap((*hits)[pos - 1], (*hits)[pos - 2]); - } + const Hit& prefix_hit = best_prefix_hit_.hit(); + // The prefix hit has score equal to the sum of the scores, capped at + // kMaxHitScore. + Hit::Score final_score = + std::min(static_cast<int>(Hit::kMaxHitScore), + prefix_hit.score() + best_exact_hit_.hit().score()); + best_prefix_hit_ = TermIdHitPair( + best_prefix_hit_.term_id(), + Hit(prefix_hit.section_id(), prefix_hit.document_id(), final_score, + prefix_hit.is_in_prefix_section(), prefix_hit.is_prefix_hit())); + (*hits)[pos++] = best_prefix_hit_; + // Ensure sorted. + if (best_prefix_hit_.hit() < best_exact_hit_.hit()) { + std::swap((*hits)[pos - 1], (*hits)[pos - 2]); } } else if (best_prefix_hit_.hit().is_valid()) { (*hits)[pos++] = best_prefix_hit_; @@ -88,16 +92,38 @@ class HitSelector { private: void SelectPrefixHitIfBetter(const TermIdHitPair& term_id_hit_pair) { - if (!best_prefix_hit_.hit().is_valid() || - best_prefix_hit_.hit().score() < term_id_hit_pair.hit().score()) { + if (!best_prefix_hit_.hit().is_valid()) { best_prefix_hit_ = term_id_hit_pair; + } else { + const Hit& hit = term_id_hit_pair.hit(); + // Create a new prefix hit with term_frequency as the sum of the term + // frequencies. The term frequency is capped at kMaxHitScore. + Hit::Score final_score = + std::min(static_cast<int>(Hit::kMaxHitScore), + hit.score() + best_prefix_hit_.hit().score()); + best_prefix_hit_ = + TermIdHitPair(term_id_hit_pair.term_id(), + Hit(hit.section_id(), hit.document_id(), final_score, + best_prefix_hit_.hit().is_in_prefix_section(), + best_prefix_hit_.hit().is_prefix_hit())); } } void SelectExactHitIfBetter(const TermIdHitPair& term_id_hit_pair) { - if (!best_exact_hit_.hit().is_valid() || - best_exact_hit_.hit().score() < term_id_hit_pair.hit().score()) { + if (!best_exact_hit_.hit().is_valid()) { best_exact_hit_ = term_id_hit_pair; + } else { + const Hit& hit = term_id_hit_pair.hit(); + // Create a new exact hit with term_frequency as the sum of the term + // frequencies. The term frequency is capped at kMaxHitScore. + Hit::Score final_score = + std::min(static_cast<int>(Hit::kMaxHitScore), + hit.score() + best_exact_hit_.hit().score()); + best_exact_hit_ = + TermIdHitPair(term_id_hit_pair.term_id(), + Hit(hit.section_id(), hit.document_id(), final_score, + best_exact_hit_.hit().is_in_prefix_section(), + best_exact_hit_.hit().is_prefix_hit())); } } @@ -166,10 +192,10 @@ class HitComparator { // {"foot", docid0, sectionid0} // {"fool", docid0, sectionid0} // -// When duplicates are encountered, we prefer the hit with the highest hit -// score. If there is both an exact and prefix hit for the same term, we prefer -// the exact hit, unless they have different scores, in which case we keep both -// them. +// When two or more prefix hits are duplicates, merge into one hit with score as +// the sum of the scores. If there is both an exact and prefix hit for the same +// term, keep the exact hit as it is, update the prefix hit so that its score is +// the sum of the scores. void DedupeHits( std::vector<TermIdHitPair>* hits, const TermIdCodec& term_id_codec, const std::unordered_map<uint32_t, int>& main_tvi_to_block_index) { diff --git a/icing/index/main/main-index-merger_test.cc b/icing/index/main/main-index-merger_test.cc index 59d3e82..93f4576 100644 --- a/icing/index/main/main-index-merger_test.cc +++ b/icing/index/main/main-index-merger_test.cc @@ -89,7 +89,7 @@ TEST_F(MainIndexMergerTest, TranslateTermNotAdded) { Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, /*score=*/57, /*is_in_prefix_section=*/false); ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit)); - Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kMaxHitScore, + Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultHitScore, /*is_in_prefix_section=*/false); ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc1_hit)); @@ -128,7 +128,7 @@ TEST_F(MainIndexMergerTest, PrefixExpansion) { Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, /*score=*/57, /*is_in_prefix_section=*/false); ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit)); - Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kMaxHitScore, + Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultHitScore, /*is_in_prefix_section=*/true); ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc1_hit)); @@ -138,7 +138,8 @@ TEST_F(MainIndexMergerTest, PrefixExpansion) { ICING_ASSERT_OK_AND_ASSIGN( uint32_t foo_term_id, term_id_codec_->EncodeTvi(foo_main_tvi, TviType::MAIN)); - Hit doc1_prefix_hit(/*section_id=*/0, /*document_id=*/1, Hit::kMaxHitScore, + Hit doc1_prefix_hit(/*section_id=*/0, /*document_id=*/1, + Hit::kDefaultHitScore, /*is_in_prefix_section=*/true, /*is_prefix_hit=*/true); uint32_t foot_main_tvi = 5; @@ -190,7 +191,7 @@ TEST_F(MainIndexMergerTest, DedupePrefixAndExactWithDifferentScores) { Hit foot_doc0_hit(/*section_id=*/0, /*document_id=*/0, /*score=*/57, /*is_in_prefix_section=*/true); ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, foot_doc0_hit)); - Hit foo_doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kMaxHitScore, + Hit foo_doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultHitScore, /*is_in_prefix_section=*/true); ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, foo_doc0_hit)); @@ -201,8 +202,8 @@ TEST_F(MainIndexMergerTest, DedupePrefixAndExactWithDifferentScores) { uint32_t foo_main_term_id, term_id_codec_->EncodeTvi(foo_main_tvi, TviType::MAIN)); // The prefix hit for 'foot' should have the same score as the exact hit for - // 'foot'. - Hit doc0_prefix_hit(/*section_id=*/0, /*document_id=*/0, /*score=*/57, + // 'foot'. The final prefix hit has score equal to 58. + Hit doc0_prefix_hit(/*section_id=*/0, /*document_id=*/0, /*score=*/58, /*is_in_prefix_section=*/true, /*is_prefix_hit=*/true); uint32_t foot_main_tvi = 5; @@ -221,8 +222,8 @@ TEST_F(MainIndexMergerTest, DedupePrefixAndExactWithDifferentScores) { // 3. TranslateAndExpand should; // a. Translate lite term ids to main term ids based on the map // b. Expand 'foot' to have a hit for 'foo' - // c. Keep both the exact hit for 'foo' and the prefix hit for 'foot' - // because they have different scores. + // c. Keep both the exact hit for 'foo' and the prefix hit for 'foot', the + // latter with score as the sum of the scores. ICING_ASSERT_OK_AND_ASSIGN( std::vector<TermIdHitPair> expanded_term_id_hit_pairs, MainIndexMerger::TranslateAndExpandLiteHits(*lite_index_, *term_id_codec_, @@ -255,6 +256,10 @@ TEST_F(MainIndexMergerTest, DedupeWithExactSameScores) { Hit foo_doc0_hit(/*section_id=*/0, /*document_id=*/0, /*score=*/57, /*is_in_prefix_section=*/true); ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, foo_doc0_hit)); + // The prefix hit should take the sum as score - 114. + Hit prefix_foo_doc0_hit(/*section_id=*/0, /*document_id=*/0, /*score=*/114, + /*is_in_prefix_section=*/true, + /*is_prefix_hit=*/true); // 2. Build up a fake LexiconMergeOutputs // This is some made up number that doesn't matter for this test. @@ -279,16 +284,17 @@ TEST_F(MainIndexMergerTest, DedupeWithExactSameScores) { // 3. TranslateAndExpand should; // a. Translate lite term ids to main term ids based on the map // b. Expand 'foot' to have a hit for 'foo' - // c. Keep only the exact hit for 'foo' since they both have the same hit - // score. + // c. Keep both the exact hit for 'foo' and the prefix hit for 'foot', the + // latter with score as the sum of the scores. ICING_ASSERT_OK_AND_ASSIGN( std::vector<TermIdHitPair> expanded_term_id_hit_pairs, MainIndexMerger::TranslateAndExpandLiteHits(*lite_index_, *term_id_codec_, lexicon_outputs)); - EXPECT_THAT( - expanded_term_id_hit_pairs, - UnorderedElementsAre(TermIdHitPair(foot_main_term_id, foot_doc0_hit), - TermIdHitPair(foo_main_term_id, foo_doc0_hit))); + EXPECT_THAT(expanded_term_id_hit_pairs, + UnorderedElementsAre( + TermIdHitPair(foot_main_term_id, foot_doc0_hit), + TermIdHitPair(foo_main_term_id, foo_doc0_hit), + TermIdHitPair(foo_main_term_id, prefix_foo_doc0_hit))); } TEST_F(MainIndexMergerTest, DedupePrefixExpansion) { @@ -307,10 +313,11 @@ TEST_F(MainIndexMergerTest, DedupePrefixExpansion) { uint32_t fool_term_id, term_id_codec_->EncodeTvi(fool_tvi, TviType::LITE)); - Hit foot_doc0_hit(/*section_id=*/0, /*document_id=*/0, /*score=*/57, + Hit foot_doc0_hit(/*section_id=*/0, /*document_id=*/0, + /*score=*/Hit::kMaxHitScore, /*is_in_prefix_section=*/true); ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, foot_doc0_hit)); - Hit fool_doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kMaxHitScore, + Hit fool_doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultHitScore, /*is_in_prefix_section=*/true); ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, fool_doc0_hit)); @@ -320,9 +327,9 @@ TEST_F(MainIndexMergerTest, DedupePrefixExpansion) { ICING_ASSERT_OK_AND_ASSIGN( uint32_t foo_term_id, term_id_codec_->EncodeTvi(foo_main_tvi, TviType::MAIN)); - // The prefix hit should take the best score - MaxHitScore when merging these - // two. - Hit doc0_prefix_hit(/*section_id=*/0, /*document_id=*/0, Hit::kMaxHitScore, + // The prefix hit should take the sum as score - 256, capped at kMaxHitScore. + Hit doc0_prefix_hit(/*section_id=*/0, /*document_id=*/0, + /*score=*/Hit::kMaxHitScore, /*is_in_prefix_section=*/true, /*is_prefix_hit=*/true); uint32_t foot_main_tvi = 5; @@ -349,7 +356,7 @@ TEST_F(MainIndexMergerTest, DedupePrefixExpansion) { // 3. TranslateAndExpand should; // a. Translate lite term ids to main term ids based on the map // b. Expand 'foot' and 'fool' to have hits for 'foo' - // c. Merge the prefix hits from 'foot' and 'fool', taking the best hit + // c. Merge the prefix hits from 'foot' and 'fool', taking the sum as hit // score. ICING_ASSERT_OK_AND_ASSIGN( std::vector<TermIdHitPair> expanded_term_id_hit_pairs, diff --git a/icing/index/main/main-index_test.cc b/icing/index/main/main-index_test.cc index abe7181..0f87b09 100644 --- a/icing/index/main/main-index_test.cc +++ b/icing/index/main/main-index_test.cc @@ -145,7 +145,7 @@ TEST_F(MainIndexTest, MainIndexGetAccessorForPrefixReturnsValidAccessor) { ICING_ASSERT_OK_AND_ASSIGN(uint32_t foot_term_id, term_id_codec_->EncodeTvi(tvi, TviType::LITE)); - Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kMaxHitScore, + Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultHitScore, /*is_in_prefix_section=*/true); ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit)); @@ -182,7 +182,7 @@ TEST_F(MainIndexTest, MainIndexGetAccessorForExactReturnsValidAccessor) { ICING_ASSERT_OK_AND_ASSIGN(uint32_t foot_term_id, term_id_codec_->EncodeTvi(tvi, TviType::LITE)); - Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kMaxHitScore, + Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultHitScore, /*is_in_prefix_section=*/false); ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit)); @@ -219,18 +219,18 @@ TEST_F(MainIndexTest, MergeIndexToEmpty) { ICING_ASSERT_OK_AND_ASSIGN(uint32_t far_term_id, term_id_codec_->EncodeTvi(tvi, TviType::LITE)); - Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kMaxHitScore, + Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultHitScore, /*is_in_prefix_section=*/false); ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit)); ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc0_hit)); ICING_ASSERT_OK(lite_index_->AddHit(far_term_id, doc0_hit)); - Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kMaxHitScore, + Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultHitScore, /*is_in_prefix_section=*/true); ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc1_hit)); ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc1_hit)); - Hit doc2_hit(/*section_id=*/0, /*document_id=*/2, Hit::kMaxHitScore, + Hit doc2_hit(/*section_id=*/0, /*document_id=*/2, Hit::kDefaultHitScore, /*is_in_prefix_section=*/false); ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc2_hit)); ICING_ASSERT_OK(lite_index_->AddHit(far_term_id, doc2_hit)); @@ -292,18 +292,18 @@ TEST_F(MainIndexTest, MergeIndexToPreexisting) { ICING_ASSERT_OK_AND_ASSIGN(uint32_t far_term_id, term_id_codec_->EncodeTvi(tvi, TviType::LITE)); - Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kMaxHitScore, + Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultHitScore, /*is_in_prefix_section=*/false); ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit)); ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc0_hit)); ICING_ASSERT_OK(lite_index_->AddHit(far_term_id, doc0_hit)); - Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kMaxHitScore, + Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultHitScore, /*is_in_prefix_section=*/true); ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc1_hit)); ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc1_hit)); - Hit doc2_hit(/*section_id=*/0, /*document_id=*/2, Hit::kMaxHitScore, + Hit doc2_hit(/*section_id=*/0, /*document_id=*/2, Hit::kDefaultHitScore, /*is_in_prefix_section=*/false); ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc2_hit)); ICING_ASSERT_OK(lite_index_->AddHit(far_term_id, doc2_hit)); @@ -345,14 +345,14 @@ TEST_F(MainIndexTest, MergeIndexToPreexisting) { ICING_ASSERT_OK_AND_ASSIGN(uint32_t fall_term_id, term_id_codec_->EncodeTvi(tvi, TviType::LITE)); - Hit doc3_hit(/*section_id=*/0, /*document_id=*/3, Hit::kMaxHitScore, + Hit doc3_hit(/*section_id=*/0, /*document_id=*/3, Hit::kDefaultHitScore, /*is_in_prefix_section=*/false); ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc3_hit)); ICING_ASSERT_OK(lite_index_->AddHit(four_term_id, doc3_hit)); ICING_ASSERT_OK(lite_index_->AddHit(foul_term_id, doc3_hit)); ICING_ASSERT_OK(lite_index_->AddHit(fall_term_id, doc3_hit)); - Hit doc4_hit(/*section_id=*/0, /*document_id=*/4, Hit::kMaxHitScore, + Hit doc4_hit(/*section_id=*/0, /*document_id=*/4, Hit::kDefaultHitScore, /*is_in_prefix_section=*/true); ICING_ASSERT_OK(lite_index_->AddHit(four_term_id, doc4_hit)); ICING_ASSERT_OK(lite_index_->AddHit(foul_term_id, doc4_hit)); @@ -404,15 +404,15 @@ TEST_F(MainIndexTest, ExactRetrievedInPrefixSearch) { ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id, term_id_codec_->EncodeTvi(tvi, TviType::LITE)); - Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kMaxHitScore, + Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultHitScore, /*is_in_prefix_section=*/true); ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit)); - Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kMaxHitScore, + Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultHitScore, /*is_in_prefix_section=*/false); ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc1_hit)); - Hit doc2_hit(/*section_id=*/0, /*document_id=*/2, Hit::kMaxHitScore, + Hit doc2_hit(/*section_id=*/0, /*document_id=*/2, Hit::kDefaultHitScore, /*is_in_prefix_section=*/false); ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc2_hit)); @@ -453,15 +453,15 @@ TEST_F(MainIndexTest, PrefixNotRetrievedInExactSearch) { ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id, term_id_codec_->EncodeTvi(tvi, TviType::LITE)); - Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kMaxHitScore, + Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultHitScore, /*is_in_prefix_section=*/true); ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit)); - Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kMaxHitScore, + Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultHitScore, /*is_in_prefix_section=*/false); ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc1_hit)); - Hit doc2_hit(/*section_id=*/0, /*document_id=*/2, Hit::kMaxHitScore, + Hit doc2_hit(/*section_id=*/0, /*document_id=*/2, Hit::kDefaultHitScore, /*is_in_prefix_section=*/true); ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc2_hit)); @@ -500,17 +500,17 @@ TEST_F(MainIndexTest, SearchChainedPostingLists) { for (DocumentId document_id = 0; document_id < 2048; ++document_id) { Hit doc_hit0(/*section_id=*/0, /*document_id=*/document_id, - Hit::kMaxHitScore, + Hit::kDefaultHitScore, /*is_in_prefix_section=*/false); ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc_hit0)); Hit doc_hit1(/*section_id=*/1, /*document_id=*/document_id, - Hit::kMaxHitScore, + Hit::kDefaultHitScore, /*is_in_prefix_section=*/false); ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc_hit1)); Hit doc_hit2(/*section_id=*/2, /*document_id=*/document_id, - Hit::kMaxHitScore, + Hit::kDefaultHitScore, /*is_in_prefix_section=*/false); ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc_hit2)); } @@ -543,7 +543,7 @@ TEST_F(MainIndexTest, MergeIndexBackfilling) { ICING_ASSERT_OK_AND_ASSIGN(uint32_t fool_term_id, term_id_codec_->EncodeTvi(tvi, TviType::LITE)); - Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kMaxHitScore, + Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultHitScore, /*is_in_prefix_section=*/true); ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc0_hit)); @@ -570,7 +570,7 @@ TEST_F(MainIndexTest, MergeIndexBackfilling) { ICING_ASSERT_OK_AND_ASSIGN(uint32_t foot_term_id, term_id_codec_->EncodeTvi(tvi, TviType::LITE)); - Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kMaxHitScore, + Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultHitScore, /*is_in_prefix_section=*/false); ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc1_hit)); diff --git a/icing/index/main/posting-list-accessor_test.cc b/icing/index/main/posting-list-accessor_test.cc index 8a5ef07..85f6d4a 100644 --- a/icing/index/main/posting-list-accessor_test.cc +++ b/icing/index/main/posting-list-accessor_test.cc @@ -82,7 +82,7 @@ TEST(PostingListAccessorStorageTest, PreexistingPLKeepOnSameBlock) { ICING_ASSERT_OK_AND_ASSIGN(PostingListAccessor pl_accessor, PostingListAccessor::Create(&flash_index_storage)); // Add a single hit. This will fit in a min-sized posting list. - Hit hit1(/*section_id=*/1, /*document_id=*/0, Hit::kMaxHitScore); + Hit hit1(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultHitScore); ICING_ASSERT_OK(pl_accessor.PrependHit(hit1)); PostingListAccessor::FinalizeResult result1 = PostingListAccessor::Finalize(std::move(pl_accessor)); @@ -324,14 +324,14 @@ TEST(PostingListAccessorStorageTest, HitsNotDecreasingReturnsInvalidArgument) { FlashIndexStorage::Create(file_name, &filesystem)); ICING_ASSERT_OK_AND_ASSIGN(PostingListAccessor pl_accessor, PostingListAccessor::Create(&flash_index_storage)); - Hit hit1(/*section_id=*/3, /*document_id=*/1, Hit::kMaxHitScore); + Hit hit1(/*section_id=*/3, /*document_id=*/1, Hit::kDefaultHitScore); ICING_ASSERT_OK(pl_accessor.PrependHit(hit1)); - Hit hit2(/*section_id=*/6, /*document_id=*/1, Hit::kMaxHitScore); + Hit hit2(/*section_id=*/6, /*document_id=*/1, Hit::kDefaultHitScore); EXPECT_THAT(pl_accessor.PrependHit(hit2), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); - Hit hit3(/*section_id=*/2, /*document_id=*/0, Hit::kMaxHitScore); + Hit hit3(/*section_id=*/2, /*document_id=*/0, Hit::kDefaultHitScore); EXPECT_THAT(pl_accessor.PrependHit(hit3), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } @@ -364,7 +364,7 @@ TEST(PostingListAccessorStorageTest, PreexistingPostingListNoHitsAdded) { FlashIndexStorage::Create(file_name, &filesystem)); ICING_ASSERT_OK_AND_ASSIGN(PostingListAccessor pl_accessor, PostingListAccessor::Create(&flash_index_storage)); - Hit hit1(/*section_id=*/3, /*document_id=*/1, Hit::kMaxHitScore); + Hit hit1(/*section_id=*/3, /*document_id=*/1, Hit::kDefaultHitScore); ICING_ASSERT_OK(pl_accessor.PrependHit(hit1)); PostingListAccessor::FinalizeResult result1 = PostingListAccessor::Finalize(std::move(pl_accessor)); diff --git a/icing/index/main/posting-list-used_test.cc b/icing/index/main/posting-list-used_test.cc index eb62aeb..f6b5898 100644 --- a/icing/index/main/posting-list-used_test.cc +++ b/icing/index/main/posting-list-used_test.cc @@ -80,7 +80,7 @@ TEST(PostingListTest, PostingListUsedPrependHitNotFull) { EXPECT_THAT(pl_used.BytesUsed(), Le(expected_size)); EXPECT_THAT(pl_used.GetHits(), IsOkAndHolds(ElementsAre(hit0))); - Hit hit1(/*section_id=*/0, 1, Hit::kMaxHitScore); + Hit hit1(/*section_id=*/0, 1, Hit::kDefaultHitScore); pl_used.PrependHit(hit1); // Size = sizeof(uncompressed hit1) // + sizeof(hit0-hit1) + sizeof(hit0::score) @@ -97,7 +97,7 @@ TEST(PostingListTest, PostingListUsedPrependHitNotFull) { EXPECT_THAT(pl_used.BytesUsed(), Le(expected_size)); EXPECT_THAT(pl_used.GetHits(), IsOkAndHolds(ElementsAre(hit2, hit1, hit0))); - Hit hit3(/*section_id=*/0, 3, Hit::kMaxHitScore); + Hit hit3(/*section_id=*/0, 3, Hit::kDefaultHitScore); pl_used.PrependHit(hit3); // Size = sizeof(uncompressed hit3) // + sizeof(hit2-hit3) + sizeof(hit2::score) @@ -122,7 +122,7 @@ TEST(PostingListTest, PostingListUsedPrependHitAlmostFull) { // Adding hit0: EMPTY -> NOT_FULL // Adding hit1: NOT_FULL -> NOT_FULL // Adding hit2: NOT_FULL -> NOT_FULL - Hit hit0(/*section_id=*/0, 0, Hit::kMaxHitScore); + Hit hit0(/*section_id=*/0, 0, Hit::kDefaultHitScore); Hit hit1 = CreateHit(hit0, /*desired_byte_length=*/2); Hit hit2 = CreateHit(hit1, /*desired_byte_length=*/2); ICING_EXPECT_OK(pl_used.PrependHit(hit0)); @@ -227,7 +227,7 @@ TEST(PostingListTest, PostingListPrependHitArrayMinSizePostingList) { static_cast<void *>(hits_buf.get()), size)); std::vector<HitElt> hits_in; - hits_in.emplace_back(Hit(1, 0, Hit::kMaxHitScore)); + hits_in.emplace_back(Hit(1, 0, Hit::kDefaultHitScore)); hits_in.emplace_back( CreateHit(hits_in.rbegin()->hit, /*desired_byte_length=*/1)); hits_in.emplace_back( @@ -268,7 +268,7 @@ TEST(PostingListTest, PostingListPrependHitArrayPostingList) { static_cast<void *>(hits_buf.get()), size)); std::vector<HitElt> hits_in; - hits_in.emplace_back(Hit(1, 0, Hit::kMaxHitScore)); + hits_in.emplace_back(Hit(1, 0, Hit::kDefaultHitScore)); hits_in.emplace_back( CreateHit(hits_in.rbegin()->hit, /*desired_byte_length=*/1)); hits_in.emplace_back( diff --git a/icing/query/query-processor_benchmark.cc b/icing/query/query-processor_benchmark.cc index 9dc5c07..eb8b7a4 100644 --- a/icing/query/query-processor_benchmark.cc +++ b/icing/query/query-processor_benchmark.cc @@ -70,7 +70,8 @@ void AddTokenToIndex(Index* index, DocumentId document_id, SectionId section_id, const std::string& token) { Index::Editor editor = index->Edit(document_id, section_id, term_match_type, /*namespace_id=*/0); - ICING_ASSERT_OK(editor.AddHit(token.c_str())); + ICING_ASSERT_OK(editor.BufferTerm(token.c_str())); + ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); } std::unique_ptr<Index> CreateIndex(const IcingFilesystem& icing_filesystem, diff --git a/icing/query/query-processor_test.cc b/icing/query/query-processor_test.cc index 8c46736..7546ae4 100644 --- a/icing/query/query-processor_test.cc +++ b/icing/query/query-processor_test.cc @@ -128,7 +128,8 @@ class QueryProcessorTest : public Test { TermMatchType::Code term_match_type, const std::string& token) { Index::Editor editor = index_->Edit(document_id, section_id, term_match_type, /*namespace_id=*/0); - return editor.AddHit(token.c_str()); + auto status = editor.BufferTerm(token.c_str()); + return status.ok() ? editor.IndexAllBufferedTerms() : status; } void TearDown() override { diff --git a/icing/result/page-result-state.h b/icing/result/page-result-state.h index a26c44e..85f1dd7 100644 --- a/icing/result/page-result-state.h +++ b/icing/result/page-result-state.h @@ -18,6 +18,7 @@ #include <cstdint> #include <vector> +#include "icing/result/projection-tree.h" #include "icing/result/snippet-context.h" #include "icing/scoring/scored-document-hit.h" @@ -29,10 +30,12 @@ struct PageResultState { PageResultState(std::vector<ScoredDocumentHit> scored_document_hits_in, uint64_t next_page_token_in, SnippetContext snippet_context_in, + std::unordered_map<std::string, ProjectionTree> tree_map, int num_previously_returned_in) : scored_document_hits(std::move(scored_document_hits_in)), next_page_token(next_page_token_in), snippet_context(std::move(snippet_context_in)), + projection_tree_map(std::move(tree_map)), num_previously_returned(num_previously_returned_in) {} // Results of one page @@ -44,6 +47,9 @@ struct PageResultState { // Information needed for snippeting. SnippetContext snippet_context; + // Information needed for projection. + std::unordered_map<std::string, ProjectionTree> projection_tree_map; + // Number of results that have been returned in previous pages. int num_previously_returned; }; diff --git a/icing/result/projection-tree.cc b/icing/result/projection-tree.cc new file mode 100644 index 0000000..382fcb4 --- /dev/null +++ b/icing/result/projection-tree.cc @@ -0,0 +1,50 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/result/projection-tree.h" + +#include <algorithm> + +#include "icing/absl_ports/str_join.h" +#include "icing/schema/section-manager.h" + +namespace icing { +namespace lib { + +ProjectionTree::ProjectionTree( + const ResultSpecProto::TypePropertyMask& type_field_mask) { + for (const std::string& field_mask : type_field_mask.paths()) { + Node* current_node = &root_; + for (std::string_view sub_field_mask : + absl_ports::StrSplit(field_mask, kPropertySeparator)) { + current_node = AddChildNode(sub_field_mask, ¤t_node->children); + } + } +} + +ProjectionTree::Node* ProjectionTree::AddChildNode( + std::string_view property_name, std::vector<Node>* current_children) { + auto itr = std::find_if(current_children->begin(), current_children->end(), + [&property_name](const Node& node) { + return node.name == property_name; + }); + if (itr != current_children->end()) { + return &(*itr); + } + current_children->push_back(ProjectionTree::Node(property_name)); + return ¤t_children->back(); +} + +} // namespace lib +} // namespace icing diff --git a/icing/result/projection-tree.h b/icing/result/projection-tree.h new file mode 100644 index 0000000..7ace295 --- /dev/null +++ b/icing/result/projection-tree.h @@ -0,0 +1,53 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ICING_RESULT_PROJECTION_TREE_H_ +#define ICING_RESULT_PROJECTION_TREE_H_ + +#include <string_view> +#include <vector> + +#include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "icing/proto/search.pb.h" + +namespace icing { +namespace lib { + +class ProjectionTree { + public: + struct Node { + explicit Node(std::string_view name = "") : name(name) {} + + std::string_view name; + std::vector<Node> children; + }; + + explicit ProjectionTree( + const ResultSpecProto::TypePropertyMask& type_field_mask); + + const Node& root() const { return root_; } + + private: + // Add a child node with property_name to current_children and returns a + // pointer to the child node. + Node* AddChildNode(std::string_view property_name, + std::vector<Node>* current_children); + + Node root_; +}; + +} // namespace lib +} // namespace icing + +#endif // ICING_RESULT_PROJECTION_TREE_H_ diff --git a/icing/result/projection-tree_test.cc b/icing/result/projection-tree_test.cc new file mode 100644 index 0000000..77d1d21 --- /dev/null +++ b/icing/result/projection-tree_test.cc @@ -0,0 +1,102 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/result/projection-tree.h" + +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "icing/proto/search.pb.h" + +namespace icing { +namespace lib { + +namespace { + +using ::testing::Eq; +using ::testing::IsEmpty; +using ::testing::SizeIs; + +TEST(ProjectionTreeTest, CreateEmptyFieldMasks) { + ResultSpecProto::TypePropertyMask type_field_mask; + ProjectionTree tree(type_field_mask); + EXPECT_THAT(tree.root().name, IsEmpty()); + EXPECT_THAT(tree.root().children, IsEmpty()); +} + +TEST(ProjectionTreeTest, CreateTreeTopLevel) { + ResultSpecProto::TypePropertyMask type_field_mask; + type_field_mask.add_paths("subject"); + + ProjectionTree tree(type_field_mask); + EXPECT_THAT(tree.root().name, IsEmpty()); + ASSERT_THAT(tree.root().children, SizeIs(1)); + ASSERT_THAT(tree.root().children.at(0).name, Eq("subject")); + ASSERT_THAT(tree.root().children.at(0).children, IsEmpty()); +} + +TEST(ProjectionTreeTest, CreateTreeMultipleTopLevel) { + ResultSpecProto::TypePropertyMask type_field_mask; + type_field_mask.add_paths("subject"); + type_field_mask.add_paths("body"); + + ProjectionTree tree(type_field_mask); + EXPECT_THAT(tree.root().name, IsEmpty()); + ASSERT_THAT(tree.root().children, SizeIs(2)); + ASSERT_THAT(tree.root().children.at(0).name, Eq("subject")); + ASSERT_THAT(tree.root().children.at(0).children, IsEmpty()); + ASSERT_THAT(tree.root().children.at(1).name, Eq("body")); + ASSERT_THAT(tree.root().children.at(1).children, IsEmpty()); +} + +TEST(ProjectionTreeTest, CreateTreeNested) { + ResultSpecProto::TypePropertyMask type_field_mask; + type_field_mask.add_paths("subject.body"); + type_field_mask.add_paths("body"); + + ProjectionTree tree(type_field_mask); + EXPECT_THAT(tree.root().name, IsEmpty()); + ASSERT_THAT(tree.root().children, SizeIs(2)); + ASSERT_THAT(tree.root().children.at(0).name, Eq("subject")); + ASSERT_THAT(tree.root().children.at(0).children, SizeIs(1)); + ASSERT_THAT(tree.root().children.at(0).children.at(0).name, Eq("body")); + ASSERT_THAT(tree.root().children.at(0).children.at(0).children, IsEmpty()); + ASSERT_THAT(tree.root().children.at(1).name, Eq("body")); + ASSERT_THAT(tree.root().children.at(1).children, IsEmpty()); +} + +TEST(ProjectionTreeTest, CreateTreeNestedSharedNode) { + ResultSpecProto::TypePropertyMask type_field_mask; + type_field_mask.add_paths("sender.name.first"); + type_field_mask.add_paths("sender.emailAddress"); + + ProjectionTree tree(type_field_mask); + EXPECT_THAT(tree.root().name, IsEmpty()); + ASSERT_THAT(tree.root().children, SizeIs(1)); + ASSERT_THAT(tree.root().children.at(0).name, Eq("sender")); + ASSERT_THAT(tree.root().children.at(0).children, SizeIs(2)); + ASSERT_THAT(tree.root().children.at(0).children.at(0).name, Eq("name")); + ASSERT_THAT(tree.root().children.at(0).children.at(0).children, SizeIs(1)); + ASSERT_THAT(tree.root().children.at(0).children.at(0).children.at(0).name, + Eq("first")); + ASSERT_THAT(tree.root().children.at(0).children.at(0).children.at(0).children, + IsEmpty()); + ASSERT_THAT(tree.root().children.at(0).children.at(1).name, + Eq("emailAddress")); + ASSERT_THAT(tree.root().children.at(0).children.at(1).children, IsEmpty()); +} + +} // namespace + +} // namespace lib +} // namespace icing diff --git a/icing/result/result-retriever.cc b/icing/result/result-retriever.cc index f09d834..ff6320b 100644 --- a/icing/result/result-retriever.cc +++ b/icing/result/result-retriever.cc @@ -14,15 +14,56 @@ #include "icing/result/result-retriever.h" +#include <string_view> +#include <utility> + #include "icing/text_classifier/lib3/utils/base/statusor.h" #include "icing/proto/search.pb.h" #include "icing/proto/term.pb.h" #include "icing/result/page-result-state.h" +#include "icing/result/projection-tree.h" #include "icing/result/snippet-context.h" #include "icing/util/status-macros.h" namespace icing { namespace lib { + +namespace { + +void Project(const std::vector<ProjectionTree::Node>& projection_tree, + proto2::RepeatedPtrField<PropertyProto>* properties) { + int num_kept = 0; + for (int cur_pos = 0; cur_pos < properties->size(); ++cur_pos) { + PropertyProto* prop = properties->Mutable(cur_pos); + auto itr = std::find_if(projection_tree.begin(), projection_tree.end(), + [&prop](const ProjectionTree::Node& node) { + return node.name == prop->name(); + }); + if (itr == projection_tree.end()) { + // Property is not present in the projection tree. Just skip it. + continue; + } + // This property should be kept. + properties->SwapElements(num_kept, cur_pos); + ++num_kept; + if (itr->children.empty()) { + // A field mask does refer to this property, but it has no children. So + // we should take the entire property, with all of its + // subproperties/values + continue; + } + // The field mask refers to children of this property. Recurse through the + // document values that this property holds and project the children + // requested by this field mask. + for (DocumentProto& subproperty : *(prop->mutable_document_values())) { + Project(itr->children, subproperty.mutable_properties()); + } + } + properties->DeleteSubrange(num_kept, properties->size() - num_kept); +} + +} // namespace + libtextclassifier3::StatusOr<std::unique_ptr<ResultRetriever>> ResultRetriever::Create(const DocumentStore* doc_store, const SchemaStore* schema_store, @@ -74,6 +115,14 @@ ResultRetriever::RetrieveResults( } } + // Apply projection + auto itr = page_result_state.projection_tree_map.find( + document_or.ValueOrDie().schema()); + if (itr != page_result_state.projection_tree_map.end()) { + Project(itr->second.root().children, + document_or.ValueOrDie().mutable_properties()); + } + SearchResultProto::ResultProto result; // Add the snippet if requested. if (snippet_context.snippet_spec.num_matches_per_property() > 0 && diff --git a/icing/result/result-retriever_test.cc b/icing/result/result-retriever_test.cc index 1078c5a..82e32ee 100644 --- a/icing/result/result-retriever_test.cc +++ b/icing/result/result-retriever_test.cc @@ -16,6 +16,8 @@ #include <limits> #include <memory> +#include <string_view> +#include <unordered_map> #include "gtest/gtest.h" #include "icing/document-builder.h" @@ -26,6 +28,7 @@ #include "icing/proto/schema.pb.h" #include "icing/proto/search.pb.h" #include "icing/proto/term.pb.h" +#include "icing/result/projection-tree.h" #include "icing/schema/schema-store.h" #include "icing/store/document-id.h" #include "icing/testing/common-matchers.h" @@ -74,30 +77,63 @@ class ResultRetrieverTest : public testing::Test { ICING_ASSERT_OK_AND_ASSIGN(normalizer_, normalizer_factory::Create( /*max_term_byte_size=*/10000)); + ASSERT_THAT(schema_store_->SetSchema(CreatePersonAndEmailSchema()), IsOk()); + } + + void TearDown() override { + filesystem_.DeleteDirectoryRecursively(test_dir_.c_str()); + } + + SchemaProto CreatePersonAndEmailSchema() { SchemaProto schema; - auto type_config = schema.add_types(); - type_config->set_schema_type("email"); - PropertyConfigProto* prop_config = type_config->add_properties(); - prop_config->set_property_name("subject"); - prop_config->set_data_type(PropertyConfigProto::DataType::STRING); - prop_config->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); - prop_config->mutable_string_indexing_config()->set_term_match_type( + + auto* type = schema.add_types(); + type->set_schema_type("Email"); + + auto* subj = type->add_properties(); + subj->set_property_name("subject"); + subj->set_data_type(PropertyConfigProto::DataType::STRING); + subj->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); + subj->mutable_string_indexing_config()->set_term_match_type( TermMatchType::PREFIX); - prop_config->mutable_string_indexing_config()->set_tokenizer_type( + subj->mutable_string_indexing_config()->set_tokenizer_type( StringIndexingConfig::TokenizerType::PLAIN); - prop_config = type_config->add_properties(); - prop_config->set_property_name("body"); - prop_config->set_data_type(PropertyConfigProto::DataType::STRING); - prop_config->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); - prop_config->mutable_string_indexing_config()->set_term_match_type( + auto* body = type->add_properties(); + body->set_property_name("body"); + body->set_data_type(PropertyConfigProto::DataType::STRING); + body->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); + body->mutable_string_indexing_config()->set_term_match_type( TermMatchType::EXACT_ONLY); - prop_config->mutable_string_indexing_config()->set_tokenizer_type( + body->mutable_string_indexing_config()->set_tokenizer_type( + StringIndexingConfig::TokenizerType::PLAIN); + auto* sender = type->add_properties(); + sender->set_property_name("sender"); + sender->set_schema_type("Person"); + sender->set_data_type(PropertyConfigProto::DataType::DOCUMENT); + sender->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); + sender->mutable_document_indexing_config()->set_index_nested_properties( + true); + + auto* person_type = schema.add_types(); + person_type->set_schema_type("Person"); + auto* name = person_type->add_properties(); + name->set_property_name("name"); + name->set_data_type(PropertyConfigProto::DataType::STRING); + name->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); + name->mutable_string_indexing_config()->set_term_match_type( + TermMatchType::PREFIX); + name->mutable_string_indexing_config()->set_tokenizer_type( + StringIndexingConfig::TokenizerType::PLAIN); + auto* address = person_type->add_properties(); + address->set_property_name("emailAddress"); + address->set_data_type(PropertyConfigProto::DataType::STRING); + address->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); + address->mutable_string_indexing_config()->set_term_match_type( + TermMatchType::PREFIX); + address->mutable_string_indexing_config()->set_tokenizer_type( StringIndexingConfig::TokenizerType::PLAIN); - ASSERT_THAT(schema_store_->SetSchema(schema), IsOk()); - } - void TearDown() override { - filesystem_.DeleteDirectoryRecursively(test_dir_.c_str()); + return schema; } const Filesystem filesystem_; @@ -118,8 +154,8 @@ ResultSpecProto::SnippetSpecProto CreateSnippetSpec() { DocumentProto CreateDocument(int id) { return DocumentBuilder() - .SetKey("icing", "email/" + std::to_string(id)) - .SetSchema("email") + .SetKey("icing", "Email/" + std::to_string(id)) + .SetSchema("Email") .AddStringProperty("subject", "subject foo " + std::to_string(id)) .AddStringProperty("body", "body bar " + std::to_string(id)) .SetCreationTimestampMs(1574365086666 + id) @@ -169,9 +205,9 @@ TEST_F(ResultRetrieverTest, ShouldRetrieveSimpleResults) { doc_store->Put(CreateDocument(/*id=*/3))); std::vector<ScoredDocumentHit> scored_document_hits = { - {document_id1, /*hit_section_id_mask=*/0b00000011, /*score=*/0}, - {document_id2, /*hit_section_id_mask=*/0b00000011, /*score=*/0}, - {document_id3, /*hit_section_id_mask=*/0b00000011, /*score=*/0}}; + {document_id1, /*hit_section_id_mask=*/0b00001001, /*score=*/0}, + {document_id2, /*hit_section_id_mask=*/0b00001001, /*score=*/0}, + {document_id3, /*hit_section_id_mask=*/0b00001001, /*score=*/0}}; ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<ResultRetriever> result_retriever, ResultRetriever::Create(doc_store.get(), schema_store_.get(), @@ -190,7 +226,9 @@ TEST_F(ResultRetrieverTest, ShouldRetrieveSimpleResults) { TermMatchType::EXACT_ONLY); PageResultState page_result_state( std::move(scored_document_hits), /*next_page_token_in=*/1, - std::move(snippet_context), /*num_previously_returned_in=*/0); + std::move(snippet_context), + std::unordered_map<std::string, ProjectionTree>(), + /*num_previously_returned_in=*/0); EXPECT_THAT( result_retriever->RetrieveResults(page_result_state), IsOkAndHolds(ElementsAre(EqualsProto(result1), EqualsProto(result2), @@ -212,9 +250,9 @@ TEST_F(ResultRetrieverTest, IgnoreErrors) { DocumentId invalid_document_id = -1; std::vector<ScoredDocumentHit> scored_document_hits = { - {document_id1, /*hit_section_id_mask=*/0b00000011, /*score=*/0}, - {document_id2, /*hit_section_id_mask=*/0b00000011, /*score=*/0}, - {invalid_document_id, /*hit_section_id_mask=*/0b00000011, /*score=*/0}}; + {document_id1, /*hit_section_id_mask=*/0b00001001, /*score=*/0}, + {document_id2, /*hit_section_id_mask=*/0b00001001, /*score=*/0}, + {invalid_document_id, /*hit_section_id_mask=*/0b00001001, /*score=*/0}}; ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<ResultRetriever> result_retriever, ResultRetriever::Create(doc_store.get(), schema_store_.get(), @@ -232,7 +270,9 @@ TEST_F(ResultRetrieverTest, IgnoreErrors) { TermMatchType::EXACT_ONLY); PageResultState page_result_state( std::move(scored_document_hits), /*next_page_token_in=*/1, - std::move(snippet_context), /*num_previously_returned_in=*/0); + std::move(snippet_context), + std::unordered_map<std::string, ProjectionTree>(), + /*num_previously_returned_in=*/0); EXPECT_THAT( result_retriever->RetrieveResults(page_result_state), IsOkAndHolds(ElementsAre(EqualsProto(result1), EqualsProto(result2)))); @@ -253,9 +293,9 @@ TEST_F(ResultRetrieverTest, NotIgnoreErrors) { DocumentId invalid_document_id = -1; std::vector<ScoredDocumentHit> scored_document_hits = { - {document_id1, /*hit_section_id_mask=*/0b00000011, /*score=*/0}, - {document_id2, /*hit_section_id_mask=*/0b00000011, /*score=*/0}, - {invalid_document_id, /*hit_section_id_mask=*/0b00000011, /*score=*/0}}; + {document_id1, /*hit_section_id_mask=*/0b00001001, /*score=*/0}, + {document_id2, /*hit_section_id_mask=*/0b00001001, /*score=*/0}, + {invalid_document_id, /*hit_section_id_mask=*/0b00001001, /*score=*/0}}; ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<ResultRetriever> result_retriever, ResultRetriever::Create(doc_store.get(), schema_store_.get(), @@ -268,15 +308,17 @@ TEST_F(ResultRetrieverTest, NotIgnoreErrors) { TermMatchType::EXACT_ONLY); PageResultState page_result_state( std::move(scored_document_hits), /*next_page_token_in=*/1, - std::move(snippet_context), /*num_previously_returned_in=*/0); + std::move(snippet_context), + std::unordered_map<std::string, ProjectionTree>(), + /*num_previously_returned_in=*/0); EXPECT_THAT(result_retriever->RetrieveResults(page_result_state), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); DocumentId non_existing_document_id = 4; page_result_state.scored_document_hits = { - {document_id1, /*hit_section_id_mask=*/0b00000011, /*score=*/0}, - {document_id2, /*hit_section_id_mask=*/0b00000011, /*score=*/0}, - {non_existing_document_id, /*hit_section_id_mask=*/0b00000011, + {document_id1, /*hit_section_id_mask=*/0b00001001, /*score=*/0}, + {document_id2, /*hit_section_id_mask=*/0b00001001, /*score=*/0}, + {non_existing_document_id, /*hit_section_id_mask=*/0b00001001, /*score=*/0}}; EXPECT_THAT(result_retriever->RetrieveResults(page_result_state), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); @@ -299,8 +341,8 @@ TEST_F(ResultRetrieverTest, IOErrorShouldReturnInternalError) { doc_store->Put(CreateDocument(/*id=*/2))); std::vector<ScoredDocumentHit> scored_document_hits = { - {document_id1, /*hit_section_id_mask=*/0b00000011, /*score=*/0}, - {document_id2, /*hit_section_id_mask=*/0b00000011, /*score=*/0}}; + {document_id1, /*hit_section_id_mask=*/0b00001001, /*score=*/0}, + {document_id2, /*hit_section_id_mask=*/0b00001001, /*score=*/0}}; ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<ResultRetriever> result_retriever, @@ -314,7 +356,9 @@ TEST_F(ResultRetrieverTest, IOErrorShouldReturnInternalError) { TermMatchType::EXACT_ONLY); PageResultState page_result_state( std::move(scored_document_hits), /*next_page_token_in=*/1, - std::move(snippet_context), /*num_previously_returned_in=*/0); + std::move(snippet_context), + std::unordered_map<std::string, ProjectionTree>(), + /*num_previously_returned_in=*/0); EXPECT_THAT(result_retriever->RetrieveResults(page_result_state), StatusIs(libtextclassifier3::StatusCode::INTERNAL)); } @@ -335,9 +379,9 @@ TEST_F(ResultRetrieverTest, DefaultSnippetSpecShouldDisableSnippeting) { doc_store->Put(CreateDocument(/*id=*/3))); std::vector<ScoredDocumentHit> scored_document_hits = { - {document_id1, /*hit_section_id_mask=*/0b00000011, /*score=*/0}, - {document_id2, /*hit_section_id_mask=*/0b00000011, /*score=*/0}, - {document_id3, /*hit_section_id_mask=*/0b00000011, /*score=*/0}}; + {document_id1, /*hit_section_id_mask=*/0b00001001, /*score=*/0}, + {document_id2, /*hit_section_id_mask=*/0b00001001, /*score=*/0}, + {document_id3, /*hit_section_id_mask=*/0b00001001, /*score=*/0}}; ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<ResultRetriever> result_retriever, ResultRetriever::Create(doc_store.get(), schema_store_.get(), @@ -349,7 +393,9 @@ TEST_F(ResultRetrieverTest, DefaultSnippetSpecShouldDisableSnippeting) { TermMatchType::EXACT_ONLY); PageResultState page_result_state( std::move(scored_document_hits), /*next_page_token_in=*/1, - std::move(snippet_context), /*num_previously_returned_in=*/0); + std::move(snippet_context), + std::unordered_map<std::string, ProjectionTree>(), + /*num_previously_returned_in=*/0); ICING_ASSERT_OK_AND_ASSIGN( std::vector<SearchResultProto::ResultProto> results, result_retriever->RetrieveResults(page_result_state)); @@ -378,9 +424,9 @@ TEST_F(ResultRetrieverTest, SimpleSnippeted) { doc_store->Put(CreateDocument(/*id=*/3))); std::vector<ScoredDocumentHit> scored_document_hits = { - {document_id1, /*hit_section_id_mask=*/0b00000011, /*score=*/0}, - {document_id2, /*hit_section_id_mask=*/0b00000011, /*score=*/0}, - {document_id3, /*hit_section_id_mask=*/0b00000011, /*score=*/0}}; + {document_id1, /*hit_section_id_mask=*/0b00001001, /*score=*/0}, + {document_id2, /*hit_section_id_mask=*/0b00001001, /*score=*/0}, + {document_id3, /*hit_section_id_mask=*/0b00001001, /*score=*/0}}; ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<ResultRetriever> result_retriever, ResultRetriever::Create(doc_store.get(), schema_store_.get(), @@ -391,7 +437,9 @@ TEST_F(ResultRetrieverTest, SimpleSnippeted) { TermMatchType::EXACT_ONLY); PageResultState page_result_state( std::move(scored_document_hits), /*next_page_token_in=*/1, - std::move(snippet_context), /*num_previously_returned_in=*/0); + std::move(snippet_context), + std::unordered_map<std::string, ProjectionTree>(), + /*num_previously_returned_in=*/0); ICING_ASSERT_OK_AND_ASSIGN( std::vector<SearchResultProto::ResultProto> result, result_retriever->RetrieveResults(page_result_state)); @@ -449,9 +497,9 @@ TEST_F(ResultRetrieverTest, OnlyOneDocumentSnippeted) { snippet_spec.set_num_to_snippet(1); std::vector<ScoredDocumentHit> scored_document_hits = { - {document_id1, /*hit_section_id_mask=*/0b00000011, /*score=*/0}, - {document_id2, /*hit_section_id_mask=*/0b00000011, /*score=*/0}, - {document_id3, /*hit_section_id_mask=*/0b00000011, /*score=*/0}}; + {document_id1, /*hit_section_id_mask=*/0b00001001, /*score=*/0}, + {document_id2, /*hit_section_id_mask=*/0b00001001, /*score=*/0}, + {document_id3, /*hit_section_id_mask=*/0b00001001, /*score=*/0}}; ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<ResultRetriever> result_retriever, ResultRetriever::Create(doc_store.get(), schema_store_.get(), @@ -461,7 +509,9 @@ TEST_F(ResultRetrieverTest, OnlyOneDocumentSnippeted) { snippet_spec, TermMatchType::EXACT_ONLY); PageResultState page_result_state( std::move(scored_document_hits), /*next_page_token_in=*/1, - std::move(snippet_context), /*num_previously_returned_in=*/0); + std::move(snippet_context), + std::unordered_map<std::string, ProjectionTree>(), + /*num_previously_returned_in=*/0); ICING_ASSERT_OK_AND_ASSIGN( std::vector<SearchResultProto::ResultProto> result, result_retriever->RetrieveResults(page_result_state)); @@ -502,9 +552,9 @@ TEST_F(ResultRetrieverTest, ShouldSnippetAllResults) { doc_store->Put(CreateDocument(/*id=*/3))); std::vector<ScoredDocumentHit> scored_document_hits = { - {document_id1, /*hit_section_id_mask=*/0b00000011, /*score=*/0}, - {document_id2, /*hit_section_id_mask=*/0b00000011, /*score=*/0}, - {document_id3, /*hit_section_id_mask=*/0b00000011, /*score=*/0}}; + {document_id1, /*hit_section_id_mask=*/0b00001001, /*score=*/0}, + {document_id2, /*hit_section_id_mask=*/0b00001001, /*score=*/0}, + {document_id3, /*hit_section_id_mask=*/0b00001001, /*score=*/0}}; ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<ResultRetriever> result_retriever, ResultRetriever::Create(doc_store.get(), schema_store_.get(), @@ -517,7 +567,9 @@ TEST_F(ResultRetrieverTest, ShouldSnippetAllResults) { TermMatchType::EXACT_ONLY); PageResultState page_result_state( std::move(scored_document_hits), /*next_page_token_in=*/1, - std::move(snippet_context), /*num_previously_returned_in=*/0); + std::move(snippet_context), + std::unordered_map<std::string, ProjectionTree>(), + /*num_previously_returned_in=*/0); ICING_ASSERT_OK_AND_ASSIGN( std::vector<SearchResultProto::ResultProto> result, @@ -547,9 +599,9 @@ TEST_F(ResultRetrieverTest, ShouldSnippetSomeResults) { doc_store->Put(CreateDocument(/*id=*/3))); std::vector<ScoredDocumentHit> scored_document_hits = { - {document_id1, /*hit_section_id_mask=*/0b00000011, /*score=*/0}, - {document_id2, /*hit_section_id_mask=*/0b00000011, /*score=*/0}, - {document_id3, /*hit_section_id_mask=*/0b00000011, /*score=*/0}}; + {document_id1, /*hit_section_id_mask=*/0b00001001, /*score=*/0}, + {document_id2, /*hit_section_id_mask=*/0b00001001, /*score=*/0}, + {document_id3, /*hit_section_id_mask=*/0b00001001, /*score=*/0}}; ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<ResultRetriever> result_retriever, ResultRetriever::Create(doc_store.get(), schema_store_.get(), @@ -562,7 +614,9 @@ TEST_F(ResultRetrieverTest, ShouldSnippetSomeResults) { TermMatchType::EXACT_ONLY); PageResultState page_result_state( std::move(scored_document_hits), /*next_page_token_in=*/1, - std::move(snippet_context), /*num_previously_returned_in=*/3); + std::move(snippet_context), + std::unordered_map<std::string, ProjectionTree>(), + /*num_previously_returned_in=*/3); // num_to_snippet = 5, num_previously_returned_in = 3, // We can return 5 - 3 = 2 snippets. @@ -591,9 +645,9 @@ TEST_F(ResultRetrieverTest, ShouldNotSnippetAnyResults) { doc_store->Put(CreateDocument(/*id=*/3))); std::vector<ScoredDocumentHit> scored_document_hits = { - {document_id1, /*hit_section_id_mask=*/0b00000011, /*score=*/0}, - {document_id2, /*hit_section_id_mask=*/0b00000011, /*score=*/0}, - {document_id3, /*hit_section_id_mask=*/0b00000011, /*score=*/0}}; + {document_id1, /*hit_section_id_mask=*/0b00001001, /*score=*/0}, + {document_id2, /*hit_section_id_mask=*/0b00001001, /*score=*/0}, + {document_id3, /*hit_section_id_mask=*/0b00001001, /*score=*/0}}; ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<ResultRetriever> result_retriever, ResultRetriever::Create(doc_store.get(), schema_store_.get(), @@ -606,7 +660,9 @@ TEST_F(ResultRetrieverTest, ShouldNotSnippetAnyResults) { TermMatchType::EXACT_ONLY); PageResultState page_result_state( std::move(scored_document_hits), /*next_page_token_in=*/1, - std::move(snippet_context), /*num_previously_returned_in=*/6); + std::move(snippet_context), + std::unordered_map<std::string, ProjectionTree>(), + /*num_previously_returned_in=*/6); // num_to_snippet = 5, num_previously_returned_in = 6, // We can't return any snippets for this page. @@ -619,6 +675,671 @@ TEST_F(ResultRetrieverTest, ShouldNotSnippetAnyResults) { EXPECT_THAT(result[2].snippet().entries(), IsEmpty()); } +TEST_F(ResultRetrieverTest, ProjectionTopLevelLeadNodeFieldPath) { + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult create_result, + DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_, + schema_store_.get())); + std::unique_ptr<DocumentStore> doc_store = + std::move(create_result.document_store); + + // 1. Add two Email documents + DocumentProto document_one = + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetCreationTimestampMs(1000) + .SetSchema("Email") + .AddStringProperty("subject", "Hello World!") + .AddStringProperty( + "body", "Oh what a beautiful morning! Oh what a beautiful day!") + .Build(); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1, + doc_store->Put(document_one)); + + DocumentProto document_two = + DocumentBuilder() + .SetKey("namespace", "uri2") + .SetCreationTimestampMs(1000) + .SetSchema("Email") + .AddStringProperty("subject", "Goodnight Moon!") + .AddStringProperty("body", + "Count all the sheep and tell them 'Hello'.") + .Build(); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2, + doc_store->Put(document_two)); + + // 2. Setup the scored results. + std::vector<ScoredDocumentHit> scored_document_hits = { + {document_id1, /*hit_section_id_mask=*/0b00001001, /*score=*/0}, + {document_id2, /*hit_section_id_mask=*/0b00001001, /*score=*/0}}; + + ResultSpecProto::TypePropertyMask type_property_mask; + type_property_mask.set_schema_type("Email"); + type_property_mask.add_paths("subject"); + std::unordered_map<std::string, ProjectionTree> type_projection_tree_map; + type_projection_tree_map.insert( + {"Email", ProjectionTree(type_property_mask)}); + + SnippetContext snippet_context( + /*query_terms_in=*/{}, + ResultSpecProto::SnippetSpecProto::default_instance(), + TermMatchType::EXACT_ONLY); + PageResultState page_result_state( + std::move(scored_document_hits), /*next_page_token_in=*/1, + std::move(snippet_context), std::move(type_projection_tree_map), + /*num_previously_returned_in=*/0); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<ResultRetriever> result_retriever, + ResultRetriever::Create(doc_store.get(), schema_store_.get(), + language_segmenter_.get(), normalizer_.get())); + + // 3. Verify that the returned results only contain the 'subject' property. + ICING_ASSERT_OK_AND_ASSIGN( + std::vector<SearchResultProto::ResultProto> result, + result_retriever->RetrieveResults(page_result_state)); + ASSERT_THAT(result, SizeIs(2)); + + DocumentProto projected_document_one = + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetCreationTimestampMs(1000) + .SetSchema("Email") + .AddStringProperty("subject", "Hello World!") + .Build(); + EXPECT_THAT(result[0].document(), EqualsProto(projected_document_one)); + + DocumentProto projected_document_two = + DocumentBuilder() + .SetKey("namespace", "uri2") + .SetCreationTimestampMs(1000) + .SetSchema("Email") + .AddStringProperty("subject", "Goodnight Moon!") + .Build(); + EXPECT_THAT(result[1].document(), EqualsProto(projected_document_two)); +} + +TEST_F(ResultRetrieverTest, ProjectionNestedLeafNodeFieldPath) { + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult create_result, + DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_, + schema_store_.get())); + std::unique_ptr<DocumentStore> doc_store = + std::move(create_result.document_store); + + // 1. Add two Email documents + DocumentProto document_one = + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetCreationTimestampMs(1000) + .SetSchema("Email") + .AddDocumentProperty( + "sender", + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetSchema("Person") + .AddStringProperty("name", "Meg Ryan") + .AddStringProperty("emailAddress", "shopgirl@aol.com") + .Build()) + .AddStringProperty("subject", "Hello World!") + .AddStringProperty( + "body", "Oh what a beautiful morning! Oh what a beautiful day!") + .Build(); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1, + doc_store->Put(document_one)); + + DocumentProto document_two = + DocumentBuilder() + .SetKey("namespace", "uri2") + .SetCreationTimestampMs(1000) + .SetSchema("Email") + .AddDocumentProperty( + "sender", DocumentBuilder() + .SetKey("namespace", "uri2") + .SetSchema("Person") + .AddStringProperty("name", "Tom Hanks") + .AddStringProperty("emailAddress", "ny152@aol.com") + .Build()) + .AddStringProperty("subject", "Goodnight Moon!") + .AddStringProperty("body", + "Count all the sheep and tell them 'Hello'.") + .Build(); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2, + doc_store->Put(document_two)); + + // 2. Setup the scored results. + std::vector<ScoredDocumentHit> scored_document_hits = { + {document_id1, /*hit_section_id_mask=*/0b00001001, /*score=*/0}, + {document_id2, /*hit_section_id_mask=*/0b00001001, /*score=*/0}}; + + ResultSpecProto::TypePropertyMask type_property_mask; + type_property_mask.set_schema_type("Email"); + type_property_mask.add_paths("sender.name"); + std::unordered_map<std::string, ProjectionTree> type_projection_tree_map; + type_projection_tree_map.insert( + {"Email", ProjectionTree(type_property_mask)}); + + SnippetContext snippet_context( + /*query_terms_in=*/{}, + ResultSpecProto::SnippetSpecProto::default_instance(), + TermMatchType::EXACT_ONLY); + PageResultState page_result_state( + std::move(scored_document_hits), /*next_page_token_in=*/1, + std::move(snippet_context), std::move(type_projection_tree_map), + /*num_previously_returned_in=*/0); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<ResultRetriever> result_retriever, + ResultRetriever::Create(doc_store.get(), schema_store_.get(), + language_segmenter_.get(), normalizer_.get())); + + // 3. Verify that the returned results only contain the 'sender.name' + // property. + ICING_ASSERT_OK_AND_ASSIGN( + std::vector<SearchResultProto::ResultProto> result, + result_retriever->RetrieveResults(page_result_state)); + ASSERT_THAT(result, SizeIs(2)); + + DocumentProto projected_document_one = + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetCreationTimestampMs(1000) + .SetSchema("Email") + .AddDocumentProperty("sender", + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetSchema("Person") + .AddStringProperty("name", "Meg Ryan") + .Build()) + .Build(); + EXPECT_THAT(result[0].document(), EqualsProto(projected_document_one)); + + DocumentProto projected_document_two = + DocumentBuilder() + .SetKey("namespace", "uri2") + .SetCreationTimestampMs(1000) + .SetSchema("Email") + .AddDocumentProperty("sender", + DocumentBuilder() + .SetKey("namespace", "uri2") + .SetSchema("Person") + .AddStringProperty("name", "Tom Hanks") + .Build()) + .Build(); + EXPECT_THAT(result[1].document(), EqualsProto(projected_document_two)); +} + +TEST_F(ResultRetrieverTest, ProjectionIntermediateNodeFieldPath) { + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult create_result, + DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_, + schema_store_.get())); + std::unique_ptr<DocumentStore> doc_store = + std::move(create_result.document_store); + + // 1. Add two Email documents + DocumentProto document_one = + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetCreationTimestampMs(1000) + .SetSchema("Email") + .AddDocumentProperty( + "sender", + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetSchema("Person") + .AddStringProperty("name", "Meg Ryan") + .AddStringProperty("emailAddress", "shopgirl@aol.com") + .Build()) + .AddStringProperty("subject", "Hello World!") + .AddStringProperty( + "body", "Oh what a beautiful morning! Oh what a beautiful day!") + .Build(); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1, + doc_store->Put(document_one)); + + DocumentProto document_two = + DocumentBuilder() + .SetKey("namespace", "uri2") + .SetCreationTimestampMs(1000) + .SetSchema("Email") + .AddDocumentProperty( + "sender", DocumentBuilder() + .SetKey("namespace", "uri2") + .SetSchema("Person") + .AddStringProperty("name", "Tom Hanks") + .AddStringProperty("emailAddress", "ny152@aol.com") + .Build()) + .AddStringProperty("subject", "Goodnight Moon!") + .AddStringProperty("body", + "Count all the sheep and tell them 'Hello'.") + .Build(); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2, + doc_store->Put(document_two)); + + // 2. Setup the scored results. + std::vector<ScoredDocumentHit> scored_document_hits = { + {document_id1, /*hit_section_id_mask=*/0b00001001, /*score=*/0}, + {document_id2, /*hit_section_id_mask=*/0b00001001, /*score=*/0}}; + + ResultSpecProto::TypePropertyMask type_property_mask; + type_property_mask.set_schema_type("Email"); + type_property_mask.add_paths("sender"); + std::unordered_map<std::string, ProjectionTree> type_projection_tree_map; + type_projection_tree_map.insert( + {"Email", ProjectionTree(type_property_mask)}); + + SnippetContext snippet_context( + /*query_terms_in=*/{}, + ResultSpecProto::SnippetSpecProto::default_instance(), + TermMatchType::EXACT_ONLY); + PageResultState page_result_state( + std::move(scored_document_hits), /*next_page_token_in=*/1, + std::move(snippet_context), std::move(type_projection_tree_map), + /*num_previously_returned_in=*/0); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<ResultRetriever> result_retriever, + ResultRetriever::Create(doc_store.get(), schema_store_.get(), + language_segmenter_.get(), normalizer_.get())); + + // 3. Verify that the returned results only contain the 'sender' + // property and all of the subproperties of 'sender'. + ICING_ASSERT_OK_AND_ASSIGN( + std::vector<SearchResultProto::ResultProto> result, + result_retriever->RetrieveResults(page_result_state)); + ASSERT_THAT(result, SizeIs(2)); + + DocumentProto projected_document_one = + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetCreationTimestampMs(1000) + .SetSchema("Email") + .AddDocumentProperty( + "sender", + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetSchema("Person") + .AddStringProperty("name", "Meg Ryan") + .AddStringProperty("emailAddress", "shopgirl@aol.com") + .Build()) + .Build(); + EXPECT_THAT(result[0].document(), EqualsProto(projected_document_one)); + + DocumentProto projected_document_two = + DocumentBuilder() + .SetKey("namespace", "uri2") + .SetCreationTimestampMs(1000) + .SetSchema("Email") + .AddDocumentProperty( + "sender", DocumentBuilder() + .SetKey("namespace", "uri2") + .SetSchema("Person") + .AddStringProperty("name", "Tom Hanks") + .AddStringProperty("emailAddress", "ny152@aol.com") + .Build()) + .Build(); + EXPECT_THAT(result[1].document(), EqualsProto(projected_document_two)); +} + +TEST_F(ResultRetrieverTest, ProjectionMultipleNestedFieldPaths) { + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult create_result, + DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_, + schema_store_.get())); + std::unique_ptr<DocumentStore> doc_store = + std::move(create_result.document_store); + + // 1. Add two Email documents + DocumentProto document_one = + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetCreationTimestampMs(1000) + .SetSchema("Email") + .AddDocumentProperty( + "sender", + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetSchema("Person") + .AddStringProperty("name", "Meg Ryan") + .AddStringProperty("emailAddress", "shopgirl@aol.com") + .Build()) + .AddStringProperty("subject", "Hello World!") + .AddStringProperty( + "body", "Oh what a beautiful morning! Oh what a beautiful day!") + .Build(); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1, + doc_store->Put(document_one)); + + DocumentProto document_two = + DocumentBuilder() + .SetKey("namespace", "uri2") + .SetCreationTimestampMs(1000) + .SetSchema("Email") + .AddDocumentProperty( + "sender", DocumentBuilder() + .SetKey("namespace", "uri2") + .SetSchema("Person") + .AddStringProperty("name", "Tom Hanks") + .AddStringProperty("emailAddress", "ny152@aol.com") + .Build()) + .AddStringProperty("subject", "Goodnight Moon!") + .AddStringProperty("body", + "Count all the sheep and tell them 'Hello'.") + .Build(); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2, + doc_store->Put(document_two)); + + // 2. Setup the scored results. + std::vector<ScoredDocumentHit> scored_document_hits = { + {document_id1, /*hit_section_id_mask=*/0b00001001, /*score=*/0}, + {document_id2, /*hit_section_id_mask=*/0b00001001, /*score=*/0}}; + + ResultSpecProto::TypePropertyMask type_property_mask; + type_property_mask.set_schema_type("Email"); + type_property_mask.add_paths("sender.name"); + type_property_mask.add_paths("sender.emailAddress"); + std::unordered_map<std::string, ProjectionTree> type_projection_tree_map; + type_projection_tree_map.insert( + {"Email", ProjectionTree(type_property_mask)}); + + SnippetContext snippet_context( + /*query_terms_in=*/{}, + ResultSpecProto::SnippetSpecProto::default_instance(), + TermMatchType::EXACT_ONLY); + PageResultState page_result_state( + std::move(scored_document_hits), /*next_page_token_in=*/1, + std::move(snippet_context), std::move(type_projection_tree_map), + /*num_previously_returned_in=*/0); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<ResultRetriever> result_retriever, + ResultRetriever::Create(doc_store.get(), schema_store_.get(), + language_segmenter_.get(), normalizer_.get())); + + // 3. Verify that the returned results only contain the 'sender.name' and + // 'sender.address' properties. + ICING_ASSERT_OK_AND_ASSIGN( + std::vector<SearchResultProto::ResultProto> result, + result_retriever->RetrieveResults(page_result_state)); + ASSERT_THAT(result, SizeIs(2)); + + DocumentProto projected_document_one = + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetCreationTimestampMs(1000) + .SetSchema("Email") + .AddDocumentProperty( + "sender", + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetSchema("Person") + .AddStringProperty("name", "Meg Ryan") + .AddStringProperty("emailAddress", "shopgirl@aol.com") + .Build()) + .Build(); + EXPECT_THAT(result[0].document(), EqualsProto(projected_document_one)); + + DocumentProto projected_document_two = + DocumentBuilder() + .SetKey("namespace", "uri2") + .SetCreationTimestampMs(1000) + .SetSchema("Email") + .AddDocumentProperty( + "sender", DocumentBuilder() + .SetKey("namespace", "uri2") + .SetSchema("Person") + .AddStringProperty("name", "Tom Hanks") + .AddStringProperty("emailAddress", "ny152@aol.com") + .Build()) + .Build(); + EXPECT_THAT(result[1].document(), EqualsProto(projected_document_two)); +} + +TEST_F(ResultRetrieverTest, ProjectionEmptyFieldPath) { + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult create_result, + DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_, + schema_store_.get())); + std::unique_ptr<DocumentStore> doc_store = + std::move(create_result.document_store); + + // 1. Add two Email documents + DocumentProto document_one = + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetCreationTimestampMs(1000) + .SetSchema("Email") + .AddStringProperty("subject", "Hello World!") + .AddStringProperty( + "body", "Oh what a beautiful morning! Oh what a beautiful day!") + .Build(); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1, + doc_store->Put(document_one)); + + DocumentProto document_two = + DocumentBuilder() + .SetKey("namespace", "uri2") + .SetCreationTimestampMs(1000) + .SetSchema("Email") + .AddStringProperty("subject", "Goodnight Moon!") + .AddStringProperty("body", + "Count all the sheep and tell them 'Hello'.") + .Build(); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2, + doc_store->Put(document_two)); + + // 2. Setup the scored results. + std::vector<ScoredDocumentHit> scored_document_hits = { + {document_id1, /*hit_section_id_mask=*/0b00001001, /*score=*/0}, + {document_id2, /*hit_section_id_mask=*/0b00001001, /*score=*/0}}; + + ResultSpecProto::TypePropertyMask type_property_mask; + type_property_mask.set_schema_type("Email"); + std::unordered_map<std::string, ProjectionTree> type_projection_tree_map; + type_projection_tree_map.insert( + {"Email", ProjectionTree(type_property_mask)}); + + SnippetContext snippet_context( + /*query_terms_in=*/{}, + ResultSpecProto::SnippetSpecProto::default_instance(), + TermMatchType::EXACT_ONLY); + PageResultState page_result_state( + std::move(scored_document_hits), /*next_page_token_in=*/1, + std::move(snippet_context), std::move(type_projection_tree_map), + /*num_previously_returned_in=*/0); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<ResultRetriever> result_retriever, + ResultRetriever::Create(doc_store.get(), schema_store_.get(), + language_segmenter_.get(), normalizer_.get())); + + // 3. Verify that the returned results contain *no* properties. + ICING_ASSERT_OK_AND_ASSIGN( + std::vector<SearchResultProto::ResultProto> result, + result_retriever->RetrieveResults(page_result_state)); + ASSERT_THAT(result, SizeIs(2)); + + DocumentProto projected_document_one = DocumentBuilder() + .SetKey("namespace", "uri1") + .SetCreationTimestampMs(1000) + .SetSchema("Email") + .Build(); + EXPECT_THAT(result[0].document(), EqualsProto(projected_document_one)); + + DocumentProto projected_document_two = DocumentBuilder() + .SetKey("namespace", "uri2") + .SetCreationTimestampMs(1000) + .SetSchema("Email") + .Build(); + EXPECT_THAT(result[1].document(), EqualsProto(projected_document_two)); +} + +TEST_F(ResultRetrieverTest, ProjectionInvalidFieldPath) { + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult create_result, + DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_, + schema_store_.get())); + std::unique_ptr<DocumentStore> doc_store = + std::move(create_result.document_store); + + // 1. Add two Email documents + DocumentProto document_one = + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetCreationTimestampMs(1000) + .SetSchema("Email") + .AddStringProperty("subject", "Hello World!") + .AddStringProperty( + "body", "Oh what a beautiful morning! Oh what a beautiful day!") + .Build(); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1, + doc_store->Put(document_one)); + + DocumentProto document_two = + DocumentBuilder() + .SetKey("namespace", "uri2") + .SetCreationTimestampMs(1000) + .SetSchema("Email") + .AddStringProperty("subject", "Goodnight Moon!") + .AddStringProperty("body", + "Count all the sheep and tell them 'Hello'.") + .Build(); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2, + doc_store->Put(document_two)); + + // 2. Setup the scored results. + std::vector<ScoredDocumentHit> scored_document_hits = { + {document_id1, /*hit_section_id_mask=*/0b00001001, /*score=*/0}, + {document_id2, /*hit_section_id_mask=*/0b00001001, /*score=*/0}}; + + ResultSpecProto::TypePropertyMask type_property_mask; + type_property_mask.set_schema_type("Email"); + type_property_mask.add_paths("nonExistentProperty"); + std::unordered_map<std::string, ProjectionTree> type_projection_tree_map; + type_projection_tree_map.insert( + {"Email", ProjectionTree(type_property_mask)}); + + SnippetContext snippet_context( + /*query_terms_in=*/{}, + ResultSpecProto::SnippetSpecProto::default_instance(), + TermMatchType::EXACT_ONLY); + PageResultState page_result_state( + std::move(scored_document_hits), /*next_page_token_in=*/1, + std::move(snippet_context), std::move(type_projection_tree_map), + /*num_previously_returned_in=*/0); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<ResultRetriever> result_retriever, + ResultRetriever::Create(doc_store.get(), schema_store_.get(), + language_segmenter_.get(), normalizer_.get())); + + // 3. Verify that the returned results contain *no* properties. + ICING_ASSERT_OK_AND_ASSIGN( + std::vector<SearchResultProto::ResultProto> result, + result_retriever->RetrieveResults(page_result_state)); + ASSERT_THAT(result, SizeIs(2)); + + DocumentProto projected_document_one = DocumentBuilder() + .SetKey("namespace", "uri1") + .SetCreationTimestampMs(1000) + .SetSchema("Email") + .Build(); + EXPECT_THAT(result[0].document(), EqualsProto(projected_document_one)); + + DocumentProto projected_document_two = DocumentBuilder() + .SetKey("namespace", "uri2") + .SetCreationTimestampMs(1000) + .SetSchema("Email") + .Build(); + EXPECT_THAT(result[1].document(), EqualsProto(projected_document_two)); +} + +TEST_F(ResultRetrieverTest, ProjectionValidAndInvalidFieldPath) { + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult create_result, + DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_, + schema_store_.get())); + std::unique_ptr<DocumentStore> doc_store = + std::move(create_result.document_store); + + // 1. Add two Email documents + DocumentProto document_one = + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetCreationTimestampMs(1000) + .SetSchema("Email") + .AddStringProperty("subject", "Hello World!") + .AddStringProperty( + "body", "Oh what a beautiful morning! Oh what a beautiful day!") + .Build(); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1, + doc_store->Put(document_one)); + + DocumentProto document_two = + DocumentBuilder() + .SetKey("namespace", "uri2") + .SetCreationTimestampMs(1000) + .SetSchema("Email") + .AddStringProperty("subject", "Goodnight Moon!") + .AddStringProperty("body", + "Count all the sheep and tell them 'Hello'.") + .Build(); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2, + doc_store->Put(document_two)); + + // 2. Setup the scored results. + std::vector<ScoredDocumentHit> scored_document_hits = { + {document_id1, /*hit_section_id_mask=*/0b00001001, /*score=*/0}, + {document_id2, /*hit_section_id_mask=*/0b00001001, /*score=*/0}}; + + ResultSpecProto::TypePropertyMask type_property_mask; + type_property_mask.set_schema_type("Email"); + type_property_mask.add_paths("subject"); + type_property_mask.add_paths("nonExistentProperty"); + std::unordered_map<std::string, ProjectionTree> type_projection_tree_map; + type_projection_tree_map.insert( + {"Email", ProjectionTree(type_property_mask)}); + + SnippetContext snippet_context( + /*query_terms_in=*/{}, + ResultSpecProto::SnippetSpecProto::default_instance(), + TermMatchType::EXACT_ONLY); + PageResultState page_result_state( + std::move(scored_document_hits), /*next_page_token_in=*/1, + std::move(snippet_context), std::move(type_projection_tree_map), + /*num_previously_returned_in=*/0); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<ResultRetriever> result_retriever, + ResultRetriever::Create(doc_store.get(), schema_store_.get(), + language_segmenter_.get(), normalizer_.get())); + + // 3. Verify that the returned results only contain the 'subject' property. + ICING_ASSERT_OK_AND_ASSIGN( + std::vector<SearchResultProto::ResultProto> result, + result_retriever->RetrieveResults(page_result_state)); + ASSERT_THAT(result, SizeIs(2)); + + DocumentProto projected_document_one = + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetCreationTimestampMs(1000) + .SetSchema("Email") + .AddStringProperty("subject", "Hello World!") + .Build(); + EXPECT_THAT(result[0].document(), EqualsProto(projected_document_one)); + + DocumentProto projected_document_two = + DocumentBuilder() + .SetKey("namespace", "uri2") + .SetCreationTimestampMs(1000) + .SetSchema("Email") + .AddStringProperty("subject", "Goodnight Moon!") + .Build(); + EXPECT_THAT(result[1].document(), EqualsProto(projected_document_two)); +} + } // namespace } // namespace lib diff --git a/icing/result/result-state-manager.cc b/icing/result/result-state-manager.cc index ff5dbf0..4488409 100644 --- a/icing/result/result-state-manager.cc +++ b/icing/result/result-state-manager.cc @@ -43,21 +43,26 @@ ResultStateManager::RankAndPaginate(ResultState result_state) { std::vector<ScoredDocumentHit> page_result_document_hits = result_state.GetNextPage(); + SnippetContext snippet_context_copy = result_state.snippet_context(); + + std::unordered_map<std::string, ProjectionTree> projection_tree_map_copy = + result_state.projection_tree_map(); if (!result_state.HasMoreResults()) { // No more pages, won't store ResultState, returns directly return PageResultState( std::move(page_result_document_hits), kInvalidNextPageToken, - result_state.snippet_context(), num_previously_returned); + std::move(snippet_context_copy), std::move(projection_tree_map_copy), + num_previously_returned); } absl_ports::unique_lock l(&mutex_); // ResultState has multiple pages, storing it - SnippetContext snippet_context_copy = result_state.snippet_context(); uint64_t next_page_token = Add(std::move(result_state)); return PageResultState(std::move(page_result_document_hits), next_page_token, std::move(snippet_context_copy), + std::move(projection_tree_map_copy), num_previously_returned); } @@ -97,13 +102,17 @@ libtextclassifier3::StatusOr<PageResultState> ResultStateManager::GetNextPage( SnippetContext snippet_context_copy = state_iterator->second.snippet_context(); + std::unordered_map<std::string, ProjectionTree> projection_tree_map_copy = + state_iterator->second.projection_tree_map(); + if (!state_iterator->second.HasMoreResults()) { InternalInvalidateResultState(next_page_token); next_page_token = kInvalidNextPageToken; } return PageResultState(result_of_page, next_page_token, - std::move(snippet_context_copy), num_returned); + std::move(snippet_context_copy), + std::move(projection_tree_map_copy), num_returned); } void ResultStateManager::InvalidateResultState(uint64_t next_page_token) { diff --git a/icing/result/result-state.cc b/icing/result/result-state.cc index bf28f52..f1479b9 100644 --- a/icing/result/result-state.cc +++ b/icing/result/result-state.cc @@ -14,6 +14,7 @@ #include "icing/result/result-state.h" +#include "icing/result/projection-tree.h" #include "icing/scoring/ranker.h" #include "icing/util/logging.h" @@ -46,6 +47,11 @@ ResultState::ResultState(std::vector<ScoredDocumentHit> scored_document_hits, num_returned_(0), scored_document_hit_comparator_(scoring_spec.order_by() == ScoringSpecProto::Order::DESC) { + for (const ResultSpecProto::TypePropertyMask& type_field_mask : + result_spec.type_property_masks()) { + projection_tree_map_.insert( + {type_field_mask.schema_type(), ProjectionTree(type_field_mask)}); + } BuildHeapInPlace(&scored_document_hits_, scored_document_hit_comparator_); } diff --git a/icing/result/result-state.h b/icing/result/result-state.h index 82e783b..de36b40 100644 --- a/icing/result/result-state.h +++ b/icing/result/result-state.h @@ -19,6 +19,7 @@ #include "icing/proto/scoring.pb.h" #include "icing/proto/search.pb.h" +#include "icing/result/projection-tree.h" #include "icing/result/snippet-context.h" #include "icing/scoring/scored-document-hit.h" @@ -52,6 +53,13 @@ class ResultState { // constructor. const SnippetContext& snippet_context() const { return snippet_context_; } + // Returns a vector of TypePropertyMasks generated from the specs passed in + // via constructor. + const std::unordered_map<std::string, ProjectionTree>& projection_tree_map() + const { + return projection_tree_map_; + } + // The number of results that have already been returned. This number is // increased when GetNextPage() is called. int num_returned() const { return num_returned_; } @@ -65,6 +73,9 @@ class ResultState { // Information needed for snippeting. SnippetContext snippet_context_; + // Information needed for projection. + std::unordered_map<std::string, ProjectionTree> projection_tree_map_; + // Number of results to return in each page. int num_per_page_; diff --git a/icing/schema/schema-util.cc b/icing/schema/schema-util.cc index 9626a0b..49e7096 100644 --- a/icing/schema/schema-util.cc +++ b/icing/schema/schema-util.cc @@ -107,6 +107,21 @@ libtextclassifier3::Status SchemaUtil::Validate(const SchemaProto& schema) { // already. std::unordered_set<std::string_view> known_property_names; + // Tracks which schemas reference other schemas. This is used to detect + // infinite loops between indexed schema references (e.g. A -> B -> C -> A). + // We could get into an infinite loop while trying to assign section ids. + // + // The key is the "child" schema that is being referenced within another + // schema. + // The value is a set of all the direct/indirect "parent" schemas that + // reference the "child" schema. + // + // For example, if A has a nested document property of type B, then A is the + // "parent" and B is the "child" and so schema_references will contain + // schema_references[B] == {A}. + std::unordered_map<std::string_view, std::unordered_set<std::string_view>> + schema_references; + for (const auto& type_config : schema.types()) { std::string_view schema_type(type_config.schema_type()); ICING_RETURN_IF_ERROR(ValidateSchemaType(schema_type)); @@ -120,6 +135,7 @@ libtextclassifier3::Status SchemaUtil::Validate(const SchemaProto& schema) { // We only care about properties being unique within one type_config known_property_names.clear(); + for (const auto& property_config : type_config.properties()) { std::string_view property_name(property_config.property_name()); ICING_RETURN_IF_ERROR(ValidatePropertyName(property_name, schema_type)); @@ -149,10 +165,55 @@ libtextclassifier3::Status SchemaUtil::Validate(const SchemaProto& schema) { schema_type, ".", property_name, "'")); } + if (property_schema_type == schema_type) { + // The schema refers to itself. This also causes a infinite loop. + // + // TODO(b/171996137): When clients can opt out of indexing document + // properties, then we don't need to do this if the document property + // isn't indexed. We only care about infinite loops while we're trying + // to assign section ids for indexing. + return absl_ports::InvalidArgumentError( + absl_ports::StrCat("Infinite loop detected in type configs. '", + schema_type, "' references itself.")); + } + // Need to make sure we eventually see/validate this schema_type if (known_schema_types.count(property_schema_type) == 0) { unknown_schema_types.insert(property_schema_type); } + + // Start tracking the parent schemas that references this nested schema + // for infinite loop detection. + // + // TODO(b/171996137): When clients can opt out of indexing document + // properties, then we don't need to do this if the document property + // isn't indexed. We only care about infinite loops while we're trying + // to assign section ids for indexing. + std::unordered_set<std::string_view> parent_schemas; + parent_schemas.insert(schema_type); + + for (const auto& parent : parent_schemas) { + // Check for any indirect parents + auto indirect_parents_iter = schema_references.find(parent); + if (indirect_parents_iter == schema_references.end()) { + continue; + } + + // Our "parent" schema has parents as well. They're our indirect + // parents now. + for (const std::string_view& indirect_parent : + indirect_parents_iter->second) { + if (indirect_parent == property_schema_type) { + // We're our own indirect parent! Infinite loop found. + return absl_ports::InvalidArgumentError(absl_ports::StrCat( + "Infinite loop detected in type configs. '", + property_schema_type, "' references itself.")); + } + parent_schemas.insert(indirect_parent); + } + } + + schema_references.insert({property_schema_type, parent_schemas}); } ICING_RETURN_IF_ERROR(ValidateCardinality(property_config.cardinality(), @@ -166,7 +227,7 @@ libtextclassifier3::Status SchemaUtil::Validate(const SchemaProto& schema) { } } - // An Document property claimed to be of a schema_type that we never + // A Document property claimed to be of a schema_type that we never // saw/validated if (!unknown_schema_types.empty()) { return absl_ports::UnknownError( diff --git a/icing/schema/schema-util.h b/icing/schema/schema-util.h index ccb2eea..7b989a8 100644 --- a/icing/schema/schema-util.h +++ b/icing/schema/schema-util.h @@ -81,10 +81,17 @@ class SchemaUtil { // SchemaTypeConfigProto.schema_type // 10. Property names can only be alphanumeric. // 11. Any STRING data types have a valid string_indexing_config + // 12. A SchemaTypeConfigProto cannot have a property whose schema_type is + // itself, thus creating an infinite loop. + // 13. Two SchemaTypeConfigProtos cannot have properties that reference each + // other's schema_type, thus creating an infinite loop. + // + // TODO(b/171996137): Clarify 12 and 13 are only for indexed properties, once + // document properties can be opted out of indexing. // // Returns: // ALREADY_EXISTS for case 1 and 2 - // INVALID_ARGUMENT for 3-11 + // INVALID_ARGUMENT for 3-13 // OK otherwise static libtextclassifier3::Status Validate(const SchemaProto& schema); diff --git a/icing/schema/schema-util_test.cc b/icing/schema/schema-util_test.cc index ed3bde7..61a861c 100644 --- a/icing/schema/schema-util_test.cc +++ b/icing/schema/schema-util_test.cc @@ -29,6 +29,7 @@ namespace lib { namespace { using ::testing::Eq; +using ::testing::HasSubstr; // Properties/fields in a schema type constexpr char kEmailType[] = "EmailMessage"; @@ -84,7 +85,7 @@ class SchemaUtilTest : public ::testing::Test { } }; -TEST_F(SchemaUtilTest, Valid_Empty) { +TEST_F(SchemaUtilTest, EmptySchemaProtoIsValid) { ICING_ASSERT_OK(SchemaUtil::Validate(schema_proto_)); } @@ -98,7 +99,7 @@ TEST_F(SchemaUtilTest, Valid_Nested) { ICING_ASSERT_OK(SchemaUtil::Validate(schema_proto_)); } -TEST_F(SchemaUtilTest, Valid_ClearedPropertyConfigs) { +TEST_F(SchemaUtilTest, ClearedPropertyConfigsIsValid) { // No property fields is technically ok, but probably not realistic. auto type = schema_proto_.add_types(); *type = CreateSchemaTypeConfig(kEmailType); @@ -107,7 +108,7 @@ TEST_F(SchemaUtilTest, Valid_ClearedPropertyConfigs) { ICING_ASSERT_OK(SchemaUtil::Validate(schema_proto_)); } -TEST_F(SchemaUtilTest, Invalid_ClearedSchemaType) { +TEST_F(SchemaUtilTest, ClearedSchemaTypeIsInvalid) { auto type = schema_proto_.add_types(); *type = CreateSchemaTypeConfig(kEmailType); type->clear_schema_type(); @@ -116,7 +117,7 @@ TEST_F(SchemaUtilTest, Invalid_ClearedSchemaType) { StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } -TEST_F(SchemaUtilTest, Invalid_EmptySchemaType) { +TEST_F(SchemaUtilTest, EmptySchemaTypeIsInvalid) { auto type = schema_proto_.add_types(); *type = CreateSchemaTypeConfig(kEmailType); type->set_schema_type(""); @@ -133,7 +134,7 @@ TEST_F(SchemaUtilTest, AnySchemaTypeOk) { ICING_ASSERT_OK(SchemaUtil::Validate(schema_proto_)); } -TEST_F(SchemaUtilTest, Invalid_ClearedPropertyName) { +TEST_F(SchemaUtilTest, ClearedPropertyNameIsInvalid) { auto type = schema_proto_.add_types(); *type = CreateSchemaTypeConfig(kEmailType); @@ -146,7 +147,7 @@ TEST_F(SchemaUtilTest, Invalid_ClearedPropertyName) { StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } -TEST_F(SchemaUtilTest, Invalid_EmptyPropertyName) { +TEST_F(SchemaUtilTest, EmptyPropertyNameIsInvalid) { auto type = schema_proto_.add_types(); *type = CreateSchemaTypeConfig(kEmailType); @@ -184,7 +185,7 @@ TEST_F(SchemaUtilTest, AlphanumericPropertyNameOk) { ICING_ASSERT_OK(SchemaUtil::Validate(schema_proto_)); } -TEST_F(SchemaUtilTest, Invalid_DuplicatePropertyName) { +TEST_F(SchemaUtilTest, DuplicatePropertyNameIsInvalid) { auto type = schema_proto_.add_types(); *type = CreateSchemaTypeConfig(kEmailType); @@ -202,7 +203,7 @@ TEST_F(SchemaUtilTest, Invalid_DuplicatePropertyName) { StatusIs(libtextclassifier3::StatusCode::ALREADY_EXISTS)); } -TEST_F(SchemaUtilTest, Invalid_ClearedDataType) { +TEST_F(SchemaUtilTest, ClearedDataTypeIsInvalid) { auto type = schema_proto_.add_types(); *type = CreateSchemaTypeConfig(kEmailType); @@ -215,7 +216,7 @@ TEST_F(SchemaUtilTest, Invalid_ClearedDataType) { StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } -TEST_F(SchemaUtilTest, Invalid_UnknownDataType) { +TEST_F(SchemaUtilTest, UnknownDataTypeIsInvalid) { auto type = schema_proto_.add_types(); *type = CreateSchemaTypeConfig(kEmailType); @@ -228,7 +229,7 @@ TEST_F(SchemaUtilTest, Invalid_UnknownDataType) { StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } -TEST_F(SchemaUtilTest, Invalid_ClearedCardinality) { +TEST_F(SchemaUtilTest, ClearedCardinalityIsInvalid) { auto type = schema_proto_.add_types(); *type = CreateSchemaTypeConfig(kEmailType); @@ -241,7 +242,7 @@ TEST_F(SchemaUtilTest, Invalid_ClearedCardinality) { StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } -TEST_F(SchemaUtilTest, Invalid_UnknownCardinality) { +TEST_F(SchemaUtilTest, UnknownCardinalityIsInvalid) { auto type = schema_proto_.add_types(); *type = CreateSchemaTypeConfig(kEmailType); @@ -254,7 +255,7 @@ TEST_F(SchemaUtilTest, Invalid_UnknownCardinality) { StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } -TEST_F(SchemaUtilTest, Invalid_ClearedPropertySchemaType) { +TEST_F(SchemaUtilTest, ClearedPropertySchemaTypeIsInvalid) { auto type = schema_proto_.add_types(); *type = CreateSchemaTypeConfig(kEmailType); @@ -282,7 +283,7 @@ TEST_F(SchemaUtilTest, Invalid_EmptyPropertySchemaType) { StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } -TEST_F(SchemaUtilTest, Invalid_NoMatchingSchemaType) { +TEST_F(SchemaUtilTest, NoMatchingSchemaTypeIsInvalid) { auto type = schema_proto_.add_types(); *type = CreateSchemaTypeConfig(kEmailType); @@ -293,7 +294,8 @@ TEST_F(SchemaUtilTest, Invalid_NoMatchingSchemaType) { property->set_schema_type("NewSchemaType"); ASSERT_THAT(SchemaUtil::Validate(schema_proto_), - StatusIs(libtextclassifier3::StatusCode::UNKNOWN)); + StatusIs(libtextclassifier3::StatusCode::UNKNOWN, + HasSubstr("Undefined 'schema_type'"))); } TEST_F(SchemaUtilTest, NewOptionalPropertyIsCompatible) { @@ -618,6 +620,153 @@ TEST_F(SchemaUtilTest, ValidateStringIndexingConfigShouldHaveTokenizer) { EXPECT_THAT(SchemaUtil::Validate(schema), IsOk()); } +TEST_F(SchemaUtilTest, MultipleReferencesToSameNestedSchemaOk) { + SchemaProto schema; + + // Create a parent schema + auto type = schema.add_types(); + type->set_schema_type("ParentSchema"); + + // Create multiple references to the same child schema + auto property = type->add_properties(); + property->set_property_name("ChildProperty1"); + property->set_data_type(PropertyConfigProto::DataType::DOCUMENT); + property->set_schema_type("ChildSchema"); + property->set_cardinality(PropertyConfigProto::Cardinality::REPEATED); + + property = type->add_properties(); + property->set_property_name("ChildProperty2"); + property->set_data_type(PropertyConfigProto::DataType::DOCUMENT); + property->set_schema_type("ChildSchema"); + property->set_cardinality(PropertyConfigProto::Cardinality::REPEATED); + + // Create a child schema + type = schema.add_types(); + type->set_schema_type("ChildSchema"); + + EXPECT_THAT(SchemaUtil::Validate(schema), IsOk()); +} + +TEST_F(SchemaUtilTest, InvalidSelfReference) { + SchemaProto schema; + + // Create a schema with a self-reference cycle in it: OwnSchema -> OwnSchema + auto type = schema.add_types(); + type->set_schema_type("OwnSchema"); + + // Reference a child schema, so far so good + auto property = type->add_properties(); + property->set_property_name("NestedDocument"); + property->set_data_type(PropertyConfigProto::DataType::DOCUMENT); + property->set_schema_type("OwnSchema"); + property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); + + EXPECT_THAT(SchemaUtil::Validate(schema), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT, + HasSubstr("Infinite loop"))); +} + +TEST_F(SchemaUtilTest, InvalidSelfReferenceEvenWithOtherProperties) { + SchemaProto schema; + + // Create a schema with a self-reference cycle in it: OwnSchema -> OwnSchema + auto type = schema.add_types(); + type->set_schema_type("OwnSchema"); + + // Reference a child schema, so far so good + auto property = type->add_properties(); + property->set_property_name("NestedDocument"); + property->set_data_type(PropertyConfigProto::DataType::DOCUMENT); + property->set_schema_type("OwnSchema"); + property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); + + property = type->add_properties(); + property->set_property_name("SomeString"); + property->set_data_type(PropertyConfigProto::DataType::STRING); + property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); + property->mutable_string_indexing_config()->set_term_match_type( + TermMatchType::PREFIX); + property->mutable_string_indexing_config()->set_tokenizer_type( + StringIndexingConfig::TokenizerType::PLAIN); + + EXPECT_THAT(SchemaUtil::Validate(schema), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT, + HasSubstr("Infinite loop"))); +} + +TEST_F(SchemaUtilTest, InvalidInfiniteLoopTwoDegrees) { + SchemaProto schema; + + // Create a schema for the parent schema + auto type = schema.add_types(); + type->set_schema_type("A"); + + // Reference schema B, so far so good + auto property = type->add_properties(); + property->set_property_name("NestedDocument"); + property->set_data_type(PropertyConfigProto::DataType::DOCUMENT); + property->set_schema_type("B"); + property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); + + // Create the child schema + type = schema.add_types(); + type->set_schema_type("B"); + + // Reference the schema A, causing an infinite loop of references. + property = type->add_properties(); + property->set_property_name("NestedDocument"); + property->set_data_type(PropertyConfigProto::DataType::DOCUMENT); + property->set_schema_type("A"); + property->set_cardinality(PropertyConfigProto::Cardinality::REPEATED); + + // Two degrees of referencing: A -> B -> A + EXPECT_THAT(SchemaUtil::Validate(schema), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT, + HasSubstr("Infinite loop"))); +} + +TEST_F(SchemaUtilTest, InvalidInfiniteLoopThreeDegrees) { + SchemaProto schema; + + // Create a schema for the parent schema + auto type = schema.add_types(); + type->set_schema_type("A"); + + // Reference schema B , so far so good + auto property = type->add_properties(); + property->set_property_name("NestedDocument"); + property->set_data_type(PropertyConfigProto::DataType::DOCUMENT); + property->set_schema_type("B"); + property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); + + // Create the child schema + type = schema.add_types(); + type->set_schema_type("B"); + + // Reference schema C, so far so good + property = type->add_properties(); + property->set_property_name("NestedDocument"); + property->set_data_type(PropertyConfigProto::DataType::DOCUMENT); + property->set_schema_type("C"); + property->set_cardinality(PropertyConfigProto::Cardinality::REPEATED); + + // Create the child schema + type = schema.add_types(); + type->set_schema_type("C"); + + // Reference schema A, no good + property = type->add_properties(); + property->set_property_name("NestedDocument"); + property->set_data_type(PropertyConfigProto::DataType::DOCUMENT); + property->set_schema_type("A"); + property->set_cardinality(PropertyConfigProto::Cardinality::REPEATED); + + // Three degrees of referencing: A -> B -> C -> A + EXPECT_THAT(SchemaUtil::Validate(schema), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT, + HasSubstr("Infinite loop"))); +} + } // namespace } // namespace lib diff --git a/icing/schema/section-manager.cc b/icing/schema/section-manager.cc index 73aa947..0285cef 100644 --- a/icing/schema/section-manager.cc +++ b/icing/schema/section-manager.cc @@ -48,39 +48,6 @@ namespace { using TypeSectionMap = std::unordered_map<std::string, const std::vector<SectionMetadata>>; -// This state helps detect infinite loops (e.g. two type configs referencing -// each other) when assigning sections. The combination of 'number of section -// assigned' and 'current schema name' represents a unique state in the -// section-assign process. If the same state is seen the second time, that means -// an infinite loop. -struct SectionAssigningState { - size_t num_sections_assigned; - std::string current_schema_name; - - SectionAssigningState(size_t num_sections_assigned_in, - std::string&& current_schema_name_in) - : num_sections_assigned(num_sections_assigned_in), - current_schema_name(std::move(current_schema_name_in)) {} -}; - -// Provides a hash value of this struct so that it can be stored in a hash -// set. -struct SectionAssigningStateHasher { - size_t operator()(const SectionAssigningState& state) const { - size_t str_hash = std::hash<std::string>()(state.current_schema_name); - size_t int_hash = std::hash<size_t>()(state.num_sections_assigned); - // Combine the two hashes by taking the upper 16-bits of the string hash and - // the lower 16-bits of the int hash. - return (str_hash & 0xFFFF0000) | (int_hash & 0x0000FFFF); - } -}; - -bool operator==(const SectionAssigningState& lhs, - const SectionAssigningState& rhs) { - return lhs.num_sections_assigned == rhs.num_sections_assigned && - lhs.current_schema_name == rhs.current_schema_name; -} - // Helper function to concatenate a path and a property name std::string ConcatenatePath(const std::string& path, const std::string& next_property_name) { @@ -90,28 +57,14 @@ std::string ConcatenatePath(const std::string& path, return absl_ports::StrCat(path, kPropertySeparator, next_property_name); } -// Helper function to recursively identify sections from a type config and add -// them to a section metadata list libtextclassifier3::Status AssignSections( - const SchemaTypeConfigProto& type_config, + const SchemaTypeConfigProto& current_type_config, const std::string& current_section_path, const SchemaUtil::TypeConfigMap& type_config_map, - std::unordered_set<SectionAssigningState, SectionAssigningStateHasher>* - visited_states, std::vector<SectionMetadata>* metadata_list) { - if (!visited_states - ->emplace(metadata_list->size(), - std::string(type_config.schema_type())) - .second) { - // Failed to insert, the same state has been seen before, there's an - // infinite loop in type configs - return absl_ports::InvalidArgumentError( - "Infinite loop detected in type configs"); - } - // Sorts properties by name's alphabetical order so that order doesn't affect // section assigning. - auto sorted_properties = type_config.properties(); + auto sorted_properties = current_type_config.properties(); std::sort(sorted_properties.pointer_begin(), sorted_properties.pointer_end(), [](const PropertyConfigProto* p1, const PropertyConfigProto* p2) { return p1->property_name() < p2->property_name(); @@ -137,7 +90,7 @@ libtextclassifier3::Status AssignSections( AssignSections(nested_type_config, ConcatenatePath(current_section_path, property_config.property_name()), - type_config_map, visited_states, metadata_list)); + type_config_map, metadata_list)); } } @@ -162,6 +115,7 @@ libtextclassifier3::Status AssignSections( "allowed: %d", kMaxSectionId - kMinSectionId + 1)); } + // Creates section metadata from property config metadata_list->emplace_back( new_section_id, @@ -182,17 +136,14 @@ BuildSectionMetadataCache(const SchemaUtil::TypeConfigMap& type_config_map, std::vector<std::vector<SectionMetadata>> section_metadata_cache( schema_type_mapper.num_keys()); - std::unordered_set<SectionAssigningState, SectionAssigningStateHasher> - visited_states; for (const auto& name_and_type : type_config_map) { // Assigns sections for each type config - visited_states.clear(); const std::string& type_config_name = name_and_type.first; const SchemaTypeConfigProto& type_config = name_and_type.second; std::vector<SectionMetadata> metadata_list; - ICING_RETURN_IF_ERROR( - AssignSections(type_config, /*current_section_path*/ "", - type_config_map, &visited_states, &metadata_list)); + ICING_RETURN_IF_ERROR(AssignSections(type_config, + /*current_section_path*/ "", + type_config_map, &metadata_list)); // Insert the section metadata list at the index of the type's SchemaTypeId ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id, diff --git a/icing/schema/section-manager_test.cc b/icing/schema/section-manager_test.cc index 1a4d324..2d995df 100644 --- a/icing/schema/section-manager_test.cc +++ b/icing/schema/section-manager_test.cc @@ -163,67 +163,6 @@ TEST_F(SectionManagerTest, CreationWithNullPointerShouldFail) { StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); } -TEST_F(SectionManagerTest, CreationWithSchemaInfiniteLoopShouldFail) { - // Creates 2 type configs that reference each other - SchemaTypeConfigProto type_config1; - type_config1.set_schema_type("type1"); - auto property1 = type_config1.add_properties(); - property1->set_property_name("property1"); - property1->set_data_type(PropertyConfigProto::DataType::DOCUMENT); - property1->set_schema_type("type2"); // Here we reference type2 - property1->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); - property1->mutable_document_indexing_config()->set_index_nested_properties( - true); - - SchemaTypeConfigProto type_config2; - type_config2.set_schema_type("type2"); - auto property2 = type_config2.add_properties(); - property2->set_property_name("property2"); - property2->set_data_type(PropertyConfigProto::DataType::DOCUMENT); - // Here we reference type1, which references type2 causing the infinite loop - property2->set_schema_type("type1"); - property2->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); - property2->mutable_document_indexing_config()->set_index_nested_properties( - true); - - SchemaUtil::TypeConfigMap type_config_map; - type_config_map.emplace("type1", type_config1); - type_config_map.emplace("type2", type_config2); - - EXPECT_THAT( - SectionManager::Create(type_config_map, schema_type_mapper_.get()), - StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT, - HasSubstr("Infinite loop detected"))); -} - -TEST_F(SectionManagerTest, CreationWithSchemaSelfReferenceShouldFail) { - // Creates a type config that has a section and references to self. - SchemaTypeConfigProto type_config; - type_config.set_schema_type("type"); - auto property1 = type_config.add_properties(); - property1->set_property_name("property1"); - property1->set_data_type(PropertyConfigProto::DataType::STRING); - property1->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); - property1->mutable_string_indexing_config()->set_term_match_type( - TermMatchType::EXACT_ONLY); - auto property2 = type_config.add_properties(); - property2->set_property_name("property2"); - property2->set_data_type(PropertyConfigProto::DataType::DOCUMENT); - property2->mutable_document_indexing_config()->set_index_nested_properties( - true); - // Here we're referencing our own type, causing an infinite loop - property2->set_schema_type("type"); - property2->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); - - SchemaUtil::TypeConfigMap type_config_map; - type_config_map.emplace("type", type_config); - - EXPECT_THAT( - SectionManager::Create(type_config_map, schema_type_mapper_.get()), - StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE, - HasSubstr("Too many properties"))); -} - TEST_F(SectionManagerTest, CreationWithTooManyPropertiesShouldFail) { SchemaTypeConfigProto type_config; type_config.set_schema_type("type"); diff --git a/icing/scoring/scorer.cc b/icing/scoring/scorer.cc index 42ec09a..0739532 100644 --- a/icing/scoring/scorer.cc +++ b/icing/scoring/scorer.cc @@ -144,6 +144,9 @@ libtextclassifier3::StatusOr<std::unique_ptr<Scorer>> Scorer::Create( case ScoringSpecProto::RankingStrategy::USAGE_TYPE3_LAST_USED_TIMESTAMP: return std::make_unique<UsageScorer>(document_store, rank_by, default_score); + case ScoringSpecProto::RankingStrategy:: + RELEVANCE_SCORE_NONFUNCTIONAL_PLACEHOLDER: + [[fallthrough]]; case ScoringSpecProto::RankingStrategy::NONE: return std::make_unique<NoScorer>(default_score); } diff --git a/icing/testing/common-matchers.h b/icing/testing/common-matchers.h index 31d41fc..225b498 100644 --- a/icing/testing/common-matchers.h +++ b/icing/testing/common-matchers.h @@ -58,6 +58,44 @@ MATCHER_P2(EqualsDocHitInfo, document_id, section_ids, "") { actual.hit_section_ids_mask() == section_mask; } +// Used to match a DocHitInfo +MATCHER_P2(EqualsDocHitInfoWithTermFrequency, document_id, + section_ids_to_term_frequencies_map, "") { + const DocHitInfo& actual = arg; + SectionIdMask section_mask = kSectionIdMaskNone; + + bool term_frequency_as_expected = true; + std::vector<Hit::Score> expected_tfs; + std::vector<Hit::Score> actual_tfs; + for (auto itr = section_ids_to_term_frequencies_map.begin(); + itr != section_ids_to_term_frequencies_map.end(); itr++) { + SectionId section_id = itr->first; + section_mask |= 1U << section_id; + expected_tfs.push_back(itr->second); + actual_tfs.push_back(actual.max_hit_score(section_id)); + if (actual.max_hit_score(section_id) != itr->second) { + term_frequency_as_expected = false; + } + } + std::string actual_term_frequencies = absl_ports::StrCat( + "[", absl_ports::StrJoin(actual_tfs, ",", absl_ports::NumberFormatter()), + "]"); + std::string expected_term_frequencies = absl_ports::StrCat( + "[", + absl_ports::StrJoin(expected_tfs, ",", absl_ports::NumberFormatter()), + "]"); + *result_listener << IcingStringUtil::StringPrintf( + "(actual is {document_id=%d, section_mask=%d, term_frequencies=%s}, but " + "expected was " + "{document_id=%d, section_mask=%d, term_frequencies=%s}.)", + actual.document_id(), actual.hit_section_ids_mask(), + actual_term_frequencies.c_str(), document_id, section_mask, + expected_term_frequencies.c_str()); + return actual.document_id() == document_id && + actual.hit_section_ids_mask() == section_mask && + term_frequency_as_expected; +} + // Used to match a ScoredDocumentHit MATCHER_P(EqualsScoredDocumentHit, expected_scored_document_hit, "") { if (arg.document_id() != expected_scored_document_hit.document_id() || diff --git a/icing/testing/hit-test-utils.cc b/icing/testing/hit-test-utils.cc index 0e2eb2a..eba1dfa 100644 --- a/icing/testing/hit-test-utils.cc +++ b/icing/testing/hit-test-utils.cc @@ -42,8 +42,8 @@ std::vector<Hit> CreateHits(DocumentId start_docid, int num_hits, if (num_hits < 1) { return hits; } - hits.push_back( - Hit(/*section_id=*/1, /*document_id=*/start_docid, Hit::kMaxHitScore)); + hits.push_back(Hit(/*section_id=*/1, /*document_id=*/start_docid, + Hit::kDefaultHitScore)); while (hits.size() < num_hits) { hits.push_back(CreateHit(hits.back(), desired_byte_length)); } diff --git a/icing/testing/icu-i18n-test-utils.cc b/icing/testing/icu-i18n-test-utils.cc index 09878db..50dc26c 100644 --- a/icing/testing/icu-i18n-test-utils.cc +++ b/icing/testing/icu-i18n-test-utils.cc @@ -29,7 +29,7 @@ std::string UCharToString(UChar32 uchar) { uint8_t utf8_buffer[4]; // U8_APPEND writes 0 to 4 bytes int utf8_index = 0; - UBool has_error = FALSE; + UBool has_error = false; // utf8_index is advanced to the end of the contents if successful U8_APPEND(utf8_buffer, utf8_index, sizeof(utf8_buffer), uchar, has_error); diff --git a/icing/util/document-validator.cc b/icing/util/document-validator.cc index 4d86913..fb1fc4b 100644 --- a/icing/util/document-validator.cc +++ b/icing/util/document-validator.cc @@ -96,7 +96,8 @@ libtextclassifier3::Status DocumentValidator::Validate( if (property_iter == parsed_property_configs.property_config_map.end()) { return absl_ports::NotFoundError(absl_ports::StrCat( "Property config '", property.name(), "' not found for key: (", - document.namespace_(), ", ", document.uri(), ").")); + document.namespace_(), ", ", document.uri(), + ") of type: ", document.schema(), ".")); } const PropertyConfigProto& property_config = *property_iter->second; diff --git a/icing/util/document-validator.h b/icing/util/document-validator.h index 34a3217..036d1fa 100644 --- a/icing/util/document-validator.h +++ b/icing/util/document-validator.h @@ -56,8 +56,6 @@ class DocumentValidator { // In addition, all nested DocumentProto will also be validated towards the // requirements above. // - // DocumentProto.custom_properties are not validated. - // // Returns: // OK on success // FAILED_PRECONDITION if no schema is set yet diff --git a/icing/util/document-validator_test.cc b/icing/util/document-validator_test.cc index 6067162..ad5a93e 100644 --- a/icing/util/document-validator_test.cc +++ b/icing/util/document-validator_test.cc @@ -195,18 +195,6 @@ TEST_F(DocumentValidatorTest, ValidateNonexistentPropertyNotFound) { HasSubstr("'WrongPropertyName' not found"))); } -TEST_F(DocumentValidatorTest, ValidateAllCustomPropertyOk) { - DocumentProto email = - SimpleEmailBuilder() - // A nonexistent property, would've triggered a NotFound message - .AddCustomStringProperty("WrongPropertyName", kDefaultString) - // 'subject' property should've been a string according to the schema - .AddCustomBooleanProperty(kPropertySubject, false, true) - .Build(); - - EXPECT_THAT(document_validator_->Validate(email), IsOk()); -} - TEST_F(DocumentValidatorTest, ValidateExactlyOneRequiredValueOk) { // Required property should have exactly 1 value DocumentProto email = diff --git a/icing/util/i18n-utils.cc b/icing/util/i18n-utils.cc index d6754d5..cd0a227 100644 --- a/icing/util/i18n-utils.cc +++ b/icing/util/i18n-utils.cc @@ -156,7 +156,7 @@ void AppendUchar32ToUtf8(std::string* utf8_string, UChar32 uchar) { uint8_t utf8_buffer[4]; // U8_APPEND writes 0 to 4 bytes int utf8_index = 0; - UBool has_error = FALSE; + UBool has_error = false; // utf8_index is advanced to the end of the contents if successful U8_APPEND(utf8_buffer, utf8_index, sizeof(utf8_buffer), uchar, has_error); diff --git a/proto/icing/proto/document.proto b/proto/icing/proto/document.proto index ff215bd..ecbd5ce 100644 --- a/proto/icing/proto/document.proto +++ b/proto/icing/proto/document.proto @@ -16,8 +16,8 @@ syntax = "proto2"; package icing.lib; -import "icing/proto/status.proto"; import "icing/proto/logging.proto"; +import "icing/proto/status.proto"; option java_package = "com.google.android.icing.proto"; option java_multiple_files = true; @@ -50,11 +50,6 @@ message DocumentProto { // already defined in the schema for this Document's schema_type. repeated PropertyProto properties = 5; - // OPTIONAL: Properties that will not be validated against the schema, - // indexed, or be searchable. The properties will be stored in the Documents, - // but never looked at by Icing. - repeated PropertyProto custom_properties = 6; - // OPTIONAL: Score of the document which could be used during search result // ranking. Negative values will lead to validation errors. The default is the // lowest score 0. @@ -69,6 +64,8 @@ message DocumentProto { // TODO(cassiewang): Benchmark if fixed64 or some other proto type is better // in terms of space/time efficiency. Both for ttl_ms and timestamp fields optional int64 ttl_ms = 8 [default = 0]; + + reserved 6; } // Holds a property field of the Document. diff --git a/proto/icing/proto/scoring.proto b/proto/icing/proto/scoring.proto index 3a99b09..bfa7aec 100644 --- a/proto/icing/proto/scoring.proto +++ b/proto/icing/proto/scoring.proto @@ -18,7 +18,6 @@ package icing.lib; option java_package = "com.google.android.icing.proto"; option java_multiple_files = true; - option objc_class_prefix = "ICNG"; // Encapsulates the configurations on how Icing should score and rank the search @@ -64,6 +63,12 @@ message ScoringSpecProto { // Ranked by last used timestamp with usage type 3. The timestamps are // compared in seconds. USAGE_TYPE3_LAST_USED_TIMESTAMP = 8; + + // Placeholder for ranking by relevance score, currently computed as BM25F + // score. + // TODO(b/173156803): one the implementation is ready, rename to + // RELEVANCE_SCORE. + RELEVANCE_SCORE_NONFUNCTIONAL_PLACEHOLDER = 9; } } optional RankingStrategy.Code rank_by = 1; diff --git a/proto/icing/proto/search.proto b/proto/icing/proto/search.proto index abbfc32..6cb590f 100644 --- a/proto/icing/proto/search.proto +++ b/proto/icing/proto/search.proto @@ -64,7 +64,7 @@ message SearchSpecProto { // Client-supplied specifications on what to include/how to format the search // results. -// Next tag: 4 +// Next tag: 5 message ResultSpecProto { // The results will be returned in pages, and num_per_page specifies the // number of documents in one page. @@ -96,6 +96,22 @@ message ResultSpecProto { optional int32 max_window_bytes = 3; } optional SnippetSpecProto snippet_spec = 3; + + // How to specify a subset of properties to retrieve. If no type property mask + // has been specified for a schema type, then *all* properties of that schema + // type will be retrieved. + // Next tag: 3 + message TypePropertyMask { + // The schema type to which these property masks should apply. + optional string schema_type = 1; + + // The property masks specifying the property to be retrieved. Property + // masks must be composed only of property names, property separators (the + // '.' character). For example, "subject", "recipients.name". Specifying no + // property masks will result in *no* properties being retrieved. + repeated string paths = 2; + } + repeated TypePropertyMask type_property_masks = 4; } // The representation of a single match within a DocumentProto property. |