diff options
author | Tim Barron <tjbarron@google.com> | 2023-05-11 06:22:44 +0000 |
---|---|---|
committer | Tim Barron <tjbarron@google.com> | 2023-05-11 15:45:06 +0000 |
commit | fc9e6aac9c62d4546cb25548e1bbb317b7a4fd9a (patch) | |
tree | 061f4d194144ae4d5f81e6b56c148ddb5495e694 /icing/index | |
parent | a7d57e98ea7168d66cf01ace85598e33d5e9e5db (diff) | |
download | icing-fc9e6aac9c62d4546cb25548e1bbb317b7a4fd9a.tar.gz |
Update Icing from upstream.
Descriptions:
========================================================================
Modify the definition of propertyDefined:
========================================================================
Remove default args in SchemaStore::SetSchema and fix calls
========================================================================
Add allow_circular_schema_definitions flag
========================================================================
Onboard version detection to Icing
========================================================================
Add version util to help read/write version info
========================================================================
Add support for the overlay schema.
========================================================================
Allow cycles in schema-property-iterator
========================================================================
Add joinable properties into schema definition cycle restrictions.
========================================================================
Loosen circular references restriction for Schema Definitions.
========================================================================
Implement BackupSchemaProducer to generate a backup schema
========================================================================
Minor fix: remove a redundant log
========================================================================
Allow schema types to inherit from more than one parent
========================================================================
allow nested document properties to accept documents of subtype
========================================================================
Support polymorphism for Icing projection in Search and Get API
========================================================================
Add max_joined_child_per_parent into ResultSpec and change behavior
========================================================================
Support Icing schema type polymorphism for the search filter API
========================================================================
Verify that every child type's property set has included all compatible properties from parent types
========================================================================
Add individual type index latency
========================================================================
Build the iterator node for the propertyDefined() custom function
========================================================================
Advance all hits with same doc id from and merge sections once for the same bucket iter
========================================================================
Introduce DocHitInfoIteratorPropertyInSchema for property existence check
========================================================================
Add SchemaUtil::BuildTransitiveInheritanceGraph to build an inheritance map from schema
========================================================================
Introduce a lookup method for a property defined in a schema
========================================================================
Rollback of: Allow LanguageSegmenter::Iterators to declare AccessType.
========================================================================
Adds join info to QueryStatsProto
========================================================================
Bug:280698419
Bug:280698125
Bug:280698121
Bug:280697513
Bug:276349029
Bug:272145329
Bug:270102295
Bug:269295094
Bug:268680462
Bug:265304217
Bug:259744228
Bug:259743562
Bug:256022027
Change-Id: I54cd1d22121c314f8c238d2d49f0809165dc0ca3
Diffstat (limited to 'icing/index')
19 files changed, 837 insertions, 51 deletions
diff --git a/icing/index/index-processor_benchmark.cc b/icing/index/index-processor_benchmark.cc index ee43364..1cbe00d 100644 --- a/icing/index/index-processor_benchmark.cc +++ b/icing/index/index-processor_benchmark.cc @@ -172,7 +172,9 @@ std::unique_ptr<SchemaStore> CreateSchemaStore(const Filesystem& filesystem, SchemaProto schema; CreateFakeTypeConfig(schema.add_types()); - auto set_schema_status = schema_store->SetSchema(schema); + auto set_schema_status = schema_store->SetSchema( + schema, /*ignore_errors_and_delete_documents=*/false, + /*allow_circular_schema_definitions=*/false); if (!set_schema_status.ok()) { ICING_LOG(ERROR) << set_schema_status.status().error_message(); diff --git a/icing/index/index-processor_test.cc b/icing/index/index-processor_test.cc index 9453e58..ed9e856 100644 --- a/icing/index/index-processor_test.cc +++ b/icing/index/index-processor_test.cc @@ -40,7 +40,7 @@ #include "icing/index/numeric/numeric-index.h" #include "icing/index/string-section-indexing-handler.h" #include "icing/index/term-property-id.h" -#include "icing/join/qualified-id-joinable-property-indexing-handler.h" +#include "icing/join/qualified-id-join-indexing-handler.h" #include "icing/join/qualified-id-type-joinable-index.h" #include "icing/legacy/index/icing-filesystem.h" #include "icing/legacy/index/icing-mock-filesystem.h" @@ -267,7 +267,9 @@ class IndexProcessorTest : public Test { TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_OPTIONAL))) .Build(); - ICING_ASSERT_OK(schema_store_->SetSchema(schema)); + ICING_ASSERT_OK(schema_store_->SetSchema( + schema, /*ignore_errors_and_delete_documents=*/false, + /*allow_circular_schema_definitions=*/false)); ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(doc_store_dir_.c_str())); ICING_ASSERT_OK_AND_ASSIGN( @@ -291,10 +293,10 @@ class IndexProcessorTest : public Test { IntegerSectionIndexingHandler::Create( &fake_clock_, integer_index_.get())); ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<QualifiedIdJoinablePropertyIndexingHandler> + std::unique_ptr<QualifiedIdJoinIndexingHandler> qualified_id_joinable_property_indexing_handler, - QualifiedIdJoinablePropertyIndexingHandler::Create( - &fake_clock_, qualified_id_join_index_.get())); + QualifiedIdJoinIndexingHandler::Create(&fake_clock_, + qualified_id_join_index_.get())); std::vector<std::unique_ptr<DataIndexingHandler>> handlers; handlers.push_back(std::move(string_section_indexing_handler)); handlers.push_back(std::move(integer_section_indexing_handler)); @@ -823,10 +825,10 @@ TEST_F(IndexProcessorTest, OutOfOrderDocumentIdsInRecoveryMode) { IntegerSectionIndexingHandler::Create( &fake_clock_, integer_index_.get())); ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<QualifiedIdJoinablePropertyIndexingHandler> + std::unique_ptr<QualifiedIdJoinIndexingHandler> qualified_id_joinable_property_indexing_handler, - QualifiedIdJoinablePropertyIndexingHandler::Create( - &fake_clock_, qualified_id_join_index_.get())); + QualifiedIdJoinIndexingHandler::Create(&fake_clock_, + qualified_id_join_index_.get())); std::vector<std::unique_ptr<DataIndexingHandler>> handlers; handlers.push_back(std::move(string_section_indexing_handler)); handlers.push_back(std::move(integer_section_indexing_handler)); diff --git a/icing/index/index.cc b/icing/index/index.cc index 5cfcd27..19edbb6 100644 --- a/icing/index/index.cc +++ b/icing/index/index.cc @@ -163,6 +163,12 @@ libtextclassifier3::StatusOr<std::unique_ptr<Index>> Index::Create( std::move(main_index), filesystem)); } +/* static */ libtextclassifier3::StatusOr<int> Index::ReadFlashIndexMagic( + const Filesystem* filesystem, const std::string& base_dir) { + return MainIndex::ReadFlashIndexMagic(filesystem, + MakeMainIndexFilepath(base_dir)); +} + libtextclassifier3::Status Index::TruncateTo(DocumentId document_id) { if (lite_index_->last_added_document_id() != kInvalidDocumentId && lite_index_->last_added_document_id() > document_id) { diff --git a/icing/index/index.h b/icing/index/index.h index 3200d70..c170278 100644 --- a/icing/index/index.h +++ b/icing/index/index.h @@ -86,6 +86,16 @@ class Index { const Options& options, const Filesystem* filesystem, const IcingFilesystem* icing_filesystem); + // Reads magic from existing flash (main) index file header. We need this + // during Icing initialization phase to determine the version. + // + // Returns + // Valid magic on success + // NOT_FOUND if the lite index doesn't exist + // INTERNAL on I/O error + static libtextclassifier3::StatusOr<int> ReadFlashIndexMagic( + const Filesystem* filesystem, const std::string& base_dir); + // Clears all files created by the index. Returns OK if all files were // cleared. libtextclassifier3::Status Reset() { diff --git a/icing/index/integer-section-indexing-handler.cc b/icing/index/integer-section-indexing-handler.cc index 584f028..63b09df 100644 --- a/icing/index/integer-section-indexing-handler.cc +++ b/icing/index/integer-section-indexing-handler.cc @@ -16,12 +16,19 @@ #include <cstdint> #include <memory> +#include <utility> #include "icing/text_classifier/lib3/utils/base/status.h" #include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "icing/absl_ports/canonical_errors.h" +#include "icing/index/numeric/numeric-index.h" +#include "icing/legacy/core/icing-string-util.h" +#include "icing/proto/logging.pb.h" #include "icing/schema/section.h" #include "icing/store/document-id.h" +#include "icing/util/clock.h" #include "icing/util/logging.h" +#include "icing/util/status-macros.h" #include "icing/util/tokenized-document.h" namespace icing { @@ -41,7 +48,7 @@ IntegerSectionIndexingHandler::Create(const Clock* clock, libtextclassifier3::Status IntegerSectionIndexingHandler::Handle( const TokenizedDocument& tokenized_document, DocumentId document_id, bool recovery_mode, PutDocumentStatsProto* put_document_stats) { - // TODO(b/259744228): set integer indexing latency and other stats + std::unique_ptr<Timer> index_timer = clock_.GetNewTimer(); if (!IsDocumentIdValid(document_id)) { return absl_ports::InvalidArgumentError( @@ -93,6 +100,11 @@ libtextclassifier3::Status IntegerSectionIndexingHandler::Handle( } } + if (put_document_stats != nullptr) { + put_document_stats->set_integer_index_latency_ms( + index_timer->GetElapsedMilliseconds()); + } + return status; } diff --git a/icing/index/integer-section-indexing-handler_test.cc b/icing/index/integer-section-indexing-handler_test.cc index 895fe57..706856c 100644 --- a/icing/index/integer-section-indexing-handler_test.cc +++ b/icing/index/integer-section-indexing-handler_test.cc @@ -156,7 +156,9 @@ class IntegerSectionIndexingHandlerTest : public ::testing::Test { TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_OPTIONAL))) .Build(); - ICING_ASSERT_OK(schema_store_->SetSchema(schema)); + ICING_ASSERT_OK(schema_store_->SetSchema( + schema, /*ignore_errors_and_delete_documents=*/false, + /*allow_circular_schema_definitions=*/false)); ASSERT_TRUE( filesystem_.CreateDirectoryRecursively(document_store_dir_.c_str())); diff --git a/icing/index/iterator/doc-hit-info-iterator-filter.cc b/icing/index/iterator/doc-hit-info-iterator-filter.cc index 83a73a4..2c0c2c2 100644 --- a/icing/index/iterator/doc-hit-info-iterator-filter.cc +++ b/icing/index/iterator/doc-hit-info-iterator-filter.cc @@ -55,11 +55,16 @@ DocHitInfoIteratorFilter::DocHitInfoIteratorFilter( // Precompute all the SchemaTypeIds for (std::string_view schema_type : options_.schema_types) { - auto schema_type_id_or = schema_store_.GetSchemaTypeId(schema_type); + libtextclassifier3::StatusOr<const std::unordered_set<SchemaTypeId>*> + schema_type_ids_or = + schema_store_.GetSchemaTypeIdsWithChildren(schema_type); // If we can't find the SchemaTypeId, just throw it away - if (schema_type_id_or.ok()) { - target_schema_type_ids_.emplace(schema_type_id_or.ValueOrDie()); + if (schema_type_ids_or.ok()) { + const std::unordered_set<SchemaTypeId>* schema_type_ids = + schema_type_ids_or.ValueOrDie(); + target_schema_type_ids_.insert(schema_type_ids->begin(), + schema_type_ids->end()); } } } diff --git a/icing/index/iterator/doc-hit-info-iterator-filter_test.cc b/icing/index/iterator/doc-hit-info-iterator-filter_test.cc index 0900e1f..4b86cae 100644 --- a/icing/index/iterator/doc-hit-info-iterator-filter_test.cc +++ b/icing/index/iterator/doc-hit-info-iterator-filter_test.cc @@ -17,6 +17,7 @@ #include <limits> #include <memory> #include <string> +#include <string_view> #include <utility> #include <vector> @@ -80,7 +81,9 @@ class DocHitInfoIteratorDeletedFilterTest : public ::testing::Test { ICING_ASSERT_OK_AND_ASSIGN( schema_store_, SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); - ICING_ASSERT_OK(schema_store_->SetSchema(schema)); + ICING_ASSERT_OK(schema_store_->SetSchema( + schema, /*ignore_errors_and_delete_documents=*/false, + /*allow_circular_schema_definitions=*/false)); ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, @@ -247,7 +250,9 @@ class DocHitInfoIteratorNamespaceFilterTest : public ::testing::Test { ICING_ASSERT_OK_AND_ASSIGN( schema_store_, SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); - ICING_ASSERT_OK(schema_store_->SetSchema(schema)); + ICING_ASSERT_OK(schema_store_->SetSchema( + schema, /*ignore_errors_and_delete_documents=*/false, + /*allow_circular_schema_definitions=*/false)); ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, @@ -379,30 +384,52 @@ TEST_F(DocHitInfoIteratorNamespaceFilterTest, FilterForMultipleNamespacesOk) { class DocHitInfoIteratorSchemaTypeFilterTest : public ::testing::Test { protected: + static constexpr std::string_view kSchema1 = "email"; + static constexpr std::string_view kSchema2 = "message"; + static constexpr std::string_view kSchema3 = "person"; + static constexpr std::string_view kSchema4 = "artist"; + static constexpr std::string_view kSchema5 = "emailMessage"; + DocHitInfoIteratorSchemaTypeFilterTest() : test_dir_(GetTestTempDir() + "/icing") {} void SetUp() override { filesystem_.CreateDirectoryRecursively(test_dir_.c_str()); - document1_schema1_ = - DocumentBuilder().SetKey("namespace", "1").SetSchema(schema1_).Build(); - document2_schema2_ = - DocumentBuilder().SetKey("namespace", "2").SetSchema(schema2_).Build(); - document3_schema3_ = - DocumentBuilder().SetKey("namespace", "3").SetSchema(schema3_).Build(); - document4_schema1_ = - DocumentBuilder().SetKey("namespace", "4").SetSchema(schema1_).Build(); + document1_schema1_ = DocumentBuilder() + .SetKey("namespace", "1") + .SetSchema(std::string(kSchema1)) + .Build(); + document2_schema2_ = DocumentBuilder() + .SetKey("namespace", "2") + .SetSchema(std::string(kSchema2)) + .Build(); + document3_schema3_ = DocumentBuilder() + .SetKey("namespace", "3") + .SetSchema(std::string(kSchema3)) + .Build(); + document4_schema1_ = DocumentBuilder() + .SetKey("namespace", "4") + .SetSchema(std::string(kSchema1)) + .Build(); SchemaProto schema = SchemaBuilder() - .AddType(SchemaTypeConfigBuilder().SetType(schema1_)) - .AddType(SchemaTypeConfigBuilder().SetType(schema2_)) - .AddType(SchemaTypeConfigBuilder().SetType(schema3_)) + .AddType(SchemaTypeConfigBuilder().SetType(kSchema1)) + .AddType(SchemaTypeConfigBuilder().SetType(kSchema2)) + .AddType(SchemaTypeConfigBuilder().SetType(kSchema3)) + .AddType(SchemaTypeConfigBuilder().SetType(kSchema4).AddParentType( + kSchema3)) + .AddType(SchemaTypeConfigBuilder() + .SetType(std::string(kSchema5)) + .AddParentType(kSchema1) + .AddParentType(kSchema2)) .Build(); ICING_ASSERT_OK_AND_ASSIGN( schema_store_, SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); - ICING_ASSERT_OK(schema_store_->SetSchema(schema)); + ICING_ASSERT_OK(schema_store_->SetSchema( + schema, /*ignore_errors_and_delete_documents=*/false, + /*allow_circular_schema_definitions=*/false)); ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, @@ -424,9 +451,6 @@ class DocHitInfoIteratorSchemaTypeFilterTest : public ::testing::Test { FakeClock fake_clock_; const Filesystem filesystem_; const std::string test_dir_; - const std::string schema1_ = "email"; - const std::string schema2_ = "message"; - const std::string schema3_ = "person"; DocumentProto document1_schema1_; DocumentProto document2_schema2_; DocumentProto document3_schema3_; @@ -495,7 +519,7 @@ TEST_F(DocHitInfoIteratorSchemaTypeFilterTest, std::unique_ptr<DocHitInfoIterator> original_iterator = std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos); - options_.schema_types = std::vector<std::string_view>{schema1_}; + options_.schema_types = std::vector<std::string_view>{kSchema1}; DocHitInfoIteratorFilter filtered_iterator(std::move(original_iterator), document_store_.get(), schema_store_.get(), options_); @@ -518,7 +542,7 @@ TEST_F(DocHitInfoIteratorSchemaTypeFilterTest, FilterForMultipleSchemaTypesOk) { std::unique_ptr<DocHitInfoIterator> original_iterator = std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos); - options_.schema_types = std::vector<std::string_view>{schema2_, schema3_}; + options_.schema_types = std::vector<std::string_view>{kSchema2, kSchema3}; DocHitInfoIteratorFilter filtered_iterator(std::move(original_iterator), document_store_.get(), schema_store_.get(), options_); @@ -527,6 +551,110 @@ TEST_F(DocHitInfoIteratorSchemaTypeFilterTest, FilterForMultipleSchemaTypesOk) { ElementsAre(document_id2, document_id3)); } +TEST_F(DocHitInfoIteratorSchemaTypeFilterTest, + FilterForSchemaTypePolymorphismOk) { + // Add some irrelevant documents. + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1, + document_store_->Put(document1_schema1_)); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2, + document_store_->Put(document2_schema2_)); + + // Create a person document and an artist document, where the artist should be + // able to be interpreted as a person by polymorphism. + ICING_ASSERT_OK_AND_ASSIGN( + DocumentId person_document_id, + document_store_->Put(DocumentBuilder() + .SetKey("namespace", "person") + .SetSchema("person") + .Build())); + ICING_ASSERT_OK_AND_ASSIGN( + DocumentId artist_document_id, + document_store_->Put(DocumentBuilder() + .SetKey("namespace", "artist") + .SetSchema("artist") + .Build())); + + std::vector<DocHitInfo> doc_hit_infos = { + DocHitInfo(document_id1), DocHitInfo(document_id2), + DocHitInfo(person_document_id), DocHitInfo(artist_document_id)}; + + // Filters for the "person" type should also include the "artist" type. + std::unique_ptr<DocHitInfoIterator> original_iterator = + std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos); + options_.schema_types = {"person"}; + DocHitInfoIteratorFilter filtered_iterator_1(std::move(original_iterator), + document_store_.get(), + schema_store_.get(), options_); + EXPECT_THAT(GetDocumentIds(&filtered_iterator_1), + ElementsAre(person_document_id, artist_document_id)); + + // Filters for the "artist" type should not include the "person" type. + original_iterator = std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos); + options_.schema_types = {"artist"}; + DocHitInfoIteratorFilter filtered_iterator_2(std::move(original_iterator), + document_store_.get(), + schema_store_.get(), options_); + EXPECT_THAT(GetDocumentIds(&filtered_iterator_2), + ElementsAre(artist_document_id)); +} + +TEST_F(DocHitInfoIteratorSchemaTypeFilterTest, + FilterForSchemaTypeMultipleParentPolymorphismOk) { + // Create an email and a message document. + ICING_ASSERT_OK_AND_ASSIGN( + DocumentId email_document_id, + document_store_->Put(DocumentBuilder() + .SetKey("namespace", "email") + .SetSchema("email") + .Build())); + ICING_ASSERT_OK_AND_ASSIGN( + DocumentId message_document_id, + document_store_->Put(DocumentBuilder() + .SetKey("namespace", "message") + .SetSchema("message") + .Build())); + + // Create a emailMessage document, which the should be able to be interpreted + // as both an email and a message by polymorphism. + ICING_ASSERT_OK_AND_ASSIGN( + DocumentId email_message_document_id, + document_store_->Put(DocumentBuilder() + .SetKey("namespace", "emailMessage") + .SetSchema("emailMessage") + .Build())); + + std::vector<DocHitInfo> doc_hit_infos = { + DocHitInfo(email_document_id), DocHitInfo(message_document_id), + DocHitInfo(email_message_document_id)}; + + // Filters for the "email" type should also include the "emailMessage" type. + std::unique_ptr<DocHitInfoIterator> original_iterator = + std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos); + options_.schema_types = std::vector<std::string_view>{"email"}; + DocHitInfoIteratorFilter filtered_iterator_1(std::move(original_iterator), + document_store_.get(), + schema_store_.get(), options_); + EXPECT_THAT(GetDocumentIds(&filtered_iterator_1), + ElementsAre(email_document_id, email_message_document_id)); + + // Filters for the "message" type should also include the "emailMessage" type. + original_iterator = std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos); + options_.schema_types = std::vector<std::string_view>{"message"}; + DocHitInfoIteratorFilter filtered_iterator_2(std::move(original_iterator), + document_store_.get(), + schema_store_.get(), options_); + EXPECT_THAT(GetDocumentIds(&filtered_iterator_2), + ElementsAre(message_document_id, email_message_document_id)); + + // Filters for a irrelevant type should return nothing. + original_iterator = std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos); + options_.schema_types = std::vector<std::string_view>{"person"}; + DocHitInfoIteratorFilter filtered_iterator_3(std::move(original_iterator), + document_store_.get(), + schema_store_.get(), options_); + EXPECT_THAT(GetDocumentIds(&filtered_iterator_3), IsEmpty()); +} + class DocHitInfoIteratorExpirationFilterTest : public ::testing::Test { protected: DocHitInfoIteratorExpirationFilterTest() @@ -542,7 +670,9 @@ class DocHitInfoIteratorExpirationFilterTest : public ::testing::Test { ICING_ASSERT_OK_AND_ASSIGN( schema_store_, SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); - ICING_ASSERT_OK(schema_store_->SetSchema(schema)); + ICING_ASSERT_OK(schema_store_->SetSchema( + schema, /*ignore_errors_and_delete_documents=*/false, + /*allow_circular_schema_definitions=*/false)); ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, @@ -742,7 +872,9 @@ class DocHitInfoIteratorFilterTest : public ::testing::Test { ICING_ASSERT_OK_AND_ASSIGN( schema_store_, SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); - ICING_ASSERT_OK(schema_store_->SetSchema(schema)); + ICING_ASSERT_OK(schema_store_->SetSchema( + schema, /*ignore_errors_and_delete_documents=*/false, + /*allow_circular_schema_definitions=*/false)); ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, diff --git a/icing/index/iterator/doc-hit-info-iterator-property-in-schema.cc b/icing/index/iterator/doc-hit-info-iterator-property-in-schema.cc new file mode 100644 index 0000000..5f260a8 --- /dev/null +++ b/icing/index/iterator/doc-hit-info-iterator-property-in-schema.cc @@ -0,0 +1,114 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/index/iterator/doc-hit-info-iterator-property-in-schema.h" + +#include <cstdint> +#include <memory> +#include <string> +#include <string_view> +#include <utility> + +#include "icing/text_classifier/lib3/utils/base/status.h" +#include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "icing/absl_ports/canonical_errors.h" +#include "icing/absl_ports/str_cat.h" +#include "icing/index/hit/doc-hit-info.h" +#include "icing/index/iterator/doc-hit-info-iterator.h" +#include "icing/schema/schema-store.h" +#include "icing/store/document-id.h" +#include "icing/store/document-store.h" + +namespace icing { +namespace lib { + +DocHitInfoIteratorPropertyInSchema::DocHitInfoIteratorPropertyInSchema( + std::unique_ptr<DocHitInfoIterator> delegate, + const DocumentStore* document_store, const SchemaStore* schema_store, + std::set<std::string> target_sections) + : delegate_(std::move(delegate)), + document_store_(*document_store), + schema_store_(*schema_store), + target_properties_(std::move(target_sections)) {} + +libtextclassifier3::Status DocHitInfoIteratorPropertyInSchema::Advance() { + doc_hit_info_ = DocHitInfo(kInvalidDocumentId); + hit_intersect_section_ids_mask_ = kSectionIdMaskNone; + + // Maps from SchemaTypeId to a bool indicating whether or not the type has + // the requested property. + std::unordered_map<SchemaTypeId, bool> property_defined_types; + while (delegate_->Advance().ok()) { + DocumentId document_id = delegate_->doc_hit_info().document_id(); + auto data_optional = + document_store_.GetAliveDocumentFilterData(document_id); + if (!data_optional) { + // Ran into some error retrieving information on this hit, skip + continue; + } + + // Guaranteed that the DocumentFilterData exists at this point + SchemaTypeId schema_type_id = data_optional.value().schema_type_id(); + bool valid_match = false; + auto itr = property_defined_types.find(schema_type_id); + if (itr != property_defined_types.end()) { + valid_match = itr->second; + } else { + for (const auto& property : target_properties_) { + if (schema_store_.IsPropertyDefinedInSchema(schema_type_id, property)) { + valid_match = true; + break; + } + } + property_defined_types[schema_type_id] = valid_match; + } + + if (valid_match) { + doc_hit_info_ = delegate_->doc_hit_info(); + hit_intersect_section_ids_mask_ = + delegate_->hit_intersect_section_ids_mask(); + doc_hit_info_.set_hit_section_ids_mask(hit_intersect_section_ids_mask_); + return libtextclassifier3::Status::OK; + } + + // The document's schema does not define any properties listed in + // target_properties_. Continue. + } + + // Didn't find anything on the delegate iterator. + return absl_ports::ResourceExhaustedError("No more DocHitInfos in iterator"); +} + +libtextclassifier3::StatusOr<DocHitInfoIterator::TrimmedNode> +DocHitInfoIteratorPropertyInSchema::TrimRightMostNode() && { + // Don't generate suggestion if the last operator is this custom function. + return absl_ports::InvalidArgumentError( + "Cannot generate suggestion if the last term is hasPropertyDefined()."); +} + +int32_t DocHitInfoIteratorPropertyInSchema::GetNumBlocksInspected() const { + return delegate_->GetNumBlocksInspected(); +} + +int32_t DocHitInfoIteratorPropertyInSchema::GetNumLeafAdvanceCalls() const { + return delegate_->GetNumLeafAdvanceCalls(); +} + +std::string DocHitInfoIteratorPropertyInSchema::ToString() const { + return absl_ports::StrCat("(", absl_ports::StrJoin(target_properties_, ","), + "): ", delegate_->ToString()); +} + +} // namespace lib +} // namespace icing diff --git a/icing/index/iterator/doc-hit-info-iterator-property-in-schema.h b/icing/index/iterator/doc-hit-info-iterator-property-in-schema.h new file mode 100644 index 0000000..35b87e1 --- /dev/null +++ b/icing/index/iterator/doc-hit-info-iterator-property-in-schema.h @@ -0,0 +1,76 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_PROPERTY_IN_SCHEMA_H_ +#define ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_PROPERTY_IN_SCHEMA_H_ + +#include <cstdint> +#include <memory> +#include <string> +#include <string_view> + +#include "icing/text_classifier/lib3/utils/base/status.h" +#include "icing/index/iterator/doc-hit-info-iterator.h" +#include "icing/schema/schema-store.h" +#include "icing/store/document-store.h" + +namespace icing { +namespace lib { + +// An iterator that helps filter for DocHitInfos whose schemas define the +// properties named in target_properties_. +class DocHitInfoIteratorPropertyInSchema : public DocHitInfoIterator { + public: + // Does not take any ownership, and all pointers must refer to valid objects + // that outlive the one constructed. The delegate should be at minimum be + // a DocHitInfoIteratorAllDocumentId, but other optimizations are possible, + // cf. go/icing-property-in-schema-existence. + explicit DocHitInfoIteratorPropertyInSchema( + std::unique_ptr<DocHitInfoIterator> delegate, + const DocumentStore* document_store, const SchemaStore* schema_store, + std::set<std::string> target_sections); + + libtextclassifier3::Status Advance() override; + + libtextclassifier3::StatusOr<TrimmedNode> TrimRightMostNode() && override; + + int32_t GetNumBlocksInspected() const override; + + int32_t GetNumLeafAdvanceCalls() const override; + + std::string ToString() const override; + + void PopulateMatchedTermsStats( + std::vector<TermMatchInfo>* matched_terms_stats, + SectionIdMask filtering_section_mask = kSectionIdMaskAll) const override { + if (doc_hit_info_.document_id() == kInvalidDocumentId) { + // Current hit isn't valid, return. + return; + } + delegate_->PopulateMatchedTermsStats(matched_terms_stats, + filtering_section_mask); + } + + private: + std::unique_ptr<DocHitInfoIterator> delegate_; + const DocumentStore& document_store_; + const SchemaStore& schema_store_; + + std::set<std::string> target_properties_; +}; + +} // namespace lib +} // namespace icing + +#endif // ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_PROPERTY_IN_SCHEMA_H_ diff --git a/icing/index/iterator/doc-hit-info-iterator-property-in-schema_test.cc b/icing/index/iterator/doc-hit-info-iterator-property-in-schema_test.cc new file mode 100644 index 0000000..9bffeeb --- /dev/null +++ b/icing/index/iterator/doc-hit-info-iterator-property-in-schema_test.cc @@ -0,0 +1,263 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/index/iterator/doc-hit-info-iterator-property-in-schema.h" + +#include <memory> +#include <string> +#include <utility> +#include <vector> + +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "icing/document-builder.h" +#include "icing/file/filesystem.h" +#include "icing/index/hit/doc-hit-info.h" +#include "icing/index/iterator/doc-hit-info-iterator-all-document-id.h" +#include "icing/index/iterator/doc-hit-info-iterator-test-util.h" +#include "icing/index/iterator/doc-hit-info-iterator.h" +#include "icing/proto/document.pb.h" +#include "icing/proto/schema.pb.h" +#include "icing/schema-builder.h" +#include "icing/schema/schema-store.h" +#include "icing/schema/section.h" +#include "icing/store/document-id.h" +#include "icing/store/document-store.h" +#include "icing/testing/common-matchers.h" +#include "icing/testing/fake-clock.h" +#include "icing/testing/tmp-directory.h" + +namespace icing { +namespace lib { + +namespace { + +using ::testing::ElementsAre; +using ::testing::Eq; +using ::testing::IsEmpty; + +class DocHitInfoIteratorPropertyInSchemaTest : public ::testing::Test { + protected: + DocHitInfoIteratorPropertyInSchemaTest() + : test_dir_(GetTestTempDir() + "/icing") {} + + void SetUp() override { + filesystem_.CreateDirectoryRecursively(test_dir_.c_str()); + document1_ = DocumentBuilder() + .SetKey("namespace", "uri1") + .SetSchema("email") + .Build(); + document2_ = + DocumentBuilder().SetKey("namespace", "uri2").SetSchema("note").Build(); + + indexed_section_0 = "indexedSection0"; + unindexed_section_1 = "unindexedSection1"; + not_defined_section_2 = "notDefinedSection2"; + + schema_ = + SchemaBuilder() + .AddType( + SchemaTypeConfigBuilder() + .SetType("email") + // Add an indexed property so we generate section + // metadata on it + .AddProperty(PropertyConfigBuilder() + .SetName(indexed_section_0) + .SetDataTypeString(TERM_MATCH_EXACT, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName(unindexed_section_1) + .SetDataType(TYPE_STRING) + .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType(SchemaTypeConfigBuilder().SetType("note").AddProperty( + PropertyConfigBuilder() + .SetName(unindexed_section_1) + .SetDataType(TYPE_STRING) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + + ICING_ASSERT_OK_AND_ASSIGN( + schema_store_, + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); + ICING_ASSERT_OK(schema_store_->SetSchema( + schema_, /*ignore_errors_and_delete_documents=*/false, + /*allow_circular_schema_definitions=*/false)); + + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult create_result, + DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_, + schema_store_.get(), + /*force_recovery_and_revalidate_documents=*/false, + /*namespace_id_fingerprint=*/false, + PortableFileBackedProtoLog< + DocumentWrapper>::kDeflateCompressionLevel, + /*initialize_stats=*/nullptr)); + document_store_ = std::move(create_result.document_store); + } + + void TearDown() override { + document_store_.reset(); + schema_store_.reset(); + filesystem_.DeleteDirectoryRecursively(test_dir_.c_str()); + } + + std::unique_ptr<SchemaStore> schema_store_; + std::unique_ptr<DocumentStore> document_store_; + const Filesystem filesystem_; + const std::string test_dir_; + std::string indexed_section_0; + std::string unindexed_section_1; + std::string not_defined_section_2; + SchemaProto schema_; + DocumentProto document1_; + DocumentProto document2_; + FakeClock fake_clock_; +}; + +TEST_F(DocHitInfoIteratorPropertyInSchemaTest, + AdvanceToDocumentWithIndexedProperty) { + // Populate the DocumentStore's FilterCache with this document's data + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id, + document_store_->Put(document1_)); + + auto original_iterator = std::make_unique<DocHitInfoIteratorAllDocumentId>( + document_store_->num_documents()); + + DocHitInfoIteratorPropertyInSchema property_defined_iterator( + std::move(original_iterator), document_store_.get(), schema_store_.get(), + /*target_target_sections=*/{indexed_section_0}); + + EXPECT_THAT(GetDocumentIds(&property_defined_iterator), + ElementsAre(document_id)); + + EXPECT_FALSE(property_defined_iterator.Advance().ok()); +} + +TEST_F(DocHitInfoIteratorPropertyInSchemaTest, + AdvanceToDocumentWithUnindexedProperty) { + // Populate the DocumentStore's FilterCache with this document's data + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id, + document_store_->Put(document1_)); + + auto original_iterator = std::make_unique<DocHitInfoIteratorAllDocumentId>( + document_store_->num_documents()); + + DocHitInfoIteratorPropertyInSchema property_defined_iterator( + std::move(original_iterator), document_store_.get(), schema_store_.get(), + /*target_target_sections=*/{unindexed_section_1}); + + EXPECT_THAT(GetDocumentIds(&property_defined_iterator), + ElementsAre(document_id)); + + EXPECT_FALSE(property_defined_iterator.Advance().ok()); +} + +TEST_F(DocHitInfoIteratorPropertyInSchemaTest, NoMatchWithUndefinedProperty) { + ICING_EXPECT_OK(document_store_->Put(document1_)); + + auto original_iterator = std::make_unique<DocHitInfoIteratorAllDocumentId>( + document_store_->num_documents()); + + DocHitInfoIteratorPropertyInSchema property_defined_iterator( + std::move(original_iterator), document_store_.get(), schema_store_.get(), + /*target_target_sections=*/{not_defined_section_2}); + EXPECT_FALSE(property_defined_iterator.Advance().ok()); +} + +TEST_F(DocHitInfoIteratorPropertyInSchemaTest, + CorrectlySetsSectionIdMasksAndPopulatesTermMatchInfo) { + // Populate the DocumentStore's FilterCache with this document's data + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id, + document_store_->Put(document1_)); + + // Arbitrary section ids for the documents in the DocHitInfoIterators. + // Created to test correct section_id_mask behavior. + SectionIdMask original_section_id_mask = 0b00000101; // hits in sections 0, 2 + + DocHitInfoTermFrequencyPair doc_hit_info1 = DocHitInfo(document_id); + doc_hit_info1.UpdateSection(/*section_id=*/0, /*hit_term_frequency=*/1); + doc_hit_info1.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/2); + + // Create a hit that was found in the indexed section + std::vector<DocHitInfoTermFrequencyPair> doc_hit_infos = {doc_hit_info1}; + + auto original_iterator = + std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos, "hi"); + original_iterator->set_hit_intersect_section_ids_mask( + original_section_id_mask); + + DocHitInfoIteratorPropertyInSchema property_defined_iterator( + std::move(original_iterator), document_store_.get(), schema_store_.get(), + /*target_target_sections=*/{indexed_section_0}); + + std::vector<TermMatchInfo> matched_terms_stats; + property_defined_iterator.PopulateMatchedTermsStats(&matched_terms_stats); + EXPECT_THAT(matched_terms_stats, IsEmpty()); + + ICING_EXPECT_OK(property_defined_iterator.Advance()); + EXPECT_THAT(property_defined_iterator.doc_hit_info().document_id(), + Eq(document_id)); + + // The expected mask is the same as the original mask, since the iterator + // should treat it as a pass-through. + SectionIdMask expected_section_id_mask = original_section_id_mask; + EXPECT_EQ(property_defined_iterator.hit_intersect_section_ids_mask(), + expected_section_id_mask); + + property_defined_iterator.PopulateMatchedTermsStats(&matched_terms_stats); + std::unordered_map<SectionId, Hit::TermFrequency> + expected_section_ids_tf_map = {{0, 1}, {2, 2}}; + EXPECT_THAT(matched_terms_stats, ElementsAre(EqualsTermMatchInfo( + "hi", expected_section_ids_tf_map))); + + EXPECT_FALSE(property_defined_iterator.Advance().ok()); +} + +TEST_F(DocHitInfoIteratorPropertyInSchemaTest, + TrimRightMostNodeResultsInError) { + auto original_iterator = std::make_unique<DocHitInfoIteratorAllDocumentId>( + document_store_->num_documents()); + + DocHitInfoIteratorPropertyInSchema property_defined_iterator( + std::move(original_iterator), document_store_.get(), schema_store_.get(), + /*target_target_sections=*/{indexed_section_0}); + + EXPECT_THAT(std::move(property_defined_iterator).TrimRightMostNode(), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); +} + +TEST_F(DocHitInfoIteratorPropertyInSchemaTest, + FindPropertyDefinedByMultipleTypes) { + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1, + document_store_->Put(document1_)); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2, + document_store_->Put(document2_)); + auto original_iterator = std::make_unique<DocHitInfoIteratorAllDocumentId>( + document_store_->num_documents()); + + DocHitInfoIteratorPropertyInSchema property_defined_iterator( + std::move(original_iterator), document_store_.get(), schema_store_.get(), + /*target_target_sections=*/{unindexed_section_1}); + + EXPECT_THAT(GetDocumentIds(&property_defined_iterator), + ElementsAre(document_id2, document_id1)); + + EXPECT_FALSE(property_defined_iterator.Advance().ok()); +} + +} // namespace + +} // namespace lib +} // namespace icing diff --git a/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc b/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc index 60b9a12..78f4d34 100644 --- a/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc +++ b/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc @@ -95,7 +95,9 @@ class DocHitInfoIteratorSectionRestrictTest : public ::testing::Test { ICING_ASSERT_OK_AND_ASSIGN( schema_store_, SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); - ICING_ASSERT_OK(schema_store_->SetSchema(schema_)); + ICING_ASSERT_OK(schema_store_->SetSchema( + schema_, /*ignore_errors_and_delete_documents=*/false, + /*allow_circular_schema_definitions=*/false)); ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, diff --git a/icing/index/iterator/doc-hit-info-iterator.h b/icing/index/iterator/doc-hit-info-iterator.h index e1f06d0..d8cd3ad 100644 --- a/icing/index/iterator/doc-hit-info-iterator.h +++ b/icing/index/iterator/doc-hit-info-iterator.h @@ -85,10 +85,11 @@ class DocHitInfoIterator { unnormalized_term_length_(unnormalized_term_length) {} }; - // Trim the right-most itertor of the itertor tree. - // This is to support search suggestion for the last terms which is the + // Trim the rightmost iterator of the iterator tree. + // This is to support search suggestions for the last term which is the // right-most node of the root iterator tree. Only support trim the right-most - // node on the AND, AND_NARY, OR, OR_NARY, OR_LEAF and Filter itertor. + // node on the AND, AND_NARY, OR, OR_NARY, OR_LEAF, Filter, and the + // property-in-schema-check iterator. // // After calling this method, this iterator is no longer usable. Please use // the returned iterator. diff --git a/icing/index/main/main-index.cc b/icing/index/main/main-index.cc index 7df137c..d5e9d57 100644 --- a/icing/index/main/main-index.cc +++ b/icing/index/main/main-index.cc @@ -22,6 +22,7 @@ #include "icing/absl_ports/canonical_errors.h" #include "icing/absl_ports/str_cat.h" #include "icing/file/destructible-directory.h" +#include "icing/file/posting_list/flash-index-storage.h" #include "icing/file/posting_list/posting-list-common.h" #include "icing/index/main/posting-list-hit-serializer.h" #include "icing/index/term-id-codec.h" @@ -90,6 +91,10 @@ FindTermResult FindShortestValidTermWithPrefixHits( return result; } +std::string MakeFlashIndexFilename(const std::string& base_dir) { + return base_dir + "/main_index"; +} + } // namespace MainIndex::MainIndex(const std::string& index_directory, @@ -112,12 +117,18 @@ libtextclassifier3::StatusOr<std::unique_ptr<MainIndex>> MainIndex::Create( return main_index; } +/* static */ libtextclassifier3::StatusOr<int> MainIndex::ReadFlashIndexMagic( + const Filesystem* filesystem, const std::string& index_directory) { + return FlashIndexStorage::ReadHeaderMagic( + filesystem, MakeFlashIndexFilename(index_directory)); +} + // TODO(b/139087650) : Migrate off of IcingFilesystem. libtextclassifier3::Status MainIndex::Init() { if (!filesystem_->CreateDirectoryRecursively(base_dir_.c_str())) { return absl_ports::InternalError("Unable to create main index directory."); } - std::string flash_index_file = base_dir_ + "/main_index"; + std::string flash_index_file = MakeFlashIndexFilename(base_dir_); ICING_ASSIGN_OR_RETURN( FlashIndexStorage flash_index, FlashIndexStorage::Create(flash_index_file, filesystem_, diff --git a/icing/index/main/main-index.h b/icing/index/main/main-index.h index e181330..9e570d5 100644 --- a/icing/index/main/main-index.h +++ b/icing/index/main/main-index.h @@ -48,6 +48,16 @@ class MainIndex { const std::string& index_directory, const Filesystem* filesystem, const IcingFilesystem* icing_filesystem); + // Reads magic from existing flash index storage file header. We need this + // during Icing initialization phase to determine the version. + // + // RETURNS: + // - On success, a valid magic. + // - NOT_FOUND if the flash index doesn't exist. + // - INTERNAL on I/O error. + static libtextclassifier3::StatusOr<int> ReadFlashIndexMagic( + const Filesystem* filesystem, const std::string& index_directory); + // Get a PostingListHitAccessor that holds the posting list chain for 'term'. // // RETURNS: diff --git a/icing/index/numeric/integer-index-storage.cc b/icing/index/numeric/integer-index-storage.cc index f3901e1..5165040 100644 --- a/icing/index/numeric/integer-index-storage.cc +++ b/icing/index/numeric/integer-index-storage.cc @@ -292,12 +292,17 @@ libtextclassifier3::Status IntegerIndexStorageIterator::Advance() { // Merge sections with same document_id into a single DocHitInfo while (!pq_.empty() && pq_.top()->GetCurrentBasicHit().document_id() == document_id) { - doc_hit_info_.UpdateSection(pq_.top()->GetCurrentBasicHit().section_id()); - BucketPostingListIterator* bucket_itr = pq_.top(); pq_.pop(); - if (bucket_itr->AdvanceAndFilter(key_lower_, key_upper_).ok()) { + libtextclassifier3::Status advance_status; + do { + doc_hit_info_.UpdateSection( + bucket_itr->GetCurrentBasicHit().section_id()); + advance_status = bucket_itr->AdvanceAndFilter(key_lower_, key_upper_); + } while (advance_status.ok() && + bucket_itr->GetCurrentBasicHit().document_id() == document_id); + if (advance_status.ok()) { pq_.push(bucket_itr); } } diff --git a/icing/index/numeric/integer-index-storage_benchmark.cc b/icing/index/numeric/integer-index-storage_benchmark.cc index 54b19c3..27f35d9 100644 --- a/icing/index/numeric/integer-index-storage_benchmark.cc +++ b/icing/index/numeric/integer-index-storage_benchmark.cc @@ -12,22 +12,30 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include <algorithm> #include <cstdint> +#include <limits> #include <memory> #include <string> #include <unordered_map> +#include <utility> #include <vector> +#include "icing/text_classifier/lib3/utils/base/statusor.h" #include "testing/base/public/benchmark.h" #include "gmock/gmock.h" #include "gtest/gtest.h" #include "icing/absl_ports/canonical_errors.h" #include "icing/file/destructible-directory.h" #include "icing/file/filesystem.h" +#include "icing/index/hit/doc-hit-info.h" +#include "icing/index/iterator/doc-hit-info-iterator.h" #include "icing/index/numeric/integer-index-storage.h" #include "icing/index/numeric/posting-list-integer-index-serializer.h" +#include "icing/schema/section.h" #include "icing/store/document-id.h" #include "icing/testing/common-matchers.h" +#include "icing/testing/numeric/normal-distribution-number-generator.h" #include "icing/testing/numeric/number-generator.h" #include "icing/testing/numeric/uniform-distribution-integer-generator.h" #include "icing/testing/tmp-directory.h" @@ -65,6 +73,7 @@ static constexpr int kDefaultSeed = 12345; enum DistributionTypeEnum { kUniformDistribution, + kNormalDistribution, }; class IntegerIndexStorageBenchmark { @@ -103,6 +112,19 @@ CreateIntegerGenerator(DistributionTypeEnum distribution_type, int seed, return std::make_unique<UniformDistributionIntegerGenerator<int64_t>>( seed, /*range_lower=*/0, /*range_upper=*/static_cast<int64_t>(num_keys) * 10 - 1); + case DistributionTypeEnum::kNormalDistribution: + // Normal distribution with mean = 0 and stddev = num_keys / 1024. + // - keys in range [-1 * stddev, 1 * stddev]: 68.2% + // - keys in range [-2 * stddev, 2 * stddev]: 95.4% + // - keys in range [-3 * stddev, 3 * stddev]: 99.7% + // + // - When generating num_keys integers, 68.2% of them will be in range + // [-num_keys / 1024, num_keys / 1024] + // - Each number in this range will be sampled (num_keys * 0.682) / + // ((num_keys / 1024) * 2) = 349 times on average and become + // "single-range bucket". + return std::make_unique<NormalDistributionNumberGenerator<int64_t>>( + seed, /*mean=*/0.0, /*stddev=*/num_keys / 1024.0); default: return absl_ports::InvalidArgumentError("Unknown type"); } @@ -155,7 +177,18 @@ BENCHMARK(BM_Index) ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 17) ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 18) ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 19) - ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 20); + ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 20) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 10) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 11) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 12) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 13) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 14) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 15) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 16) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 17) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 18) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 19) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 20); void BM_BatchIndex(benchmark::State& state) { DistributionTypeEnum distribution_type = @@ -203,7 +236,18 @@ BENCHMARK(BM_BatchIndex) ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 17) ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 18) ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 19) - ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 20); + ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 20) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 10) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 11) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 12) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 13) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 14) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 15) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 16) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 17) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 18) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 19) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 20); void BM_ExactQuery(benchmark::State& state) { DistributionTypeEnum distribution_type = @@ -269,7 +313,81 @@ BENCHMARK(BM_ExactQuery) ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 17) ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 18) ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 19) - ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 20); + ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 20) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 10) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 11) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 12) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 13) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 14) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 15) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 16) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 17) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 18) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 19) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 20); + +void BM_RangeQueryAll(benchmark::State& state) { + DistributionTypeEnum distribution_type = + static_cast<DistributionTypeEnum>(state.range(0)); + int num_keys = state.range(1); + + IntegerIndexStorageBenchmark benchmark; + benchmark.filesystem.DeleteDirectoryRecursively( + benchmark.working_path.c_str()); + DestructibleDirectory ddir(&benchmark.filesystem, benchmark.working_path); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<IntegerIndexStorage> storage, + IntegerIndexStorage::Create(benchmark.filesystem, benchmark.working_path, + IntegerIndexStorage::Options(), + &benchmark.posting_list_serializer)); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<NumberGenerator<int64_t>> generator, + CreateIntegerGenerator(distribution_type, kDefaultSeed, num_keys)); + for (int i = 0; i < num_keys; ++i) { + ICING_ASSERT_OK(storage->AddKeys(static_cast<DocumentId>(i), + kDefaultSectionId, + {generator->Generate()})); + } + ICING_ASSERT_OK(storage->PersistToDisk()); + + for (auto _ : state) { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<DocHitInfoIterator> iterator, + storage->GetIterator( + /*query_key_lower=*/std::numeric_limits<int64_t>::min(), + /*query_key_upper=*/std::numeric_limits<int64_t>::max())); + std::vector<DocHitInfo> data; + while (iterator->Advance().ok()) { + data.push_back(iterator->doc_hit_info()); + } + + ASSERT_THAT(data, SizeIs(num_keys)); + } +} +BENCHMARK(BM_RangeQueryAll) + ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 10) + ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 11) + ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 12) + ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 13) + ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 14) + ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 15) + ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 16) + ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 17) + ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 18) + ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 19) + ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 20) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 10) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 11) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 12) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 13) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 14) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 15) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 16) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 17) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 18) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 19) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 20); } // namespace diff --git a/icing/index/numeric/integer-index_test.cc b/icing/index/numeric/integer-index_test.cc index ec7f55b..92433e1 100644 --- a/icing/index/numeric/integer-index_test.cc +++ b/icing/index/numeric/integer-index_test.cc @@ -389,7 +389,10 @@ TYPED_TEST(NumericIndexIntegerTest, WildcardStorageQuery) { .AddProperty(PropertyConfigBuilder(int_property_config) .SetName("desiredProperty"))) .Build(); - ICING_ASSERT_OK(this->schema_store_->SetSchema(schema)); + ICING_ASSERT_OK(this->schema_store_->SetSchema( + schema, + /*ignore_errors_and_delete_documents=*/false, + /*allow_circular_schema_definitions=*/false)); // Put 11 docs of "TypeA" into the document store. DocumentProto doc = @@ -1492,7 +1495,10 @@ TEST_F(IntegerIndexTest, WildcardStoragePersistenceQuery) { .AddProperty(PropertyConfigBuilder(int_property_config) .SetName("desiredProperty"))) .Build(); - ICING_ASSERT_OK(this->schema_store_->SetSchema(schema)); + ICING_ASSERT_OK(this->schema_store_->SetSchema( + schema, + /*ignore_errors_and_delete_documents=*/false, + /*allow_circular_schema_definitions=*/false)); // Ids are assigned alphabetically, so the property ids are: // TypeA.desiredProperty = 0 @@ -1862,7 +1868,10 @@ TEST_F(IntegerIndexTest, WildcardStorageWorksAfterOptimize) { .AddProperty(PropertyConfigBuilder(int_property_config) .SetName("desiredProperty"))) .Build(); - ICING_ASSERT_OK(this->schema_store_->SetSchema(schema)); + ICING_ASSERT_OK(this->schema_store_->SetSchema( + schema, + /*ignore_errors_and_delete_documents=*/false, + /*allow_circular_schema_definitions=*/false)); // Ids are assigned alphabetically, so the property ids are: // TypeA.desiredProperty = 0 @@ -2145,7 +2154,10 @@ TEST_F(IntegerIndexTest, WildcardStorageAvailableIndicesAfterOptimize) { .AddProperty(PropertyConfigBuilder(int_property_config) .SetName("undesiredProperty"))) .Build(); - ICING_ASSERT_OK(this->schema_store_->SetSchema(schema)); + ICING_ASSERT_OK(this->schema_store_->SetSchema( + schema, + /*ignore_errors_and_delete_documents=*/false, + /*allow_circular_schema_definitions=*/false)); // Ids are assigned alphabetically, so the property ids are: // TypeA.desiredProperty = 0 diff --git a/icing/index/string-section-indexing-handler.cc b/icing/index/string-section-indexing-handler.cc index a992568..69b8889 100644 --- a/icing/index/string-section-indexing-handler.cc +++ b/icing/index/string-section-indexing-handler.cc @@ -30,6 +30,8 @@ #include "icing/store/document-id.h" #include "icing/transform/normalizer.h" #include "icing/util/clock.h" +#include "icing/util/logging.h" +#include "icing/util/status-macros.h" #include "icing/util/tokenized-document.h" namespace icing { @@ -121,7 +123,8 @@ libtextclassifier3::Status StringSectionIndexingHandler::Handle( } if (put_document_stats != nullptr) { - // TODO(b/259744228): set term index latency. + put_document_stats->set_term_index_latency_ms( + index_timer->GetElapsedMilliseconds()); put_document_stats->mutable_tokenization_stats()->set_num_tokens_indexed( num_tokens); } |