diff options
author | Tim Barron <tjbarron@google.com> | 2023-05-11 06:22:44 +0000 |
---|---|---|
committer | Tim Barron <tjbarron@google.com> | 2023-05-11 15:45:06 +0000 |
commit | fc9e6aac9c62d4546cb25548e1bbb317b7a4fd9a (patch) | |
tree | 061f4d194144ae4d5f81e6b56c148ddb5495e694 /icing/index/numeric | |
parent | a7d57e98ea7168d66cf01ace85598e33d5e9e5db (diff) | |
download | icing-fc9e6aac9c62d4546cb25548e1bbb317b7a4fd9a.tar.gz |
Update Icing from upstream.
Descriptions:
========================================================================
Modify the definition of propertyDefined:
========================================================================
Remove default args in SchemaStore::SetSchema and fix calls
========================================================================
Add allow_circular_schema_definitions flag
========================================================================
Onboard version detection to Icing
========================================================================
Add version util to help read/write version info
========================================================================
Add support for the overlay schema.
========================================================================
Allow cycles in schema-property-iterator
========================================================================
Add joinable properties into schema definition cycle restrictions.
========================================================================
Loosen circular references restriction for Schema Definitions.
========================================================================
Implement BackupSchemaProducer to generate a backup schema
========================================================================
Minor fix: remove a redundant log
========================================================================
Allow schema types to inherit from more than one parent
========================================================================
allow nested document properties to accept documents of subtype
========================================================================
Support polymorphism for Icing projection in Search and Get API
========================================================================
Add max_joined_child_per_parent into ResultSpec and change behavior
========================================================================
Support Icing schema type polymorphism for the search filter API
========================================================================
Verify that every child type's property set has included all compatible properties from parent types
========================================================================
Add individual type index latency
========================================================================
Build the iterator node for the propertyDefined() custom function
========================================================================
Advance all hits with same doc id from and merge sections once for the same bucket iter
========================================================================
Introduce DocHitInfoIteratorPropertyInSchema for property existence check
========================================================================
Add SchemaUtil::BuildTransitiveInheritanceGraph to build an inheritance map from schema
========================================================================
Introduce a lookup method for a property defined in a schema
========================================================================
Rollback of: Allow LanguageSegmenter::Iterators to declare AccessType.
========================================================================
Adds join info to QueryStatsProto
========================================================================
Bug:280698419
Bug:280698125
Bug:280698121
Bug:280697513
Bug:276349029
Bug:272145329
Bug:270102295
Bug:269295094
Bug:268680462
Bug:265304217
Bug:259744228
Bug:259743562
Bug:256022027
Change-Id: I54cd1d22121c314f8c238d2d49f0809165dc0ca3
Diffstat (limited to 'icing/index/numeric')
-rw-r--r-- | icing/index/numeric/integer-index-storage.cc | 11 | ||||
-rw-r--r-- | icing/index/numeric/integer-index-storage_benchmark.cc | 124 | ||||
-rw-r--r-- | icing/index/numeric/integer-index_test.cc | 20 |
3 files changed, 145 insertions, 10 deletions
diff --git a/icing/index/numeric/integer-index-storage.cc b/icing/index/numeric/integer-index-storage.cc index f3901e1..5165040 100644 --- a/icing/index/numeric/integer-index-storage.cc +++ b/icing/index/numeric/integer-index-storage.cc @@ -292,12 +292,17 @@ libtextclassifier3::Status IntegerIndexStorageIterator::Advance() { // Merge sections with same document_id into a single DocHitInfo while (!pq_.empty() && pq_.top()->GetCurrentBasicHit().document_id() == document_id) { - doc_hit_info_.UpdateSection(pq_.top()->GetCurrentBasicHit().section_id()); - BucketPostingListIterator* bucket_itr = pq_.top(); pq_.pop(); - if (bucket_itr->AdvanceAndFilter(key_lower_, key_upper_).ok()) { + libtextclassifier3::Status advance_status; + do { + doc_hit_info_.UpdateSection( + bucket_itr->GetCurrentBasicHit().section_id()); + advance_status = bucket_itr->AdvanceAndFilter(key_lower_, key_upper_); + } while (advance_status.ok() && + bucket_itr->GetCurrentBasicHit().document_id() == document_id); + if (advance_status.ok()) { pq_.push(bucket_itr); } } diff --git a/icing/index/numeric/integer-index-storage_benchmark.cc b/icing/index/numeric/integer-index-storage_benchmark.cc index 54b19c3..27f35d9 100644 --- a/icing/index/numeric/integer-index-storage_benchmark.cc +++ b/icing/index/numeric/integer-index-storage_benchmark.cc @@ -12,22 +12,30 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include <algorithm> #include <cstdint> +#include <limits> #include <memory> #include <string> #include <unordered_map> +#include <utility> #include <vector> +#include "icing/text_classifier/lib3/utils/base/statusor.h" #include "testing/base/public/benchmark.h" #include "gmock/gmock.h" #include "gtest/gtest.h" #include "icing/absl_ports/canonical_errors.h" #include "icing/file/destructible-directory.h" #include "icing/file/filesystem.h" +#include "icing/index/hit/doc-hit-info.h" +#include "icing/index/iterator/doc-hit-info-iterator.h" #include "icing/index/numeric/integer-index-storage.h" #include "icing/index/numeric/posting-list-integer-index-serializer.h" +#include "icing/schema/section.h" #include "icing/store/document-id.h" #include "icing/testing/common-matchers.h" +#include "icing/testing/numeric/normal-distribution-number-generator.h" #include "icing/testing/numeric/number-generator.h" #include "icing/testing/numeric/uniform-distribution-integer-generator.h" #include "icing/testing/tmp-directory.h" @@ -65,6 +73,7 @@ static constexpr int kDefaultSeed = 12345; enum DistributionTypeEnum { kUniformDistribution, + kNormalDistribution, }; class IntegerIndexStorageBenchmark { @@ -103,6 +112,19 @@ CreateIntegerGenerator(DistributionTypeEnum distribution_type, int seed, return std::make_unique<UniformDistributionIntegerGenerator<int64_t>>( seed, /*range_lower=*/0, /*range_upper=*/static_cast<int64_t>(num_keys) * 10 - 1); + case DistributionTypeEnum::kNormalDistribution: + // Normal distribution with mean = 0 and stddev = num_keys / 1024. + // - keys in range [-1 * stddev, 1 * stddev]: 68.2% + // - keys in range [-2 * stddev, 2 * stddev]: 95.4% + // - keys in range [-3 * stddev, 3 * stddev]: 99.7% + // + // - When generating num_keys integers, 68.2% of them will be in range + // [-num_keys / 1024, num_keys / 1024] + // - Each number in this range will be sampled (num_keys * 0.682) / + // ((num_keys / 1024) * 2) = 349 times on average and become + // "single-range bucket". + return std::make_unique<NormalDistributionNumberGenerator<int64_t>>( + seed, /*mean=*/0.0, /*stddev=*/num_keys / 1024.0); default: return absl_ports::InvalidArgumentError("Unknown type"); } @@ -155,7 +177,18 @@ BENCHMARK(BM_Index) ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 17) ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 18) ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 19) - ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 20); + ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 20) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 10) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 11) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 12) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 13) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 14) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 15) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 16) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 17) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 18) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 19) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 20); void BM_BatchIndex(benchmark::State& state) { DistributionTypeEnum distribution_type = @@ -203,7 +236,18 @@ BENCHMARK(BM_BatchIndex) ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 17) ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 18) ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 19) - ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 20); + ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 20) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 10) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 11) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 12) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 13) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 14) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 15) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 16) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 17) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 18) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 19) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 20); void BM_ExactQuery(benchmark::State& state) { DistributionTypeEnum distribution_type = @@ -269,7 +313,81 @@ BENCHMARK(BM_ExactQuery) ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 17) ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 18) ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 19) - ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 20); + ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 20) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 10) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 11) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 12) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 13) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 14) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 15) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 16) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 17) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 18) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 19) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 20); + +void BM_RangeQueryAll(benchmark::State& state) { + DistributionTypeEnum distribution_type = + static_cast<DistributionTypeEnum>(state.range(0)); + int num_keys = state.range(1); + + IntegerIndexStorageBenchmark benchmark; + benchmark.filesystem.DeleteDirectoryRecursively( + benchmark.working_path.c_str()); + DestructibleDirectory ddir(&benchmark.filesystem, benchmark.working_path); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<IntegerIndexStorage> storage, + IntegerIndexStorage::Create(benchmark.filesystem, benchmark.working_path, + IntegerIndexStorage::Options(), + &benchmark.posting_list_serializer)); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<NumberGenerator<int64_t>> generator, + CreateIntegerGenerator(distribution_type, kDefaultSeed, num_keys)); + for (int i = 0; i < num_keys; ++i) { + ICING_ASSERT_OK(storage->AddKeys(static_cast<DocumentId>(i), + kDefaultSectionId, + {generator->Generate()})); + } + ICING_ASSERT_OK(storage->PersistToDisk()); + + for (auto _ : state) { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<DocHitInfoIterator> iterator, + storage->GetIterator( + /*query_key_lower=*/std::numeric_limits<int64_t>::min(), + /*query_key_upper=*/std::numeric_limits<int64_t>::max())); + std::vector<DocHitInfo> data; + while (iterator->Advance().ok()) { + data.push_back(iterator->doc_hit_info()); + } + + ASSERT_THAT(data, SizeIs(num_keys)); + } +} +BENCHMARK(BM_RangeQueryAll) + ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 10) + ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 11) + ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 12) + ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 13) + ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 14) + ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 15) + ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 16) + ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 17) + ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 18) + ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 19) + ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 20) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 10) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 11) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 12) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 13) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 14) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 15) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 16) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 17) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 18) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 19) + ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 20); } // namespace diff --git a/icing/index/numeric/integer-index_test.cc b/icing/index/numeric/integer-index_test.cc index ec7f55b..92433e1 100644 --- a/icing/index/numeric/integer-index_test.cc +++ b/icing/index/numeric/integer-index_test.cc @@ -389,7 +389,10 @@ TYPED_TEST(NumericIndexIntegerTest, WildcardStorageQuery) { .AddProperty(PropertyConfigBuilder(int_property_config) .SetName("desiredProperty"))) .Build(); - ICING_ASSERT_OK(this->schema_store_->SetSchema(schema)); + ICING_ASSERT_OK(this->schema_store_->SetSchema( + schema, + /*ignore_errors_and_delete_documents=*/false, + /*allow_circular_schema_definitions=*/false)); // Put 11 docs of "TypeA" into the document store. DocumentProto doc = @@ -1492,7 +1495,10 @@ TEST_F(IntegerIndexTest, WildcardStoragePersistenceQuery) { .AddProperty(PropertyConfigBuilder(int_property_config) .SetName("desiredProperty"))) .Build(); - ICING_ASSERT_OK(this->schema_store_->SetSchema(schema)); + ICING_ASSERT_OK(this->schema_store_->SetSchema( + schema, + /*ignore_errors_and_delete_documents=*/false, + /*allow_circular_schema_definitions=*/false)); // Ids are assigned alphabetically, so the property ids are: // TypeA.desiredProperty = 0 @@ -1862,7 +1868,10 @@ TEST_F(IntegerIndexTest, WildcardStorageWorksAfterOptimize) { .AddProperty(PropertyConfigBuilder(int_property_config) .SetName("desiredProperty"))) .Build(); - ICING_ASSERT_OK(this->schema_store_->SetSchema(schema)); + ICING_ASSERT_OK(this->schema_store_->SetSchema( + schema, + /*ignore_errors_and_delete_documents=*/false, + /*allow_circular_schema_definitions=*/false)); // Ids are assigned alphabetically, so the property ids are: // TypeA.desiredProperty = 0 @@ -2145,7 +2154,10 @@ TEST_F(IntegerIndexTest, WildcardStorageAvailableIndicesAfterOptimize) { .AddProperty(PropertyConfigBuilder(int_property_config) .SetName("undesiredProperty"))) .Build(); - ICING_ASSERT_OK(this->schema_store_->SetSchema(schema)); + ICING_ASSERT_OK(this->schema_store_->SetSchema( + schema, + /*ignore_errors_and_delete_documents=*/false, + /*allow_circular_schema_definitions=*/false)); // Ids are assigned alphabetically, so the property ids are: // TypeA.desiredProperty = 0 |