aboutsummaryrefslogtreecommitdiff
path: root/icing/index/numeric
diff options
context:
space:
mode:
authorTim Barron <tjbarron@google.com>2023-05-11 06:22:44 +0000
committerTim Barron <tjbarron@google.com>2023-05-11 15:45:06 +0000
commitfc9e6aac9c62d4546cb25548e1bbb317b7a4fd9a (patch)
tree061f4d194144ae4d5f81e6b56c148ddb5495e694 /icing/index/numeric
parenta7d57e98ea7168d66cf01ace85598e33d5e9e5db (diff)
downloadicing-fc9e6aac9c62d4546cb25548e1bbb317b7a4fd9a.tar.gz
Update Icing from upstream.
Descriptions: ======================================================================== Modify the definition of propertyDefined: ======================================================================== Remove default args in SchemaStore::SetSchema and fix calls ======================================================================== Add allow_circular_schema_definitions flag ======================================================================== Onboard version detection to Icing ======================================================================== Add version util to help read/write version info ======================================================================== Add support for the overlay schema. ======================================================================== Allow cycles in schema-property-iterator ======================================================================== Add joinable properties into schema definition cycle restrictions. ======================================================================== Loosen circular references restriction for Schema Definitions. ======================================================================== Implement BackupSchemaProducer to generate a backup schema ======================================================================== Minor fix: remove a redundant log ======================================================================== Allow schema types to inherit from more than one parent ======================================================================== allow nested document properties to accept documents of subtype ======================================================================== Support polymorphism for Icing projection in Search and Get API ======================================================================== Add max_joined_child_per_parent into ResultSpec and change behavior ======================================================================== Support Icing schema type polymorphism for the search filter API ======================================================================== Verify that every child type's property set has included all compatible properties from parent types ======================================================================== Add individual type index latency ======================================================================== Build the iterator node for the propertyDefined() custom function ======================================================================== Advance all hits with same doc id from and merge sections once for the same bucket iter ======================================================================== Introduce DocHitInfoIteratorPropertyInSchema for property existence check ======================================================================== Add SchemaUtil::BuildTransitiveInheritanceGraph to build an inheritance map from schema ======================================================================== Introduce a lookup method for a property defined in a schema ======================================================================== Rollback of: Allow LanguageSegmenter::Iterators to declare AccessType. ======================================================================== Adds join info to QueryStatsProto ======================================================================== Bug:280698419 Bug:280698125 Bug:280698121 Bug:280697513 Bug:276349029 Bug:272145329 Bug:270102295 Bug:269295094 Bug:268680462 Bug:265304217 Bug:259744228 Bug:259743562 Bug:256022027 Change-Id: I54cd1d22121c314f8c238d2d49f0809165dc0ca3
Diffstat (limited to 'icing/index/numeric')
-rw-r--r--icing/index/numeric/integer-index-storage.cc11
-rw-r--r--icing/index/numeric/integer-index-storage_benchmark.cc124
-rw-r--r--icing/index/numeric/integer-index_test.cc20
3 files changed, 145 insertions, 10 deletions
diff --git a/icing/index/numeric/integer-index-storage.cc b/icing/index/numeric/integer-index-storage.cc
index f3901e1..5165040 100644
--- a/icing/index/numeric/integer-index-storage.cc
+++ b/icing/index/numeric/integer-index-storage.cc
@@ -292,12 +292,17 @@ libtextclassifier3::Status IntegerIndexStorageIterator::Advance() {
// Merge sections with same document_id into a single DocHitInfo
while (!pq_.empty() &&
pq_.top()->GetCurrentBasicHit().document_id() == document_id) {
- doc_hit_info_.UpdateSection(pq_.top()->GetCurrentBasicHit().section_id());
-
BucketPostingListIterator* bucket_itr = pq_.top();
pq_.pop();
- if (bucket_itr->AdvanceAndFilter(key_lower_, key_upper_).ok()) {
+ libtextclassifier3::Status advance_status;
+ do {
+ doc_hit_info_.UpdateSection(
+ bucket_itr->GetCurrentBasicHit().section_id());
+ advance_status = bucket_itr->AdvanceAndFilter(key_lower_, key_upper_);
+ } while (advance_status.ok() &&
+ bucket_itr->GetCurrentBasicHit().document_id() == document_id);
+ if (advance_status.ok()) {
pq_.push(bucket_itr);
}
}
diff --git a/icing/index/numeric/integer-index-storage_benchmark.cc b/icing/index/numeric/integer-index-storage_benchmark.cc
index 54b19c3..27f35d9 100644
--- a/icing/index/numeric/integer-index-storage_benchmark.cc
+++ b/icing/index/numeric/integer-index-storage_benchmark.cc
@@ -12,22 +12,30 @@
// See the License for the specific language governing permissions and
// limitations under the License.
+#include <algorithm>
#include <cstdint>
+#include <limits>
#include <memory>
#include <string>
#include <unordered_map>
+#include <utility>
#include <vector>
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "testing/base/public/benchmark.h"
#include "gmock/gmock.h"
#include "gtest/gtest.h"
#include "icing/absl_ports/canonical_errors.h"
#include "icing/file/destructible-directory.h"
#include "icing/file/filesystem.h"
+#include "icing/index/hit/doc-hit-info.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
#include "icing/index/numeric/integer-index-storage.h"
#include "icing/index/numeric/posting-list-integer-index-serializer.h"
+#include "icing/schema/section.h"
#include "icing/store/document-id.h"
#include "icing/testing/common-matchers.h"
+#include "icing/testing/numeric/normal-distribution-number-generator.h"
#include "icing/testing/numeric/number-generator.h"
#include "icing/testing/numeric/uniform-distribution-integer-generator.h"
#include "icing/testing/tmp-directory.h"
@@ -65,6 +73,7 @@ static constexpr int kDefaultSeed = 12345;
enum DistributionTypeEnum {
kUniformDistribution,
+ kNormalDistribution,
};
class IntegerIndexStorageBenchmark {
@@ -103,6 +112,19 @@ CreateIntegerGenerator(DistributionTypeEnum distribution_type, int seed,
return std::make_unique<UniformDistributionIntegerGenerator<int64_t>>(
seed, /*range_lower=*/0,
/*range_upper=*/static_cast<int64_t>(num_keys) * 10 - 1);
+ case DistributionTypeEnum::kNormalDistribution:
+ // Normal distribution with mean = 0 and stddev = num_keys / 1024.
+ // - keys in range [-1 * stddev, 1 * stddev]: 68.2%
+ // - keys in range [-2 * stddev, 2 * stddev]: 95.4%
+ // - keys in range [-3 * stddev, 3 * stddev]: 99.7%
+ //
+ // - When generating num_keys integers, 68.2% of them will be in range
+ // [-num_keys / 1024, num_keys / 1024]
+ // - Each number in this range will be sampled (num_keys * 0.682) /
+ // ((num_keys / 1024) * 2) = 349 times on average and become
+ // "single-range bucket".
+ return std::make_unique<NormalDistributionNumberGenerator<int64_t>>(
+ seed, /*mean=*/0.0, /*stddev=*/num_keys / 1024.0);
default:
return absl_ports::InvalidArgumentError("Unknown type");
}
@@ -155,7 +177,18 @@ BENCHMARK(BM_Index)
->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 17)
->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 18)
->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 19)
- ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 20);
+ ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 20)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 10)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 11)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 12)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 13)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 14)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 15)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 16)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 17)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 18)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 19)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 20);
void BM_BatchIndex(benchmark::State& state) {
DistributionTypeEnum distribution_type =
@@ -203,7 +236,18 @@ BENCHMARK(BM_BatchIndex)
->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 17)
->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 18)
->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 19)
- ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 20);
+ ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 20)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 10)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 11)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 12)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 13)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 14)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 15)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 16)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 17)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 18)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 19)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 20);
void BM_ExactQuery(benchmark::State& state) {
DistributionTypeEnum distribution_type =
@@ -269,7 +313,81 @@ BENCHMARK(BM_ExactQuery)
->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 17)
->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 18)
->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 19)
- ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 20);
+ ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 20)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 10)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 11)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 12)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 13)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 14)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 15)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 16)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 17)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 18)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 19)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 20);
+
+void BM_RangeQueryAll(benchmark::State& state) {
+ DistributionTypeEnum distribution_type =
+ static_cast<DistributionTypeEnum>(state.range(0));
+ int num_keys = state.range(1);
+
+ IntegerIndexStorageBenchmark benchmark;
+ benchmark.filesystem.DeleteDirectoryRecursively(
+ benchmark.working_path.c_str());
+ DestructibleDirectory ddir(&benchmark.filesystem, benchmark.working_path);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndexStorage> storage,
+ IntegerIndexStorage::Create(benchmark.filesystem, benchmark.working_path,
+ IntegerIndexStorage::Options(),
+ &benchmark.posting_list_serializer));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<NumberGenerator<int64_t>> generator,
+ CreateIntegerGenerator(distribution_type, kDefaultSeed, num_keys));
+ for (int i = 0; i < num_keys; ++i) {
+ ICING_ASSERT_OK(storage->AddKeys(static_cast<DocumentId>(i),
+ kDefaultSectionId,
+ {generator->Generate()}));
+ }
+ ICING_ASSERT_OK(storage->PersistToDisk());
+
+ for (auto _ : state) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> iterator,
+ storage->GetIterator(
+ /*query_key_lower=*/std::numeric_limits<int64_t>::min(),
+ /*query_key_upper=*/std::numeric_limits<int64_t>::max()));
+ std::vector<DocHitInfo> data;
+ while (iterator->Advance().ok()) {
+ data.push_back(iterator->doc_hit_info());
+ }
+
+ ASSERT_THAT(data, SizeIs(num_keys));
+ }
+}
+BENCHMARK(BM_RangeQueryAll)
+ ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 10)
+ ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 11)
+ ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 12)
+ ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 13)
+ ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 14)
+ ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 15)
+ ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 16)
+ ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 17)
+ ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 18)
+ ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 19)
+ ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 20)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 10)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 11)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 12)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 13)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 14)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 15)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 16)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 17)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 18)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 19)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 20);
} // namespace
diff --git a/icing/index/numeric/integer-index_test.cc b/icing/index/numeric/integer-index_test.cc
index ec7f55b..92433e1 100644
--- a/icing/index/numeric/integer-index_test.cc
+++ b/icing/index/numeric/integer-index_test.cc
@@ -389,7 +389,10 @@ TYPED_TEST(NumericIndexIntegerTest, WildcardStorageQuery) {
.AddProperty(PropertyConfigBuilder(int_property_config)
.SetName("desiredProperty")))
.Build();
- ICING_ASSERT_OK(this->schema_store_->SetSchema(schema));
+ ICING_ASSERT_OK(this->schema_store_->SetSchema(
+ schema,
+ /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
// Put 11 docs of "TypeA" into the document store.
DocumentProto doc =
@@ -1492,7 +1495,10 @@ TEST_F(IntegerIndexTest, WildcardStoragePersistenceQuery) {
.AddProperty(PropertyConfigBuilder(int_property_config)
.SetName("desiredProperty")))
.Build();
- ICING_ASSERT_OK(this->schema_store_->SetSchema(schema));
+ ICING_ASSERT_OK(this->schema_store_->SetSchema(
+ schema,
+ /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
// Ids are assigned alphabetically, so the property ids are:
// TypeA.desiredProperty = 0
@@ -1862,7 +1868,10 @@ TEST_F(IntegerIndexTest, WildcardStorageWorksAfterOptimize) {
.AddProperty(PropertyConfigBuilder(int_property_config)
.SetName("desiredProperty")))
.Build();
- ICING_ASSERT_OK(this->schema_store_->SetSchema(schema));
+ ICING_ASSERT_OK(this->schema_store_->SetSchema(
+ schema,
+ /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
// Ids are assigned alphabetically, so the property ids are:
// TypeA.desiredProperty = 0
@@ -2145,7 +2154,10 @@ TEST_F(IntegerIndexTest, WildcardStorageAvailableIndicesAfterOptimize) {
.AddProperty(PropertyConfigBuilder(int_property_config)
.SetName("undesiredProperty")))
.Build();
- ICING_ASSERT_OK(this->schema_store_->SetSchema(schema));
+ ICING_ASSERT_OK(this->schema_store_->SetSchema(
+ schema,
+ /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
// Ids are assigned alphabetically, so the property ids are:
// TypeA.desiredProperty = 0