diff options
Diffstat (limited to 'icing/index')
20 files changed, 3548 insertions, 256 deletions
diff --git a/icing/index/index-processor_test.cc b/icing/index/index-processor_test.cc index 3a9b4ee..47baabe 100644 --- a/icing/index/index-processor_test.cc +++ b/icing/index/index-processor_test.cc @@ -40,6 +40,8 @@ #include "icing/index/numeric/numeric-index.h" #include "icing/index/string-section-indexing-handler.h" #include "icing/index/term-property-id.h" +#include "icing/join/qualified-id-joinable-property-indexing-handler.h" +#include "icing/join/qualified-id-type-joinable-index.h" #include "icing/legacy/index/icing-filesystem.h" #include "icing/legacy/index/icing-mock-filesystem.h" #include "icing/portable/platform.h" @@ -51,6 +53,7 @@ #include "icing/schema/schema-util.h" #include "icing/schema/section.h" #include "icing/store/document-id.h" +#include "icing/store/document-store.h" #include "icing/testing/common-matchers.h" #include "icing/testing/fake-clock.h" #include "icing/testing/icu-data-file-helper.h" @@ -160,7 +163,9 @@ class IndexProcessorTest : public Test { index_dir_ = base_dir_ + "/index"; integer_index_dir_ = base_dir_ + "/integer_index"; + qualified_id_join_index_dir_ = base_dir_ + "/qualified_id_join_index"; schema_store_dir_ = base_dir_ + "/schema_store"; + doc_store_dir_ = base_dir_ + "/doc_store"; Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024); ICING_ASSERT_OK_AND_ASSIGN( @@ -169,6 +174,10 @@ class IndexProcessorTest : public Test { ICING_ASSERT_OK_AND_ASSIGN( integer_index_, IntegerIndex::Create(filesystem_, integer_index_dir_)); + ICING_ASSERT_OK_AND_ASSIGN(qualified_id_join_index_, + QualifiedIdTypeJoinableIndex::Create( + filesystem_, qualified_id_join_index_dir_)); + language_segmenter_factory::SegmenterOptions segmenter_options(ULOC_US); ICING_ASSERT_OK_AND_ASSIGN( lang_segmenter_, @@ -260,6 +269,13 @@ class IndexProcessorTest : public Test { .Build(); ICING_ASSERT_OK(schema_store_->SetSchema(schema)); + ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(doc_store_dir_.c_str())); + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult create_result, + DocumentStore::Create(&filesystem_, doc_store_dir_, &fake_clock_, + schema_store_.get())); + doc_store_ = std::move(create_result.document_store); + ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<StringSectionIndexingHandler> string_section_indexing_handler, @@ -269,9 +285,16 @@ class IndexProcessorTest : public Test { integer_section_indexing_handler, IntegerSectionIndexingHandler::Create( &fake_clock_, integer_index_.get())); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdJoinablePropertyIndexingHandler> + qualified_id_joinable_property_indexing_handler, + QualifiedIdJoinablePropertyIndexingHandler::Create( + &fake_clock_, qualified_id_join_index_.get())); std::vector<std::unique_ptr<DataIndexingHandler>> handlers; handlers.push_back(std::move(string_section_indexing_handler)); handlers.push_back(std::move(integer_section_indexing_handler)); + handlers.push_back( + std::move(qualified_id_joinable_property_indexing_handler)); index_processor_ = std::make_unique<IndexProcessor>(std::move(handlers), &fake_clock_); @@ -281,9 +304,11 @@ class IndexProcessorTest : public Test { void TearDown() override { index_processor_.reset(); + doc_store_.reset(); schema_store_.reset(); normalizer_.reset(); lang_segmenter_.reset(); + qualified_id_join_index_.reset(); integer_index_.reset(); index_.reset(); @@ -298,13 +323,17 @@ class IndexProcessorTest : public Test { std::string base_dir_; std::string index_dir_; std::string integer_index_dir_; + std::string qualified_id_join_index_dir_; std::string schema_store_dir_; + std::string doc_store_dir_; std::unique_ptr<Index> index_; std::unique_ptr<NumericIndex<int64_t>> integer_index_; + std::unique_ptr<QualifiedIdTypeJoinableIndex> qualified_id_join_index_; std::unique_ptr<LanguageSegmenter> lang_segmenter_; std::unique_ptr<Normalizer> normalizer_; std::unique_ptr<SchemaStore> schema_store_; + std::unique_ptr<DocumentStore> doc_store_; std::unique_ptr<IndexProcessor> index_processor_; }; @@ -788,9 +817,16 @@ TEST_F(IndexProcessorTest, OutOfOrderDocumentIdsInRecoveryMode) { integer_section_indexing_handler, IntegerSectionIndexingHandler::Create( &fake_clock_, integer_index_.get())); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdJoinablePropertyIndexingHandler> + qualified_id_joinable_property_indexing_handler, + QualifiedIdJoinablePropertyIndexingHandler::Create( + &fake_clock_, qualified_id_join_index_.get())); std::vector<std::unique_ptr<DataIndexingHandler>> handlers; handlers.push_back(std::move(string_section_indexing_handler)); handlers.push_back(std::move(integer_section_indexing_handler)); + handlers.push_back( + std::move(qualified_id_joinable_property_indexing_handler)); IndexProcessor index_processor(std::move(handlers), &fake_clock_, /*recovery_mode=*/true); @@ -1506,10 +1542,10 @@ TEST_F(IndexProcessorTest, IndexableIntegerProperty) { EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), IsOk()); - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<DocHitInfoIterator> itr, - integer_index_->GetIterator(kIndexableIntegerProperty, /*key_lower=*/1, - /*key_upper=*/5)); + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DocHitInfoIterator> itr, + integer_index_->GetIterator( + kIndexableIntegerProperty, /*key_lower=*/1, + /*key_upper=*/5, *doc_store_, *schema_store_)); EXPECT_THAT( GetHits(std::move(itr)), @@ -1535,10 +1571,10 @@ TEST_F(IndexProcessorTest, IndexableIntegerPropertyNoMatch) { EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), IsOk()); - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<DocHitInfoIterator> itr, - integer_index_->GetIterator(kIndexableIntegerProperty, /*key_lower=*/-1, - /*key_upper=*/0)); + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DocHitInfoIterator> itr, + integer_index_->GetIterator( + kIndexableIntegerProperty, /*key_lower=*/-1, + /*key_upper=*/0, *doc_store_, *schema_store_)); EXPECT_THAT(GetHits(std::move(itr)), IsEmpty()); } diff --git a/icing/index/iterator/doc-hit-info-iterator-not.cc b/icing/index/iterator/doc-hit-info-iterator-not.cc index 1818f08..38b1ded 100644 --- a/icing/index/iterator/doc-hit-info-iterator-not.cc +++ b/icing/index/iterator/doc-hit-info-iterator-not.cc @@ -63,8 +63,8 @@ libtextclassifier3::Status DocHitInfoIteratorNot::Advance() { libtextclassifier3::StatusOr<DocHitInfoIterator::TrimmedNode> DocHitInfoIteratorNot::TrimRightMostNode() && { // Don't generate suggestion if the last operator is NOT. - return absl_ports::UnimplementedError( - "Cannot trim right most node in NOT operator."); + return absl_ports::InvalidArgumentError( + "Cannot generate suggestion if the last term is NOT operator."); } int32_t DocHitInfoIteratorNot::GetNumBlocksInspected() const { diff --git a/icing/index/iterator/doc-hit-info-iterator-not_test.cc b/icing/index/iterator/doc-hit-info-iterator-not_test.cc index 54d6c36..5a8ce2c 100644 --- a/icing/index/iterator/doc-hit-info-iterator-not_test.cc +++ b/icing/index/iterator/doc-hit-info-iterator-not_test.cc @@ -163,7 +163,7 @@ TEST(DocHitInfoIteratorNotTest, TrimNotIterator) { DocHitInfoIteratorNot not_iterator(std::move(to_be_excluded_iterator), /*document_id_limit=*/5); EXPECT_THAT(std::move(not_iterator).TrimRightMostNode(), - StatusIs(libtextclassifier3::StatusCode::UNIMPLEMENTED)); + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } } // namespace diff --git a/icing/index/numeric/doc-hit-info-iterator-numeric.h b/icing/index/numeric/doc-hit-info-iterator-numeric.h index bf990d1..fc66a1d 100644 --- a/icing/index/numeric/doc-hit-info-iterator-numeric.h +++ b/icing/index/numeric/doc-hit-info-iterator-numeric.h @@ -49,8 +49,8 @@ class DocHitInfoIteratorNumeric : public DocHitInfoIterator { } libtextclassifier3::StatusOr<TrimmedNode> TrimRightMostNode() && override { - return absl_ports::UnimplementedError( - "Cannot trim right most node in numeric operator."); + return absl_ports::InvalidArgumentError( + "Cannot generate suggestion if the last term is numeric operator."); } int32_t GetNumBlocksInspected() const override { return 0; } diff --git a/icing/index/numeric/dummy-numeric-index.h b/icing/index/numeric/dummy-numeric-index.h index 164866c..7cfb102 100644 --- a/icing/index/numeric/dummy-numeric-index.h +++ b/icing/index/numeric/dummy-numeric-index.h @@ -70,7 +70,8 @@ class DummyNumericIndex : public NumericIndex<T> { } libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>> GetIterator( - std::string_view property_path, T key_lower, T key_upper) const override; + std::string_view property_path, T key_lower, T key_upper, + const DocumentStore&, const SchemaStore&) const override; libtextclassifier3::Status Optimize( const std::vector<DocumentId>& document_id_old_to_new, @@ -93,6 +94,8 @@ class DummyNumericIndex : public NumericIndex<T> { } } + int num_property_indices() const override { return storage_.size(); } + private: class Editor : public NumericIndex<T>::Editor { public: @@ -176,7 +179,6 @@ class DummyNumericIndex : public NumericIndex<T> { DocHitInfo doc_hit_info_; }; - private: explicit DummyNumericIndex(const Filesystem& filesystem, std::string&& working_path) : NumericIndex<T>(filesystem, std::move(working_path), @@ -265,7 +267,8 @@ libtextclassifier3::Status DummyNumericIndex<T>::Iterator::Advance() { template <typename T> libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>> DummyNumericIndex<T>::GetIterator(std::string_view property_path, T key_lower, - T key_upper) const { + T key_upper, const DocumentStore&, + const SchemaStore&) const { if (key_lower > key_upper) { return absl_ports::InvalidArgumentError( "key_lower should not be greater than key_upper"); diff --git a/icing/index/numeric/integer-index-bucket-util.cc b/icing/index/numeric/integer-index-bucket-util.cc new file mode 100644 index 0000000..a05baab --- /dev/null +++ b/icing/index/numeric/integer-index-bucket-util.cc @@ -0,0 +1,205 @@ +// Copyright (C) 2023 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/index/numeric/integer-index-bucket-util.h" + +#include <algorithm> +#include <cstdint> +#include <iterator> +#include <limits> +#include <utility> +#include <vector> + +#include "icing/index/numeric/integer-index-data.h" + +namespace icing { +namespace lib { + +namespace integer_index_bucket_util { + +namespace { + +// Helper function to determine if data slice [start, end) forms a "full +// single-range bucket". +// +// Full single-range bucket: keys of all data are identical and # of them exceed +// num_data_threshold. +// +// REQUIRES: data slice [start, end) are sorted by key. +inline bool WouldBeFullSingleRangeBucket( + const std::vector<IntegerIndexData>::iterator& start, + const std::vector<IntegerIndexData>::iterator& end, + int32_t num_data_threshold) { + return std::distance(start, end) > num_data_threshold && + start->key() == (end - 1)->key(); +} + +// Helper function to determine if a bucket is full single-range. +// +// REQUIRES: +// bucket.key_lower <= [bucket.start, bucket.end)->key() <= bucket.key_upper +inline bool IsFullSingleRangeBucket(const DataRangeAndBucketInfo& bucket, + int32_t num_data_threshold) { + return bucket.key_lower == bucket.key_upper && + WouldBeFullSingleRangeBucket(bucket.start, bucket.end, + num_data_threshold); +} + +// Helper function to append new bucket(s) with corresponding data slice for +// range [curr_key_lower, last_key] where last_key = (it_end - 1)->key(). +// +// Also it handles an edge case: +// If data slice [it_start, it_end) forms a "full single-range bucket" (see +// WouldBeFullSingleRangeBucket for definition), then we have to put them into a +// single range bucket [last_key, last_key] instead of [curr_key_lower, +// last_key]. Also we have to deal with range [curr_key_lower, last_key - 1]: +// - If the previous bucket exists and it is not a "full single-range bucket", +// then merge [curr_key_lower, last_key - 1] into the previous bucket, i.e. +// change the previous bucket's key_upper to (last_key - 1). Then we will end +// up having: +// - [prev_bucket.key_lower, last_key - 1] +// - [last_key, last_key] +// - Otherwise, we have to create [curr_key_lower, last_key - 1] with +// empty data. Then we will end up having (Note: prev_bucket.key_upper == +// curr_key_lower - 1): +// - [prev_bucket.key_lower, curr_key_lower - 1] +// - [curr_key_lower, last_key - 1] +// - [last_key, last_key] +// This will avoid split bucket being called too frequently. +// For example, original_key_lower = 0, original_key_upper = 50. If we have +// (num_data_threshold + 1) data with key = 20 and another data with key = 40: +// - Without this part, we will split them into [[0, 20], [21, 50]]. Then when +// adding data with key = 10 next round, we will invoke split again and split +// [0, 20] to [[0, 10], [11, 20]]. +// - With this part, we will split them into [[0, 19], [20, 20], [21, 50]], +// which will avoid splitting in the next round for key = 20. +// +// REQUIRES: it_start < it_end +void AppendNewBuckets(const std::vector<IntegerIndexData>::iterator& it_start, + const std::vector<IntegerIndexData>::iterator& it_end, + int64_t curr_key_lower, int32_t num_data_threshold, + std::vector<DataRangeAndBucketInfo>& results) { + int64_t last_key = (it_end - 1)->key(); + if (curr_key_lower < last_key && + WouldBeFullSingleRangeBucket(it_start, it_end, num_data_threshold)) { + if (!results.empty() && + !IsFullSingleRangeBucket(results.back(), num_data_threshold)) { + // Previous bucket is not full single-range, so merge it to now hold the + // range [prev_bucket.key_lower, last_key - 1]. + results.back().key_upper = last_key - 1; + } else { + // There is either no previous bucket or the previous bucket is full + // single-range. So add an empty bucket for the range [curr_key_lower, + // last_key - 1]. + results.push_back(DataRangeAndBucketInfo(it_start, it_start, + curr_key_lower, last_key - 1)); + } + curr_key_lower = last_key; + } + results.push_back( + DataRangeAndBucketInfo(it_start, it_end, curr_key_lower, last_key)); +} + +} // namespace + +std::vector<DataRangeAndBucketInfo> Split(std::vector<IntegerIndexData>& data, + int64_t original_key_lower, + int64_t original_key_upper, + int32_t num_data_threshold) { + // Early return if there is no need to split. + if (data.size() <= num_data_threshold) { + return {DataRangeAndBucketInfo(data.begin(), data.end(), original_key_lower, + original_key_upper)}; + } + + // Sort data by key. + std::sort( + data.begin(), data.end(), + [](const IntegerIndexData& lhs, const IntegerIndexData& rhs) -> bool { + return lhs.key() < rhs.key(); + }); + + std::vector<DataRangeAndBucketInfo> results; + int64_t curr_key_lower = original_key_lower; + // Sliding window [it_start, it_end) to separate data into different buckets. + auto it_start = data.begin(); + auto it_end = data.begin(); + while (it_end != data.end()) { + // Attempt to extend it_end by 1, but we have to include all data with the + // same key since they cannot be separated into different buckets. Also use + // extend_it_end to avoid modifying it_end directly. For some edge cases, + // the extension in a single round is extremely large (i.e. a lot of data + // have the same key), and we want to separate them. For example: + // - key = 0: 5 data + // - key = 1: num_data_threshold - 1 data + // In the second round, # of data in the sliding window will exceed the + // threshold. We want to separate all data with key = 0 into a single bucket + // instead of putting key = 0 and key = 1 together. Therefore, using + // extend_it_end allow us to preserve it_end of the previous round and be + // able to deal with this case. + auto extend_it_end = it_end + 1; + while (extend_it_end != data.end() && + it_end->key() == extend_it_end->key()) { + ++extend_it_end; + } + + if (std::distance(it_start, extend_it_end) > num_data_threshold && + it_start != it_end) { + // Split data between [it_start, it_end) into range [curr_key_lower, + // (it_end - 1)->key()]. + AppendNewBuckets(it_start, it_end, curr_key_lower, num_data_threshold, + results); + + // it_end at this moment won't be data.end(), so the last element of the + // new bucket can't have key == INT64_MAX. Therefore, it is safe to set + // curr_key_lower as ((it_end - 1)->key() + 1). + curr_key_lower = (it_end - 1)->key() + 1; + it_start = it_end; + } + it_end = extend_it_end; + } + + // Handle the final range [curr_key_lower, original_key_upper]. + if (curr_key_lower <= original_key_upper) { + if (it_start != it_end) { + AppendNewBuckets(it_start, it_end, curr_key_lower, num_data_threshold, + results); + + // AppendNewBuckets only handles range [curr_key_lower, (it_end - + // 1)->key()], so we have to handle range [(it_end - 1)->key() + 1, + // original_key_upper] if needed. + int64_t last_key = (it_end - 1)->key(); + if (last_key != std::numeric_limits<int64_t>::max() && + last_key + 1 <= original_key_upper) { + if (!results.empty() && + !IsFullSingleRangeBucket(results.back(), num_data_threshold)) { + results.back().key_upper = original_key_upper; + } else { + results.push_back(DataRangeAndBucketInfo( + it_start, it_start, last_key + 1, original_key_upper)); + } + } + } else { + results.push_back(DataRangeAndBucketInfo(it_start, it_end, curr_key_lower, + original_key_upper)); + } + } + + return results; +} + +} // namespace integer_index_bucket_util + +} // namespace lib +} // namespace icing diff --git a/icing/index/numeric/integer-index-bucket-util.h b/icing/index/numeric/integer-index-bucket-util.h new file mode 100644 index 0000000..863bd01 --- /dev/null +++ b/icing/index/numeric/integer-index-bucket-util.h @@ -0,0 +1,81 @@ +// Copyright (C) 2023 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ICING_INDEX_NUMERIC_INTEGER_INDEX_BUCKET_UTIL_H_ +#define ICING_INDEX_NUMERIC_INTEGER_INDEX_BUCKET_UTIL_H_ + +#include <cstdint> +#include <utility> +#include <vector> + +#include "icing/index/numeric/integer-index-data.h" + +namespace icing { +namespace lib { + +namespace integer_index_bucket_util { + +// A wrapper struct that contains information of a bucket. +// - The bucket contains data within the iterator [start, end). +// - Bucket range is [key_lower, key_upper], and all data within [start, end) +// should have keys in the bucket range. +// +// Note: the caller should make sure the lifecycle of data vector is longer than +// instances of this wrapper struct. +struct DataRangeAndBucketInfo { + std::vector<IntegerIndexData>::iterator start; + std::vector<IntegerIndexData>::iterator end; + int64_t key_lower; + int64_t key_upper; + + explicit DataRangeAndBucketInfo( + std::vector<IntegerIndexData>::iterator start_in, + std::vector<IntegerIndexData>::iterator end_in, int64_t key_lower_in, + int64_t key_upper_in) + : start(std::move(start_in)), + end(std::move(end_in)), + key_lower(key_lower_in), + key_upper(key_upper_in) {} +}; + +// Helper function to split data (that are originally in a bucket with range +// [original_key_lower, original_key_upper]) into different buckets according to +// num_data_threshold. +// - The input vector `data` will be sorted by key in ascending order (unless +// there's no need to split in which case data is returned unmodified) +// - Data with the same key will be in the same bucket even if # of them exceed +// num_data_threshold. +// - Range of all buckets will be disjoint, and the range union will be +// [original_key_lower, original_key_upper]. +// - Data slice (i.e. [start, end)) can be empty. +// +// REQUIRES: +// - original_key_lower <= original_key_upper +// - num_data_threshold > 0 +// - Keys of all data are in range [original_key_lower, original_key_upper] +// +// Returns: a vector of DataRangeAndBucketInfo that contain all bucket info +// after splitting. Also the returned vector should contain at least one +// bucket, otherwise it is considered an error. +std::vector<DataRangeAndBucketInfo> Split(std::vector<IntegerIndexData>& data, + int64_t original_key_lower, + int64_t original_key_upper, + int32_t num_data_threshold); + +} // namespace integer_index_bucket_util + +} // namespace lib +} // namespace icing + +#endif // ICING_INDEX_NUMERIC_INTEGER_INDEX_BUCKET_UTIL_H_ diff --git a/icing/index/numeric/integer-index-bucket-util_test.cc b/icing/index/numeric/integer-index-bucket-util_test.cc new file mode 100644 index 0000000..82c593e --- /dev/null +++ b/icing/index/numeric/integer-index-bucket-util_test.cc @@ -0,0 +1,1112 @@ +// Copyright (C) 2023 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/index/numeric/integer-index-bucket-util.h" + +#include <limits> +#include <vector> + +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "icing/index/numeric/integer-index-data.h" +#include "icing/schema/section.h" +#include "icing/store/document-id.h" + +namespace icing { +namespace lib { +namespace integer_index_bucket_util { + +namespace { + +using ::testing::ElementsAre; +using ::testing::Eq; +using ::testing::IsEmpty; +using ::testing::Ne; +using ::testing::SizeIs; + +static constexpr DocumentId kDefaultDocumentId = 123; +static constexpr SectionId kDefaultSectionId = 31; + +TEST(IntegerIndexBucketUtilTest, Split_numDataNotDivisibleByThreshold) { + std::vector<IntegerIndexData> data = { + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 1), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2)}; + int64_t key_lower = -10; + int64_t key_upper = 10; + int32_t num_data_threshold = 3; + ASSERT_THAT(data.size() % num_data_threshold, Ne(0)); + + // Keys = [-10, -3, -2, 0, 1, 2, 10]. + std::vector<DataRangeAndBucketInfo> results = + Split(data, key_lower, key_upper, num_data_threshold); + ASSERT_THAT(results, SizeIs(3)); + // Bucket 0: key lower = -10, key upper = -2, keys = [-10, -3, -2]. + EXPECT_THAT(results[0].key_lower, Eq(-10)); + EXPECT_THAT(results[0].key_upper, Eq(-2)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[0].start, results[0].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2))); + // Bucket 1: key lower = -1, key upper = 2, keys = [0, 1, 2]. + EXPECT_THAT(results[1].key_lower, Eq(-1)); + EXPECT_THAT(results[1].key_upper, Eq(2)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[1].start, results[1].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 1), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2))); + // Bucket 2: key lower = 3, key upper = 10, keys = [10]. + EXPECT_THAT(results[2].key_lower, Eq(3)); + EXPECT_THAT(results[2].key_upper, Eq(10)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[2].start, results[2].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10))); +} + +TEST(IntegerIndexBucketUtilTest, Split_numDataDivisibleByThreshold) { + std::vector<IntegerIndexData> data = { + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2)}; + int64_t key_lower = -10; + int64_t key_upper = 10; + int32_t num_data_threshold = 3; + ASSERT_THAT(data.size() % num_data_threshold, Eq(0)); + + // Keys = [-10, -3, -2, 0, 2, 10]. + std::vector<DataRangeAndBucketInfo> results = + Split(data, key_lower, key_upper, num_data_threshold); + ASSERT_THAT(results, SizeIs(2)); + // Bucket 0: key lower = -10, key upper = -2, keys = [-10, -3, -2]. + EXPECT_THAT(results[0].key_lower, Eq(-10)); + EXPECT_THAT(results[0].key_upper, Eq(-2)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[0].start, results[0].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2))); + // Bucket 1: key lower = -1, key upper = 2, keys = [0, 2, 10]. + EXPECT_THAT(results[1].key_lower, Eq(-1)); + EXPECT_THAT(results[1].key_upper, Eq(10)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[1].start, results[1].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10))); +} + +TEST(IntegerIndexBucketUtilTest, Split_shouldIncludeOriginalKeyRange) { + std::vector<IntegerIndexData> data = { + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 1), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2)}; + int64_t key_lower = -1000; + int64_t key_upper = 1000; + int32_t num_data_threshold = 3; + + // Keys = [-10, -3, -2, 0, 1, 2, 10]. + // Split should include the original key_lower and key_upper even if there is + // no key at boundary. + std::vector<DataRangeAndBucketInfo> results = + Split(data, key_lower, key_upper, num_data_threshold); + ASSERT_THAT(results, SizeIs(3)); + // Bucket 0: key lower = -1000, key upper = -2, keys = [-10, -3, -2]. + EXPECT_THAT(results[0].key_lower, Eq(-1000)); + EXPECT_THAT(results[0].key_upper, Eq(-2)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[0].start, results[0].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2))); + // Bucket 1: key lower = -1, key upper = 2, keys = [0, 1, 2]. + EXPECT_THAT(results[1].key_lower, Eq(-1)); + EXPECT_THAT(results[1].key_upper, Eq(2)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[1].start, results[1].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 1), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2))); + // Bucket 2: key lower = 3, key upper = 1000, keys = [10]. + EXPECT_THAT(results[2].key_lower, Eq(3)); + EXPECT_THAT(results[2].key_upper, Eq(1000)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[2].start, results[2].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10))); +} + +TEST(IntegerIndexBucketUtilTest, Split_singleBucketWithoutSplitting) { + std::vector<IntegerIndexData> data = { + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 1), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2)}; + int64_t key_lower = -1000; + int64_t key_upper = 1000; + int32_t num_data_threshold = 100; + + // Keys = [-10, -3, -2, 0, 1, 2, 10]. + std::vector<DataRangeAndBucketInfo> results = + Split(data, key_lower, key_upper, num_data_threshold); + ASSERT_THAT(results, SizeIs(1)); + // Bucket 0: key lower = -1000, key upper = 1000, keys = [-10, -3, -2, 0, 1, + // 2, 10]. Since # of data <= threshold, data vector won't be sorted and thus + // [start, end) will have data with the original order. + EXPECT_THAT(results[0].key_lower, Eq(-1000)); + EXPECT_THAT(results[0].key_upper, Eq(1000)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[0].start, results[0].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 1), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2))); +} + +TEST(IntegerIndexBucketUtilTest, Split_emptyData) { + std::vector<IntegerIndexData> empty_data; + std::vector<DataRangeAndBucketInfo> results = + Split(empty_data, /*original_key_lower=*/-10, /*original_key_upper=*/10, + /*num_data_threshold=*/3); + ASSERT_THAT(results, SizeIs(1)); + // Bucket 0: key lower = -10, key upper = 10, keys = []. + EXPECT_THAT(results[0].key_lower, Eq(-10)); + EXPECT_THAT(results[0].key_upper, Eq(10)); + EXPECT_THAT(std::vector<IntegerIndexData>(results[0].start, results[0].end), + IsEmpty()); +} + +TEST(IntegerIndexBucketUtilTest, + Split_sameKeysExceedingThreshold_firstBucket_keyEqualsKeyLower) { + std::vector<IntegerIndexData> data = { + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 3), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)}; + + // Keys = [-10, -10, -10, -10, -10, 0, 3, 5, 10]. + std::vector<DataRangeAndBucketInfo> results = + Split(data, /*original_key_lower=*/-10, /*original_key_upper=*/10, + /*num_data_threshold=*/3); + // - Even though # of data with key = -10 exceeds the threshold, they should + // still be in the same bucket. + // - They should be separated from key = 0, 3, .... + ASSERT_THAT(results, SizeIs(3)); + // Bucket 0: key lower = -10, key upper = -10, keys = [-10, -10, -10, -10, + // -10]. + EXPECT_THAT(results[0].key_lower, Eq(-10)); + EXPECT_THAT(results[0].key_upper, Eq(-10)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[0].start, results[0].end), + ElementsAre( + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10))); + // Bucket 1: key lower = -9, key upper = 5, keys = [0, 3, 5]. + EXPECT_THAT(results[1].key_lower, Eq(-9)); + EXPECT_THAT(results[1].key_upper, Eq(5)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[1].start, results[1].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 3), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5))); + // Bucket 2: key lower = 6, key upper = 10, keys = [10]. + EXPECT_THAT(results[2].key_lower, Eq(6)); + EXPECT_THAT(results[2].key_upper, Eq(10)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[2].start, results[2].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10))); +} + +TEST(IntegerIndexBucketUtilTest, + Split_sameKeysExceedingThreshold_firstBucket_keyGreaterThanKeyLower) { + std::vector<IntegerIndexData> data = { + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -7), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -7), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -7), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -7), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -7), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 3), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)}; + + // Keys = [-7, -7, -7, -7, -7, 0, 3, 5, 10]. + std::vector<DataRangeAndBucketInfo> results = + Split(data, /*original_key_lower=*/-10, /*original_key_upper=*/10, + /*num_data_threshold=*/3); + // - Even though # of data with key = -7 exceeds the threshold, they should + // still be in the same bucket. + // - They should be separated from key = 0, 3, .... + // - They should be in a single range bucket [-7, -7], and another bucket + // [-10, -8] with empty data should be created before it. + ASSERT_THAT(results, SizeIs(4)); + // Bucket 0: key lower = -10, key upper = -8, keys = []. + EXPECT_THAT(results[0].key_lower, Eq(-10)); + EXPECT_THAT(results[0].key_upper, Eq(-8)); + EXPECT_THAT(std::vector<IntegerIndexData>(results[0].start, results[0].end), + IsEmpty()); + // Bucket 1: key lower = -7, key upper = -7, keys = [-7, -7, -7, -7, -7]. + EXPECT_THAT(results[1].key_lower, Eq(-7)); + EXPECT_THAT(results[1].key_upper, Eq(-7)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[1].start, results[1].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -7), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -7), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -7), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -7), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -7))); + // Bucket 2: key lower = -6, key upper = 5, keys = [0, 3, 5]. + EXPECT_THAT(results[2].key_lower, Eq(-6)); + EXPECT_THAT(results[2].key_upper, Eq(5)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[2].start, results[2].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 3), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5))); + // Bucket 3: key lower = 6, key upper = 10, keys = [10]. + EXPECT_THAT(results[3].key_lower, Eq(6)); + EXPECT_THAT(results[3].key_upper, Eq(10)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[3].start, results[3].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10))); +} + +TEST(IntegerIndexBucketUtilTest, + Split_sameKeysExceedingThreshold_midBucket_keyEqualsKeyLower) { + std::vector<IntegerIndexData> data = { + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -5), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -4), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -4), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -4), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -4), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -4), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)}; + + // Keys = [-10, -5, -4, -4, -4, -4, -4, 5, 10]. + std::vector<DataRangeAndBucketInfo> results = + Split(data, /*original_key_lower=*/-10, /*original_key_upper=*/10, + /*num_data_threshold=*/3); + // - Even though # of data with key = -4 exceeds the threshold, they should + // still be in the same bucket. + // - They should be separated from key = -10, -5, 5, 10. + ASSERT_THAT(results, SizeIs(3)); + // Bucket 0: key lower = -10, key upper = -5, keys = [-10, -5]. + EXPECT_THAT(results[0].key_lower, Eq(-10)); + EXPECT_THAT(results[0].key_upper, Eq(-5)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[0].start, results[0].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -5))); + // Bucket 1: key lower = -4, key upper = -4, keys = [-4, -4, -4, -4, -4]. + EXPECT_THAT(results[1].key_lower, Eq(-4)); + EXPECT_THAT(results[1].key_upper, Eq(-4)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[1].start, results[1].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -4), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -4), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -4), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -4), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -4))); + // Bucket 2: key lower = -3, key upper = 10, keys = [5, 10]. + EXPECT_THAT(results[2].key_lower, Eq(-3)); + EXPECT_THAT(results[2].key_upper, Eq(10)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[2].start, results[2].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10))); +} + +TEST(IntegerIndexBucketUtilTest, + Split_sameKeysExceedingThreshold_midBucket_keyGreaterThanKeyLower) { + std::vector<IntegerIndexData> data = { + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -5), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)}; + + // Keys = [-10, -5, -1, -1, -1, -1, -1, 5, 10]. + std::vector<DataRangeAndBucketInfo> results = + Split(data, /*original_key_lower=*/-10, /*original_key_upper=*/10, + /*num_data_threshold=*/3); + // - Even though # of data with key = -1 exceeds the threshold, they should + // still be in the same bucket. + // - They should be separated from key = -10, -5, 5, 10. + // - They should be in a single range bucket [-1, -1], and range [-4, -2] + // should be merged into the previous bucket. + ASSERT_THAT(results, SizeIs(3)); + // Bucket 0: key lower = -10, key upper = -2, keys = [-10, -5]. + EXPECT_THAT(results[0].key_lower, Eq(-10)); + EXPECT_THAT(results[0].key_upper, Eq(-2)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[0].start, results[0].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -5))); + // Bucket 1: key lower = -1, key upper = -1, keys = [-1, -1, -1, -1, -1]. + EXPECT_THAT(results[1].key_lower, Eq(-1)); + EXPECT_THAT(results[1].key_upper, Eq(-1)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[1].start, results[1].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1))); + // Bucket 2: key lower = 0, key upper = 10, keys = [5, 10]. + EXPECT_THAT(results[2].key_lower, Eq(0)); + EXPECT_THAT(results[2].key_upper, Eq(10)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[2].start, results[2].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10))); +} + +TEST(IntegerIndexBucketUtilTest, + Split_sameKeysExceedingThreshold_lastBucket_keyEqualsKeyLower) { + std::vector<IntegerIndexData> data = { + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 3), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 3), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 3), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 3), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 3)}; + + // Keys = [-10, -3, 0, 2, 3, 3, 3, 3, 3]. + std::vector<DataRangeAndBucketInfo> results = + Split(data, /*original_key_lower=*/-10, /*original_key_upper=*/10, + /*num_data_threshold=*/3); + // - Even though # of data with key = 3 exceeds the threshold, they should + // still be in the same bucket. + // - They should be separated from key = -10, -3, 0, 2. + // - They should be in a single range bucket [3, 3], and another bucket + // [4, 10] with empty data should be created after it. + ASSERT_THAT(results, SizeIs(4)); + // Bucket 0: key lower = -10, key upper = 0, keys = [-10, -3, 0]. + EXPECT_THAT(results[0].key_lower, Eq(-10)); + EXPECT_THAT(results[0].key_upper, Eq(0)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[0].start, results[0].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0))); + // Bucket 1: key lower = 1, key upper = 2, keys = [2]. + EXPECT_THAT(results[1].key_lower, Eq(1)); + EXPECT_THAT(results[1].key_upper, Eq(2)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[1].start, results[1].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2))); + // Bucket 2: key lower = 3, key upper = 10, keys = [3, 3, 3, 3, 3]. + EXPECT_THAT(results[2].key_lower, Eq(3)); + EXPECT_THAT(results[2].key_upper, Eq(3)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[2].start, results[2].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 3), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 3), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 3), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 3), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 3))); + // Bucket 3: key lower = 4, key upper = 10, keys = []. + EXPECT_THAT(results[3].key_lower, Eq(4)); + EXPECT_THAT(results[3].key_upper, Eq(10)); + EXPECT_THAT(std::vector<IntegerIndexData>(results[3].start, results[3].end), + IsEmpty()); +} + +TEST(IntegerIndexBucketUtilTest, + Split_sameKeysExceedingThreshold_lastBucket_keyWithinKeyLowerAndUpper) { + std::vector<IntegerIndexData> data = { + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 6), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 6), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 6), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 6), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 6)}; + + // Keys = [-10, -3, 0, 2, 6, 6, 6, 6, 6]. + std::vector<DataRangeAndBucketInfo> results = + Split(data, /*original_key_lower=*/-10, /*original_key_upper=*/10, + /*num_data_threshold=*/3); + // - Even though # of data with key = 6 exceeds the threshold, they should + // still be in the same bucket. + // - They should be separated from key = -10, -3, 0, 2. + // - They should be in a single range bucket [6, 6]. Range [3, 5] should be + // merged into the previous bucket. and another bucket [7, 10] with empty + // data should be created after it. + ASSERT_THAT(results, SizeIs(4)); + // Bucket 0: key lower = -10, key upper = 0, keys = [-10, -3, 0]. + EXPECT_THAT(results[0].key_lower, Eq(-10)); + EXPECT_THAT(results[0].key_upper, Eq(0)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[0].start, results[0].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0))); + // Bucket 1: key lower = 1, key upper = 5, keys = [2]. + EXPECT_THAT(results[1].key_lower, Eq(1)); + EXPECT_THAT(results[1].key_upper, Eq(5)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[1].start, results[1].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2))); + // Bucket 2: key lower = 6, key upper = 6, keys = [6, 6, 6, 6, 6]. + EXPECT_THAT(results[2].key_lower, Eq(6)); + EXPECT_THAT(results[2].key_upper, Eq(6)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[2].start, results[2].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 6), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 6), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 6), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 6), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 6))); + // Bucket 3: key lower = 7, key upper = 10, keys = []. + EXPECT_THAT(results[3].key_lower, Eq(7)); + EXPECT_THAT(results[3].key_upper, Eq(10)); + EXPECT_THAT(std::vector<IntegerIndexData>(results[3].start, results[3].end), + IsEmpty()); +} + +TEST(IntegerIndexBucketUtilTest, + Split_sameKeysExceedingThreshold_lastBucket_keyEqualsKeyUpper) { + std::vector<IntegerIndexData> data = { + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)}; + + // Keys = [-10, -3, 0, 2, 10, 10, 10, 10, 10]. + std::vector<DataRangeAndBucketInfo> results = + Split(data, /*original_key_lower=*/-10, /*original_key_upper=*/10, + /*num_data_threshold=*/3); + // - Even though # of data with key = 10 exceeds the threshold, they should + // still be in the same bucket. + // - They should be separated from key = -10, -3, 0, 2. + // - They should be in a single range bucket [10, 10], and range [3, 9] should + // be merged into the previous bucket. + ASSERT_THAT(results, SizeIs(3)); + // Bucket 0: key lower = -10, key upper = 0, keys = [-10, -3, 0]. + EXPECT_THAT(results[0].key_lower, Eq(-10)); + EXPECT_THAT(results[0].key_upper, Eq(0)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[0].start, results[0].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0))); + // Bucket 1: key lower = 1, key upper = 9, keys = [2]. + EXPECT_THAT(results[1].key_lower, Eq(1)); + EXPECT_THAT(results[1].key_upper, Eq(9)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[1].start, results[1].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2))); + // Bucket 2: key lower = 10, key upper = 10, keys = [10, 10, 10, 10, 10]. + EXPECT_THAT(results[2].key_lower, Eq(10)); + EXPECT_THAT(results[2].key_upper, Eq(10)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[2].start, results[2].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10))); +} + +TEST(IntegerIndexBucketUtilTest, + Split_sameKeysExceedingThreshold_shouldNotMergeIntoPreviousBucket) { + std::vector<IntegerIndexData> data = { + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)}; + + // Keys = [-10, -2, -2, -2, -2, -2, 5, 5, 5, 5, 5, 10]. + std::vector<DataRangeAndBucketInfo> results = + Split(data, /*original_key_lower=*/-10, /*original_key_upper=*/10, + /*num_data_threshold=*/3); + // - Data with key = -2 and 5 should be put into a single bucket respectively. + // - When dealing with key = 5, range [-1, 4] should not be merged into the + // previous bucket [-2, -2] because [-2, -2] also contains single key data + // exceeding the threshold. Instead, we should create bucket [-1, 4] with + // empty data. + ASSERT_THAT(results, SizeIs(5)); + // Bucket 0: key lower = -10, key upper = -3, keys = [-10]. + EXPECT_THAT(results[0].key_lower, Eq(-10)); + EXPECT_THAT(results[0].key_upper, Eq(-3)); + EXPECT_THAT(std::vector<IntegerIndexData>(results[0].start, results[0].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, + kDefaultDocumentId, -10))); + // Bucket 1: key lower = -2, key upper = -2, keys = [-2, -2, -2, -2, -2]. + EXPECT_THAT(results[1].key_lower, Eq(-2)); + EXPECT_THAT(results[1].key_upper, Eq(-2)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[1].start, results[1].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2))); + // Bucket 2: key lower = -1, key upper = 4, keys = []. + EXPECT_THAT(results[2].key_lower, Eq(-1)); + EXPECT_THAT(results[2].key_upper, Eq(4)); + EXPECT_THAT(std::vector<IntegerIndexData>(results[2].start, results[2].end), + IsEmpty()); + // Bucket 3: key lower = 5, key upper = 5, keys = [5, 5, 5, 5, 5]. + EXPECT_THAT(results[3].key_lower, Eq(5)); + EXPECT_THAT(results[3].key_upper, Eq(5)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[3].start, results[3].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5))); + // Bucket 4: key lower = 6, key upper = 10, keys = [10]. + EXPECT_THAT(results[4].key_lower, Eq(6)); + EXPECT_THAT(results[4].key_upper, Eq(10)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[4].start, results[4].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10))); +} + +TEST(IntegerIndexBucketUtilTest, + Split_sameKeysExceedingThreshold_shouldMergeIntoPreviousBucket) { + std::vector<IntegerIndexData> data = { + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -8), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)}; + + // Keys = [-10, -8, -3, -2, -2, -2, 5, 5, 5, 5, 5, 10]. + std::vector<DataRangeAndBucketInfo> results = + Split(data, /*original_key_lower=*/-10, /*original_key_upper=*/10, + /*num_data_threshold=*/3); + // - Data with key = 5 should be put into a single bucket. + // - When dealing with key = 5, range [-1, 4] should be merged into the + // previous bucket [-2, -2] because # of data in [-2, -2] doesn't exceed the + // threshold. + ASSERT_THAT(results, SizeIs(4)); + // Bucket 0: key lower = -10, key upper = -3, keys = [-10, -8, -3]. + EXPECT_THAT(results[0].key_lower, Eq(-10)); + EXPECT_THAT(results[0].key_upper, Eq(-3)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[0].start, results[0].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -8), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3))); + // Bucket 1: key lower = -2, key upper = 4, keys = [-2, -2, -2]. + EXPECT_THAT(results[1].key_lower, Eq(-2)); + EXPECT_THAT(results[1].key_upper, Eq(4)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[1].start, results[1].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2))); + // Bucket 2: key lower = 5, key upper = 5, keys = [5, 5, 5, 5, 5]. + EXPECT_THAT(results[2].key_lower, Eq(5)); + EXPECT_THAT(results[2].key_upper, Eq(5)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[2].start, results[2].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5))); + // Bucket 3: key lower = 6, key upper = 10, keys = [10]. + EXPECT_THAT(results[3].key_lower, Eq(6)); + EXPECT_THAT(results[3].key_upper, Eq(10)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[3].start, results[3].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10))); +} + +TEST(IntegerIndexBucketUtilTest, + Split_sameKeysExceedingThreshold_singleBucket_keyEqualsKeyLower) { + std::vector<IntegerIndexData> data = { + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10)}; + + // Keys = [-10, -10, -10, -10, -10]. + std::vector<DataRangeAndBucketInfo> results = + Split(data, /*original_key_lower=*/-10, /*original_key_upper=*/10, + /*num_data_threshold=*/3); + // - Even though # of data with key = -10 exceeds the threshold, they should + // still be in the same bucket. + // - They should be in a single range bucket [-10, -10], and another bucket + // [-9, 10] with empty data should be created after it. + ASSERT_THAT(results, SizeIs(2)); + // Bucket 0: key lower = -10, key upper = -10, keys = [-10, -10, -10, -10, + // -10]. + EXPECT_THAT(results[0].key_lower, Eq(-10)); + EXPECT_THAT(results[0].key_upper, Eq(-10)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[0].start, results[0].end), + ElementsAre( + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10))); + // Bucket 1: key lower = -9, key upper = 10, keys = []. + EXPECT_THAT(results[1].key_lower, Eq(-9)); + EXPECT_THAT(results[1].key_upper, Eq(10)); + EXPECT_THAT(std::vector<IntegerIndexData>(results[1].start, results[1].end), + IsEmpty()); +} + +TEST(IntegerIndexBucketUtilTest, + Split_sameKeysExceedingThreshold_singleBucket_keyWithinKeyLowerAndUpper) { + std::vector<IntegerIndexData> data = { + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0)}; + + // Keys = [0, 0, 0, 0, 0]. + std::vector<DataRangeAndBucketInfo> results = + Split(data, /*original_key_lower=*/-10, /*original_key_upper=*/10, + /*num_data_threshold=*/3); + // - Even though # of data with key = 0 exceeds the threshold, they should + // still be in the same bucket. + // - They should be in a single range bucket [0, 0]. Another bucket [-10, -1] + // with empty data should be created before it, and another bucket [1, 10] + // with empty data should be created after it. + ASSERT_THAT(results, SizeIs(3)); + // Bucket 0: key lower = -10, key upper = -1, keys = []. + EXPECT_THAT(results[0].key_lower, Eq(-10)); + EXPECT_THAT(results[0].key_upper, Eq(-1)); + EXPECT_THAT(std::vector<IntegerIndexData>(results[0].start, results[0].end), + IsEmpty()); + // Bucket 1: key lower = 0, key upper = 0, keys = [0, 0, 0, 0, 0]. + EXPECT_THAT(results[1].key_lower, Eq(0)); + EXPECT_THAT(results[1].key_upper, Eq(0)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[1].start, results[1].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0))); + // Bucket 2: key lower = 1, key upper = 10, keys = []. + EXPECT_THAT(results[2].key_lower, Eq(1)); + EXPECT_THAT(results[2].key_upper, Eq(10)); + EXPECT_THAT(std::vector<IntegerIndexData>(results[2].start, results[2].end), + IsEmpty()); +} + +TEST(IntegerIndexBucketUtilTest, + Split_sameKeysExceedingThreshold_singleBucket_keyEqualsKeyUpper) { + std::vector<IntegerIndexData> data = { + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)}; + + // Keys = [10, 10, 10, 10, 10]. + std::vector<DataRangeAndBucketInfo> results = + Split(data, /*original_key_lower=*/-10, /*original_key_upper=*/10, + /*num_data_threshold=*/3); + // - Even though # of data with key = 10 exceeds the threshold, they should + // still be in the same bucket. + // - They should be in a single range bucket [10, 10], and another bucket + // [-10, 9] with empty data should be created before it. + ASSERT_THAT(results, SizeIs(2)); + // Bucket 0: key lower = -10, key upper = 9, keys = []. + EXPECT_THAT(results[0].key_lower, Eq(-10)); + EXPECT_THAT(results[0].key_upper, Eq(9)); + EXPECT_THAT(std::vector<IntegerIndexData>(results[0].start, results[0].end), + IsEmpty()); + // Bucket 1: key lower = -10, key upper = 10, keys = [10, 10, 10, 10, 10]. + EXPECT_THAT(results[1].key_lower, Eq(10)); + EXPECT_THAT(results[1].key_upper, Eq(10)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[1].start, results[1].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10))); +} + +TEST(IntegerIndexBucketUtilTest, + Split_adjacentKeysTotalNumDataExceedThreshold) { + std::vector<IntegerIndexData> data = { + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)}; + + // Keys = [-10, -10, -1, -1, 2, 2, 10, 10]. + std::vector<DataRangeAndBucketInfo> results = + Split(data, /*original_key_lower=*/-10, /*original_key_upper=*/10, + /*num_data_threshold=*/3); + // Even though # of data with the same key is within the threshold, since + // total # of data of adjacent keys exceed the threshold, they should be + // separated into different buckets. + ASSERT_THAT(results, SizeIs(4)); + // Bucket 0: key lower = -10, key upper = -10, keys = [-10, -10]. + EXPECT_THAT(results[0].key_lower, Eq(-10)); + EXPECT_THAT(results[0].key_upper, Eq(-10)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[0].start, results[0].end), + ElementsAre( + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10))); + // Bucket 1: key lower = -9, key upper = -1, keys = [-1, -1]. + EXPECT_THAT(results[1].key_lower, Eq(-9)); + EXPECT_THAT(results[1].key_upper, Eq(-1)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[1].start, results[1].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1))); + // Bucket 2: key lower = 0, key upper = 2, keys = [2, 2]. + EXPECT_THAT(results[2].key_lower, Eq(0)); + EXPECT_THAT(results[2].key_upper, Eq(2)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[2].start, results[2].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2))); + // Bucket 3: key lower = 3, key upper = 10, keys = [10, 10]. + EXPECT_THAT(results[3].key_lower, Eq(3)); + EXPECT_THAT(results[3].key_upper, Eq(10)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[3].start, results[3].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10))); +} + +TEST(IntegerIndexBucketUtilTest, + Split_keyLowerEqualsIntMin_smallestKeyGreaterThanKeyLower) { + std::vector<IntegerIndexData> data = { + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, + std::numeric_limits<int64_t>::min() + 1), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)}; + + // Keys = [INT64_MIN + 1, -10, -1, 2, 10]. + std::vector<DataRangeAndBucketInfo> results = + Split(data, /*original_key_lower=*/std::numeric_limits<int64_t>::min(), + /*original_key_upper=*/std::numeric_limits<int64_t>::max(), + /*num_data_threshold=*/3); + ASSERT_THAT(results, SizeIs(2)); + // Bucket 0: key lower = INT64_MIN, key upper = -1, keys = [INT64_MIN + 1, + // -10, -1]. + EXPECT_THAT(results[0].key_lower, Eq(std::numeric_limits<int64_t>::min())); + EXPECT_THAT(results[0].key_upper, Eq(-1)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[0].start, results[0].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, + std::numeric_limits<int64_t>::min() + 1), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1))); + // Bucket 1: key lower = 0, key upper = INT64_MAX, keys = [2, 10]. + EXPECT_THAT(results[1].key_lower, Eq(0)); + EXPECT_THAT(results[1].key_upper, Eq(std::numeric_limits<int64_t>::max())); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[1].start, results[1].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10))); +} + +TEST(IntegerIndexBucketUtilTest, + Split_keyLowerEqualsIntMin_smallestKeyEqualsKeyLower) { + std::vector<IntegerIndexData> data = { + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, + std::numeric_limits<int64_t>::min()), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)}; + + // Keys = [INT64_MIN, -10, -1, 2, 10]. + std::vector<DataRangeAndBucketInfo> results = + Split(data, /*original_key_lower=*/std::numeric_limits<int64_t>::min(), + /*original_key_upper=*/std::numeric_limits<int64_t>::max(), + /*num_data_threshold=*/3); + ASSERT_THAT(results, SizeIs(2)); + // Bucket 0: key lower = INT64_MIN, key upper = -1, keys = [INT64_MIN, -10, + // -1]. + EXPECT_THAT(results[0].key_lower, Eq(std::numeric_limits<int64_t>::min())); + EXPECT_THAT(results[0].key_upper, Eq(-1)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[0].start, results[0].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, + std::numeric_limits<int64_t>::min()), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1))); + // Bucket 1: key lower = 0, key upper = INT64_MAX, keys = [2, 10]. + EXPECT_THAT(results[1].key_lower, Eq(0)); + EXPECT_THAT(results[1].key_upper, Eq(std::numeric_limits<int64_t>::max())); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[1].start, results[1].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10))); +} + +TEST(IntegerIndexBucketUtilTest, + Split_keyLowerEqualsIntMin_keyIntMinExceedingThreshold) { + std::vector<IntegerIndexData> data = { + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, + std::numeric_limits<int64_t>::min()), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, + std::numeric_limits<int64_t>::min()), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, + std::numeric_limits<int64_t>::min()), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, + std::numeric_limits<int64_t>::min()), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, + std::numeric_limits<int64_t>::min()), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)}; + + // Keys = [INT64_MIN, INT64_MIN, INT64_MIN, INT64_MIN, INT64_MIN, -10, -1, 2, + // 10]. + std::vector<DataRangeAndBucketInfo> results = + Split(data, /*original_key_lower=*/std::numeric_limits<int64_t>::min(), + /*original_key_upper=*/std::numeric_limits<int64_t>::max(), + /*num_data_threshold=*/3); + ASSERT_THAT(results, SizeIs(3)); + // Bucket 0: key lower = INT64_MIN, key upper = INT64_MIN, keys = [INT64_MIN, + // INT64_MIN, INT64_MIN, INT64_MIN, INT64_MIN]. + EXPECT_THAT(results[0].key_lower, Eq(std::numeric_limits<int64_t>::min())); + EXPECT_THAT(results[0].key_upper, Eq(std::numeric_limits<int64_t>::min())); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[0].start, results[0].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, + std::numeric_limits<int64_t>::min()), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, + std::numeric_limits<int64_t>::min()), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, + std::numeric_limits<int64_t>::min()), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, + std::numeric_limits<int64_t>::min()), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, + std::numeric_limits<int64_t>::min()))); + // Bucket 1: key lower = INT64_MIN + 1, key upper = 2, keys = [-10, -1, 2]. + EXPECT_THAT(results[1].key_lower, + Eq(std::numeric_limits<int64_t>::min() + 1)); + EXPECT_THAT(results[1].key_upper, Eq(2)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[1].start, results[1].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2))); + // Bucket 2: key lower = 3, key upper = INT64_MAX, keys = [10]. + EXPECT_THAT(results[2].key_lower, Eq(3)); + EXPECT_THAT(results[2].key_upper, Eq(std::numeric_limits<int64_t>::max())); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[2].start, results[2].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10))); +} + +TEST(IntegerIndexBucketUtilTest, + Split_keyUpperEqualsIntMax_largestKeySmallerThanKeyUpper) { + std::vector<IntegerIndexData> data = { + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, + std::numeric_limits<int64_t>::max() - 1), + }; + + // Keys = [-10, -1, 2, 10, INT64_MAX - 1]. + std::vector<DataRangeAndBucketInfo> results = + Split(data, /*original_key_lower=*/std::numeric_limits<int64_t>::min(), + /*original_key_upper=*/std::numeric_limits<int64_t>::max(), + /*num_data_threshold=*/3); + ASSERT_THAT(results, SizeIs(2)); + // Bucket 0: key lower = INT64_MIN, key upper = 2, keys = [-10, -1, 2]. + EXPECT_THAT(results[0].key_lower, Eq(std::numeric_limits<int64_t>::min())); + EXPECT_THAT(results[0].key_upper, Eq(2)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[0].start, results[0].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2))); + // Bucket 1: key lower = 3, key upper = INT64_MAX, keys = [10, INT64_MAX - 1]. + EXPECT_THAT(results[1].key_lower, Eq(3)); + EXPECT_THAT(results[1].key_upper, Eq(std::numeric_limits<int64_t>::max())); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[1].start, results[1].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, + std::numeric_limits<int64_t>::max() - 1))); +} + +TEST(IntegerIndexBucketUtilTest, + Split_keyUpperEqualsIntMax_largestKeyEqualsKeyUpper) { + std::vector<IntegerIndexData> data = { + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, + std::numeric_limits<int64_t>::max()), + }; + + // Keys = [-10, -1, 2, 10, INT64_MAX]. + std::vector<DataRangeAndBucketInfo> results = + Split(data, /*original_key_lower=*/std::numeric_limits<int64_t>::min(), + /*original_key_upper=*/std::numeric_limits<int64_t>::max(), + /*num_data_threshold=*/3); + ASSERT_THAT(results, SizeIs(2)); + // Bucket 0: key lower = INT64_MIN, key upper = 2, keys = [-10, -1, 2]. + EXPECT_THAT(results[0].key_lower, Eq(std::numeric_limits<int64_t>::min())); + EXPECT_THAT(results[0].key_upper, Eq(2)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[0].start, results[0].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2))); + // Bucket 1: key lower = 3, key upper = INT64_MAX, keys = [10, INT64_MAX]. + EXPECT_THAT(results[1].key_lower, Eq(3)); + EXPECT_THAT(results[1].key_upper, Eq(std::numeric_limits<int64_t>::max())); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[1].start, results[1].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, + std::numeric_limits<int64_t>::max()))); +} + +TEST(IntegerIndexBucketUtilTest, + Split_keyUpperEqualsIntMax_keyIntMaxExceedingThreshold) { + std::vector<IntegerIndexData> data = { + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, + std::numeric_limits<int64_t>::max()), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, + std::numeric_limits<int64_t>::max()), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, + std::numeric_limits<int64_t>::max()), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, + std::numeric_limits<int64_t>::max()), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, + std::numeric_limits<int64_t>::max())}; + + // Keys = [-10, -1, 2, 10, INT64_MAX, INT64_MAX, INT64_MAX, INT64_MAX, + // INT64_MAX]. + std::vector<DataRangeAndBucketInfo> results = + Split(data, /*original_key_lower=*/std::numeric_limits<int64_t>::min(), + /*original_key_upper=*/std::numeric_limits<int64_t>::max(), + /*num_data_threshold=*/3); + ASSERT_THAT(results, SizeIs(3)); + // Bucket 0: key lower = INT64_MIN, key upper = 2, keys = [-10, -1, 2]. + EXPECT_THAT(results[0].key_lower, Eq(std::numeric_limits<int64_t>::min())); + EXPECT_THAT(results[0].key_upper, Eq(2)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[0].start, results[0].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2))); + // Bucket 1: key lower = 3, key upper = INT_MAX - 1, keys = [10]. + EXPECT_THAT(results[1].key_lower, Eq(3)); + EXPECT_THAT(results[1].key_upper, + Eq(std::numeric_limits<int64_t>::max() - 1)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[1].start, results[1].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10))); + // Bucket 2: key lower = INT64_MAX, key upper = INT64_MAX, keys = [INT64_MAX, + // INT64_MAX, INT64_MAX, INT64_MAX, INT64_MAX]. + EXPECT_THAT(results[2].key_lower, Eq(std::numeric_limits<int64_t>::max())); + EXPECT_THAT(results[2].key_upper, Eq(std::numeric_limits<int64_t>::max())); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[2].start, results[2].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, + std::numeric_limits<int64_t>::max()), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, + std::numeric_limits<int64_t>::max()), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, + std::numeric_limits<int64_t>::max()), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, + std::numeric_limits<int64_t>::max()), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, + std::numeric_limits<int64_t>::max()))); +} + +} // namespace + +} // namespace integer_index_bucket_util +} // namespace lib +} // namespace icing diff --git a/icing/index/numeric/integer-index-storage.cc b/icing/index/numeric/integer-index-storage.cc index 22ef8bd..db1983c 100644 --- a/icing/index/numeric/integer-index-storage.cc +++ b/icing/index/numeric/integer-index-storage.cc @@ -17,6 +17,7 @@ #include <algorithm> #include <cstdint> #include <functional> +#include <iterator> #include <limits> #include <memory> #include <queue> @@ -37,6 +38,7 @@ #include "icing/index/hit/doc-hit-info.h" #include "icing/index/iterator/doc-hit-info-iterator.h" #include "icing/index/numeric/doc-hit-info-iterator-numeric.h" +#include "icing/index/numeric/integer-index-bucket-util.h" #include "icing/index/numeric/integer-index-data.h" #include "icing/index/numeric/numeric-index.h" #include "icing/index/numeric/posting-list-integer-index-accessor.h" @@ -50,6 +52,41 @@ namespace lib { namespace { +// Helper function to flush data between [it_start, it_end) into posting list(s) +// and return posting list id. +// Note: it will sort data between [it_start, it_end) by basic hit value, so the +// caller should be aware that the data order will be changed after calling this +// function. +libtextclassifier3::StatusOr<PostingListIdentifier> FlushDataIntoPostingLists( + FlashIndexStorage* flash_index_storage, + PostingListIntegerIndexSerializer* posting_list_serializer, + const std::vector<IntegerIndexData>::iterator& it_start, + const std::vector<IntegerIndexData>::iterator& it_end) { + if (it_start == it_end) { + return PostingListIdentifier::kInvalid; + } + + ICING_ASSIGN_OR_RETURN( + std::unique_ptr<PostingListIntegerIndexAccessor> new_pl_accessor, + PostingListIntegerIndexAccessor::Create(flash_index_storage, + posting_list_serializer)); + + std::sort(it_start, it_end); + for (auto it = it_end - 1; it >= it_start; --it) { + ICING_RETURN_IF_ERROR(new_pl_accessor->PrependData(*it)); + } + + PostingListAccessor::FinalizeResult result = + std::move(*new_pl_accessor).Finalize(); + if (!result.status.ok()) { + return result.status; + } + if (!result.id.is_valid()) { + return absl_ports::InternalError("Fail to flush data into posting list(s)"); + } + return result.id; +} + // The following 4 methods are helper functions to get the correct file path of // metadata/sorted_buckets/unsorted_buckets/flash_index_storage, according to // the given working directory. @@ -510,9 +547,12 @@ libtextclassifier3::Status IntegerIndexStorage::AddKeys( mutable_new_arr.SetArray(/*idx=*/0, new_buckets.data(), new_buckets.size()); } - // Step 4: merge the unsorted bucket array into the sorted bucket array if the - // length of the unsorted bucket array exceeds the threshold. - // TODO(b/259743562): [Optimization 1] implement merge + // Step 4: sort and merge the unsorted bucket array into the sorted bucket + // array if the length of the unsorted bucket array exceeds the + // threshold. + if (unsorted_buckets_->num_elements() > kUnsortedBucketsLengthThreshold) { + ICING_RETURN_IF_ERROR(SortBuckets()); + } info().num_data += new_keys.size(); @@ -679,29 +719,23 @@ IntegerIndexStorage::InitializeNewFiles( absl_ports::StrCat("Failed to create directory: ", working_path)); } - // TODO(b/259743562): [Optimization 1] decide max # buckets, unsorted buckets - // threshold // Initialize sorted_buckets int32_t pre_mapping_mmap_size = sizeof(Bucket) * (1 << 10); - int32_t max_file_size = - pre_mapping_mmap_size + FileBackedVector<Bucket>::Header::kHeaderSize; ICING_ASSIGN_OR_RETURN( std::unique_ptr<FileBackedVector<Bucket>> sorted_buckets, FileBackedVector<Bucket>::Create( filesystem, GetSortedBucketsFilePath(working_path), - MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size, - pre_mapping_mmap_size)); + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, + FileBackedVector<Bucket>::kMaxFileSize, pre_mapping_mmap_size)); // Initialize unsorted_buckets - pre_mapping_mmap_size = sizeof(Bucket) * 100; - max_file_size = - pre_mapping_mmap_size + FileBackedVector<Bucket>::Header::kHeaderSize; + pre_mapping_mmap_size = sizeof(Bucket) * kUnsortedBucketsLengthThreshold; ICING_ASSIGN_OR_RETURN( std::unique_ptr<FileBackedVector<Bucket>> unsorted_buckets, FileBackedVector<Bucket>::Create( filesystem, GetUnsortedBucketsFilePath(working_path), - MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size, - pre_mapping_mmap_size)); + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, + FileBackedVector<Bucket>::kMaxFileSize, pre_mapping_mmap_size)); // Initialize flash_index_storage ICING_ASSIGN_OR_RETURN( @@ -785,29 +819,23 @@ IntegerIndexStorage::InitializeExistingFiles( /*pre_mapping_file_offset=*/0, /*pre_mapping_mmap_size=*/kMetadataFileSize)); - // TODO(b/259743562): [Optimization 1] decide max # buckets, unsorted buckets - // threshold // Initialize sorted_buckets int32_t pre_mapping_mmap_size = sizeof(Bucket) * (1 << 10); - int32_t max_file_size = - pre_mapping_mmap_size + FileBackedVector<Bucket>::Header::kHeaderSize; ICING_ASSIGN_OR_RETURN( std::unique_ptr<FileBackedVector<Bucket>> sorted_buckets, FileBackedVector<Bucket>::Create( filesystem, GetSortedBucketsFilePath(working_path), - MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size, - pre_mapping_mmap_size)); + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, + FileBackedVector<Bucket>::kMaxFileSize, pre_mapping_mmap_size)); // Initialize unsorted_buckets - pre_mapping_mmap_size = sizeof(Bucket) * 100; - max_file_size = - pre_mapping_mmap_size + FileBackedVector<Bucket>::Header::kHeaderSize; + pre_mapping_mmap_size = sizeof(Bucket) * kUnsortedBucketsLengthThreshold; ICING_ASSIGN_OR_RETURN( std::unique_ptr<FileBackedVector<Bucket>> unsorted_buckets, FileBackedVector<Bucket>::Create( filesystem, GetUnsortedBucketsFilePath(working_path), - MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size, - pre_mapping_mmap_size)); + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, + FileBackedVector<Bucket>::kMaxFileSize, pre_mapping_mmap_size)); // Initialize flash_index_storage ICING_ASSIGN_OR_RETURN( @@ -845,28 +873,13 @@ IntegerIndexStorage::FlushDataIntoNewSortedBucket( } ICING_ASSIGN_OR_RETURN( - std::unique_ptr<PostingListIntegerIndexAccessor> new_pl_accessor, - PostingListIntegerIndexAccessor::Create( - storage->flash_index_storage_.get(), - storage->posting_list_serializer_)); - - std::sort(data.begin(), data.end()); - for (auto itr = data.rbegin(); itr != data.rend(); ++itr) { - ICING_RETURN_IF_ERROR(new_pl_accessor->PrependData(*itr)); - } - - PostingListAccessor::FinalizeResult result = - std::move(*new_pl_accessor).Finalize(); - if (!result.status.ok()) { - return result.status; - } - if (!result.id.is_valid()) { - return absl_ports::InternalError("Fail to flush data into posting list"); - } + PostingListIdentifier pl_id, + FlushDataIntoPostingLists(storage->flash_index_storage_.get(), + storage->posting_list_serializer_, data.begin(), + data.end())); storage->info().num_data += data.size(); - return storage->sorted_buckets_->Append( - Bucket(key_lower, key_upper, result.id)); + return storage->sorted_buckets_->Append(Bucket(key_lower, key_upper, pl_id)); } libtextclassifier3::Status IntegerIndexStorage::PersistStoragesToDisk() { @@ -921,21 +934,80 @@ IntegerIndexStorage::AddKeysIntoBucketAndSplitIfNecessary( } for (auto it = it_start; it != it_end; ++it) { - // TODO(b/259743562): [Optimization 1] implement split bucket if pl is full - // and the bucket is splittable + if (mutable_bucket.Get().key_lower() < mutable_bucket.Get().key_upper() && + pl_accessor->WantsSplit()) { + // If the bucket needs split (max size and full) and is splittable, then + // we perform bucket splitting. + + // 1. Finalize the current posting list accessor. + PostingListAccessor::FinalizeResult result = + std::move(*pl_accessor).Finalize(); + if (!result.status.ok()) { + return result.status; + } + + // 2. Create another posting list accessor instance. Read all data and + // free all posting lists. + ICING_ASSIGN_OR_RETURN( + pl_accessor, + PostingListIntegerIndexAccessor::CreateFromExisting( + flash_index_storage_.get(), posting_list_serializer_, result.id)); + ICING_ASSIGN_OR_RETURN(std::vector<IntegerIndexData> all_data, + pl_accessor->GetAllDataAndFree()); + + // 3. Append all remaining new data. + all_data.reserve(all_data.size() + std::distance(it, it_end)); + for (; it != it_end; ++it) { + all_data.push_back(IntegerIndexData(section_id, document_id, *it)); + } + + // 4. Run bucket splitting algorithm to decide new buckets and dispatch + // data. + std::vector<integer_index_bucket_util::DataRangeAndBucketInfo> + new_bucket_infos = integer_index_bucket_util::Split( + all_data, mutable_bucket.Get().key_lower(), + mutable_bucket.Get().key_upper(), + kNumDataThresholdForBucketSplit); + if (new_bucket_infos.empty()) { + ICING_LOG(WARNING) + << "No buckets after splitting. This should not happen."; + return absl_ports::InternalError("Split error"); + } + + // 5. Flush data. + std::vector<Bucket> new_buckets; + for (int i = 0; i < new_bucket_infos.size(); ++i) { + ICING_ASSIGN_OR_RETURN( + PostingListIdentifier pl_id, + FlushDataIntoPostingLists( + flash_index_storage_.get(), posting_list_serializer_, + new_bucket_infos[i].start, new_bucket_infos[i].end)); + if (i == 0) { + // Reuse mutable_bucket + mutable_bucket.Get().set_key_lower(new_bucket_infos[i].key_lower); + mutable_bucket.Get().set_key_upper(new_bucket_infos[i].key_upper); + mutable_bucket.Get().set_posting_list_identifier(pl_id); + } else { + new_buckets.push_back(Bucket(new_bucket_infos[i].key_lower, + new_bucket_infos[i].key_upper, pl_id)); + } + } + + return new_buckets; + } + ICING_RETURN_IF_ERROR(pl_accessor->PrependData( IntegerIndexData(section_id, document_id, *it))); } - // TODO(b/259743562): [Optimization 1] implement split and return new buckets. - // We will change the original bucket (mutable_bucket) - // in-place to one of the new buckets, and the rest will - // be returned and added into unsorted buckets in AddKeys. PostingListAccessor::FinalizeResult result = std::move(*pl_accessor).Finalize(); if (!result.status.ok()) { return result.status; } + if (!result.id.is_valid()) { + return absl_ports::InternalError("Fail to flush data into posting list(s)"); + } mutable_bucket.Get().set_posting_list_identifier(result.id); diff --git a/icing/index/numeric/integer-index-storage.h b/icing/index/numeric/integer-index-storage.h index be0add9..ddd9231 100644 --- a/icing/index/numeric/integer-index-storage.h +++ b/icing/index/numeric/integer-index-storage.h @@ -30,6 +30,7 @@ #include "icing/file/posting_list/flash-index-storage.h" #include "icing/file/posting_list/posting-list-identifier.h" #include "icing/index/iterator/doc-hit-info-iterator.h" +#include "icing/index/numeric/integer-index-data.h" #include "icing/index/numeric/posting-list-integer-index-serializer.h" #include "icing/schema/section.h" #include "icing/store/document-id.h" @@ -117,6 +118,10 @@ class IntegerIndexStorage : public PersistentStorage { int64_t key_upper() const { return key_upper_; } + void set_key_lower(int64_t key_lower) { key_lower_ = key_lower; } + + void set_key_upper(int64_t key_upper) { key_upper_ = key_upper; } + PostingListIdentifier posting_list_identifier() const { return posting_list_identifier_; } @@ -176,14 +181,29 @@ class IntegerIndexStorage : public PersistentStorage { WorkingPathType::kDirectory; static constexpr std::string_view kFilePrefix = "integer_index_storage"; - // # of data threshold for bucket merging. If total # data of adjacent buckets - // exceed this value, then flush the accumulated data. Otherwise merge - // buckets and their data. + // # of data threshold for bucket merging during optimization (TransferIndex). + // If total # data of adjacent buckets exceed this value, then flush the + // accumulated data. Otherwise merge buckets and their data. // // Calculated by: 0.7 * (kMaxPostingListSize / sizeof(IntegerIndexData)), // where kMaxPostingListSize = (kPageSize - sizeof(IndexBlock::BlockHeader)). static constexpr int32_t kNumDataThresholdForBucketMerge = 240; + // # of data threshold for bucket splitting during indexing (AddKeys). + // When the posting list of a bucket is full, we will try to split data into + // multiple buckets according to their keys. In order to achieve good + // (amortized) time complexity, we want # of data in new buckets to be at most + // half # of elements in a full posting list. + // + // Calculated by: 0.5 * (kMaxPostingListSize / sizeof(IntegerIndexData)), + // where kMaxPostingListSize = (kPageSize - sizeof(IndexBlock::BlockHeader)). + static constexpr int32_t kNumDataThresholdForBucketSplit = 170; + + // Length threshold to sort and merge unsorted buckets into sorted buckets. If + // the length of unsorted_buckets exceed the threshold, then call + // SortBuckets(). + static constexpr int32_t kUnsortedBucketsLengthThreshold = 50; + // Creates a new IntegerIndexStorage instance to index integers (for a single // property). If any of the underlying file is missing, then delete the whole // working_path and (re)initialize with new ones. Otherwise initialize and @@ -370,7 +390,6 @@ class IntegerIndexStorage : public PersistentStorage { // into several new buckets with new ranges, and split the data (according // to their keys and the range of new buckets) of the original posting // list into several new posting lists. - // TODO(b/259743562): [Optimization 1] implement split // - Otherwise, just simply add a new key into it, and PostingListAccessor // mechanism will automatically create a new max size posting list and // chain them. diff --git a/icing/index/numeric/integer-index-storage_benchmark.cc b/icing/index/numeric/integer-index-storage_benchmark.cc index d150f2d..54b19c3 100644 --- a/icing/index/numeric/integer-index-storage_benchmark.cc +++ b/icing/index/numeric/integer-index-storage_benchmark.cc @@ -57,6 +57,7 @@ namespace lib { namespace { using ::testing::Eq; +using ::testing::IsEmpty; using ::testing::SizeIs; static constexpr SectionId kDefaultSectionId = 12; @@ -237,18 +238,24 @@ void BM_ExactQuery(benchmark::State& state) { std::unique_ptr<DocHitInfoIterator> iterator, storage->GetIterator(/*query_key_lower=*/exact_query_key, /*query_key_upper=*/exact_query_key)); - int cnt = 0; + std::vector<DocHitInfo> data; while (iterator->Advance().ok()) { - benchmark::DoNotOptimize(iterator->doc_hit_info()); - ++cnt; + data.push_back(iterator->doc_hit_info()); } + state.PauseTiming(); const auto it = keys.find(exact_query_key); if (it == keys.end()) { - ASSERT_THAT(cnt, Eq(0)); + ASSERT_THAT(data, IsEmpty()); } else { - ASSERT_THAT(it->second, SizeIs(cnt)); + ASSERT_THAT(data, SizeIs(it->second.size())); + std::reverse(data.begin(), data.end()); + for (int i = 0; i < data.size(); ++i) { + ASSERT_THAT(data[i].document_id(), Eq(it->second[i])); + ASSERT_THAT(data[i].hit_section_ids_mask(), Eq(1 << kDefaultSectionId)); + } } + state.ResumeTiming(); } } BENCHMARK(BM_ExactQuery) diff --git a/icing/index/numeric/integer-index-storage_test.cc b/icing/index/numeric/integer-index-storage_test.cc index 9d6864c..ed7d5db 100644 --- a/icing/index/numeric/integer-index-storage_test.cc +++ b/icing/index/numeric/integer-index-storage_test.cc @@ -14,6 +14,8 @@ #include "icing/index/numeric/integer-index-storage.h" +#include <unistd.h> + #include <cstdint> #include <limits> #include <memory> @@ -26,7 +28,10 @@ #include "gmock/gmock.h" #include "gtest/gtest.h" #include "icing/file/file-backed-vector.h" +#include "icing/file/filesystem.h" #include "icing/file/persistent-storage.h" +#include "icing/file/posting_list/flash-index-storage.h" +#include "icing/file/posting_list/index-block.h" #include "icing/file/posting_list/posting-list-identifier.h" #include "icing/index/hit/doc-hit-info.h" #include "icing/index/iterator/doc-hit-info-iterator.h" @@ -42,14 +47,17 @@ namespace lib { namespace { +using ::testing::Contains; using ::testing::ElementsAre; using ::testing::ElementsAreArray; using ::testing::Eq; +using ::testing::Ge; using ::testing::Gt; using ::testing::HasSubstr; using ::testing::IsEmpty; using ::testing::IsFalse; using ::testing::IsTrue; +using ::testing::Key; using ::testing::Le; using ::testing::Ne; using ::testing::Not; @@ -1186,6 +1194,150 @@ TEST_F(IntegerIndexStorageTest, EqualsDocHitInfo(kDefaultDocumentId, expected_sections)))); } +TEST_F(IntegerIndexStorageTest, SplitBuckets) { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<IntegerIndexStorage> storage, + IntegerIndexStorage::Create(filesystem_, working_path_, Options(), + serializer_.get())); + + uint32_t block_size = FlashIndexStorage::SelectBlockSize(); + uint32_t max_posting_list_bytes = IndexBlock::CalculateMaxPostingListBytes( + block_size, serializer_->GetDataTypeBytes()); + uint32_t max_num_data_before_split = + max_posting_list_bytes / serializer_->GetDataTypeBytes(); + + // Add max_num_data_before_split + 1 keys to invoke bucket splitting. + // Keys: max_num_data_before_split to 0 + // Document ids: 0 to max_num_data_before_split + std::unordered_map<int64_t, DocumentId> data; + int64_t key = max_num_data_before_split; + DocumentId document_id = 0; + for (int i = 0; i < max_num_data_before_split + 1; ++i) { + data[key] = document_id; + ICING_ASSERT_OK( + storage->AddKeys(document_id, kDefaultSectionId, /*new_keys=*/{key})); + ++document_id; + --key; + } + ICING_ASSERT_OK(storage->PersistToDisk()); + + // Manually check sorted and unsorted buckets. + { + // Check sorted buckets. + const std::string sorted_buckets_file_path = absl_ports::StrCat( + working_path_, "/", IntegerIndexStorage::kFilePrefix, ".s"); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<FileBackedVector<Bucket>> sorted_buckets, + FileBackedVector<Bucket>::Create( + filesystem_, sorted_buckets_file_path, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC)); + + EXPECT_THAT(sorted_buckets->num_elements(), Eq(1)); + ICING_ASSERT_OK_AND_ASSIGN(const Bucket* bucket1, + sorted_buckets->Get(/*idx=*/0)); + EXPECT_THAT(bucket1->key_lower(), Eq(std::numeric_limits<int64_t>::min())); + EXPECT_THAT(bucket1->key_upper(), Ne(std::numeric_limits<int64_t>::max())); + + int64_t sorted_bucket_key_upper = bucket1->key_upper(); + + // Check unsorted buckets. + const std::string unsorted_buckets_file_path = absl_ports::StrCat( + working_path_, "/", IntegerIndexStorage::kFilePrefix, ".u"); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<FileBackedVector<Bucket>> unsorted_buckets, + FileBackedVector<Bucket>::Create( + filesystem_, unsorted_buckets_file_path, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC)); + + EXPECT_THAT(unsorted_buckets->num_elements(), Ge(1)); + ICING_ASSERT_OK_AND_ASSIGN(const Bucket* bucket2, + unsorted_buckets->Get(/*idx=*/0)); + EXPECT_THAT(bucket2->key_lower(), Eq(sorted_bucket_key_upper + 1)); + } + + // Ensure that search works normally. + std::vector<SectionId> expected_sections = {kDefaultSectionId}; + for (int64_t key = max_num_data_before_split; key >= 0; key--) { + ASSERT_THAT(data, Contains(Key(key))); + DocumentId expected_document_id = data[key]; + EXPECT_THAT(Query(storage.get(), /*key_lower=*/key, /*key_upper=*/key), + IsOkAndHolds(ElementsAre(EqualsDocHitInfo(expected_document_id, + expected_sections)))); + } +} + +TEST_F(IntegerIndexStorageTest, SplitBucketsTriggerSortBuckets) { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<IntegerIndexStorage> storage, + IntegerIndexStorage::Create(filesystem_, working_path_, Options(), + serializer_.get())); + + uint32_t block_size = FlashIndexStorage::SelectBlockSize(); + uint32_t max_posting_list_bytes = IndexBlock::CalculateMaxPostingListBytes( + block_size, serializer_->GetDataTypeBytes()); + uint32_t max_num_data_before_split = + max_posting_list_bytes / serializer_->GetDataTypeBytes(); + + // Add IntegerIndexStorage::kUnsortedBucketsLengthThreshold keys. For each + // key, add max_num_data_before_split + 1 data. Then we will get: + // - Bucket splitting will create kUnsortedBucketsLengthThreshold + 1 unsorted + // buckets [[50, 50], [49, 49], ..., [1, 1], [51, INT64_MAX]]. + // - Since there are kUnsortedBucketsLengthThreshold + 1 unsorted buckets, we + // should sort and merge buckets. + std::unordered_map<int64_t, std::vector<DocumentId>> data; + int64_t key = IntegerIndexStorage::kUnsortedBucketsLengthThreshold; + DocumentId document_id = 0; + for (int i = 0; i < IntegerIndexStorage::kUnsortedBucketsLengthThreshold; + ++i) { + for (int j = 0; j < max_num_data_before_split + 1; ++j) { + data[key].push_back(document_id); + ICING_ASSERT_OK( + storage->AddKeys(document_id, kDefaultSectionId, /*new_keys=*/{key})); + ++document_id; + } + --key; + } + ICING_ASSERT_OK(storage->PersistToDisk()); + + // Manually check sorted and unsorted buckets. + { + // Check unsorted buckets. + const std::string unsorted_buckets_file_path = absl_ports::StrCat( + working_path_, "/", IntegerIndexStorage::kFilePrefix, ".u"); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<FileBackedVector<Bucket>> unsorted_buckets, + FileBackedVector<Bucket>::Create( + filesystem_, unsorted_buckets_file_path, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC)); + EXPECT_THAT(unsorted_buckets->num_elements(), Eq(0)); + + // Check sorted buckets. + const std::string sorted_buckets_file_path = absl_ports::StrCat( + working_path_, "/", IntegerIndexStorage::kFilePrefix, ".s"); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<FileBackedVector<Bucket>> sorted_buckets, + FileBackedVector<Bucket>::Create( + filesystem_, sorted_buckets_file_path, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC)); + EXPECT_THAT(sorted_buckets->num_elements(), Gt(1)); + } + + // Ensure that search works normally. + for (key = 1; key <= IntegerIndexStorage::kUnsortedBucketsLengthThreshold; + ++key) { + ASSERT_THAT(data, Contains(Key(key))); + + std::vector<DocHitInfo> expected_doc_hit_infos; + for (DocumentId doc_id : data[key]) { + expected_doc_hit_infos.push_back(DocHitInfo( + doc_id, /*hit_section_ids_mask=*/UINT64_C(1) << kDefaultSectionId)); + } + EXPECT_THAT(Query(storage.get(), /*key_lower=*/key, /*key_upper=*/key), + IsOkAndHolds(ElementsAreArray(expected_doc_hit_infos.rbegin(), + expected_doc_hit_infos.rend()))); + } +} + TEST_F(IntegerIndexStorageTest, TransferIndex) { // We use predefined custom buckets to initialize new integer index storage // and create some test keys accordingly. diff --git a/icing/index/numeric/integer-index.cc b/icing/index/numeric/integer-index.cc index a2d40f1..2f876e4 100644 --- a/icing/index/numeric/integer-index.cc +++ b/icing/index/numeric/integer-index.cc @@ -14,10 +14,12 @@ #include "icing/index/numeric/integer-index.h" +#include <algorithm> #include <cstdint> #include <memory> #include <string> #include <string_view> +#include <utility> #include <vector> #include "icing/text_classifier/lib3/utils/base/status.h" @@ -27,6 +29,7 @@ #include "icing/file/destructible-directory.h" #include "icing/file/filesystem.h" #include "icing/file/memory-mapped-file.h" +#include "icing/index/iterator/doc-hit-info-iterator-section-restrict.h" #include "icing/index/numeric/doc-hit-info-iterator-numeric.h" #include "icing/index/numeric/integer-index-storage.h" #include "icing/index/numeric/posting-list-integer-index-serializer.h" @@ -50,6 +53,17 @@ std::string GetMetadataFilePath(std::string_view working_path) { return absl_ports::StrCat(working_path, "/", GetMetadataFileName()); } +constexpr std::string_view kWildcardPropertyIndexFileName = + "wildcard_property_index"; + +constexpr std::string_view kWildcardPropertyStorageFileName = + "wildcard_property_storage"; + +std::string GetWildcardPropertyStorageFilePath(std::string_view working_path) { + return absl_ports::StrCat(working_path, "/", + kWildcardPropertyStorageFileName); +} + // Helper function to get the sub working (directory) path of // IntegerIndexStorage according to the given working directory and property // path. @@ -64,8 +78,9 @@ libtextclassifier3::StatusOr<std::vector<std::string>> GetAllExistingPropertyPaths(const Filesystem& filesystem, const std::string& working_path) { std::vector<std::string> property_paths; - if (!filesystem.ListDirectory(working_path.c_str(), - /*exclude=*/{GetMetadataFileName()}, + std::unordered_set<std::string> excludes = { + GetMetadataFileName(), std::string(kWildcardPropertyStorageFileName)}; + if (!filesystem.ListDirectory(working_path.c_str(), excludes, /*recursive=*/false, &property_paths)) { return absl_ports::InternalError("Failed to list directory"); } @@ -81,6 +96,9 @@ GetPropertyIntegerIndexStorageMap( IntegerIndex::PropertyToStorageMapType property_to_storage_map; for (const std::string& property_path : property_paths) { + if (property_path == kWildcardPropertyIndexFileName) { + continue; + } std::string storage_working_path = GetPropertyIndexStoragePath(working_path, property_path); ICING_ASSIGN_OR_RETURN( @@ -95,16 +113,61 @@ GetPropertyIntegerIndexStorageMap( return property_to_storage_map; } +// RETURNS: +// - On success, an unordered_set representing the list of property paths +// stored in the WildcardPropertyStorage managed by property_storage +// - INTERNAL_ERROR on any failure to successfully read the underlying proto. +libtextclassifier3::StatusOr<std::unordered_set<std::string>> CreatePropertySet( + const FileBackedProto<WildcardPropertyStorage>& property_storage) { + std::unordered_set<std::string> wildcard_properties_set; + auto wildcard_properties_or = property_storage.Read(); + if (!wildcard_properties_or.ok()) { + if (absl_ports::IsNotFound(wildcard_properties_or.status())) { + return wildcard_properties_set; + } + return wildcard_properties_or.status(); + } + + const WildcardPropertyStorage* wildcard_properties = + wildcard_properties_or.ValueOrDie(); + wildcard_properties_set.reserve(wildcard_properties->property_entries_size()); + for (const std::string& property : wildcard_properties->property_entries()) { + wildcard_properties_set.insert(property); + } + return wildcard_properties_set; +} + } // namespace libtextclassifier3::Status IntegerIndex::Editor::IndexAllBufferedKeys() && { auto iter = integer_index_.property_to_storage_map_.find(property_path_); IntegerIndexStorage* target_storage = nullptr; + // 1. Check if this property already has its own individual index. if (iter != integer_index_.property_to_storage_map_.end()) { target_storage = iter->second.get(); + // 2. Check if this property was added to wildcard storage. + } else if (integer_index_.wildcard_properties_set_.find(property_path_) != + integer_index_.wildcard_properties_set_.end()) { + target_storage = integer_index_.wildcard_index_storage_.get(); + // 3. Check if we've reach the limit of individual property storages. + } else if (integer_index_.property_to_storage_map_.size() >= + kMaxPropertyStorages) { + // 3a. Create the wildcard storage if it doesn't exist. + if (integer_index_.wildcard_index_storage_ == nullptr) { + ICING_ASSIGN_OR_RETURN( + integer_index_.wildcard_index_storage_, + IntegerIndexStorage::Create( + integer_index_.filesystem_, + GetPropertyIndexStoragePath(integer_index_.working_path_, + kWildcardPropertyIndexFileName), + IntegerIndexStorage::Options(), + integer_index_.posting_list_serializer_.get())); + } + ICING_RETURN_IF_ERROR( + integer_index_.AddPropertyToWildcardStorage(property_path_)); + target_storage = integer_index_.wildcard_index_storage_.get(); + // 4. Create a new individual storage for this new property. } else { - // A new property path. Create a new storage instance and insert into the - // map. ICING_ASSIGN_OR_RETURN( std::unique_ptr<IntegerIndexStorage> new_storage, IntegerIndexStorage::Create( @@ -144,15 +207,45 @@ IntegerIndex::~IntegerIndex() { libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>> IntegerIndex::GetIterator(std::string_view property_path, int64_t key_lower, - int64_t key_upper) const { - auto iter = property_to_storage_map_.find(std::string(property_path)); - if (iter == property_to_storage_map_.end()) { - // Return an empty iterator. - return std::make_unique<DocHitInfoIteratorNumeric<int64_t>>( - /*numeric_index_iter=*/nullptr); + int64_t key_upper, + const DocumentStore& document_store, + const SchemaStore& schema_store) const { + std::string property_path_str(property_path); + auto iter = property_to_storage_map_.find(property_path_str); + if (iter != property_to_storage_map_.end()) { + return iter->second->GetIterator(key_lower, key_upper); + } + + if (wildcard_properties_set_.find(property_path_str) != + wildcard_properties_set_.end()) { + ICING_ASSIGN_OR_RETURN( + std::unique_ptr<DocHitInfoIterator> delegate, + wildcard_index_storage_->GetIterator(key_lower, key_upper)); + std::set<std::string> property_paths = {std::move(property_path_str)}; + return std::make_unique<DocHitInfoIteratorSectionRestrict>( + std::move(delegate), &document_store, &schema_store, + std::move(property_paths)); + } + + // Return an empty iterator. + return std::make_unique<DocHitInfoIteratorNumeric<int64_t>>( + /*numeric_index_iter=*/nullptr); +} + +libtextclassifier3::Status IntegerIndex::AddPropertyToWildcardStorage( + const std::string& property_path) { + WildcardPropertyStorage wildcard_properties; + wildcard_properties.mutable_property_entries()->Reserve( + wildcard_properties_set_.size()); + for (const std::string& property_path : wildcard_properties_set_) { + wildcard_properties.add_property_entries(property_path); } + ICING_RETURN_IF_ERROR(wildcard_property_storage_->Write( + std::make_unique<WildcardPropertyStorage>( + std::move(wildcard_properties)))); - return iter->second->GetIterator(key_lower, key_upper); + wildcard_properties_set_.insert(property_path); + return libtextclassifier3::Status::OK; } libtextclassifier3::Status IntegerIndex::Optimize( @@ -183,6 +276,8 @@ libtextclassifier3::Status IntegerIndex::Optimize( // Destruct current storage instances to safely swap directories. metadata_mmapped_file_.reset(); property_to_storage_map_.clear(); + wildcard_index_storage_.reset(); + wildcard_property_storage_.reset(); if (!filesystem_.SwapFiles(temp_working_path_ddir.dir().c_str(), working_path_.c_str())) { return absl_ports::InternalError( @@ -190,9 +285,10 @@ libtextclassifier3::Status IntegerIndex::Optimize( } // Reinitialize the integer index. + std::string metadata_file_path = GetMetadataFilePath(working_path_); ICING_ASSIGN_OR_RETURN( MemoryMappedFile metadata_mmapped_file, - MemoryMappedFile::Create(filesystem_, GetMetadataFilePath(working_path_), + MemoryMappedFile::Create(filesystem_, metadata_file_path, MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, /*max_file_size=*/kMetadataFileSize, /*pre_mapping_file_offset=*/0, @@ -200,6 +296,25 @@ libtextclassifier3::Status IntegerIndex::Optimize( metadata_mmapped_file_ = std::make_unique<MemoryMappedFile>(std::move(metadata_mmapped_file)); + // Recreate all of the data structures tracking the wildcard storage. + std::string wildcard_property_path = + GetWildcardPropertyStorageFilePath(working_path_); + wildcard_property_storage_ = + std::make_unique<FileBackedProto<WildcardPropertyStorage>>( + filesystem_, wildcard_property_path); + + ICING_ASSIGN_OR_RETURN(wildcard_properties_set_, + CreatePropertySet(*wildcard_property_storage_)); + if (!wildcard_properties_set_.empty()) { + ICING_ASSIGN_OR_RETURN( + wildcard_index_storage_, + IntegerIndexStorage::Create( + filesystem_, + GetPropertyIndexStoragePath(working_path_, + kWildcardPropertyIndexFileName), + IntegerIndexStorage::Options(), posting_list_serializer_.get())); + } + // Initialize all existing integer index storages. ICING_ASSIGN_OR_RETURN( property_to_storage_map_, @@ -212,6 +327,7 @@ libtextclassifier3::Status IntegerIndex::Optimize( libtextclassifier3::Status IntegerIndex::Clear() { // Step 1: clear property_to_storage_map_. property_to_storage_map_.clear(); + wildcard_index_storage_.reset(); // Step 2: delete all IntegerIndexStorages. It is safe because there is no // active IntegerIndexStorage after clearing the map. @@ -224,6 +340,15 @@ libtextclassifier3::Status IntegerIndex::Clear() { GetPropertyIndexStoragePath(working_path_, property_path))); } + // Step 3: Delete the wildcard property storage + std::string wildcard_property_path = + GetWildcardPropertyStorageFilePath(working_path_); + if (filesystem_.FileExists(wildcard_property_path.c_str()) || + !filesystem_.DeleteFile(wildcard_property_path.c_str())) { + return absl_ports::InternalError(absl_ports::StrCat( + "Unable to delete file at path ", wildcard_property_path)); + } + info().last_added_document_id = kInvalidDocumentId; return libtextclassifier3::Status::OK; } @@ -249,12 +374,20 @@ IntegerIndex::InitializeNewFiles(const Filesystem& filesystem, ICING_RETURN_IF_ERROR(metadata_mmapped_file.GrowAndRemapIfNecessary( /*file_offset=*/0, /*mmap_size=*/kMetadataFileSize)); + std::string wildcard_property_path = + GetWildcardPropertyStorageFilePath(working_path); + auto wildcard_property_storage = + std::make_unique<FileBackedProto<WildcardPropertyStorage>>( + filesystem, wildcard_property_path); + // Create instance. auto new_integer_index = std::unique_ptr<IntegerIndex>(new IntegerIndex( filesystem, std::move(working_path), std::make_unique<PostingListIntegerIndexSerializer>(), std::make_unique<MemoryMappedFile>(std::move(metadata_mmapped_file)), - /*property_to_storage_map=*/{})); + /*property_to_storage_map=*/{}, std::move(wildcard_property_storage), + /*wildcard_properties_set=*/{}, /*wildcard_index_storage=*/nullptr)); + // Initialize info content by writing mapped memory directly. Info& info_ref = new_integer_index->info(); info_ref.magic = Info::kMagic; @@ -287,11 +420,33 @@ IntegerIndex::InitializeExistingFiles(const Filesystem& filesystem, GetPropertyIntegerIndexStorageMap(filesystem, working_path, posting_list_serializer.get())); + std::string wildcard_property_path = + GetWildcardPropertyStorageFilePath(working_path); + auto wildcard_property_storage = + std::make_unique<FileBackedProto<WildcardPropertyStorage>>( + filesystem, wildcard_property_path); + + ICING_ASSIGN_OR_RETURN( + std::unordered_set<std::string> wildcard_properties_set, + CreatePropertySet(*wildcard_property_storage)); + + std::unique_ptr<IntegerIndexStorage> wildcard_index_storage; + if (!wildcard_properties_set.empty()) { + ICING_ASSIGN_OR_RETURN( + wildcard_index_storage, + IntegerIndexStorage::Create( + filesystem, + GetPropertyIndexStoragePath(working_path, + kWildcardPropertyIndexFileName), + IntegerIndexStorage::Options(), posting_list_serializer.get())); + } + // Create instance. auto integer_index = std::unique_ptr<IntegerIndex>(new IntegerIndex( filesystem, std::move(working_path), std::move(posting_list_serializer), std::make_unique<MemoryMappedFile>(std::move(metadata_mmapped_file)), - std::move(property_to_storage_map))); + std::move(property_to_storage_map), std::move(wildcard_property_storage), + std::move(wildcard_properties_set), std::move(wildcard_index_storage))); // Initialize existing PersistentStorage. Checksums will be validated. ICING_RETURN_IF_ERROR(integer_index->InitializeExistingStorage()); @@ -303,31 +458,78 @@ IntegerIndex::InitializeExistingFiles(const Filesystem& filesystem, return integer_index; } -libtextclassifier3::Status IntegerIndex::TransferIndex( +libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndexStorage>> +IntegerIndex::TransferIntegerIndexStorage( const std::vector<DocumentId>& document_id_old_to_new, + const IntegerIndexStorage* old_storage, const std::string& property_path, IntegerIndex* new_integer_index) const { - for (const auto& [property_path, old_storage] : property_to_storage_map_) { - std::string new_storage_working_path = GetPropertyIndexStoragePath( - new_integer_index->working_path_, property_path); - ICING_ASSIGN_OR_RETURN( - std::unique_ptr<IntegerIndexStorage> new_storage, - IntegerIndexStorage::Create( - new_integer_index->filesystem_, new_storage_working_path, - IntegerIndexStorage::Options(), - new_integer_index->posting_list_serializer_.get())); + std::string new_storage_working_path = GetPropertyIndexStoragePath( + new_integer_index->working_path_, property_path); + ICING_ASSIGN_OR_RETURN( + std::unique_ptr<IntegerIndexStorage> new_storage, + IntegerIndexStorage::Create( + new_integer_index->filesystem_, new_storage_working_path, + IntegerIndexStorage::Options(), + new_integer_index->posting_list_serializer_.get())); + + ICING_RETURN_IF_ERROR( + old_storage->TransferIndex(document_id_old_to_new, new_storage.get())); + if (new_storage->num_data() == 0) { + new_storage.reset(); ICING_RETURN_IF_ERROR( - old_storage->TransferIndex(document_id_old_to_new, new_storage.get())); + IntegerIndexStorage::Discard(filesystem_, new_storage_working_path)); + } + return new_storage; +} + +libtextclassifier3::Status IntegerIndex::TransferWildcardStorage( + IntegerIndex* new_integer_index) const { + auto property_storage = std::make_unique<WildcardPropertyStorage>(); + property_storage->mutable_property_entries()->Reserve( + wildcard_properties_set_.size()); + for (const std::string& property : wildcard_properties_set_) { + property_storage->add_property_entries(property); + } + + ICING_RETURN_IF_ERROR(new_integer_index->wildcard_property_storage_->Write( + std::move(property_storage))); + new_integer_index->wildcard_properties_set_ = wildcard_properties_set_; + return libtextclassifier3::Status::OK; +} - if (new_storage->num_data() == 0) { - new_storage.reset(); - ICING_RETURN_IF_ERROR( - IntegerIndexStorage::Discard(filesystem_, new_storage_working_path)); - } else { +libtextclassifier3::Status IntegerIndex::TransferIndex( + const std::vector<DocumentId>& document_id_old_to_new, + IntegerIndex* new_integer_index) const { + // Transfer over the integer index storages + std::unique_ptr<IntegerIndexStorage> new_storage; + for (const auto& [property_path, old_storage] : property_to_storage_map_) { + ICING_ASSIGN_OR_RETURN( + new_storage, + TransferIntegerIndexStorage(document_id_old_to_new, old_storage.get(), + property_path, new_integer_index)); + if (new_storage != nullptr) { new_integer_index->property_to_storage_map_.insert( - std::make_pair(property_path, std::move(new_storage))); + {property_path, std::move(new_storage)}); } } + if (wildcard_index_storage_ != nullptr) { + ICING_ASSIGN_OR_RETURN( + new_storage, + TransferIntegerIndexStorage( + document_id_old_to_new, wildcard_index_storage_.get(), + std::string(kWildcardPropertyIndexFileName), new_integer_index)); + if (new_storage != nullptr) { + new_integer_index->wildcard_index_storage_ = std::move(new_storage); + + // The only time we need to copy over the list of properties using + // wildcard storage is if wildcard_index_storage and new_storage are both + // non-null. Otherwise, the new wildcard index storage won't have any + // data. + ICING_RETURN_IF_ERROR(TransferWildcardStorage(new_integer_index)); + } + } + return libtextclassifier3::Status::OK; } @@ -335,6 +537,11 @@ libtextclassifier3::Status IntegerIndex::PersistStoragesToDisk() { for (auto& [_, storage] : property_to_storage_map_) { ICING_RETURN_IF_ERROR(storage->PersistToDisk()); } + // No need to persist wildcard_properties_storage_. All calls to + // FileBackedProto::Write are fully written through at the time of the call. + if (wildcard_index_storage_) { + ICING_RETURN_IF_ERROR(wildcard_index_storage_->PersistToDisk()); + } return libtextclassifier3::Status::OK; } @@ -350,8 +557,8 @@ libtextclassifier3::StatusOr<Crc32> IntegerIndex::ComputeInfoChecksum() { } libtextclassifier3::StatusOr<Crc32> IntegerIndex::ComputeStoragesChecksum() { - // XOR all crcs of all storages. Since XOR is commutative and associative, the - // order doesn't matter. + // XOR all crcs of all storages. Since XOR is commutative and associative, + // the order doesn't matter. uint32_t storages_checksum = 0; for (auto& [property_path, storage] : property_to_storage_map_) { ICING_ASSIGN_OR_RETURN(Crc32 storage_crc, storage->UpdateChecksums()); @@ -359,6 +566,17 @@ libtextclassifier3::StatusOr<Crc32> IntegerIndex::ComputeStoragesChecksum() { storages_checksum ^= storage_crc.Get(); } + + if (wildcard_index_storage_ != nullptr) { + ICING_ASSIGN_OR_RETURN(Crc32 storage_crc, + wildcard_index_storage_->UpdateChecksums()); + storages_checksum ^= storage_crc.Get(); + } + + ICING_ASSIGN_OR_RETURN(Crc32 wildcard_properties_crc, + wildcard_property_storage_->ComputeChecksum()); + storages_checksum ^= wildcard_properties_crc.Get(); + return Crc32(storages_checksum); } diff --git a/icing/index/numeric/integer-index.h b/icing/index/numeric/integer-index.h index 050a143..303bb41 100644 --- a/icing/index/numeric/integer-index.h +++ b/icing/index/numeric/integer-index.h @@ -23,12 +23,16 @@ #include "icing/text_classifier/lib3/utils/base/status.h" #include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "icing/file/file-backed-proto.h" #include "icing/file/filesystem.h" #include "icing/file/memory-mapped-file.h" #include "icing/index/numeric/integer-index-storage.h" #include "icing/index/numeric/numeric-index.h" #include "icing/index/numeric/posting-list-integer-index-serializer.h" +#include "icing/index/numeric/wildcard-property-storage.pb.h" +#include "icing/schema/schema-store.h" #include "icing/store/document-id.h" +#include "icing/store/document-store.h" #include "icing/util/crc32.h" namespace icing { @@ -46,6 +50,11 @@ class IntegerIndex : public NumericIndex<int64_t> { using PropertyToStorageMapType = std::unordered_map<std::string, std::unique_ptr<IntegerIndexStorage>>; + // Maximum number of individual property storages that this index will allow + // before falling back to placing hits for any new properties into the + // 'wildcard' storage. + static constexpr int kMaxPropertyStorages = 32; + struct Info { static constexpr int32_t kMagic = 0x238a3dcb; @@ -125,8 +134,9 @@ class IntegerIndex : public NumericIndex<int64_t> { // - NOT_FOUND_ERROR if the given property_path doesn't exist // - Any IntegerIndexStorage errors libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>> GetIterator( - std::string_view property_path, int64_t key_lower, - int64_t key_upper) const override; + std::string_view property_path, int64_t key_lower, int64_t key_upper, + const DocumentStore& document_store, + const SchemaStore& schema_store) const override; // Reduces internal file sizes by reclaiming space and ids of deleted // documents. Integer index will convert all data (hits) to the new document @@ -165,6 +175,11 @@ class IntegerIndex : public NumericIndex<int64_t> { } } + int num_property_indices() const override { + return property_to_storage_map_.size() + + ((wildcard_index_storage_ == nullptr) ? 0 : 1); + } + private: class Editor : public NumericIndex<int64_t>::Editor { public: @@ -191,17 +206,24 @@ class IntegerIndex : public NumericIndex<int64_t> { IntegerIndex& integer_index_; // Does not own. }; - explicit IntegerIndex(const Filesystem& filesystem, - std::string&& working_path, - std::unique_ptr<PostingListIntegerIndexSerializer> - posting_list_serializer, - std::unique_ptr<MemoryMappedFile> metadata_mmapped_file, - PropertyToStorageMapType&& property_to_storage_map) + explicit IntegerIndex( + const Filesystem& filesystem, std::string&& working_path, + std::unique_ptr<PostingListIntegerIndexSerializer> + posting_list_serializer, + std::unique_ptr<MemoryMappedFile> metadata_mmapped_file, + PropertyToStorageMapType&& property_to_storage_map, + std::unique_ptr<FileBackedProto<WildcardPropertyStorage>> + wildcard_property_storage, + std::unordered_set<std::string> wildcard_properties_set, + std::unique_ptr<icing::lib::IntegerIndexStorage> wildcard_index_storage) : NumericIndex<int64_t>(filesystem, std::move(working_path), kWorkingPathType), posting_list_serializer_(std::move(posting_list_serializer)), metadata_mmapped_file_(std::move(metadata_mmapped_file)), - property_to_storage_map_(std::move(property_to_storage_map)) {} + property_to_storage_map_(std::move(property_to_storage_map)), + wildcard_property_storage_(std::move(wildcard_property_storage)), + wildcard_properties_set_(std::move(wildcard_properties_set)), + wildcard_index_storage_(std::move(wildcard_index_storage)) {} static libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndex>> InitializeNewFiles(const Filesystem& filesystem, std::string&& working_path); @@ -210,6 +232,17 @@ class IntegerIndex : public NumericIndex<int64_t> { InitializeExistingFiles(const Filesystem& filesystem, std::string&& working_path); + // Adds the property path to the list of properties using wildcard storage. + // This will both update the in-memory list (wildcard_properties_set_) and + // the persistent list (wilcard_property_storage_). + // + // RETURNS: + // - OK on success + // - INTERNAL_ERROR if unable to successfully persist updated properties + // list in wildcard_property_storage_. + libtextclassifier3::Status AddPropertyToWildcardStorage( + const std::string& property_path); + // Transfers integer index data from the current integer index to // new_integer_index. // @@ -222,6 +255,29 @@ class IntegerIndex : public NumericIndex<int64_t> { const std::vector<DocumentId>& document_id_old_to_new, IntegerIndex* new_integer_index) const; + // Transfers integer index data from old_storage to new_integer_index. + // + // Returns: + // - OK on success + // - INTERNAL_ERROR on I/O error. This could potentially leave the storages + // in an invalid state and the caller should handle it properly (e.g. + // discard and rebuild) + libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndexStorage>> + TransferIntegerIndexStorage( + const std::vector<DocumentId>& document_id_old_to_new, + const IntegerIndexStorage* old_storage, const std::string& property_path, + IntegerIndex* new_integer_index) const; + + // Transfers the persistent and in-memory list of properties using the + // wildcard storage from old_storage to new_integer_index. + // + // RETURNS: + // - OK on success + // - INTERNAL_ERROR if unable to successfully persist updated properties + // list in new_integer_index. + libtextclassifier3::Status TransferWildcardStorage( + IntegerIndex* new_integer_index) const; + // Flushes contents of all storages to underlying files. // // Returns: @@ -277,6 +333,19 @@ class IntegerIndex : public NumericIndex<int64_t> { // Property path to integer index storage map. PropertyToStorageMapType property_to_storage_map_; + + // Persistent list of properties that have added content to + // wildcard_index_storage_. + std::unique_ptr<FileBackedProto<WildcardPropertyStorage>> + wildcard_property_storage_; + + // In-memory list of properties that have added content to + // wildcard_index_storage_. + std::unordered_set<std::string> wildcard_properties_set_; + + // The index storage that is used once we have already created + // kMaxPropertyStorages in property_to_storage_map. + std::unique_ptr<icing::lib::IntegerIndexStorage> wildcard_index_storage_; }; } // namespace lib diff --git a/icing/index/numeric/integer-index_test.cc b/icing/index/numeric/integer-index_test.cc index c6cf855..c4dacb8 100644 --- a/icing/index/numeric/integer-index_test.cc +++ b/icing/index/numeric/integer-index_test.cc @@ -25,6 +25,7 @@ #include "icing/text_classifier/lib3/utils/base/statusor.h" #include "gmock/gmock.h" #include "gtest/gtest.h" +#include "icing/document-builder.h" #include "icing/file/filesystem.h" #include "icing/index/hit/doc-hit-info.h" #include "icing/index/iterator/doc-hit-info-iterator.h" @@ -32,6 +33,9 @@ #include "icing/index/numeric/integer-index-storage.h" #include "icing/index/numeric/numeric-index.h" #include "icing/index/numeric/posting-list-integer-index-serializer.h" +#include "icing/proto/document.pb.h" +#include "icing/proto/schema.pb.h" +#include "icing/schema-builder.h" #include "icing/schema/section.h" #include "icing/store/document-id.h" #include "icing/testing/common-matchers.h" @@ -68,9 +72,25 @@ class NumericIndexIntegerTest : public ::testing::Test { IsTrue()); working_path_ = base_dir_ + "/numeric_index_integer_test"; + std::string schema_dir = base_dir_ + "/schema_test"; + + ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(schema_dir.c_str())); + ICING_ASSERT_OK_AND_ASSIGN( + schema_store_, SchemaStore::Create(&filesystem_, schema_dir, &clock_)); + + std::string document_store_dir = base_dir_ + "/doc_store_test"; + ASSERT_TRUE( + filesystem_.CreateDirectoryRecursively(document_store_dir.c_str())); + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult doc_store_create_result, + DocumentStore::Create(&filesystem_, document_store_dir, &clock_, + schema_store_.get())); + doc_store_ = std::move(doc_store_create_result.document_store); } void TearDown() override { + doc_store_.reset(); + schema_store_.reset(); filesystem_.DeleteDirectoryRecursively(base_dir_.c_str()); } @@ -92,9 +112,67 @@ class NumericIndexIntegerTest : public ::testing::Test { return IntegerIndex::Create(filesystem_, working_path_); } + template <typename NotIntegerIndexType> + bool is_integer_index() const { + return false; + } + + template <> + bool is_integer_index<IntegerIndex>() const { + return true; + } + + libtextclassifier3::StatusOr<std::vector<DocumentId>> CompactDocStore() { + std::string document_store_dir = base_dir_ + "/doc_store_test"; + std::string document_store_compact_dir = + base_dir_ + "/doc_store_compact_test"; + if (!filesystem_.CreateDirectoryRecursively( + document_store_compact_dir.c_str())) { + return absl_ports::InternalError("Unable to create compact directory"); + } + ICING_ASSIGN_OR_RETURN( + std::vector<DocumentId> docid_map, + doc_store_->OptimizeInto(document_store_compact_dir, nullptr)); + + doc_store_.reset(); + if (!filesystem_.SwapFiles(document_store_dir.c_str(), + document_store_compact_dir.c_str())) { + return absl_ports::InternalError("Unable to swap directories."); + } + if (!filesystem_.DeleteDirectoryRecursively( + document_store_compact_dir.c_str())) { + return absl_ports::InternalError("Unable to delete compact directory"); + } + + ICING_ASSIGN_OR_RETURN( + DocumentStore::CreateResult doc_store_create_result, + DocumentStore::Create(&filesystem_, document_store_dir, &clock_, + schema_store_.get())); + doc_store_ = std::move(doc_store_create_result.document_store); + return docid_map; + } + + libtextclassifier3::StatusOr<std::vector<DocHitInfo>> Query( + const NumericIndex<int64_t>* integer_index, + std::string_view property_path, int64_t key_lower, int64_t key_upper) { + ICING_ASSIGN_OR_RETURN( + std::unique_ptr<DocHitInfoIterator> iter, + integer_index->GetIterator(property_path, key_lower, key_upper, + *doc_store_, *schema_store_)); + + std::vector<DocHitInfo> result; + while (iter->Advance().ok()) { + result.push_back(iter->doc_hit_info()); + } + return result; + } + Filesystem filesystem_; std::string base_dir_; std::string working_path_; + std::unique_ptr<SchemaStore> schema_store_; + std::unique_ptr<DocumentStore> doc_store_; + Clock clock_; }; void Index(NumericIndex<int64_t>* integer_index, std::string_view property_path, @@ -109,20 +187,6 @@ void Index(NumericIndex<int64_t>* integer_index, std::string_view property_path, ICING_EXPECT_OK(std::move(*editor).IndexAllBufferedKeys()); } -libtextclassifier3::StatusOr<std::vector<DocHitInfo>> Query( - const NumericIndex<int64_t>* integer_index, std::string_view property_path, - int64_t key_lower, int64_t key_upper) { - ICING_ASSIGN_OR_RETURN( - std::unique_ptr<DocHitInfoIterator> iter, - integer_index->GetIterator(property_path, key_lower, key_upper)); - - std::vector<DocHitInfo> result; - while (iter->Advance().ok()) { - result.push_back(iter->doc_hit_info()); - } - return result; -} - using TestTypes = ::testing::Types<DummyNumericIndex<int64_t>, IntegerIndex>; TYPED_TEST_SUITE(NumericIndexIntegerTest, TestTypes); @@ -180,8 +244,8 @@ TYPED_TEST(NumericIndexIntegerTest, SingleKeyExactQuery) { int64_t query_key = 2; std::vector<SectionId> expected_sections = {kDefaultSectionId}; - EXPECT_THAT(Query(integer_index.get(), kDefaultTestPropertyPath, - /*key_lower=*/query_key, /*key_upper=*/query_key), + EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath, + /*key_lower=*/query_key, /*key_upper=*/query_key), IsOkAndHolds(ElementsAre( EqualsDocHitInfo(/*document_id=*/5, expected_sections), EqualsDocHitInfo(/*document_id=*/2, expected_sections)))); @@ -206,8 +270,8 @@ TYPED_TEST(NumericIndexIntegerTest, SingleKeyRangeQuery) { kDefaultSectionId, /*keys=*/{2}); std::vector<SectionId> expected_sections = {kDefaultSectionId}; - EXPECT_THAT(Query(integer_index.get(), kDefaultTestPropertyPath, - /*key_lower=*/1, /*key_upper=*/3), + EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath, + /*key_lower=*/1, /*key_upper=*/3), IsOkAndHolds(ElementsAre( EqualsDocHitInfo(/*document_id=*/5, expected_sections), EqualsDocHitInfo(/*document_id=*/2, expected_sections), @@ -215,6 +279,258 @@ TYPED_TEST(NumericIndexIntegerTest, SingleKeyRangeQuery) { EqualsDocHitInfo(/*document_id=*/0, expected_sections)))); } +TYPED_TEST(NumericIndexIntegerTest, WildcardStorageQuery) { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<NumericIndex<int64_t>> integer_index, + this->template CreateIntegerIndex<TypeParam>()); + + // This test sets its schema assuming that max property storages == 32. + ASSERT_THAT(IntegerIndex::kMaxPropertyStorages, Eq(32)); + + PropertyConfigProto int_property_config = + PropertyConfigBuilder() + .SetName("otherProperty1") + .SetCardinality(CARDINALITY_REPEATED) + .SetDataTypeInt64(NUMERIC_MATCH_RANGE) + .Build(); + // Create a schema with two types: + // - TypeA has 34 properties: + // 'desiredProperty', 'otherProperty'*, 'undesiredProperty' + // - TypeB has 2 properties: 'anotherProperty', 'desiredProperty' + // 1. The 32 'otherProperty's will consume all of the individual storages + // 2. TypeA.desiredProperty and TypeB.anotherProperty will both be assigned + // SectionId = 0 for their respective types. + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType("TypeA") + .AddProperty(int_property_config) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty2")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty3")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty4")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty5")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty6")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty7")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty8")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty9")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty10")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty11")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty12")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty13")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty14")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty15")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty16")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty17")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty18")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty19")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty20")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty21")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty22")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty23")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty24")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty25")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty26")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty27")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty28")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty29")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty30")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty31")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty32")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("desiredProperty")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("undesiredProperty"))) + .AddType(SchemaTypeConfigBuilder() + .SetType("TypeB") + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("anotherProperty")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("desiredProperty"))) + .Build(); + ICING_ASSERT_OK(this->schema_store_->SetSchema(schema)); + + // Put 11 docs of "TypeA" into the document store. + DocumentProto doc = + DocumentBuilder().SetKey("ns1", "uri0").SetSchema("TypeA").Build(); + ICING_ASSERT_OK(this->doc_store_->Put(doc)); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri1").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri2").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri3").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri4").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri5").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri6").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri7").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri8").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri9").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri10").Build())); + + // Put 5 docs of "TypeB" into the document store. + doc = DocumentBuilder(doc).SetUri("uri11").SetSchema("TypeB").Build(); + ICING_ASSERT_OK(this->doc_store_->Put(doc)); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri12").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri13").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri14").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri15").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri16").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri17").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri18").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri19").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri20").Build())); + + // Ids are assigned alphabetically, so the property ids are: + // TypeA.desiredProperty = 0 + // TypeA.otherPropertyN = N + // TypeA.undesiredProperty = 33 + // TypeB.anotherProperty = 0 + // TypeB.desiredProperty = 1 + SectionId typea_desired_prop_id = 0; + SectionId typea_undesired_prop_id = 33; + SectionId typeb_another_prop_id = 0; + SectionId typeb_desired_prop_id = 1; + + // Index numeric content for other properties to force our property into the + // wildcard storage. + std::string other_property_path = "otherProperty"; + for (int i = 1; i <= IntegerIndex::kMaxPropertyStorages; ++i) { + Index(integer_index.get(), + absl_ports::StrCat(other_property_path, std::to_string(i)), + /*document_id=*/0, /*section_id=*/i, /*keys=*/{i}); + } + + // Index numeric content for TypeA.desiredProperty + std::string desired_property = "desiredProperty"; + Index(integer_index.get(), desired_property, /*document_id=*/0, + typea_desired_prop_id, /*keys=*/{1}); + Index(integer_index.get(), desired_property, /*document_id=*/1, + typea_desired_prop_id, /*keys=*/{3}); + Index(integer_index.get(), desired_property, /*document_id=*/2, + typea_desired_prop_id, /*keys=*/{2}); + Index(integer_index.get(), desired_property, /*document_id=*/3, + typea_desired_prop_id, /*keys=*/{0}); + Index(integer_index.get(), desired_property, /*document_id=*/4, + typea_desired_prop_id, /*keys=*/{4}); + Index(integer_index.get(), desired_property, /*document_id=*/5, + typea_desired_prop_id, /*keys=*/{2}); + + // Index the same numeric content for TypeA.undesiredProperty + std::string undesired_property = "undesiredProperty"; + Index(integer_index.get(), undesired_property, /*document_id=*/6, + typea_undesired_prop_id, /*keys=*/{3}); + Index(integer_index.get(), undesired_property, /*document_id=*/7, + typea_undesired_prop_id, /*keys=*/{2}); + Index(integer_index.get(), undesired_property, /*document_id=*/8, + typea_undesired_prop_id, /*keys=*/{0}); + Index(integer_index.get(), undesired_property, /*document_id=*/9, + typea_undesired_prop_id, /*keys=*/{4}); + Index(integer_index.get(), undesired_property, /*document_id=*/10, + typea_undesired_prop_id, /*keys=*/{2}); + + // Index the same numeric content for TypeB.anotherProperty + std::string another_property = "anotherProperty"; + Index(integer_index.get(), another_property, /*document_id=*/11, + typeb_another_prop_id, /*keys=*/{3}); + Index(integer_index.get(), another_property, /*document_id=*/12, + typeb_another_prop_id, /*keys=*/{2}); + Index(integer_index.get(), another_property, /*document_id=*/13, + typeb_another_prop_id, /*keys=*/{0}); + Index(integer_index.get(), another_property, /*document_id=*/14, + typeb_another_prop_id, /*keys=*/{4}); + Index(integer_index.get(), another_property, /*document_id=*/15, + typeb_another_prop_id, /*keys=*/{2}); + + // Finally, index the same numeric content for TypeB.desiredProperty + Index(integer_index.get(), desired_property, /*document_id=*/16, + typeb_desired_prop_id, /*keys=*/{3}); + Index(integer_index.get(), desired_property, /*document_id=*/17, + typeb_desired_prop_id, /*keys=*/{2}); + Index(integer_index.get(), desired_property, /*document_id=*/18, + typeb_desired_prop_id, /*keys=*/{0}); + Index(integer_index.get(), desired_property, /*document_id=*/19, + typeb_desired_prop_id, /*keys=*/{4}); + Index(integer_index.get(), desired_property, /*document_id=*/20, + typeb_desired_prop_id, /*keys=*/{2}); + + if (this->template is_integer_index<TypeParam>()) { + EXPECT_THAT(integer_index->num_property_indices(), Eq(33)); + } else { + EXPECT_THAT(integer_index->num_property_indices(), Eq(35)); + } + + // Only the hits for 'desired_prop_id' should be returned. + std::vector<SectionId> expected_sections_typea = {typea_desired_prop_id}; + std::vector<SectionId> expected_sections_typeb = {typeb_desired_prop_id}; + EXPECT_THAT( + this->Query(integer_index.get(), desired_property, + /*key_lower=*/2, /*key_upper=*/2), + IsOkAndHolds(ElementsAre( + EqualsDocHitInfo(/*document_id=*/20, expected_sections_typeb), + EqualsDocHitInfo(/*document_id=*/17, expected_sections_typeb), + EqualsDocHitInfo(/*document_id=*/5, expected_sections_typea), + EqualsDocHitInfo(/*document_id=*/2, expected_sections_typea)))); + + EXPECT_THAT( + this->Query(integer_index.get(), desired_property, + /*key_lower=*/1, /*key_upper=*/3), + IsOkAndHolds(ElementsAre( + EqualsDocHitInfo(/*document_id=*/20, expected_sections_typeb), + EqualsDocHitInfo(/*document_id=*/17, expected_sections_typeb), + EqualsDocHitInfo(/*document_id=*/16, expected_sections_typeb), + EqualsDocHitInfo(/*document_id=*/5, expected_sections_typea), + EqualsDocHitInfo(/*document_id=*/2, expected_sections_typea), + EqualsDocHitInfo(/*document_id=*/1, expected_sections_typea), + EqualsDocHitInfo(/*document_id=*/0, expected_sections_typea)))); +} + TYPED_TEST(NumericIndexIntegerTest, EmptyResult) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<NumericIndex<int64_t>> integer_index, @@ -233,11 +549,11 @@ TYPED_TEST(NumericIndexIntegerTest, EmptyResult) { Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/5, kDefaultSectionId, /*keys=*/{2}); - EXPECT_THAT(Query(integer_index.get(), kDefaultTestPropertyPath, - /*key_lower=*/10, /*key_upper=*/10), + EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath, + /*key_lower=*/10, /*key_upper=*/10), IsOkAndHolds(IsEmpty())); - EXPECT_THAT(Query(integer_index.get(), kDefaultTestPropertyPath, - /*key_lower=*/100, /*key_upper=*/200), + EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath, + /*key_lower=*/100, /*key_upper=*/200), IsOkAndHolds(IsEmpty())); } @@ -252,8 +568,8 @@ TYPED_TEST(NumericIndexIntegerTest, Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/0, kDefaultSectionId, /*keys=*/{1}); - EXPECT_THAT(Query(integer_index.get(), kAnotherPropertyPath, - /*key_lower=*/100, /*key_upper=*/200), + EXPECT_THAT(this->Query(integer_index.get(), kAnotherPropertyPath, + /*key_lower=*/100, /*key_upper=*/200), IsOkAndHolds(IsEmpty())); } @@ -286,8 +602,8 @@ TYPED_TEST(NumericIndexIntegerTest, kDefaultSectionId, /*keys=*/{4, -1000}); std::vector<SectionId> expected_sections = {kDefaultSectionId}; - EXPECT_THAT(Query(integer_index.get(), kDefaultTestPropertyPath, - /*key_lower=*/1, /*key_upper=*/3), + EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath, + /*key_lower=*/1, /*key_upper=*/3), IsOkAndHolds(ElementsAre( EqualsDocHitInfo(/*document_id=*/6, expected_sections), EqualsDocHitInfo(/*document_id=*/5, expected_sections), @@ -326,39 +642,39 @@ TYPED_TEST(NumericIndexIntegerTest, EdgeNumericValues) { std::vector<SectionId> expected_sections = {kDefaultSectionId}; // Negative key - EXPECT_THAT(Query(integer_index.get(), kDefaultTestPropertyPath, - /*key_lower=*/-100, /*key_upper=*/-70), + EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath, + /*key_lower=*/-100, /*key_upper=*/-70), IsOkAndHolds(ElementsAre( EqualsDocHitInfo(/*document_id=*/2, expected_sections), EqualsDocHitInfo(/*document_id=*/1, expected_sections)))); // INT64_MAX key - EXPECT_THAT(Query(integer_index.get(), kDefaultTestPropertyPath, - /*key_lower=*/std::numeric_limits<int64_t>::max(), - /*key_upper=*/std::numeric_limits<int64_t>::max()), + EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath, + /*key_lower=*/std::numeric_limits<int64_t>::max(), + /*key_upper=*/std::numeric_limits<int64_t>::max()), IsOkAndHolds(ElementsAre( EqualsDocHitInfo(/*document_id=*/7, expected_sections), EqualsDocHitInfo(/*document_id=*/3, expected_sections)))); // INT64_MIN key - EXPECT_THAT(Query(integer_index.get(), kDefaultTestPropertyPath, - /*key_lower=*/std::numeric_limits<int64_t>::min(), - /*key_upper=*/std::numeric_limits<int64_t>::min()), + EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath, + /*key_lower=*/std::numeric_limits<int64_t>::min(), + /*key_upper=*/std::numeric_limits<int64_t>::min()), IsOkAndHolds(ElementsAre( EqualsDocHitInfo(/*document_id=*/9, expected_sections), EqualsDocHitInfo(/*document_id=*/4, expected_sections)))); // Key = 0 - EXPECT_THAT(Query(integer_index.get(), kDefaultTestPropertyPath, - /*key_lower=*/0, /*key_upper=*/0), + EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath, + /*key_lower=*/0, /*key_upper=*/0), IsOkAndHolds(ElementsAre( EqualsDocHitInfo(/*document_id=*/8, expected_sections), EqualsDocHitInfo(/*document_id=*/0, expected_sections)))); // All keys from INT64_MIN to INT64_MAX - EXPECT_THAT(Query(integer_index.get(), kDefaultTestPropertyPath, - /*key_lower=*/std::numeric_limits<int64_t>::min(), - /*key_upper=*/std::numeric_limits<int64_t>::max()), + EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath, + /*key_lower=*/std::numeric_limits<int64_t>::min(), + /*key_upper=*/std::numeric_limits<int64_t>::max()), IsOkAndHolds(ElementsAre( EqualsDocHitInfo(/*document_id=*/9, expected_sections), EqualsDocHitInfo(/*document_id=*/8, expected_sections), @@ -404,8 +720,9 @@ TYPED_TEST(NumericIndexIntegerTest, /*section_id=*/3, /*keys=*/{5}); EXPECT_THAT( - Query(integer_index.get(), kDefaultTestPropertyPath, /*key_lower=*/1, - /*key_upper=*/3), + this->Query(integer_index.get(), kDefaultTestPropertyPath, + /*key_lower=*/1, + /*key_upper=*/3), IsOkAndHolds(ElementsAre( EqualsDocHitInfo(/*document_id=*/2, std::vector<SectionId>{4, 5}), EqualsDocHitInfo(/*document_id=*/1, std::vector<SectionId>{1, 2}), @@ -433,8 +750,8 @@ TYPED_TEST(NumericIndexIntegerTest, NonRelevantPropertyShouldNotBeIncluded) { kDefaultSectionId, /*keys=*/{2}); std::vector<SectionId> expected_sections = {kDefaultSectionId}; - EXPECT_THAT(Query(integer_index.get(), kDefaultTestPropertyPath, - /*key_lower=*/1, /*key_upper=*/3), + EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath, + /*key_lower=*/1, /*key_upper=*/3), IsOkAndHolds(ElementsAre( EqualsDocHitInfo(/*document_id=*/5, expected_sections), EqualsDocHitInfo(/*document_id=*/1, expected_sections), @@ -460,8 +777,8 @@ TYPED_TEST(NumericIndexIntegerTest, Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/5, kDefaultSectionId, /*keys=*/{2}); - EXPECT_THAT(Query(integer_index.get(), kDefaultTestPropertyPath, - /*key_lower=*/3, /*key_upper=*/1), + EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath, + /*key_lower=*/3, /*key_upper=*/1), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } @@ -499,30 +816,30 @@ TYPED_TEST(NumericIndexIntegerTest, Optimize) { // Verify index and query API still work normally after Optimize(). std::vector<SectionId> expected_sections = {kDefaultSectionId}; - EXPECT_THAT(Query(integer_index.get(), kDefaultTestPropertyPath, - /*key_lower=*/1, /*key_upper=*/1), + EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath, + /*key_lower=*/1, /*key_upper=*/1), IsOkAndHolds(ElementsAre( EqualsDocHitInfo(/*document_id=*/0, expected_sections)))); - EXPECT_THAT(Query(integer_index.get(), kDefaultTestPropertyPath, - /*key_lower=*/3, /*key_upper=*/3), + EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath, + /*key_lower=*/3, /*key_upper=*/3), IsOkAndHolds(ElementsAre( EqualsDocHitInfo(/*document_id=*/1, expected_sections)))); - EXPECT_THAT(Query(integer_index.get(), kDefaultTestPropertyPath, - /*key_lower=*/0, /*key_upper=*/0), + EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath, + /*key_lower=*/0, /*key_upper=*/0), IsOkAndHolds(IsEmpty())); - EXPECT_THAT(Query(integer_index.get(), kDefaultTestPropertyPath, - /*key_lower=*/4, /*key_upper=*/4), + EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath, + /*key_lower=*/4, /*key_upper=*/4), IsOkAndHolds(ElementsAre( EqualsDocHitInfo(/*document_id=*/2, expected_sections)))); - EXPECT_THAT(Query(integer_index.get(), kDefaultTestPropertyPath, - /*key_lower=*/2, /*key_upper=*/2), + EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath, + /*key_lower=*/2, /*key_upper=*/2), IsOkAndHolds(ElementsAre( EqualsDocHitInfo(/*document_id=*/3, expected_sections)))); Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/5, kDefaultSectionId, /*keys=*/{123}); - EXPECT_THAT(Query(integer_index.get(), kDefaultTestPropertyPath, - /*key_lower=*/123, /*key_upper=*/123), + EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath, + /*key_lower=*/123, /*key_upper=*/123), IsOkAndHolds(ElementsAre( EqualsDocHitInfo(/*document_id=*/5, expected_sections)))); } @@ -581,40 +898,40 @@ TYPED_TEST(NumericIndexIntegerTest, OptimizeMultiplePropertyPaths) { // Verify index and query API still work normally after Optimize(). // Key = 1 - EXPECT_THAT(Query(integer_index.get(), kPropertyPath1, /*key_lower=*/1, - /*key_upper=*/1), + EXPECT_THAT(this->Query(integer_index.get(), kPropertyPath1, /*key_lower=*/1, + /*key_upper=*/1), IsOkAndHolds(IsEmpty())); - EXPECT_THAT(Query(integer_index.get(), kPropertyPath2, /*key_lower=*/1, - /*key_upper=*/1), + EXPECT_THAT(this->Query(integer_index.get(), kPropertyPath2, /*key_lower=*/1, + /*key_upper=*/1), IsOkAndHolds(ElementsAre(EqualsDocHitInfo( /*document_id=*/0, std::vector<SectionId>{kSectionId2})))); // key = 2 - EXPECT_THAT(Query(integer_index.get(), kPropertyPath1, /*key_lower=*/2, - /*key_upper=*/2), + EXPECT_THAT(this->Query(integer_index.get(), kPropertyPath1, /*key_lower=*/2, + /*key_upper=*/2), IsOkAndHolds(ElementsAre(EqualsDocHitInfo( /*document_id=*/0, std::vector<SectionId>{kSectionId1})))); - EXPECT_THAT(Query(integer_index.get(), kPropertyPath2, /*key_lower=*/2, - /*key_upper=*/2), + EXPECT_THAT(this->Query(integer_index.get(), kPropertyPath2, /*key_lower=*/2, + /*key_upper=*/2), IsOkAndHolds(IsEmpty())); // key = 3 - EXPECT_THAT(Query(integer_index.get(), kPropertyPath1, /*key_lower=*/3, - /*key_upper=*/3), + EXPECT_THAT(this->Query(integer_index.get(), kPropertyPath1, /*key_lower=*/3, + /*key_upper=*/3), IsOkAndHolds(ElementsAre(EqualsDocHitInfo( /*document_id=*/1, std::vector<SectionId>{kSectionId1})))); - EXPECT_THAT(Query(integer_index.get(), kPropertyPath2, /*key_lower=*/3, - /*key_upper=*/3), + EXPECT_THAT(this->Query(integer_index.get(), kPropertyPath2, /*key_lower=*/3, + /*key_upper=*/3), IsOkAndHolds(ElementsAre(EqualsDocHitInfo( /*document_id=*/2, std::vector<SectionId>{kSectionId2})))); // key = 4 - EXPECT_THAT(Query(integer_index.get(), kPropertyPath1, /*key_lower=*/4, - /*key_upper=*/4), + EXPECT_THAT(this->Query(integer_index.get(), kPropertyPath1, /*key_lower=*/4, + /*key_upper=*/4), IsOkAndHolds(ElementsAre(EqualsDocHitInfo( /*document_id=*/3, std::vector<SectionId>{kSectionId1})))); - EXPECT_THAT(Query(integer_index.get(), kPropertyPath2, /*key_lower=*/4, - /*key_upper=*/4), + EXPECT_THAT(this->Query(integer_index.get(), kPropertyPath2, /*key_lower=*/4, + /*key_upper=*/4), IsOkAndHolds(IsEmpty())); } @@ -655,9 +972,9 @@ TYPED_TEST(NumericIndexIntegerTest, OptimizeShouldDiscardEmptyPropertyStorage) { // All data in "prop2" as well as the underlying storage should be deleted, so // when querying "prop2", we should get empty result. - EXPECT_THAT(Query(integer_index.get(), kPropertyPath2, - /*key_lower=*/std::numeric_limits<int64_t>::min(), - /*key_upper=*/std::numeric_limits<int64_t>::max()), + EXPECT_THAT(this->Query(integer_index.get(), kPropertyPath2, + /*key_lower=*/std::numeric_limits<int64_t>::min(), + /*key_upper=*/std::numeric_limits<int64_t>::max()), IsOkAndHolds(IsEmpty())); if (std::is_same_v<IntegerIndex, TypeParam>) { std::string prop2_storage_working_path = @@ -670,8 +987,8 @@ TYPED_TEST(NumericIndexIntegerTest, OptimizeShouldDiscardEmptyPropertyStorage) { // Verify we can still index and query for "prop2". Index(integer_index.get(), kPropertyPath2, /*document_id=*/100, kSectionId2, /*keys=*/{123}); - EXPECT_THAT(Query(integer_index.get(), kPropertyPath2, - /*key_lower=*/123, /*key_upper=*/123), + EXPECT_THAT(this->Query(integer_index.get(), kPropertyPath2, + /*key_lower=*/123, /*key_upper=*/123), IsOkAndHolds(ElementsAre(EqualsDocHitInfo( /*document_id=*/100, std::vector<SectionId>{kSectionId2})))); } @@ -697,9 +1014,9 @@ TYPED_TEST(NumericIndexIntegerTest, OptimizeOutOfRangeDocumentId) { EXPECT_THAT(integer_index->last_added_document_id(), Eq(kInvalidDocumentId)); // Verify all data are discarded after Optimize(). - EXPECT_THAT(Query(integer_index.get(), kDefaultTestPropertyPath, - /*key_lower=*/std::numeric_limits<int64_t>::min(), - /*key_upper=*/std::numeric_limits<int64_t>::max()), + EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath, + /*key_lower=*/std::numeric_limits<int64_t>::min(), + /*key_upper=*/std::numeric_limits<int64_t>::max()), IsOkAndHolds(IsEmpty())); } @@ -731,9 +1048,9 @@ TYPED_TEST(NumericIndexIntegerTest, OptimizeDeleteAll) { EXPECT_THAT(integer_index->last_added_document_id(), Eq(kInvalidDocumentId)); // Verify all data are discarded after Optimize(). - EXPECT_THAT(Query(integer_index.get(), kDefaultTestPropertyPath, - /*key_lower=*/std::numeric_limits<int64_t>::min(), - /*key_upper=*/std::numeric_limits<int64_t>::max()), + EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath, + /*key_lower=*/std::numeric_limits<int64_t>::min(), + /*key_upper=*/std::numeric_limits<int64_t>::max()), IsOkAndHolds(IsEmpty())); } @@ -750,13 +1067,13 @@ TYPED_TEST(NumericIndexIntegerTest, Clear) { ASSERT_THAT(integer_index->last_added_document_id(), Eq(1)); ASSERT_THAT( - Query(integer_index.get(), /*property_path=*/"A", /*key_lower=*/1, - /*key_upper=*/1), + this->Query(integer_index.get(), /*property_path=*/"A", /*key_lower=*/1, + /*key_upper=*/1), IsOkAndHolds(ElementsAre(EqualsDocHitInfo( /*document_id=*/0, std::vector<SectionId>{kDefaultSectionId})))); ASSERT_THAT( - Query(integer_index.get(), /*property_path=*/"B", /*key_lower=*/3, - /*key_upper=*/3), + this->Query(integer_index.get(), /*property_path=*/"B", /*key_lower=*/3, + /*key_upper=*/3), IsOkAndHolds(ElementsAre(EqualsDocHitInfo( /*document_id=*/1, std::vector<SectionId>{kDefaultSectionId})))); @@ -764,12 +1081,14 @@ TYPED_TEST(NumericIndexIntegerTest, Clear) { // kInvalidDocumentId, and the previous added keys should be deleted. ICING_ASSERT_OK(integer_index->Clear()); EXPECT_THAT(integer_index->last_added_document_id(), Eq(kInvalidDocumentId)); - EXPECT_THAT(Query(integer_index.get(), /*property_path=*/"A", /*key_lower=*/1, - /*key_upper=*/1), - IsOkAndHolds(IsEmpty())); - EXPECT_THAT(Query(integer_index.get(), /*property_path=*/"B", /*key_lower=*/3, - /*key_upper=*/3), - IsOkAndHolds(IsEmpty())); + EXPECT_THAT( + this->Query(integer_index.get(), /*property_path=*/"A", /*key_lower=*/1, + /*key_upper=*/1), + IsOkAndHolds(IsEmpty())); + EXPECT_THAT( + this->Query(integer_index.get(), /*property_path=*/"B", /*key_lower=*/3, + /*key_upper=*/3), + IsOkAndHolds(IsEmpty())); // Integer index should be able to work normally after Clear(). Index(integer_index.get(), /*property_path=*/"A", /*document_id=*/3, @@ -780,13 +1099,13 @@ TYPED_TEST(NumericIndexIntegerTest, Clear) { EXPECT_THAT(integer_index->last_added_document_id(), Eq(4)); EXPECT_THAT( - Query(integer_index.get(), /*property_path=*/"A", /*key_lower=*/123, - /*key_upper=*/123), + this->Query(integer_index.get(), /*property_path=*/"A", /*key_lower=*/123, + /*key_upper=*/123), IsOkAndHolds(ElementsAre(EqualsDocHitInfo( /*document_id=*/3, std::vector<SectionId>{kDefaultSectionId})))); EXPECT_THAT( - Query(integer_index.get(), /*property_path=*/"B", /*key_lower=*/456, - /*key_upper=*/456), + this->Query(integer_index.get(), /*property_path=*/"B", /*key_lower=*/456, + /*key_upper=*/456), IsOkAndHolds(ElementsAre(EqualsDocHitInfo( /*document_id=*/4, std::vector<SectionId>{kDefaultSectionId})))); } @@ -1066,6 +1385,260 @@ TEST_F(IntegerIndexTest, HasSubstr("Invalid storages crc")); } } + +TEST_F(IntegerIndexTest, WildcardStoragePersistenceQuery) { + // This test sets its schema assuming that max property storages == 32. + ASSERT_THAT(IntegerIndex::kMaxPropertyStorages, Eq(32)); + + PropertyConfigProto int_property_config = + PropertyConfigBuilder() + .SetName("otherProperty1") + .SetCardinality(CARDINALITY_REPEATED) + .SetDataTypeInt64(NUMERIC_MATCH_RANGE) + .Build(); + // Create a schema with two types: + // - TypeA has 34 properties: + // 'desiredProperty', 'otherProperty'*, 'undesiredProperty' + // - TypeB has 2 properties: 'anotherProperty', 'desiredProperty' + // 1. The 32 'otherProperty's will consume all of the individual storages + // 2. TypeA.desiredProperty and TypeB.anotherProperty will both be assigned + // SectionId = 0 for their respective types. + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType("TypeA") + .AddProperty(int_property_config) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty2")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty3")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty4")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty5")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty6")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty7")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty8")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty9")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty10")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty11")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty12")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty13")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty14")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty15")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty16")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty17")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty18")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty19")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty20")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty21")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty22")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty23")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty24")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty25")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty26")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty27")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty28")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty29")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty30")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty31")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty32")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("desiredProperty")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("undesiredProperty"))) + .AddType(SchemaTypeConfigBuilder() + .SetType("TypeB") + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("anotherProperty")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("desiredProperty"))) + .Build(); + ICING_ASSERT_OK(this->schema_store_->SetSchema(schema)); + + // Ids are assigned alphabetically, so the property ids are: + // TypeA.desiredProperty = 0 + // TypeA.otherPropertyN = N + // TypeA.undesiredProperty = 33 + // TypeB.anotherProperty = 0 + // TypeB.desiredProperty = 1 + SectionId typea_desired_prop_id = 0; + SectionId typea_undesired_prop_id = 33; + SectionId typeb_another_prop_id = 0; + SectionId typeb_desired_prop_id = 1; + std::string desired_property = "desiredProperty"; + std::string undesired_property = "undesiredProperty"; + std::string another_property = "anotherProperty"; + + // Put 11 docs of "TypeA" into the document store. + DocumentProto doc = + DocumentBuilder().SetKey("ns1", "uri0").SetSchema("TypeA").Build(); + ICING_ASSERT_OK(this->doc_store_->Put(doc)); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri1").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri2").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri3").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri4").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri5").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri6").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri7").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri8").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri9").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri10").Build())); + + // Put 10 docs of "TypeB" into the document store. + doc = DocumentBuilder(doc).SetUri("uri11").SetSchema("TypeB").Build(); + ICING_ASSERT_OK(this->doc_store_->Put(doc)); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri12").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri13").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri14").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri15").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri16").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri17").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri18").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri19").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri20").Build())); + + { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<IntegerIndex> integer_index, + IntegerIndex::Create(filesystem_, working_path_)); + + // Index numeric content for other properties to force our property into the + // wildcard storage. + std::string other_property_path = "otherProperty"; + for (int i = 1; i <= IntegerIndex::kMaxPropertyStorages; ++i) { + Index(integer_index.get(), + absl_ports::StrCat(other_property_path, std::to_string(i)), + /*document_id=*/0, /*section_id=*/i, /*keys=*/{i}); + } + + // Index numeric content for TypeA.desiredProperty + Index(integer_index.get(), desired_property, /*document_id=*/0, + typea_desired_prop_id, /*keys=*/{1}); + Index(integer_index.get(), desired_property, /*document_id=*/1, + typea_desired_prop_id, /*keys=*/{3}); + Index(integer_index.get(), desired_property, /*document_id=*/2, + typea_desired_prop_id, /*keys=*/{2}); + Index(integer_index.get(), desired_property, /*document_id=*/3, + typea_desired_prop_id, /*keys=*/{0}); + Index(integer_index.get(), desired_property, /*document_id=*/4, + typea_desired_prop_id, /*keys=*/{4}); + Index(integer_index.get(), desired_property, /*document_id=*/5, + typea_desired_prop_id, /*keys=*/{2}); + + // Index the same numeric content for TypeA.undesiredProperty + Index(integer_index.get(), undesired_property, /*document_id=*/6, + typea_undesired_prop_id, /*keys=*/{3}); + Index(integer_index.get(), undesired_property, /*document_id=*/7, + typea_undesired_prop_id, /*keys=*/{2}); + Index(integer_index.get(), undesired_property, /*document_id=*/8, + typea_undesired_prop_id, /*keys=*/{0}); + Index(integer_index.get(), undesired_property, /*document_id=*/9, + typea_undesired_prop_id, /*keys=*/{4}); + Index(integer_index.get(), undesired_property, /*document_id=*/10, + typea_undesired_prop_id, /*keys=*/{2}); + + // Index the same numeric content for TypeB.undesiredProperty + Index(integer_index.get(), another_property, /*document_id=*/11, + typeb_another_prop_id, /*keys=*/{3}); + Index(integer_index.get(), another_property, /*document_id=*/12, + typeb_another_prop_id, /*keys=*/{2}); + Index(integer_index.get(), another_property, /*document_id=*/13, + typeb_another_prop_id, /*keys=*/{0}); + Index(integer_index.get(), another_property, /*document_id=*/14, + typeb_another_prop_id, /*keys=*/{4}); + Index(integer_index.get(), another_property, /*document_id=*/15, + typeb_another_prop_id, /*keys=*/{2}); + + // Finally, index the same numeric content for TypeB.desiredProperty + Index(integer_index.get(), desired_property, /*document_id=*/16, + typeb_desired_prop_id, /*keys=*/{3}); + Index(integer_index.get(), desired_property, /*document_id=*/17, + typeb_desired_prop_id, /*keys=*/{2}); + Index(integer_index.get(), desired_property, /*document_id=*/18, + typeb_desired_prop_id, /*keys=*/{0}); + Index(integer_index.get(), desired_property, /*document_id=*/19, + typeb_desired_prop_id, /*keys=*/{4}); + Index(integer_index.get(), desired_property, /*document_id=*/20, + typeb_desired_prop_id, /*keys=*/{2}); + } + + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<IntegerIndex> integer_index, + IntegerIndex::Create(filesystem_, working_path_)); + + EXPECT_THAT(integer_index->num_property_indices(), Eq(33)); + + // Only the hits for 'desired_prop_id' should be returned. + std::vector<SectionId> expected_sections_typea = {typea_desired_prop_id}; + std::vector<SectionId> expected_sections_typeb = {typeb_desired_prop_id}; + EXPECT_THAT( + Query(integer_index.get(), desired_property, + /*key_lower=*/2, /*key_upper=*/2), + IsOkAndHolds(ElementsAre( + EqualsDocHitInfo(/*document_id=*/20, expected_sections_typeb), + EqualsDocHitInfo(/*document_id=*/17, expected_sections_typeb), + EqualsDocHitInfo(/*document_id=*/5, expected_sections_typea), + EqualsDocHitInfo(/*document_id=*/2, expected_sections_typea)))); + + EXPECT_THAT( + Query(integer_index.get(), desired_property, + /*key_lower=*/1, /*key_upper=*/3), + IsOkAndHolds(ElementsAre( + EqualsDocHitInfo(/*document_id=*/20, expected_sections_typeb), + EqualsDocHitInfo(/*document_id=*/17, expected_sections_typeb), + EqualsDocHitInfo(/*document_id=*/16, expected_sections_typeb), + EqualsDocHitInfo(/*document_id=*/5, expected_sections_typea), + EqualsDocHitInfo(/*document_id=*/2, expected_sections_typea), + EqualsDocHitInfo(/*document_id=*/1, expected_sections_typea), + EqualsDocHitInfo(/*document_id=*/0, expected_sections_typea)))); +} + TEST_F(IntegerIndexTest, IntegerIndexShouldWorkAfterOptimizeAndReinitialization) { constexpr std::string_view kPropertyPath1 = "prop1"; @@ -1183,6 +1756,550 @@ TEST_F(IntegerIndexTest, } } +TEST_F(IntegerIndexTest, WildcardStorageWorksAfterOptimize) { + // This test sets its schema assuming that max property storages == 32. + ASSERT_THAT(IntegerIndex::kMaxPropertyStorages, Eq(32)); + + PropertyConfigProto int_property_config = + PropertyConfigBuilder() + .SetName("otherProperty1") + .SetCardinality(CARDINALITY_REPEATED) + .SetDataTypeInt64(NUMERIC_MATCH_RANGE) + .Build(); + // Create a schema with two types: + // - TypeA has 34 properties: + // 'desiredProperty', 'otherProperty'*, 'undesiredProperty' + // - TypeB has 2 properties: 'anotherProperty', 'desiredProperty' + // 1. The 32 'otherProperty's will consume all of the individual storages + // 2. TypeA.desiredProperty and TypeB.anotherProperty will both be assigned + // SectionId = 0 for their respective types. + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType("TypeA") + .AddProperty(int_property_config) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty2")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty3")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty4")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty5")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty6")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty7")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty8")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty9")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty10")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty11")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty12")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty13")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty14")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty15")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty16")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty17")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty18")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty19")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty20")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty21")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty22")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty23")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty24")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty25")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty26")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty27")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty28")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty29")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty30")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty31")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty32")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("desiredProperty")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("undesiredProperty"))) + .AddType(SchemaTypeConfigBuilder() + .SetType("TypeB") + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("anotherProperty")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("desiredProperty"))) + .Build(); + ICING_ASSERT_OK(this->schema_store_->SetSchema(schema)); + + // Ids are assigned alphabetically, so the property ids are: + // TypeA.desiredProperty = 0 + // TypeA.otherPropertyN = N + // TypeA.undesiredProperty = 33 + // TypeB.anotherProperty = 0 + // TypeB.desiredProperty = 1 + SectionId typea_desired_prop_id = 0; + SectionId typea_undesired_prop_id = 33; + SectionId typeb_another_prop_id = 0; + SectionId typeb_desired_prop_id = 1; + std::string desired_property = "desiredProperty"; + std::string undesired_property = "undesiredProperty"; + std::string another_property = "anotherProperty"; + + // Only the hits for 'desired_prop_id' should be returned. + std::vector<SectionId> expected_sections_typea = {typea_desired_prop_id}; + std::vector<SectionId> expected_sections_typeb = {typeb_desired_prop_id}; + + // Put 11 docs of "TypeA" into the document store. + DocumentProto doc = + DocumentBuilder().SetKey("ns1", "uri0").SetSchema("TypeA").Build(); + ICING_ASSERT_OK(this->doc_store_->Put(doc)); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri1").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri2").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri3").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri4").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri5").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri6").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri7").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri8").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri9").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri10").Build())); + + // Put 10 docs of "TypeB" into the document store. + doc = DocumentBuilder(doc).SetUri("uri11").SetSchema("TypeB").Build(); + ICING_ASSERT_OK(this->doc_store_->Put(doc)); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri12").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri13").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri14").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri15").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri16").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri17").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri18").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri19").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri20").Build())); + + { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<IntegerIndex> integer_index, + IntegerIndex::Create(filesystem_, working_path_)); + + // Index numeric content for other properties to force our property into the + // wildcard storage. + std::string other_property_path = "otherProperty"; + for (int i = 1; i <= IntegerIndex::kMaxPropertyStorages; ++i) { + Index(integer_index.get(), + absl_ports::StrCat(other_property_path, std::to_string(i)), + /*document_id=*/0, /*section_id=*/i, /*keys=*/{i}); + } + + // Index numeric content for TypeA.desiredProperty + Index(integer_index.get(), desired_property, /*document_id=*/0, + typea_desired_prop_id, /*keys=*/{1}); + Index(integer_index.get(), desired_property, /*document_id=*/1, + typea_desired_prop_id, /*keys=*/{3}); + Index(integer_index.get(), desired_property, /*document_id=*/2, + typea_desired_prop_id, /*keys=*/{2}); + Index(integer_index.get(), desired_property, /*document_id=*/3, + typea_desired_prop_id, /*keys=*/{0}); + Index(integer_index.get(), desired_property, /*document_id=*/4, + typea_desired_prop_id, /*keys=*/{4}); + Index(integer_index.get(), desired_property, /*document_id=*/5, + typea_desired_prop_id, /*keys=*/{2}); + + // Index the same numeric content for TypeA.undesiredProperty + Index(integer_index.get(), undesired_property, /*document_id=*/6, + typea_undesired_prop_id, /*keys=*/{3}); + Index(integer_index.get(), undesired_property, /*document_id=*/7, + typea_undesired_prop_id, /*keys=*/{2}); + Index(integer_index.get(), undesired_property, /*document_id=*/8, + typea_undesired_prop_id, /*keys=*/{0}); + Index(integer_index.get(), undesired_property, /*document_id=*/9, + typea_undesired_prop_id, /*keys=*/{4}); + Index(integer_index.get(), undesired_property, /*document_id=*/10, + typea_undesired_prop_id, /*keys=*/{2}); + + // Index the same numeric content for TypeB.undesiredProperty + Index(integer_index.get(), another_property, /*document_id=*/11, + typeb_another_prop_id, /*keys=*/{3}); + Index(integer_index.get(), another_property, /*document_id=*/12, + typeb_another_prop_id, /*keys=*/{2}); + Index(integer_index.get(), another_property, /*document_id=*/13, + typeb_another_prop_id, /*keys=*/{0}); + Index(integer_index.get(), another_property, /*document_id=*/14, + typeb_another_prop_id, /*keys=*/{4}); + Index(integer_index.get(), another_property, /*document_id=*/15, + typeb_another_prop_id, /*keys=*/{2}); + + // Finally, index the same numeric content for TypeB.desiredProperty + Index(integer_index.get(), desired_property, /*document_id=*/16, + typeb_desired_prop_id, /*keys=*/{3}); + Index(integer_index.get(), desired_property, /*document_id=*/17, + typeb_desired_prop_id, /*keys=*/{2}); + Index(integer_index.get(), desired_property, /*document_id=*/18, + typeb_desired_prop_id, /*keys=*/{0}); + Index(integer_index.get(), desired_property, /*document_id=*/19, + typeb_desired_prop_id, /*keys=*/{4}); + Index(integer_index.get(), desired_property, /*document_id=*/20, + typeb_desired_prop_id, /*keys=*/{2}); + + ICING_ASSERT_OK(doc_store_->Delete(/*document_id=*/3)); + ICING_ASSERT_OK(doc_store_->Delete(/*document_id=*/5)); + // Delete doc id = 3, 5, compress and keep the rest. + ICING_ASSERT_OK_AND_ASSIGN(std::vector<DocumentId> document_id_old_to_new, + CompactDocStore()); + + DocumentId new_last_added_document_id = 18; + EXPECT_THAT(integer_index->Optimize(document_id_old_to_new, + new_last_added_document_id), + IsOk()); + EXPECT_THAT(integer_index->last_added_document_id(), + Eq(new_last_added_document_id)); + + EXPECT_THAT( + Query(integer_index.get(), desired_property, + /*key_lower=*/2, /*key_upper=*/2), + IsOkAndHolds(ElementsAre( + EqualsDocHitInfo(/*document_id=*/20 - 2, expected_sections_typeb), + EqualsDocHitInfo(/*document_id=*/17 - 2, expected_sections_typeb), + EqualsDocHitInfo(/*document_id=*/2, expected_sections_typea)))); + + EXPECT_THAT( + Query(integer_index.get(), desired_property, + /*key_lower=*/1, /*key_upper=*/3), + IsOkAndHolds(ElementsAre( + EqualsDocHitInfo(/*document_id=*/20 - 2, expected_sections_typeb), + EqualsDocHitInfo(/*document_id=*/17 - 2, expected_sections_typeb), + EqualsDocHitInfo(/*document_id=*/16 - 2, expected_sections_typeb), + EqualsDocHitInfo(/*document_id=*/2, expected_sections_typea), + EqualsDocHitInfo(/*document_id=*/1, expected_sections_typea), + EqualsDocHitInfo(/*document_id=*/0, expected_sections_typea)))); + } + + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<IntegerIndex> integer_index, + IntegerIndex::Create(filesystem_, working_path_)); + + EXPECT_THAT(integer_index->num_property_indices(), Eq(33)); + + EXPECT_THAT( + Query(integer_index.get(), desired_property, + /*key_lower=*/2, /*key_upper=*/2), + IsOkAndHolds(ElementsAre( + EqualsDocHitInfo(/*document_id=*/20 - 2, expected_sections_typeb), + EqualsDocHitInfo(/*document_id=*/17 - 2, expected_sections_typeb), + EqualsDocHitInfo(/*document_id=*/2, expected_sections_typea)))); + + EXPECT_THAT( + Query(integer_index.get(), desired_property, + /*key_lower=*/1, /*key_upper=*/3), + IsOkAndHolds(ElementsAre( + EqualsDocHitInfo(/*document_id=*/20 - 2, expected_sections_typeb), + EqualsDocHitInfo(/*document_id=*/17 - 2, expected_sections_typeb), + EqualsDocHitInfo(/*document_id=*/16 - 2, expected_sections_typeb), + EqualsDocHitInfo(/*document_id=*/2, expected_sections_typea), + EqualsDocHitInfo(/*document_id=*/1, expected_sections_typea), + EqualsDocHitInfo(/*document_id=*/0, expected_sections_typea)))); +} + +// This test covers the situation where Optimize causes us to throw out some of +// the individual index storages (because they don't have any hits anymore). +// In this case, any properties that added content to the wildcard storage (even +// if all of their content was also deleted) should still be placed in the +// wilcard storage. +TEST_F(IntegerIndexTest, WildcardStorageAvailableIndicesAfterOptimize) { + // This test sets its schema assuming that max property storages == 32. + ASSERT_THAT(IntegerIndex::kMaxPropertyStorages, Eq(32)); + + PropertyConfigProto int_property_config = + PropertyConfigBuilder() + .SetName("otherProperty1") + .SetCardinality(CARDINALITY_REPEATED) + .SetDataTypeInt64(NUMERIC_MATCH_RANGE) + .Build(); + // Create a schema with two types: + // - TypeA has 34 properties: + // 'desiredProperty', 'otherProperty'*, 'undesiredProperty' + // - TypeB has 2 properties: 'anotherProperty', 'desiredProperty' + // 1. The 32 'otherProperty's will consume all of the individual storages + // 2. TypeA.desiredProperty and TypeB.anotherProperty will both be assigned + // SectionId = 0 for their respective types. + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType("TypeA") + .AddProperty(int_property_config) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty2")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty3")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty4")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty5")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty6")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty7")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty8")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty9")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty10")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty11")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty12")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty13")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty14")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty15")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty16")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty17")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty18")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty19")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty20")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty21")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty22")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty23")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty24")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty25")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty26")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty27")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty28")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty29")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty30")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty31")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty32")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("desiredProperty")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("undesiredProperty"))) + .Build(); + ICING_ASSERT_OK(this->schema_store_->SetSchema(schema)); + + // Ids are assigned alphabetically, so the property ids are: + // TypeA.desiredProperty = 0 + // TypeA.otherPropertyN = N + // TypeA.undesiredProperty = 33 + // TypeB.anotherProperty = 0 + // TypeB.desiredProperty = 1 + SectionId typea_desired_prop_id = 0; + SectionId typea_undesired_prop_id = 33; + SectionId typea_other1_prop_id = 1; + std::string desired_property = "desiredProperty"; + std::string undesired_property = "undesiredProperty"; + std::string another_property = "anotherProperty"; + std::string other_property_1 = "otherProperty1"; + + // Only the hits for 'desired_prop_id' should be returned. + std::vector<SectionId> expected_sections_typea = {typea_desired_prop_id}; + + // Put 11 docs of "TypeA" into the document store. + DocumentProto doc = + DocumentBuilder().SetKey("ns1", "uri0").SetSchema("TypeA").Build(); + ICING_ASSERT_OK(this->doc_store_->Put(doc)); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri1").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri2").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri3").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri4").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri5").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri6").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri7").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri8").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri9").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri10").Build())); + + { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<IntegerIndex> integer_index, + IntegerIndex::Create(filesystem_, working_path_)); + + // Index numeric content for other properties to force our property into the + // wildcard storage. + std::string other_property_path = "otherProperty"; + for (int i = 1; i <= IntegerIndex::kMaxPropertyStorages; ++i) { + Index(integer_index.get(), + absl_ports::StrCat(other_property_path, std::to_string(i)), + /*document_id=*/0, /*section_id=*/i, /*keys=*/{i}); + } + + // Index numeric content for TypeA.desiredProperty + Index(integer_index.get(), desired_property, /*document_id=*/0, + typea_desired_prop_id, /*keys=*/{1}); + Index(integer_index.get(), desired_property, /*document_id=*/1, + typea_desired_prop_id, /*keys=*/{3}); + Index(integer_index.get(), desired_property, /*document_id=*/2, + typea_desired_prop_id, /*keys=*/{2}); + Index(integer_index.get(), desired_property, /*document_id=*/3, + typea_desired_prop_id, /*keys=*/{0}); + Index(integer_index.get(), desired_property, /*document_id=*/4, + typea_desired_prop_id, /*keys=*/{4}); + Index(integer_index.get(), desired_property, /*document_id=*/5, + typea_desired_prop_id, /*keys=*/{2}); + + // Index the same numeric content for TypeA.undesiredProperty + Index(integer_index.get(), undesired_property, /*document_id=*/6, + typea_undesired_prop_id, /*keys=*/{3}); + Index(integer_index.get(), undesired_property, /*document_id=*/7, + typea_undesired_prop_id, /*keys=*/{2}); + Index(integer_index.get(), undesired_property, /*document_id=*/8, + typea_undesired_prop_id, /*keys=*/{0}); + Index(integer_index.get(), undesired_property, /*document_id=*/9, + typea_undesired_prop_id, /*keys=*/{4}); + Index(integer_index.get(), undesired_property, /*document_id=*/10, + typea_undesired_prop_id, /*keys=*/{2}); + + // Delete all the docs that had hits in otherProperty* and + // undesiredProperty. + ICING_ASSERT_OK(doc_store_->Delete(/*document_id=*/0)); + ICING_ASSERT_OK(doc_store_->Delete(/*document_id=*/6)); + ICING_ASSERT_OK(doc_store_->Delete(/*document_id=*/7)); + ICING_ASSERT_OK(doc_store_->Delete(/*document_id=*/8)); + ICING_ASSERT_OK(doc_store_->Delete(/*document_id=*/9)); + ICING_ASSERT_OK(doc_store_->Delete(/*document_id=*/10)); + // Delete doc id = 0, 6, 7, 8, 9, 10. Compress and keep the rest. + ICING_ASSERT_OK_AND_ASSIGN(std::vector<DocumentId> document_id_old_to_new, + CompactDocStore()); + + DocumentId new_last_added_document_id = 5 - 1; + EXPECT_THAT(integer_index->Optimize(document_id_old_to_new, + new_last_added_document_id), + IsOk()); + EXPECT_THAT(integer_index->last_added_document_id(), + Eq(new_last_added_document_id)); + + EXPECT_THAT( + Query(integer_index.get(), desired_property, + /*key_lower=*/2, /*key_upper=*/2), + IsOkAndHolds(ElementsAre( + EqualsDocHitInfo(/*document_id=*/5 - 1, expected_sections_typea), + EqualsDocHitInfo(/*document_id=*/2 - 1, expected_sections_typea)))); + + EXPECT_THAT( + Query(integer_index.get(), desired_property, + /*key_lower=*/1, /*key_upper=*/3), + IsOkAndHolds(ElementsAre( + EqualsDocHitInfo(/*document_id=*/5 - 1, expected_sections_typea), + EqualsDocHitInfo(/*document_id=*/2 - 1, expected_sections_typea), + EqualsDocHitInfo(/*document_id=*/1 - 1, expected_sections_typea)))); + } + + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<IntegerIndex> integer_index, + IntegerIndex::Create(filesystem_, working_path_)); + + EXPECT_THAT(integer_index->num_property_indices(), Eq(1)); + + // Add a new doc (docid==5) and a hit for desiredProperty. This should still + // be placed into the wildcard integer storage. + doc = DocumentBuilder().SetKey("ns1", "uri11").SetSchema("TypeA").Build(); + ICING_ASSERT_OK(this->doc_store_->Put(doc)); + Index(integer_index.get(), desired_property, /*document_id=*/5, + typea_desired_prop_id, /*keys=*/{12}); + EXPECT_THAT(integer_index->num_property_indices(), Eq(1)); + + EXPECT_THAT(Query(integer_index.get(), desired_property, + /*key_lower=*/12, /*key_upper=*/12), + IsOkAndHolds(ElementsAre(EqualsDocHitInfo( + /*document_id=*/5, expected_sections_typea)))); + + // Add a new doc (docid==6) and a hit for undesiredProperty. This should still + // be placed into the wildcard integer storage. + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri12").Build())); + Index(integer_index.get(), undesired_property, /*document_id=*/6, + typea_undesired_prop_id, /*keys=*/{3}); + EXPECT_THAT(integer_index->num_property_indices(), Eq(1)); + + expected_sections_typea = {typea_undesired_prop_id}; + EXPECT_THAT(Query(integer_index.get(), undesired_property, + /*key_lower=*/3, /*key_upper=*/3), + IsOkAndHolds(ElementsAre(EqualsDocHitInfo( + /*document_id=*/6, expected_sections_typea)))); + + // Add a new doc (docid==7) and a hit for otherProperty1. This should be given + // its own individual storage. + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri13").Build())); + Index(integer_index.get(), other_property_1, /*document_id=*/7, + typea_other1_prop_id, /*keys=*/{3}); + EXPECT_THAT(integer_index->num_property_indices(), Eq(2)); + + expected_sections_typea = {typea_other1_prop_id}; + EXPECT_THAT(Query(integer_index.get(), other_property_1, + /*key_lower=*/3, /*key_upper=*/3), + IsOkAndHolds(ElementsAre(EqualsDocHitInfo( + /*document_id=*/7, expected_sections_typea)))); +} + } // namespace } // namespace lib diff --git a/icing/index/numeric/numeric-index.h b/icing/index/numeric/numeric-index.h index 347260a..28640ca 100644 --- a/icing/index/numeric/numeric-index.h +++ b/icing/index/numeric/numeric-index.h @@ -23,8 +23,10 @@ #include "icing/text_classifier/lib3/utils/base/statusor.h" #include "icing/file/persistent-storage.h" #include "icing/index/iterator/doc-hit-info-iterator.h" +#include "icing/schema/schema-store.h" #include "icing/schema/section.h" #include "icing/store/document-id.h" +#include "icing/store/document-store.h" namespace icing { namespace lib { @@ -126,8 +128,9 @@ class NumericIndex : public PersistentStorage { // - INVALID_ARGUMENT_ERROR if key_lower > key_upper // - Any other errors, depending on the actual implementation virtual libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>> - GetIterator(std::string_view property_path, T key_lower, - T key_upper) const = 0; + GetIterator(std::string_view property_path, T key_lower, T key_upper, + const DocumentStore& document_store, + const SchemaStore& schema_store) const = 0; // Reduces internal file sizes by reclaiming space and ids of deleted // documents. Numeric index will convert all data (hits) to the new document @@ -162,6 +165,10 @@ class NumericIndex : public PersistentStorage { // last_added_document_id() or last_added_document_id() is invalid. virtual void set_last_added_document_id(DocumentId document_id) = 0; + // The number of individual indices that the NumericIndex has created to + // search over all indexed properties thus far. + virtual int num_property_indices() const = 0; + protected: explicit NumericIndex(const Filesystem& filesystem, std::string&& working_path, diff --git a/icing/index/numeric/posting-list-integer-index-accessor.cc b/icing/index/numeric/posting-list-integer-index-accessor.cc index 220b240..af2aea4 100644 --- a/icing/index/numeric/posting-list-integer-index-accessor.cc +++ b/icing/index/numeric/posting-list-integer-index-accessor.cc @@ -64,6 +64,58 @@ PostingListIntegerIndexAccessor::CreateFromExisting( // Returns the next batch of integer index data for the provided posting list. libtextclassifier3::StatusOr<std::vector<IntegerIndexData>> PostingListIntegerIndexAccessor::GetNextDataBatch() { + return GetNextDataBatchImpl(/*free_posting_list=*/false); +} + +libtextclassifier3::StatusOr<std::vector<IntegerIndexData>> +PostingListIntegerIndexAccessor::GetAllDataAndFree() { + if (preexisting_posting_list_ == nullptr) { + return absl_ports::FailedPreconditionError( + "Cannot retrieve data from a PostingListIntegerIndexAccessor that " + "was not created from a preexisting posting list."); + } + + std::vector<IntegerIndexData> all_data; + while (true) { + ICING_ASSIGN_OR_RETURN(std::vector<IntegerIndexData> batch, + GetNextDataBatchImpl(/*free_posting_list=*/true)); + if (batch.empty()) { + break; + } + std::move(batch.begin(), batch.end(), std::back_inserter(all_data)); + } + + return all_data; +} + +libtextclassifier3::Status PostingListIntegerIndexAccessor::PrependData( + const IntegerIndexData& data) { + PostingListUsed& active_pl = (preexisting_posting_list_ != nullptr) + ? preexisting_posting_list_->posting_list + : in_memory_posting_list_; + libtextclassifier3::Status status = + serializer_->PrependData(&active_pl, data); + if (!absl_ports::IsResourceExhausted(status)) { + return status; + } + // There is no more room to add data to this current posting list! Therefore, + // we need to either move those data to a larger posting list or flush this + // posting list and create another max-sized posting list in the chain. + if (preexisting_posting_list_ != nullptr) { + ICING_RETURN_IF_ERROR(FlushPreexistingPostingList()); + } else { + ICING_RETURN_IF_ERROR(FlushInMemoryPostingList()); + } + + // Re-add data. Should always fit since we just cleared + // in_memory_posting_list_. It's fine to explicitly reference + // in_memory_posting_list_ here because there's no way of reaching this line + // while preexisting_posting_list_ is still in use. + return serializer_->PrependData(&in_memory_posting_list_, data); +} + +libtextclassifier3::StatusOr<std::vector<IntegerIndexData>> +PostingListIntegerIndexAccessor::GetNextDataBatchImpl(bool free_posting_list) { if (preexisting_posting_list_ == nullptr) { if (has_reached_posting_list_chain_end_) { return std::vector<IntegerIndexData>(); @@ -85,6 +137,11 @@ PostingListIntegerIndexAccessor::GetNextDataBatch() { next_block_index = preexisting_posting_list_->next_block_index; } + if (free_posting_list) { + ICING_RETURN_IF_ERROR( + storage_->FreePostingList(std::move(*preexisting_posting_list_))); + } + if (next_block_index != kInvalidBlockIndex) { // Since we only have to deal with next block for max-sized posting list // block, max_num_posting_lists is 1 and posting_list_index_bits is @@ -103,31 +160,5 @@ PostingListIntegerIndexAccessor::GetNextDataBatch() { return batch; } -libtextclassifier3::Status PostingListIntegerIndexAccessor::PrependData( - const IntegerIndexData& data) { - PostingListUsed& active_pl = (preexisting_posting_list_ != nullptr) - ? preexisting_posting_list_->posting_list - : in_memory_posting_list_; - libtextclassifier3::Status status = - serializer_->PrependData(&active_pl, data); - if (!absl_ports::IsResourceExhausted(status)) { - return status; - } - // There is no more room to add data to this current posting list! Therefore, - // we need to either move those data to a larger posting list or flush this - // posting list and create another max-sized posting list in the chain. - if (preexisting_posting_list_ != nullptr) { - ICING_RETURN_IF_ERROR(FlushPreexistingPostingList()); - } else { - ICING_RETURN_IF_ERROR(FlushInMemoryPostingList()); - } - - // Re-add data. Should always fit since we just cleared - // in_memory_posting_list_. It's fine to explicitly reference - // in_memory_posting_list_ here because there's no way of reaching this line - // while preexisting_posting_list_ is still in use. - return serializer_->PrependData(&in_memory_posting_list_, data); -} - } // namespace lib } // namespace icing diff --git a/icing/index/numeric/posting-list-integer-index-accessor.h b/icing/index/numeric/posting-list-integer-index-accessor.h index 4c1eced..f0d3d25 100644 --- a/icing/index/numeric/posting-list-integer-index-accessor.h +++ b/icing/index/numeric/posting-list-integer-index-accessor.h @@ -50,7 +50,7 @@ class PostingListIntegerIndexAccessor : public PostingListAccessor { Create(FlashIndexStorage* storage, PostingListIntegerIndexSerializer* serializer); - // Create a PostingListIntegerIndexAccessor with an existing posting list + // Creates a PostingListIntegerIndexAccessor with an existing posting list // identified by existing_posting_list_id. // // RETURNS: @@ -64,17 +64,30 @@ class PostingListIntegerIndexAccessor : public PostingListAccessor { PostingListSerializer* GetSerializer() override { return serializer_; } - // Retrieve the next batch of data in the posting list chain + // Retrieves the next batch of data in the posting list chain. // // RETURNS: // - On success, a vector of integer index data in the posting list chain - // - INTERNAL if called on an instance that was created via Create, if - // unable to read the next posting list in the chain or if the posting - // list has been corrupted somehow. + // - FAILED_PRECONDITION_ERROR if called on an instance that was created via + // Create. + // - INTERNAL_ERROR if unable to read the next posting list in the chain or + // if the posting list has been corrupted somehow. libtextclassifier3::StatusOr<std::vector<IntegerIndexData>> GetNextDataBatch(); - // Prepend one data. This may result in flushing the posting list to disk (if + // Retrieves all data from the posting list chain and frees all posting + // list(s). + // + // RETURNS: + // - On success, a vector of integer index data in the posting list chain + // - FAILED_PRECONDITION_ERROR if called on an instance that was created via + // Create. + // - INTERNAL_ERROR if unable to read the next posting list in the chain or + // if the posting list has been corrupted somehow. + libtextclassifier3::StatusOr<std::vector<IntegerIndexData>> + GetAllDataAndFree(); + + // Prepends one data. This may result in flushing the posting list to disk (if // the PostingListIntegerIndexAccessor holds a max-sized posting list that // is full) or freeing a pre-existing posting list if it is too small to fit // all data necessary. @@ -87,7 +100,15 @@ class PostingListIntegerIndexAccessor : public PostingListAccessor { // posting list. libtextclassifier3::Status PrependData(const IntegerIndexData& data); - // TODO(b/259743562): [Optimization 1] add GetAndClear, IsFull for split + bool WantsSplit() const { + const PostingListUsed* current_pl = + preexisting_posting_list_ != nullptr + ? &preexisting_posting_list_->posting_list + : &in_memory_posting_list_; + // Only max-sized PLs get split. Smaller PLs just get copied to larger PLs. + return current_pl->size_in_bytes() == storage_->max_posting_list_bytes() && + serializer_->IsFull(current_pl); + } private: explicit PostingListIntegerIndexAccessor( @@ -96,6 +117,20 @@ class PostingListIntegerIndexAccessor : public PostingListAccessor { : PostingListAccessor(storage, std::move(in_memory_posting_list)), serializer_(serializer) {} + // Retrieves the next batch of data in the posting list chain. + // + // - free_posting_list: a boolean flag indicating whether freeing all posting + // lists after retrieving batch data. + // + // RETURNS: + // - On success, a vector of integer index data in the posting list chain + // - FAILED_PRECONDITION_ERROR if called on an instance that was created via + // Create. + // - INTERNAL_ERROR if unable to read the next posting list in the chain or + // if the posting list has been corrupted somehow. + libtextclassifier3::StatusOr<std::vector<IntegerIndexData>> + GetNextDataBatchImpl(bool free_posting_list); + PostingListIntegerIndexSerializer* serializer_; // Does not own. }; diff --git a/icing/index/numeric/posting-list-integer-index-accessor_test.cc b/icing/index/numeric/posting-list-integer-index-accessor_test.cc index 48221b9..f655fea 100644 --- a/icing/index/numeric/posting-list-integer-index-accessor_test.cc +++ b/icing/index/numeric/posting-list-integer-index-accessor_test.cc @@ -25,6 +25,7 @@ #include "gtest/gtest.h" #include "icing/file/filesystem.h" #include "icing/file/posting_list/flash-index-storage.h" +#include "icing/file/posting_list/posting-list-common.h" #include "icing/file/posting_list/posting-list-identifier.h" #include "icing/index/numeric/integer-index-data.h" #include "icing/index/numeric/posting-list-integer-index-serializer.h" @@ -42,6 +43,7 @@ using ::testing::ElementsAre; using ::testing::ElementsAreArray; using ::testing::Eq; using ::testing::Lt; +using ::testing::Ne; using ::testing::SizeIs; class PostingListIntegerIndexAccessorTest : public ::testing::Test { @@ -402,6 +404,131 @@ TEST_F(PostingListIntegerIndexAccessorTest, EXPECT_THAT(result2.status, IsOk()); } +TEST_F(PostingListIntegerIndexAccessorTest, GetAllDataAndFree) { + IntegerIndexData data1(/*section_id=*/3, /*document_id=*/1, /*key=*/123); + IntegerIndexData data2(/*section_id=*/3, /*document_id=*/2, /*key=*/456); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<PostingListIntegerIndexAccessor> pl_accessor1, + PostingListIntegerIndexAccessor::Create(flash_index_storage_.get(), + serializer_.get())); + // Add 2 data. + ICING_ASSERT_OK(pl_accessor1->PrependData(data1)); + ICING_ASSERT_OK(pl_accessor1->PrependData(data2)); + PostingListAccessor::FinalizeResult result1 = + std::move(*pl_accessor1).Finalize(); + ICING_ASSERT_OK(result1.status); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<PostingListIntegerIndexAccessor> pl_accessor2, + PostingListIntegerIndexAccessor::CreateFromExisting( + flash_index_storage_.get(), serializer_.get(), result1.id)); + EXPECT_THAT(pl_accessor2->GetAllDataAndFree(), + IsOkAndHolds(ElementsAre(data2, data1))); + + // Allocate a new posting list with same size again. + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<PostingListIntegerIndexAccessor> pl_accessor3, + PostingListIntegerIndexAccessor::Create(flash_index_storage_.get(), + serializer_.get())); + // Add 2 data. + ICING_ASSERT_OK(pl_accessor3->PrependData(data1)); + ICING_ASSERT_OK(pl_accessor3->PrependData(data2)); + PostingListAccessor::FinalizeResult result3 = + std::move(*pl_accessor3).Finalize(); + ICING_ASSERT_OK(result3.status); + // We should get the same id if the previous one has been freed correctly by + // GetAllDataAndFree. + EXPECT_THAT(result3.id, Eq(result1.id)); +} + +TEST_F(PostingListIntegerIndexAccessorTest, GetAllDataAndFreePostingListChain) { + uint32_t block_size = FlashIndexStorage::SelectBlockSize(); + uint32_t max_posting_list_bytes = IndexBlock::CalculateMaxPostingListBytes( + block_size, serializer_->GetDataTypeBytes()); + uint32_t max_num_data_single_posting_list = + max_posting_list_bytes / serializer_->GetDataTypeBytes(); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<PostingListIntegerIndexAccessor> pl_accessor1, + PostingListIntegerIndexAccessor::Create(flash_index_storage_.get(), + serializer_.get())); + + // Prepend max_num_data_single_posting_list + 1 data. + std::vector<IntegerIndexData> data_vec; + for (uint32_t i = 0; i < max_num_data_single_posting_list + 1; ++i) { + IntegerIndexData data(/*section_id=*/3, static_cast<DocumentId>(i), + /*key=*/i); + ICING_ASSERT_OK(pl_accessor1->PrependData(data)); + data_vec.push_back(data); + } + + // This will cause: + // - Allocate the first max-sized posting list at block index = 1, storing + // max_num_data_single_posting_list data. + // - Allocate the second max-sized posting list at block index = 2, storing 1 + // data. Also its next_block_index is 1. + // - IOW, we will get 2 -> 1 and result1.id points to 2. + PostingListAccessor::FinalizeResult result1 = + std::move(*pl_accessor1).Finalize(); + ICING_ASSERT_OK(result1.status); + + uint32_t first_pl_block_index = kInvalidBlockIndex; + { + // result1.id points at the second (max-sized) PL, and next_block_index of + // the second PL points to the first PL's block. Fetch the first PL's block + // index manually. + ICING_ASSERT_OK_AND_ASSIGN( + PostingListHolder pl_holder, + flash_index_storage_->GetPostingList(result1.id)); + first_pl_block_index = pl_holder.next_block_index; + } + ASSERT_THAT(first_pl_block_index, Ne(kInvalidBlockIndex)); + + // Call GetAllDataAndFree. This will free block 2 and block 1. + // Free block list: 1 -> 2 (since free block list is LIFO). + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<PostingListIntegerIndexAccessor> pl_accessor2, + PostingListIntegerIndexAccessor::CreateFromExisting( + flash_index_storage_.get(), serializer_.get(), result1.id)); + EXPECT_THAT( + pl_accessor2->GetAllDataAndFree(), + IsOkAndHolds(ElementsAreArray(data_vec.rbegin(), data_vec.rend()))); + pl_accessor2.reset(); + + // Allocate a new posting list with same size again. + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<PostingListIntegerIndexAccessor> pl_accessor3, + PostingListIntegerIndexAccessor::Create(flash_index_storage_.get(), + serializer_.get())); + // Add same set of data. + for (uint32_t i = 0; i < max_num_data_single_posting_list + 1; ++i) { + ICING_ASSERT_OK(pl_accessor3->PrependData(data_vec[i])); + } + + // This will cause: + // - Allocate the first max-sized posting list from the free block list, which + // is block index = 1, storing max_num_data_single_posting_list data. + // - Allocate the second max-sized posting list from the next block in free + // block list, which is block index = 2, storing 1 data. Also its + // next_block_index should be 1. + PostingListAccessor::FinalizeResult result3 = + std::move(*pl_accessor3).Finalize(); + ICING_ASSERT_OK(result3.status); + // We should get the same id if the previous one has been freed correctly by + // GetAllDataAndFree. + EXPECT_THAT(result3.id, Eq(result1.id)); + // Also the first PL should be the same if it has been freed correctly by + // GetAllDataAndFree. Since it is a max-sized posting list, we just need to + // verify the block index. + { + ICING_ASSERT_OK_AND_ASSIGN( + PostingListHolder pl_holder, + flash_index_storage_->GetPostingList(result3.id)); + EXPECT_THAT(pl_holder.next_block_index, Eq(first_pl_block_index)); + } +} + } // namespace } // namespace lib diff --git a/icing/index/numeric/posting-list-integer-index-serializer.h b/icing/index/numeric/posting-list-integer-index-serializer.h index 9cfdb7a..ea2f2da 100644 --- a/icing/index/numeric/posting-list-integer-index-serializer.h +++ b/icing/index/numeric/posting-list-integer-index-serializer.h @@ -111,6 +111,12 @@ class PostingListIntegerIndexSerializer : public PostingListSerializer { libtextclassifier3::Status PopFrontData(PostingListUsed* posting_list_used, uint32_t num_data) const; + // Helper function to determine if posting list is full. + bool IsFull(const PostingListUsed* posting_list_used) const { + return GetSpecialData(posting_list_used, /*index=*/0).data().is_valid() && + GetSpecialData(posting_list_used, /*index=*/1).data().is_valid(); + } + private: // Posting list layout formats: // @@ -228,11 +234,6 @@ class PostingListIntegerIndexSerializer : public PostingListSerializer { // +-----------------+-----------------+---+--------+-----+--------+--------+ // Helpers to determine what state the posting list is in. - bool IsFull(const PostingListUsed* posting_list_used) const { - return GetSpecialData(posting_list_used, /*index=*/0).data().is_valid() && - GetSpecialData(posting_list_used, /*index=*/1).data().is_valid(); - } - bool IsAlmostFull(const PostingListUsed* posting_list_used) const { return !GetSpecialData(posting_list_used, /*index=*/0).data().is_valid() && GetSpecialData(posting_list_used, /*index=*/1).data().is_valid(); |