aboutsummaryrefslogtreecommitdiff
path: root/icing/index
diff options
context:
space:
mode:
Diffstat (limited to 'icing/index')
-rw-r--r--icing/index/index-processor_test.cc52
-rw-r--r--icing/index/iterator/doc-hit-info-iterator-not.cc4
-rw-r--r--icing/index/iterator/doc-hit-info-iterator-not_test.cc2
-rw-r--r--icing/index/numeric/doc-hit-info-iterator-numeric.h4
-rw-r--r--icing/index/numeric/dummy-numeric-index.h9
-rw-r--r--icing/index/numeric/integer-index-bucket-util.cc205
-rw-r--r--icing/index/numeric/integer-index-bucket-util.h81
-rw-r--r--icing/index/numeric/integer-index-bucket-util_test.cc1112
-rw-r--r--icing/index/numeric/integer-index-storage.cc174
-rw-r--r--icing/index/numeric/integer-index-storage.h27
-rw-r--r--icing/index/numeric/integer-index-storage_benchmark.cc17
-rw-r--r--icing/index/numeric/integer-index-storage_test.cc152
-rw-r--r--icing/index/numeric/integer-index.cc284
-rw-r--r--icing/index/numeric/integer-index.h87
-rw-r--r--icing/index/numeric/integer-index_test.cc1313
-rw-r--r--icing/index/numeric/numeric-index.h11
-rw-r--r--icing/index/numeric/posting-list-integer-index-accessor.cc83
-rw-r--r--icing/index/numeric/posting-list-integer-index-accessor.h49
-rw-r--r--icing/index/numeric/posting-list-integer-index-accessor_test.cc127
-rw-r--r--icing/index/numeric/posting-list-integer-index-serializer.h11
20 files changed, 3548 insertions, 256 deletions
diff --git a/icing/index/index-processor_test.cc b/icing/index/index-processor_test.cc
index 3a9b4ee..47baabe 100644
--- a/icing/index/index-processor_test.cc
+++ b/icing/index/index-processor_test.cc
@@ -40,6 +40,8 @@
#include "icing/index/numeric/numeric-index.h"
#include "icing/index/string-section-indexing-handler.h"
#include "icing/index/term-property-id.h"
+#include "icing/join/qualified-id-joinable-property-indexing-handler.h"
+#include "icing/join/qualified-id-type-joinable-index.h"
#include "icing/legacy/index/icing-filesystem.h"
#include "icing/legacy/index/icing-mock-filesystem.h"
#include "icing/portable/platform.h"
@@ -51,6 +53,7 @@
#include "icing/schema/schema-util.h"
#include "icing/schema/section.h"
#include "icing/store/document-id.h"
+#include "icing/store/document-store.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/fake-clock.h"
#include "icing/testing/icu-data-file-helper.h"
@@ -160,7 +163,9 @@ class IndexProcessorTest : public Test {
index_dir_ = base_dir_ + "/index";
integer_index_dir_ = base_dir_ + "/integer_index";
+ qualified_id_join_index_dir_ = base_dir_ + "/qualified_id_join_index";
schema_store_dir_ = base_dir_ + "/schema_store";
+ doc_store_dir_ = base_dir_ + "/doc_store";
Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024);
ICING_ASSERT_OK_AND_ASSIGN(
@@ -169,6 +174,10 @@ class IndexProcessorTest : public Test {
ICING_ASSERT_OK_AND_ASSIGN(
integer_index_, IntegerIndex::Create(filesystem_, integer_index_dir_));
+ ICING_ASSERT_OK_AND_ASSIGN(qualified_id_join_index_,
+ QualifiedIdTypeJoinableIndex::Create(
+ filesystem_, qualified_id_join_index_dir_));
+
language_segmenter_factory::SegmenterOptions segmenter_options(ULOC_US);
ICING_ASSERT_OK_AND_ASSIGN(
lang_segmenter_,
@@ -260,6 +269,13 @@ class IndexProcessorTest : public Test {
.Build();
ICING_ASSERT_OK(schema_store_->SetSchema(schema));
+ ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(doc_store_dir_.c_str()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ DocumentStore::Create(&filesystem_, doc_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ doc_store_ = std::move(create_result.document_store);
+
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<StringSectionIndexingHandler>
string_section_indexing_handler,
@@ -269,9 +285,16 @@ class IndexProcessorTest : public Test {
integer_section_indexing_handler,
IntegerSectionIndexingHandler::Create(
&fake_clock_, integer_index_.get()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinablePropertyIndexingHandler>
+ qualified_id_joinable_property_indexing_handler,
+ QualifiedIdJoinablePropertyIndexingHandler::Create(
+ &fake_clock_, qualified_id_join_index_.get()));
std::vector<std::unique_ptr<DataIndexingHandler>> handlers;
handlers.push_back(std::move(string_section_indexing_handler));
handlers.push_back(std::move(integer_section_indexing_handler));
+ handlers.push_back(
+ std::move(qualified_id_joinable_property_indexing_handler));
index_processor_ =
std::make_unique<IndexProcessor>(std::move(handlers), &fake_clock_);
@@ -281,9 +304,11 @@ class IndexProcessorTest : public Test {
void TearDown() override {
index_processor_.reset();
+ doc_store_.reset();
schema_store_.reset();
normalizer_.reset();
lang_segmenter_.reset();
+ qualified_id_join_index_.reset();
integer_index_.reset();
index_.reset();
@@ -298,13 +323,17 @@ class IndexProcessorTest : public Test {
std::string base_dir_;
std::string index_dir_;
std::string integer_index_dir_;
+ std::string qualified_id_join_index_dir_;
std::string schema_store_dir_;
+ std::string doc_store_dir_;
std::unique_ptr<Index> index_;
std::unique_ptr<NumericIndex<int64_t>> integer_index_;
+ std::unique_ptr<QualifiedIdTypeJoinableIndex> qualified_id_join_index_;
std::unique_ptr<LanguageSegmenter> lang_segmenter_;
std::unique_ptr<Normalizer> normalizer_;
std::unique_ptr<SchemaStore> schema_store_;
+ std::unique_ptr<DocumentStore> doc_store_;
std::unique_ptr<IndexProcessor> index_processor_;
};
@@ -788,9 +817,16 @@ TEST_F(IndexProcessorTest, OutOfOrderDocumentIdsInRecoveryMode) {
integer_section_indexing_handler,
IntegerSectionIndexingHandler::Create(
&fake_clock_, integer_index_.get()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinablePropertyIndexingHandler>
+ qualified_id_joinable_property_indexing_handler,
+ QualifiedIdJoinablePropertyIndexingHandler::Create(
+ &fake_clock_, qualified_id_join_index_.get()));
std::vector<std::unique_ptr<DataIndexingHandler>> handlers;
handlers.push_back(std::move(string_section_indexing_handler));
handlers.push_back(std::move(integer_section_indexing_handler));
+ handlers.push_back(
+ std::move(qualified_id_joinable_property_indexing_handler));
IndexProcessor index_processor(std::move(handlers), &fake_clock_,
/*recovery_mode=*/true);
@@ -1506,10 +1542,10 @@ TEST_F(IndexProcessorTest, IndexableIntegerProperty) {
EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
IsOk());
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocHitInfoIterator> itr,
- integer_index_->GetIterator(kIndexableIntegerProperty, /*key_lower=*/1,
- /*key_upper=*/5));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DocHitInfoIterator> itr,
+ integer_index_->GetIterator(
+ kIndexableIntegerProperty, /*key_lower=*/1,
+ /*key_upper=*/5, *doc_store_, *schema_store_));
EXPECT_THAT(
GetHits(std::move(itr)),
@@ -1535,10 +1571,10 @@ TEST_F(IndexProcessorTest, IndexableIntegerPropertyNoMatch) {
EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
IsOk());
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocHitInfoIterator> itr,
- integer_index_->GetIterator(kIndexableIntegerProperty, /*key_lower=*/-1,
- /*key_upper=*/0));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DocHitInfoIterator> itr,
+ integer_index_->GetIterator(
+ kIndexableIntegerProperty, /*key_lower=*/-1,
+ /*key_upper=*/0, *doc_store_, *schema_store_));
EXPECT_THAT(GetHits(std::move(itr)), IsEmpty());
}
diff --git a/icing/index/iterator/doc-hit-info-iterator-not.cc b/icing/index/iterator/doc-hit-info-iterator-not.cc
index 1818f08..38b1ded 100644
--- a/icing/index/iterator/doc-hit-info-iterator-not.cc
+++ b/icing/index/iterator/doc-hit-info-iterator-not.cc
@@ -63,8 +63,8 @@ libtextclassifier3::Status DocHitInfoIteratorNot::Advance() {
libtextclassifier3::StatusOr<DocHitInfoIterator::TrimmedNode>
DocHitInfoIteratorNot::TrimRightMostNode() && {
// Don't generate suggestion if the last operator is NOT.
- return absl_ports::UnimplementedError(
- "Cannot trim right most node in NOT operator.");
+ return absl_ports::InvalidArgumentError(
+ "Cannot generate suggestion if the last term is NOT operator.");
}
int32_t DocHitInfoIteratorNot::GetNumBlocksInspected() const {
diff --git a/icing/index/iterator/doc-hit-info-iterator-not_test.cc b/icing/index/iterator/doc-hit-info-iterator-not_test.cc
index 54d6c36..5a8ce2c 100644
--- a/icing/index/iterator/doc-hit-info-iterator-not_test.cc
+++ b/icing/index/iterator/doc-hit-info-iterator-not_test.cc
@@ -163,7 +163,7 @@ TEST(DocHitInfoIteratorNotTest, TrimNotIterator) {
DocHitInfoIteratorNot not_iterator(std::move(to_be_excluded_iterator),
/*document_id_limit=*/5);
EXPECT_THAT(std::move(not_iterator).TrimRightMostNode(),
- StatusIs(libtextclassifier3::StatusCode::UNIMPLEMENTED));
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
}
} // namespace
diff --git a/icing/index/numeric/doc-hit-info-iterator-numeric.h b/icing/index/numeric/doc-hit-info-iterator-numeric.h
index bf990d1..fc66a1d 100644
--- a/icing/index/numeric/doc-hit-info-iterator-numeric.h
+++ b/icing/index/numeric/doc-hit-info-iterator-numeric.h
@@ -49,8 +49,8 @@ class DocHitInfoIteratorNumeric : public DocHitInfoIterator {
}
libtextclassifier3::StatusOr<TrimmedNode> TrimRightMostNode() && override {
- return absl_ports::UnimplementedError(
- "Cannot trim right most node in numeric operator.");
+ return absl_ports::InvalidArgumentError(
+ "Cannot generate suggestion if the last term is numeric operator.");
}
int32_t GetNumBlocksInspected() const override { return 0; }
diff --git a/icing/index/numeric/dummy-numeric-index.h b/icing/index/numeric/dummy-numeric-index.h
index 164866c..7cfb102 100644
--- a/icing/index/numeric/dummy-numeric-index.h
+++ b/icing/index/numeric/dummy-numeric-index.h
@@ -70,7 +70,8 @@ class DummyNumericIndex : public NumericIndex<T> {
}
libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>> GetIterator(
- std::string_view property_path, T key_lower, T key_upper) const override;
+ std::string_view property_path, T key_lower, T key_upper,
+ const DocumentStore&, const SchemaStore&) const override;
libtextclassifier3::Status Optimize(
const std::vector<DocumentId>& document_id_old_to_new,
@@ -93,6 +94,8 @@ class DummyNumericIndex : public NumericIndex<T> {
}
}
+ int num_property_indices() const override { return storage_.size(); }
+
private:
class Editor : public NumericIndex<T>::Editor {
public:
@@ -176,7 +179,6 @@ class DummyNumericIndex : public NumericIndex<T> {
DocHitInfo doc_hit_info_;
};
- private:
explicit DummyNumericIndex(const Filesystem& filesystem,
std::string&& working_path)
: NumericIndex<T>(filesystem, std::move(working_path),
@@ -265,7 +267,8 @@ libtextclassifier3::Status DummyNumericIndex<T>::Iterator::Advance() {
template <typename T>
libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>>
DummyNumericIndex<T>::GetIterator(std::string_view property_path, T key_lower,
- T key_upper) const {
+ T key_upper, const DocumentStore&,
+ const SchemaStore&) const {
if (key_lower > key_upper) {
return absl_ports::InvalidArgumentError(
"key_lower should not be greater than key_upper");
diff --git a/icing/index/numeric/integer-index-bucket-util.cc b/icing/index/numeric/integer-index-bucket-util.cc
new file mode 100644
index 0000000..a05baab
--- /dev/null
+++ b/icing/index/numeric/integer-index-bucket-util.cc
@@ -0,0 +1,205 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/numeric/integer-index-bucket-util.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <iterator>
+#include <limits>
+#include <utility>
+#include <vector>
+
+#include "icing/index/numeric/integer-index-data.h"
+
+namespace icing {
+namespace lib {
+
+namespace integer_index_bucket_util {
+
+namespace {
+
+// Helper function to determine if data slice [start, end) forms a "full
+// single-range bucket".
+//
+// Full single-range bucket: keys of all data are identical and # of them exceed
+// num_data_threshold.
+//
+// REQUIRES: data slice [start, end) are sorted by key.
+inline bool WouldBeFullSingleRangeBucket(
+ const std::vector<IntegerIndexData>::iterator& start,
+ const std::vector<IntegerIndexData>::iterator& end,
+ int32_t num_data_threshold) {
+ return std::distance(start, end) > num_data_threshold &&
+ start->key() == (end - 1)->key();
+}
+
+// Helper function to determine if a bucket is full single-range.
+//
+// REQUIRES:
+// bucket.key_lower <= [bucket.start, bucket.end)->key() <= bucket.key_upper
+inline bool IsFullSingleRangeBucket(const DataRangeAndBucketInfo& bucket,
+ int32_t num_data_threshold) {
+ return bucket.key_lower == bucket.key_upper &&
+ WouldBeFullSingleRangeBucket(bucket.start, bucket.end,
+ num_data_threshold);
+}
+
+// Helper function to append new bucket(s) with corresponding data slice for
+// range [curr_key_lower, last_key] where last_key = (it_end - 1)->key().
+//
+// Also it handles an edge case:
+// If data slice [it_start, it_end) forms a "full single-range bucket" (see
+// WouldBeFullSingleRangeBucket for definition), then we have to put them into a
+// single range bucket [last_key, last_key] instead of [curr_key_lower,
+// last_key]. Also we have to deal with range [curr_key_lower, last_key - 1]:
+// - If the previous bucket exists and it is not a "full single-range bucket",
+// then merge [curr_key_lower, last_key - 1] into the previous bucket, i.e.
+// change the previous bucket's key_upper to (last_key - 1). Then we will end
+// up having:
+// - [prev_bucket.key_lower, last_key - 1]
+// - [last_key, last_key]
+// - Otherwise, we have to create [curr_key_lower, last_key - 1] with
+// empty data. Then we will end up having (Note: prev_bucket.key_upper ==
+// curr_key_lower - 1):
+// - [prev_bucket.key_lower, curr_key_lower - 1]
+// - [curr_key_lower, last_key - 1]
+// - [last_key, last_key]
+// This will avoid split bucket being called too frequently.
+// For example, original_key_lower = 0, original_key_upper = 50. If we have
+// (num_data_threshold + 1) data with key = 20 and another data with key = 40:
+// - Without this part, we will split them into [[0, 20], [21, 50]]. Then when
+// adding data with key = 10 next round, we will invoke split again and split
+// [0, 20] to [[0, 10], [11, 20]].
+// - With this part, we will split them into [[0, 19], [20, 20], [21, 50]],
+// which will avoid splitting in the next round for key = 20.
+//
+// REQUIRES: it_start < it_end
+void AppendNewBuckets(const std::vector<IntegerIndexData>::iterator& it_start,
+ const std::vector<IntegerIndexData>::iterator& it_end,
+ int64_t curr_key_lower, int32_t num_data_threshold,
+ std::vector<DataRangeAndBucketInfo>& results) {
+ int64_t last_key = (it_end - 1)->key();
+ if (curr_key_lower < last_key &&
+ WouldBeFullSingleRangeBucket(it_start, it_end, num_data_threshold)) {
+ if (!results.empty() &&
+ !IsFullSingleRangeBucket(results.back(), num_data_threshold)) {
+ // Previous bucket is not full single-range, so merge it to now hold the
+ // range [prev_bucket.key_lower, last_key - 1].
+ results.back().key_upper = last_key - 1;
+ } else {
+ // There is either no previous bucket or the previous bucket is full
+ // single-range. So add an empty bucket for the range [curr_key_lower,
+ // last_key - 1].
+ results.push_back(DataRangeAndBucketInfo(it_start, it_start,
+ curr_key_lower, last_key - 1));
+ }
+ curr_key_lower = last_key;
+ }
+ results.push_back(
+ DataRangeAndBucketInfo(it_start, it_end, curr_key_lower, last_key));
+}
+
+} // namespace
+
+std::vector<DataRangeAndBucketInfo> Split(std::vector<IntegerIndexData>& data,
+ int64_t original_key_lower,
+ int64_t original_key_upper,
+ int32_t num_data_threshold) {
+ // Early return if there is no need to split.
+ if (data.size() <= num_data_threshold) {
+ return {DataRangeAndBucketInfo(data.begin(), data.end(), original_key_lower,
+ original_key_upper)};
+ }
+
+ // Sort data by key.
+ std::sort(
+ data.begin(), data.end(),
+ [](const IntegerIndexData& lhs, const IntegerIndexData& rhs) -> bool {
+ return lhs.key() < rhs.key();
+ });
+
+ std::vector<DataRangeAndBucketInfo> results;
+ int64_t curr_key_lower = original_key_lower;
+ // Sliding window [it_start, it_end) to separate data into different buckets.
+ auto it_start = data.begin();
+ auto it_end = data.begin();
+ while (it_end != data.end()) {
+ // Attempt to extend it_end by 1, but we have to include all data with the
+ // same key since they cannot be separated into different buckets. Also use
+ // extend_it_end to avoid modifying it_end directly. For some edge cases,
+ // the extension in a single round is extremely large (i.e. a lot of data
+ // have the same key), and we want to separate them. For example:
+ // - key = 0: 5 data
+ // - key = 1: num_data_threshold - 1 data
+ // In the second round, # of data in the sliding window will exceed the
+ // threshold. We want to separate all data with key = 0 into a single bucket
+ // instead of putting key = 0 and key = 1 together. Therefore, using
+ // extend_it_end allow us to preserve it_end of the previous round and be
+ // able to deal with this case.
+ auto extend_it_end = it_end + 1;
+ while (extend_it_end != data.end() &&
+ it_end->key() == extend_it_end->key()) {
+ ++extend_it_end;
+ }
+
+ if (std::distance(it_start, extend_it_end) > num_data_threshold &&
+ it_start != it_end) {
+ // Split data between [it_start, it_end) into range [curr_key_lower,
+ // (it_end - 1)->key()].
+ AppendNewBuckets(it_start, it_end, curr_key_lower, num_data_threshold,
+ results);
+
+ // it_end at this moment won't be data.end(), so the last element of the
+ // new bucket can't have key == INT64_MAX. Therefore, it is safe to set
+ // curr_key_lower as ((it_end - 1)->key() + 1).
+ curr_key_lower = (it_end - 1)->key() + 1;
+ it_start = it_end;
+ }
+ it_end = extend_it_end;
+ }
+
+ // Handle the final range [curr_key_lower, original_key_upper].
+ if (curr_key_lower <= original_key_upper) {
+ if (it_start != it_end) {
+ AppendNewBuckets(it_start, it_end, curr_key_lower, num_data_threshold,
+ results);
+
+ // AppendNewBuckets only handles range [curr_key_lower, (it_end -
+ // 1)->key()], so we have to handle range [(it_end - 1)->key() + 1,
+ // original_key_upper] if needed.
+ int64_t last_key = (it_end - 1)->key();
+ if (last_key != std::numeric_limits<int64_t>::max() &&
+ last_key + 1 <= original_key_upper) {
+ if (!results.empty() &&
+ !IsFullSingleRangeBucket(results.back(), num_data_threshold)) {
+ results.back().key_upper = original_key_upper;
+ } else {
+ results.push_back(DataRangeAndBucketInfo(
+ it_start, it_start, last_key + 1, original_key_upper));
+ }
+ }
+ } else {
+ results.push_back(DataRangeAndBucketInfo(it_start, it_end, curr_key_lower,
+ original_key_upper));
+ }
+ }
+
+ return results;
+}
+
+} // namespace integer_index_bucket_util
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/numeric/integer-index-bucket-util.h b/icing/index/numeric/integer-index-bucket-util.h
new file mode 100644
index 0000000..863bd01
--- /dev/null
+++ b/icing/index/numeric/integer-index-bucket-util.h
@@ -0,0 +1,81 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_NUMERIC_INTEGER_INDEX_BUCKET_UTIL_H_
+#define ICING_INDEX_NUMERIC_INTEGER_INDEX_BUCKET_UTIL_H_
+
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+#include "icing/index/numeric/integer-index-data.h"
+
+namespace icing {
+namespace lib {
+
+namespace integer_index_bucket_util {
+
+// A wrapper struct that contains information of a bucket.
+// - The bucket contains data within the iterator [start, end).
+// - Bucket range is [key_lower, key_upper], and all data within [start, end)
+// should have keys in the bucket range.
+//
+// Note: the caller should make sure the lifecycle of data vector is longer than
+// instances of this wrapper struct.
+struct DataRangeAndBucketInfo {
+ std::vector<IntegerIndexData>::iterator start;
+ std::vector<IntegerIndexData>::iterator end;
+ int64_t key_lower;
+ int64_t key_upper;
+
+ explicit DataRangeAndBucketInfo(
+ std::vector<IntegerIndexData>::iterator start_in,
+ std::vector<IntegerIndexData>::iterator end_in, int64_t key_lower_in,
+ int64_t key_upper_in)
+ : start(std::move(start_in)),
+ end(std::move(end_in)),
+ key_lower(key_lower_in),
+ key_upper(key_upper_in) {}
+};
+
+// Helper function to split data (that are originally in a bucket with range
+// [original_key_lower, original_key_upper]) into different buckets according to
+// num_data_threshold.
+// - The input vector `data` will be sorted by key in ascending order (unless
+// there's no need to split in which case data is returned unmodified)
+// - Data with the same key will be in the same bucket even if # of them exceed
+// num_data_threshold.
+// - Range of all buckets will be disjoint, and the range union will be
+// [original_key_lower, original_key_upper].
+// - Data slice (i.e. [start, end)) can be empty.
+//
+// REQUIRES:
+// - original_key_lower <= original_key_upper
+// - num_data_threshold > 0
+// - Keys of all data are in range [original_key_lower, original_key_upper]
+//
+// Returns: a vector of DataRangeAndBucketInfo that contain all bucket info
+// after splitting. Also the returned vector should contain at least one
+// bucket, otherwise it is considered an error.
+std::vector<DataRangeAndBucketInfo> Split(std::vector<IntegerIndexData>& data,
+ int64_t original_key_lower,
+ int64_t original_key_upper,
+ int32_t num_data_threshold);
+
+} // namespace integer_index_bucket_util
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_INDEX_NUMERIC_INTEGER_INDEX_BUCKET_UTIL_H_
diff --git a/icing/index/numeric/integer-index-bucket-util_test.cc b/icing/index/numeric/integer-index-bucket-util_test.cc
new file mode 100644
index 0000000..82c593e
--- /dev/null
+++ b/icing/index/numeric/integer-index-bucket-util_test.cc
@@ -0,0 +1,1112 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/numeric/integer-index-bucket-util.h"
+
+#include <limits>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/index/numeric/integer-index-data.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
+
+namespace icing {
+namespace lib {
+namespace integer_index_bucket_util {
+
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::Eq;
+using ::testing::IsEmpty;
+using ::testing::Ne;
+using ::testing::SizeIs;
+
+static constexpr DocumentId kDefaultDocumentId = 123;
+static constexpr SectionId kDefaultSectionId = 31;
+
+TEST(IntegerIndexBucketUtilTest, Split_numDataNotDivisibleByThreshold) {
+ std::vector<IntegerIndexData> data = {
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 1),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2)};
+ int64_t key_lower = -10;
+ int64_t key_upper = 10;
+ int32_t num_data_threshold = 3;
+ ASSERT_THAT(data.size() % num_data_threshold, Ne(0));
+
+ // Keys = [-10, -3, -2, 0, 1, 2, 10].
+ std::vector<DataRangeAndBucketInfo> results =
+ Split(data, key_lower, key_upper, num_data_threshold);
+ ASSERT_THAT(results, SizeIs(3));
+ // Bucket 0: key lower = -10, key upper = -2, keys = [-10, -3, -2].
+ EXPECT_THAT(results[0].key_lower, Eq(-10));
+ EXPECT_THAT(results[0].key_upper, Eq(-2));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[0].start, results[0].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2)));
+ // Bucket 1: key lower = -1, key upper = 2, keys = [0, 1, 2].
+ EXPECT_THAT(results[1].key_lower, Eq(-1));
+ EXPECT_THAT(results[1].key_upper, Eq(2));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[1].start, results[1].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 1),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2)));
+ // Bucket 2: key lower = 3, key upper = 10, keys = [10].
+ EXPECT_THAT(results[2].key_lower, Eq(3));
+ EXPECT_THAT(results[2].key_upper, Eq(10));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[2].start, results[2].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)));
+}
+
+TEST(IntegerIndexBucketUtilTest, Split_numDataDivisibleByThreshold) {
+ std::vector<IntegerIndexData> data = {
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2)};
+ int64_t key_lower = -10;
+ int64_t key_upper = 10;
+ int32_t num_data_threshold = 3;
+ ASSERT_THAT(data.size() % num_data_threshold, Eq(0));
+
+ // Keys = [-10, -3, -2, 0, 2, 10].
+ std::vector<DataRangeAndBucketInfo> results =
+ Split(data, key_lower, key_upper, num_data_threshold);
+ ASSERT_THAT(results, SizeIs(2));
+ // Bucket 0: key lower = -10, key upper = -2, keys = [-10, -3, -2].
+ EXPECT_THAT(results[0].key_lower, Eq(-10));
+ EXPECT_THAT(results[0].key_upper, Eq(-2));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[0].start, results[0].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2)));
+ // Bucket 1: key lower = -1, key upper = 2, keys = [0, 2, 10].
+ EXPECT_THAT(results[1].key_lower, Eq(-1));
+ EXPECT_THAT(results[1].key_upper, Eq(10));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[1].start, results[1].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)));
+}
+
+TEST(IntegerIndexBucketUtilTest, Split_shouldIncludeOriginalKeyRange) {
+ std::vector<IntegerIndexData> data = {
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 1),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2)};
+ int64_t key_lower = -1000;
+ int64_t key_upper = 1000;
+ int32_t num_data_threshold = 3;
+
+ // Keys = [-10, -3, -2, 0, 1, 2, 10].
+ // Split should include the original key_lower and key_upper even if there is
+ // no key at boundary.
+ std::vector<DataRangeAndBucketInfo> results =
+ Split(data, key_lower, key_upper, num_data_threshold);
+ ASSERT_THAT(results, SizeIs(3));
+ // Bucket 0: key lower = -1000, key upper = -2, keys = [-10, -3, -2].
+ EXPECT_THAT(results[0].key_lower, Eq(-1000));
+ EXPECT_THAT(results[0].key_upper, Eq(-2));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[0].start, results[0].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2)));
+ // Bucket 1: key lower = -1, key upper = 2, keys = [0, 1, 2].
+ EXPECT_THAT(results[1].key_lower, Eq(-1));
+ EXPECT_THAT(results[1].key_upper, Eq(2));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[1].start, results[1].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 1),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2)));
+ // Bucket 2: key lower = 3, key upper = 1000, keys = [10].
+ EXPECT_THAT(results[2].key_lower, Eq(3));
+ EXPECT_THAT(results[2].key_upper, Eq(1000));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[2].start, results[2].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)));
+}
+
+TEST(IntegerIndexBucketUtilTest, Split_singleBucketWithoutSplitting) {
+ std::vector<IntegerIndexData> data = {
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 1),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2)};
+ int64_t key_lower = -1000;
+ int64_t key_upper = 1000;
+ int32_t num_data_threshold = 100;
+
+ // Keys = [-10, -3, -2, 0, 1, 2, 10].
+ std::vector<DataRangeAndBucketInfo> results =
+ Split(data, key_lower, key_upper, num_data_threshold);
+ ASSERT_THAT(results, SizeIs(1));
+ // Bucket 0: key lower = -1000, key upper = 1000, keys = [-10, -3, -2, 0, 1,
+ // 2, 10]. Since # of data <= threshold, data vector won't be sorted and thus
+ // [start, end) will have data with the original order.
+ EXPECT_THAT(results[0].key_lower, Eq(-1000));
+ EXPECT_THAT(results[0].key_upper, Eq(1000));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[0].start, results[0].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 1),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2)));
+}
+
+TEST(IntegerIndexBucketUtilTest, Split_emptyData) {
+ std::vector<IntegerIndexData> empty_data;
+ std::vector<DataRangeAndBucketInfo> results =
+ Split(empty_data, /*original_key_lower=*/-10, /*original_key_upper=*/10,
+ /*num_data_threshold=*/3);
+ ASSERT_THAT(results, SizeIs(1));
+ // Bucket 0: key lower = -10, key upper = 10, keys = [].
+ EXPECT_THAT(results[0].key_lower, Eq(-10));
+ EXPECT_THAT(results[0].key_upper, Eq(10));
+ EXPECT_THAT(std::vector<IntegerIndexData>(results[0].start, results[0].end),
+ IsEmpty());
+}
+
+TEST(IntegerIndexBucketUtilTest,
+ Split_sameKeysExceedingThreshold_firstBucket_keyEqualsKeyLower) {
+ std::vector<IntegerIndexData> data = {
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 3),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)};
+
+ // Keys = [-10, -10, -10, -10, -10, 0, 3, 5, 10].
+ std::vector<DataRangeAndBucketInfo> results =
+ Split(data, /*original_key_lower=*/-10, /*original_key_upper=*/10,
+ /*num_data_threshold=*/3);
+ // - Even though # of data with key = -10 exceeds the threshold, they should
+ // still be in the same bucket.
+ // - They should be separated from key = 0, 3, ....
+ ASSERT_THAT(results, SizeIs(3));
+ // Bucket 0: key lower = -10, key upper = -10, keys = [-10, -10, -10, -10,
+ // -10].
+ EXPECT_THAT(results[0].key_lower, Eq(-10));
+ EXPECT_THAT(results[0].key_upper, Eq(-10));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[0].start, results[0].end),
+ ElementsAre(
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10)));
+ // Bucket 1: key lower = -9, key upper = 5, keys = [0, 3, 5].
+ EXPECT_THAT(results[1].key_lower, Eq(-9));
+ EXPECT_THAT(results[1].key_upper, Eq(5));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[1].start, results[1].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 3),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5)));
+ // Bucket 2: key lower = 6, key upper = 10, keys = [10].
+ EXPECT_THAT(results[2].key_lower, Eq(6));
+ EXPECT_THAT(results[2].key_upper, Eq(10));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[2].start, results[2].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)));
+}
+
+TEST(IntegerIndexBucketUtilTest,
+ Split_sameKeysExceedingThreshold_firstBucket_keyGreaterThanKeyLower) {
+ std::vector<IntegerIndexData> data = {
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -7),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -7),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -7),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -7),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -7),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 3),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)};
+
+ // Keys = [-7, -7, -7, -7, -7, 0, 3, 5, 10].
+ std::vector<DataRangeAndBucketInfo> results =
+ Split(data, /*original_key_lower=*/-10, /*original_key_upper=*/10,
+ /*num_data_threshold=*/3);
+ // - Even though # of data with key = -7 exceeds the threshold, they should
+ // still be in the same bucket.
+ // - They should be separated from key = 0, 3, ....
+ // - They should be in a single range bucket [-7, -7], and another bucket
+ // [-10, -8] with empty data should be created before it.
+ ASSERT_THAT(results, SizeIs(4));
+ // Bucket 0: key lower = -10, key upper = -8, keys = [].
+ EXPECT_THAT(results[0].key_lower, Eq(-10));
+ EXPECT_THAT(results[0].key_upper, Eq(-8));
+ EXPECT_THAT(std::vector<IntegerIndexData>(results[0].start, results[0].end),
+ IsEmpty());
+ // Bucket 1: key lower = -7, key upper = -7, keys = [-7, -7, -7, -7, -7].
+ EXPECT_THAT(results[1].key_lower, Eq(-7));
+ EXPECT_THAT(results[1].key_upper, Eq(-7));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[1].start, results[1].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -7),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -7),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -7),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -7),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -7)));
+ // Bucket 2: key lower = -6, key upper = 5, keys = [0, 3, 5].
+ EXPECT_THAT(results[2].key_lower, Eq(-6));
+ EXPECT_THAT(results[2].key_upper, Eq(5));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[2].start, results[2].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 3),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5)));
+ // Bucket 3: key lower = 6, key upper = 10, keys = [10].
+ EXPECT_THAT(results[3].key_lower, Eq(6));
+ EXPECT_THAT(results[3].key_upper, Eq(10));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[3].start, results[3].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)));
+}
+
+TEST(IntegerIndexBucketUtilTest,
+ Split_sameKeysExceedingThreshold_midBucket_keyEqualsKeyLower) {
+ std::vector<IntegerIndexData> data = {
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -5),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -4),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -4),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -4),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -4),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -4),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)};
+
+ // Keys = [-10, -5, -4, -4, -4, -4, -4, 5, 10].
+ std::vector<DataRangeAndBucketInfo> results =
+ Split(data, /*original_key_lower=*/-10, /*original_key_upper=*/10,
+ /*num_data_threshold=*/3);
+ // - Even though # of data with key = -4 exceeds the threshold, they should
+ // still be in the same bucket.
+ // - They should be separated from key = -10, -5, 5, 10.
+ ASSERT_THAT(results, SizeIs(3));
+ // Bucket 0: key lower = -10, key upper = -5, keys = [-10, -5].
+ EXPECT_THAT(results[0].key_lower, Eq(-10));
+ EXPECT_THAT(results[0].key_upper, Eq(-5));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[0].start, results[0].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -5)));
+ // Bucket 1: key lower = -4, key upper = -4, keys = [-4, -4, -4, -4, -4].
+ EXPECT_THAT(results[1].key_lower, Eq(-4));
+ EXPECT_THAT(results[1].key_upper, Eq(-4));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[1].start, results[1].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -4),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -4),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -4),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -4),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -4)));
+ // Bucket 2: key lower = -3, key upper = 10, keys = [5, 10].
+ EXPECT_THAT(results[2].key_lower, Eq(-3));
+ EXPECT_THAT(results[2].key_upper, Eq(10));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[2].start, results[2].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)));
+}
+
+TEST(IntegerIndexBucketUtilTest,
+ Split_sameKeysExceedingThreshold_midBucket_keyGreaterThanKeyLower) {
+ std::vector<IntegerIndexData> data = {
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -5),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)};
+
+ // Keys = [-10, -5, -1, -1, -1, -1, -1, 5, 10].
+ std::vector<DataRangeAndBucketInfo> results =
+ Split(data, /*original_key_lower=*/-10, /*original_key_upper=*/10,
+ /*num_data_threshold=*/3);
+ // - Even though # of data with key = -1 exceeds the threshold, they should
+ // still be in the same bucket.
+ // - They should be separated from key = -10, -5, 5, 10.
+ // - They should be in a single range bucket [-1, -1], and range [-4, -2]
+ // should be merged into the previous bucket.
+ ASSERT_THAT(results, SizeIs(3));
+ // Bucket 0: key lower = -10, key upper = -2, keys = [-10, -5].
+ EXPECT_THAT(results[0].key_lower, Eq(-10));
+ EXPECT_THAT(results[0].key_upper, Eq(-2));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[0].start, results[0].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -5)));
+ // Bucket 1: key lower = -1, key upper = -1, keys = [-1, -1, -1, -1, -1].
+ EXPECT_THAT(results[1].key_lower, Eq(-1));
+ EXPECT_THAT(results[1].key_upper, Eq(-1));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[1].start, results[1].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1)));
+ // Bucket 2: key lower = 0, key upper = 10, keys = [5, 10].
+ EXPECT_THAT(results[2].key_lower, Eq(0));
+ EXPECT_THAT(results[2].key_upper, Eq(10));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[2].start, results[2].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)));
+}
+
+TEST(IntegerIndexBucketUtilTest,
+ Split_sameKeysExceedingThreshold_lastBucket_keyEqualsKeyLower) {
+ std::vector<IntegerIndexData> data = {
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 3),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 3),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 3),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 3),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 3)};
+
+ // Keys = [-10, -3, 0, 2, 3, 3, 3, 3, 3].
+ std::vector<DataRangeAndBucketInfo> results =
+ Split(data, /*original_key_lower=*/-10, /*original_key_upper=*/10,
+ /*num_data_threshold=*/3);
+ // - Even though # of data with key = 3 exceeds the threshold, they should
+ // still be in the same bucket.
+ // - They should be separated from key = -10, -3, 0, 2.
+ // - They should be in a single range bucket [3, 3], and another bucket
+ // [4, 10] with empty data should be created after it.
+ ASSERT_THAT(results, SizeIs(4));
+ // Bucket 0: key lower = -10, key upper = 0, keys = [-10, -3, 0].
+ EXPECT_THAT(results[0].key_lower, Eq(-10));
+ EXPECT_THAT(results[0].key_upper, Eq(0));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[0].start, results[0].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0)));
+ // Bucket 1: key lower = 1, key upper = 2, keys = [2].
+ EXPECT_THAT(results[1].key_lower, Eq(1));
+ EXPECT_THAT(results[1].key_upper, Eq(2));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[1].start, results[1].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2)));
+ // Bucket 2: key lower = 3, key upper = 10, keys = [3, 3, 3, 3, 3].
+ EXPECT_THAT(results[2].key_lower, Eq(3));
+ EXPECT_THAT(results[2].key_upper, Eq(3));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[2].start, results[2].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 3),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 3),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 3),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 3),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 3)));
+ // Bucket 3: key lower = 4, key upper = 10, keys = [].
+ EXPECT_THAT(results[3].key_lower, Eq(4));
+ EXPECT_THAT(results[3].key_upper, Eq(10));
+ EXPECT_THAT(std::vector<IntegerIndexData>(results[3].start, results[3].end),
+ IsEmpty());
+}
+
+TEST(IntegerIndexBucketUtilTest,
+ Split_sameKeysExceedingThreshold_lastBucket_keyWithinKeyLowerAndUpper) {
+ std::vector<IntegerIndexData> data = {
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 6),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 6),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 6),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 6),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 6)};
+
+ // Keys = [-10, -3, 0, 2, 6, 6, 6, 6, 6].
+ std::vector<DataRangeAndBucketInfo> results =
+ Split(data, /*original_key_lower=*/-10, /*original_key_upper=*/10,
+ /*num_data_threshold=*/3);
+ // - Even though # of data with key = 6 exceeds the threshold, they should
+ // still be in the same bucket.
+ // - They should be separated from key = -10, -3, 0, 2.
+ // - They should be in a single range bucket [6, 6]. Range [3, 5] should be
+ // merged into the previous bucket. and another bucket [7, 10] with empty
+ // data should be created after it.
+ ASSERT_THAT(results, SizeIs(4));
+ // Bucket 0: key lower = -10, key upper = 0, keys = [-10, -3, 0].
+ EXPECT_THAT(results[0].key_lower, Eq(-10));
+ EXPECT_THAT(results[0].key_upper, Eq(0));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[0].start, results[0].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0)));
+ // Bucket 1: key lower = 1, key upper = 5, keys = [2].
+ EXPECT_THAT(results[1].key_lower, Eq(1));
+ EXPECT_THAT(results[1].key_upper, Eq(5));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[1].start, results[1].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2)));
+ // Bucket 2: key lower = 6, key upper = 6, keys = [6, 6, 6, 6, 6].
+ EXPECT_THAT(results[2].key_lower, Eq(6));
+ EXPECT_THAT(results[2].key_upper, Eq(6));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[2].start, results[2].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 6),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 6),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 6),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 6),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 6)));
+ // Bucket 3: key lower = 7, key upper = 10, keys = [].
+ EXPECT_THAT(results[3].key_lower, Eq(7));
+ EXPECT_THAT(results[3].key_upper, Eq(10));
+ EXPECT_THAT(std::vector<IntegerIndexData>(results[3].start, results[3].end),
+ IsEmpty());
+}
+
+TEST(IntegerIndexBucketUtilTest,
+ Split_sameKeysExceedingThreshold_lastBucket_keyEqualsKeyUpper) {
+ std::vector<IntegerIndexData> data = {
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)};
+
+ // Keys = [-10, -3, 0, 2, 10, 10, 10, 10, 10].
+ std::vector<DataRangeAndBucketInfo> results =
+ Split(data, /*original_key_lower=*/-10, /*original_key_upper=*/10,
+ /*num_data_threshold=*/3);
+ // - Even though # of data with key = 10 exceeds the threshold, they should
+ // still be in the same bucket.
+ // - They should be separated from key = -10, -3, 0, 2.
+ // - They should be in a single range bucket [10, 10], and range [3, 9] should
+ // be merged into the previous bucket.
+ ASSERT_THAT(results, SizeIs(3));
+ // Bucket 0: key lower = -10, key upper = 0, keys = [-10, -3, 0].
+ EXPECT_THAT(results[0].key_lower, Eq(-10));
+ EXPECT_THAT(results[0].key_upper, Eq(0));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[0].start, results[0].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0)));
+ // Bucket 1: key lower = 1, key upper = 9, keys = [2].
+ EXPECT_THAT(results[1].key_lower, Eq(1));
+ EXPECT_THAT(results[1].key_upper, Eq(9));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[1].start, results[1].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2)));
+ // Bucket 2: key lower = 10, key upper = 10, keys = [10, 10, 10, 10, 10].
+ EXPECT_THAT(results[2].key_lower, Eq(10));
+ EXPECT_THAT(results[2].key_upper, Eq(10));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[2].start, results[2].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)));
+}
+
+TEST(IntegerIndexBucketUtilTest,
+ Split_sameKeysExceedingThreshold_shouldNotMergeIntoPreviousBucket) {
+ std::vector<IntegerIndexData> data = {
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)};
+
+ // Keys = [-10, -2, -2, -2, -2, -2, 5, 5, 5, 5, 5, 10].
+ std::vector<DataRangeAndBucketInfo> results =
+ Split(data, /*original_key_lower=*/-10, /*original_key_upper=*/10,
+ /*num_data_threshold=*/3);
+ // - Data with key = -2 and 5 should be put into a single bucket respectively.
+ // - When dealing with key = 5, range [-1, 4] should not be merged into the
+ // previous bucket [-2, -2] because [-2, -2] also contains single key data
+ // exceeding the threshold. Instead, we should create bucket [-1, 4] with
+ // empty data.
+ ASSERT_THAT(results, SizeIs(5));
+ // Bucket 0: key lower = -10, key upper = -3, keys = [-10].
+ EXPECT_THAT(results[0].key_lower, Eq(-10));
+ EXPECT_THAT(results[0].key_upper, Eq(-3));
+ EXPECT_THAT(std::vector<IntegerIndexData>(results[0].start, results[0].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId,
+ kDefaultDocumentId, -10)));
+ // Bucket 1: key lower = -2, key upper = -2, keys = [-2, -2, -2, -2, -2].
+ EXPECT_THAT(results[1].key_lower, Eq(-2));
+ EXPECT_THAT(results[1].key_upper, Eq(-2));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[1].start, results[1].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2)));
+ // Bucket 2: key lower = -1, key upper = 4, keys = [].
+ EXPECT_THAT(results[2].key_lower, Eq(-1));
+ EXPECT_THAT(results[2].key_upper, Eq(4));
+ EXPECT_THAT(std::vector<IntegerIndexData>(results[2].start, results[2].end),
+ IsEmpty());
+ // Bucket 3: key lower = 5, key upper = 5, keys = [5, 5, 5, 5, 5].
+ EXPECT_THAT(results[3].key_lower, Eq(5));
+ EXPECT_THAT(results[3].key_upper, Eq(5));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[3].start, results[3].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5)));
+ // Bucket 4: key lower = 6, key upper = 10, keys = [10].
+ EXPECT_THAT(results[4].key_lower, Eq(6));
+ EXPECT_THAT(results[4].key_upper, Eq(10));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[4].start, results[4].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)));
+}
+
+TEST(IntegerIndexBucketUtilTest,
+ Split_sameKeysExceedingThreshold_shouldMergeIntoPreviousBucket) {
+ std::vector<IntegerIndexData> data = {
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -8),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)};
+
+ // Keys = [-10, -8, -3, -2, -2, -2, 5, 5, 5, 5, 5, 10].
+ std::vector<DataRangeAndBucketInfo> results =
+ Split(data, /*original_key_lower=*/-10, /*original_key_upper=*/10,
+ /*num_data_threshold=*/3);
+ // - Data with key = 5 should be put into a single bucket.
+ // - When dealing with key = 5, range [-1, 4] should be merged into the
+ // previous bucket [-2, -2] because # of data in [-2, -2] doesn't exceed the
+ // threshold.
+ ASSERT_THAT(results, SizeIs(4));
+ // Bucket 0: key lower = -10, key upper = -3, keys = [-10, -8, -3].
+ EXPECT_THAT(results[0].key_lower, Eq(-10));
+ EXPECT_THAT(results[0].key_upper, Eq(-3));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[0].start, results[0].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -8),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3)));
+ // Bucket 1: key lower = -2, key upper = 4, keys = [-2, -2, -2].
+ EXPECT_THAT(results[1].key_lower, Eq(-2));
+ EXPECT_THAT(results[1].key_upper, Eq(4));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[1].start, results[1].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2)));
+ // Bucket 2: key lower = 5, key upper = 5, keys = [5, 5, 5, 5, 5].
+ EXPECT_THAT(results[2].key_lower, Eq(5));
+ EXPECT_THAT(results[2].key_upper, Eq(5));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[2].start, results[2].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5)));
+ // Bucket 3: key lower = 6, key upper = 10, keys = [10].
+ EXPECT_THAT(results[3].key_lower, Eq(6));
+ EXPECT_THAT(results[3].key_upper, Eq(10));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[3].start, results[3].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)));
+}
+
+TEST(IntegerIndexBucketUtilTest,
+ Split_sameKeysExceedingThreshold_singleBucket_keyEqualsKeyLower) {
+ std::vector<IntegerIndexData> data = {
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10)};
+
+ // Keys = [-10, -10, -10, -10, -10].
+ std::vector<DataRangeAndBucketInfo> results =
+ Split(data, /*original_key_lower=*/-10, /*original_key_upper=*/10,
+ /*num_data_threshold=*/3);
+ // - Even though # of data with key = -10 exceeds the threshold, they should
+ // still be in the same bucket.
+ // - They should be in a single range bucket [-10, -10], and another bucket
+ // [-9, 10] with empty data should be created after it.
+ ASSERT_THAT(results, SizeIs(2));
+ // Bucket 0: key lower = -10, key upper = -10, keys = [-10, -10, -10, -10,
+ // -10].
+ EXPECT_THAT(results[0].key_lower, Eq(-10));
+ EXPECT_THAT(results[0].key_upper, Eq(-10));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[0].start, results[0].end),
+ ElementsAre(
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10)));
+ // Bucket 1: key lower = -9, key upper = 10, keys = [].
+ EXPECT_THAT(results[1].key_lower, Eq(-9));
+ EXPECT_THAT(results[1].key_upper, Eq(10));
+ EXPECT_THAT(std::vector<IntegerIndexData>(results[1].start, results[1].end),
+ IsEmpty());
+}
+
+TEST(IntegerIndexBucketUtilTest,
+ Split_sameKeysExceedingThreshold_singleBucket_keyWithinKeyLowerAndUpper) {
+ std::vector<IntegerIndexData> data = {
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0)};
+
+ // Keys = [0, 0, 0, 0, 0].
+ std::vector<DataRangeAndBucketInfo> results =
+ Split(data, /*original_key_lower=*/-10, /*original_key_upper=*/10,
+ /*num_data_threshold=*/3);
+ // - Even though # of data with key = 0 exceeds the threshold, they should
+ // still be in the same bucket.
+ // - They should be in a single range bucket [0, 0]. Another bucket [-10, -1]
+ // with empty data should be created before it, and another bucket [1, 10]
+ // with empty data should be created after it.
+ ASSERT_THAT(results, SizeIs(3));
+ // Bucket 0: key lower = -10, key upper = -1, keys = [].
+ EXPECT_THAT(results[0].key_lower, Eq(-10));
+ EXPECT_THAT(results[0].key_upper, Eq(-1));
+ EXPECT_THAT(std::vector<IntegerIndexData>(results[0].start, results[0].end),
+ IsEmpty());
+ // Bucket 1: key lower = 0, key upper = 0, keys = [0, 0, 0, 0, 0].
+ EXPECT_THAT(results[1].key_lower, Eq(0));
+ EXPECT_THAT(results[1].key_upper, Eq(0));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[1].start, results[1].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0)));
+ // Bucket 2: key lower = 1, key upper = 10, keys = [].
+ EXPECT_THAT(results[2].key_lower, Eq(1));
+ EXPECT_THAT(results[2].key_upper, Eq(10));
+ EXPECT_THAT(std::vector<IntegerIndexData>(results[2].start, results[2].end),
+ IsEmpty());
+}
+
+TEST(IntegerIndexBucketUtilTest,
+ Split_sameKeysExceedingThreshold_singleBucket_keyEqualsKeyUpper) {
+ std::vector<IntegerIndexData> data = {
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)};
+
+ // Keys = [10, 10, 10, 10, 10].
+ std::vector<DataRangeAndBucketInfo> results =
+ Split(data, /*original_key_lower=*/-10, /*original_key_upper=*/10,
+ /*num_data_threshold=*/3);
+ // - Even though # of data with key = 10 exceeds the threshold, they should
+ // still be in the same bucket.
+ // - They should be in a single range bucket [10, 10], and another bucket
+ // [-10, 9] with empty data should be created before it.
+ ASSERT_THAT(results, SizeIs(2));
+ // Bucket 0: key lower = -10, key upper = 9, keys = [].
+ EXPECT_THAT(results[0].key_lower, Eq(-10));
+ EXPECT_THAT(results[0].key_upper, Eq(9));
+ EXPECT_THAT(std::vector<IntegerIndexData>(results[0].start, results[0].end),
+ IsEmpty());
+ // Bucket 1: key lower = -10, key upper = 10, keys = [10, 10, 10, 10, 10].
+ EXPECT_THAT(results[1].key_lower, Eq(10));
+ EXPECT_THAT(results[1].key_upper, Eq(10));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[1].start, results[1].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)));
+}
+
+TEST(IntegerIndexBucketUtilTest,
+ Split_adjacentKeysTotalNumDataExceedThreshold) {
+ std::vector<IntegerIndexData> data = {
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)};
+
+ // Keys = [-10, -10, -1, -1, 2, 2, 10, 10].
+ std::vector<DataRangeAndBucketInfo> results =
+ Split(data, /*original_key_lower=*/-10, /*original_key_upper=*/10,
+ /*num_data_threshold=*/3);
+ // Even though # of data with the same key is within the threshold, since
+ // total # of data of adjacent keys exceed the threshold, they should be
+ // separated into different buckets.
+ ASSERT_THAT(results, SizeIs(4));
+ // Bucket 0: key lower = -10, key upper = -10, keys = [-10, -10].
+ EXPECT_THAT(results[0].key_lower, Eq(-10));
+ EXPECT_THAT(results[0].key_upper, Eq(-10));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[0].start, results[0].end),
+ ElementsAre(
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10)));
+ // Bucket 1: key lower = -9, key upper = -1, keys = [-1, -1].
+ EXPECT_THAT(results[1].key_lower, Eq(-9));
+ EXPECT_THAT(results[1].key_upper, Eq(-1));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[1].start, results[1].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1)));
+ // Bucket 2: key lower = 0, key upper = 2, keys = [2, 2].
+ EXPECT_THAT(results[2].key_lower, Eq(0));
+ EXPECT_THAT(results[2].key_upper, Eq(2));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[2].start, results[2].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2)));
+ // Bucket 3: key lower = 3, key upper = 10, keys = [10, 10].
+ EXPECT_THAT(results[3].key_lower, Eq(3));
+ EXPECT_THAT(results[3].key_upper, Eq(10));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[3].start, results[3].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)));
+}
+
+TEST(IntegerIndexBucketUtilTest,
+ Split_keyLowerEqualsIntMin_smallestKeyGreaterThanKeyLower) {
+ std::vector<IntegerIndexData> data = {
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId,
+ std::numeric_limits<int64_t>::min() + 1),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)};
+
+ // Keys = [INT64_MIN + 1, -10, -1, 2, 10].
+ std::vector<DataRangeAndBucketInfo> results =
+ Split(data, /*original_key_lower=*/std::numeric_limits<int64_t>::min(),
+ /*original_key_upper=*/std::numeric_limits<int64_t>::max(),
+ /*num_data_threshold=*/3);
+ ASSERT_THAT(results, SizeIs(2));
+ // Bucket 0: key lower = INT64_MIN, key upper = -1, keys = [INT64_MIN + 1,
+ // -10, -1].
+ EXPECT_THAT(results[0].key_lower, Eq(std::numeric_limits<int64_t>::min()));
+ EXPECT_THAT(results[0].key_upper, Eq(-1));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[0].start, results[0].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId,
+ std::numeric_limits<int64_t>::min() + 1),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1)));
+ // Bucket 1: key lower = 0, key upper = INT64_MAX, keys = [2, 10].
+ EXPECT_THAT(results[1].key_lower, Eq(0));
+ EXPECT_THAT(results[1].key_upper, Eq(std::numeric_limits<int64_t>::max()));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[1].start, results[1].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)));
+}
+
+TEST(IntegerIndexBucketUtilTest,
+ Split_keyLowerEqualsIntMin_smallestKeyEqualsKeyLower) {
+ std::vector<IntegerIndexData> data = {
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId,
+ std::numeric_limits<int64_t>::min()),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)};
+
+ // Keys = [INT64_MIN, -10, -1, 2, 10].
+ std::vector<DataRangeAndBucketInfo> results =
+ Split(data, /*original_key_lower=*/std::numeric_limits<int64_t>::min(),
+ /*original_key_upper=*/std::numeric_limits<int64_t>::max(),
+ /*num_data_threshold=*/3);
+ ASSERT_THAT(results, SizeIs(2));
+ // Bucket 0: key lower = INT64_MIN, key upper = -1, keys = [INT64_MIN, -10,
+ // -1].
+ EXPECT_THAT(results[0].key_lower, Eq(std::numeric_limits<int64_t>::min()));
+ EXPECT_THAT(results[0].key_upper, Eq(-1));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[0].start, results[0].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId,
+ std::numeric_limits<int64_t>::min()),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1)));
+ // Bucket 1: key lower = 0, key upper = INT64_MAX, keys = [2, 10].
+ EXPECT_THAT(results[1].key_lower, Eq(0));
+ EXPECT_THAT(results[1].key_upper, Eq(std::numeric_limits<int64_t>::max()));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[1].start, results[1].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)));
+}
+
+TEST(IntegerIndexBucketUtilTest,
+ Split_keyLowerEqualsIntMin_keyIntMinExceedingThreshold) {
+ std::vector<IntegerIndexData> data = {
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId,
+ std::numeric_limits<int64_t>::min()),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId,
+ std::numeric_limits<int64_t>::min()),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId,
+ std::numeric_limits<int64_t>::min()),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId,
+ std::numeric_limits<int64_t>::min()),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId,
+ std::numeric_limits<int64_t>::min()),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)};
+
+ // Keys = [INT64_MIN, INT64_MIN, INT64_MIN, INT64_MIN, INT64_MIN, -10, -1, 2,
+ // 10].
+ std::vector<DataRangeAndBucketInfo> results =
+ Split(data, /*original_key_lower=*/std::numeric_limits<int64_t>::min(),
+ /*original_key_upper=*/std::numeric_limits<int64_t>::max(),
+ /*num_data_threshold=*/3);
+ ASSERT_THAT(results, SizeIs(3));
+ // Bucket 0: key lower = INT64_MIN, key upper = INT64_MIN, keys = [INT64_MIN,
+ // INT64_MIN, INT64_MIN, INT64_MIN, INT64_MIN].
+ EXPECT_THAT(results[0].key_lower, Eq(std::numeric_limits<int64_t>::min()));
+ EXPECT_THAT(results[0].key_upper, Eq(std::numeric_limits<int64_t>::min()));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[0].start, results[0].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId,
+ std::numeric_limits<int64_t>::min()),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId,
+ std::numeric_limits<int64_t>::min()),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId,
+ std::numeric_limits<int64_t>::min()),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId,
+ std::numeric_limits<int64_t>::min()),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId,
+ std::numeric_limits<int64_t>::min())));
+ // Bucket 1: key lower = INT64_MIN + 1, key upper = 2, keys = [-10, -1, 2].
+ EXPECT_THAT(results[1].key_lower,
+ Eq(std::numeric_limits<int64_t>::min() + 1));
+ EXPECT_THAT(results[1].key_upper, Eq(2));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[1].start, results[1].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2)));
+ // Bucket 2: key lower = 3, key upper = INT64_MAX, keys = [10].
+ EXPECT_THAT(results[2].key_lower, Eq(3));
+ EXPECT_THAT(results[2].key_upper, Eq(std::numeric_limits<int64_t>::max()));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[2].start, results[2].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)));
+}
+
+TEST(IntegerIndexBucketUtilTest,
+ Split_keyUpperEqualsIntMax_largestKeySmallerThanKeyUpper) {
+ std::vector<IntegerIndexData> data = {
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId,
+ std::numeric_limits<int64_t>::max() - 1),
+ };
+
+ // Keys = [-10, -1, 2, 10, INT64_MAX - 1].
+ std::vector<DataRangeAndBucketInfo> results =
+ Split(data, /*original_key_lower=*/std::numeric_limits<int64_t>::min(),
+ /*original_key_upper=*/std::numeric_limits<int64_t>::max(),
+ /*num_data_threshold=*/3);
+ ASSERT_THAT(results, SizeIs(2));
+ // Bucket 0: key lower = INT64_MIN, key upper = 2, keys = [-10, -1, 2].
+ EXPECT_THAT(results[0].key_lower, Eq(std::numeric_limits<int64_t>::min()));
+ EXPECT_THAT(results[0].key_upper, Eq(2));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[0].start, results[0].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2)));
+ // Bucket 1: key lower = 3, key upper = INT64_MAX, keys = [10, INT64_MAX - 1].
+ EXPECT_THAT(results[1].key_lower, Eq(3));
+ EXPECT_THAT(results[1].key_upper, Eq(std::numeric_limits<int64_t>::max()));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[1].start, results[1].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId,
+ std::numeric_limits<int64_t>::max() - 1)));
+}
+
+TEST(IntegerIndexBucketUtilTest,
+ Split_keyUpperEqualsIntMax_largestKeyEqualsKeyUpper) {
+ std::vector<IntegerIndexData> data = {
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId,
+ std::numeric_limits<int64_t>::max()),
+ };
+
+ // Keys = [-10, -1, 2, 10, INT64_MAX].
+ std::vector<DataRangeAndBucketInfo> results =
+ Split(data, /*original_key_lower=*/std::numeric_limits<int64_t>::min(),
+ /*original_key_upper=*/std::numeric_limits<int64_t>::max(),
+ /*num_data_threshold=*/3);
+ ASSERT_THAT(results, SizeIs(2));
+ // Bucket 0: key lower = INT64_MIN, key upper = 2, keys = [-10, -1, 2].
+ EXPECT_THAT(results[0].key_lower, Eq(std::numeric_limits<int64_t>::min()));
+ EXPECT_THAT(results[0].key_upper, Eq(2));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[0].start, results[0].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2)));
+ // Bucket 1: key lower = 3, key upper = INT64_MAX, keys = [10, INT64_MAX].
+ EXPECT_THAT(results[1].key_lower, Eq(3));
+ EXPECT_THAT(results[1].key_upper, Eq(std::numeric_limits<int64_t>::max()));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[1].start, results[1].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId,
+ std::numeric_limits<int64_t>::max())));
+}
+
+TEST(IntegerIndexBucketUtilTest,
+ Split_keyUpperEqualsIntMax_keyIntMaxExceedingThreshold) {
+ std::vector<IntegerIndexData> data = {
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId,
+ std::numeric_limits<int64_t>::max()),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId,
+ std::numeric_limits<int64_t>::max()),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId,
+ std::numeric_limits<int64_t>::max()),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId,
+ std::numeric_limits<int64_t>::max()),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId,
+ std::numeric_limits<int64_t>::max())};
+
+ // Keys = [-10, -1, 2, 10, INT64_MAX, INT64_MAX, INT64_MAX, INT64_MAX,
+ // INT64_MAX].
+ std::vector<DataRangeAndBucketInfo> results =
+ Split(data, /*original_key_lower=*/std::numeric_limits<int64_t>::min(),
+ /*original_key_upper=*/std::numeric_limits<int64_t>::max(),
+ /*num_data_threshold=*/3);
+ ASSERT_THAT(results, SizeIs(3));
+ // Bucket 0: key lower = INT64_MIN, key upper = 2, keys = [-10, -1, 2].
+ EXPECT_THAT(results[0].key_lower, Eq(std::numeric_limits<int64_t>::min()));
+ EXPECT_THAT(results[0].key_upper, Eq(2));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[0].start, results[0].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2)));
+ // Bucket 1: key lower = 3, key upper = INT_MAX - 1, keys = [10].
+ EXPECT_THAT(results[1].key_lower, Eq(3));
+ EXPECT_THAT(results[1].key_upper,
+ Eq(std::numeric_limits<int64_t>::max() - 1));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[1].start, results[1].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)));
+ // Bucket 2: key lower = INT64_MAX, key upper = INT64_MAX, keys = [INT64_MAX,
+ // INT64_MAX, INT64_MAX, INT64_MAX, INT64_MAX].
+ EXPECT_THAT(results[2].key_lower, Eq(std::numeric_limits<int64_t>::max()));
+ EXPECT_THAT(results[2].key_upper, Eq(std::numeric_limits<int64_t>::max()));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[2].start, results[2].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId,
+ std::numeric_limits<int64_t>::max()),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId,
+ std::numeric_limits<int64_t>::max()),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId,
+ std::numeric_limits<int64_t>::max()),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId,
+ std::numeric_limits<int64_t>::max()),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId,
+ std::numeric_limits<int64_t>::max())));
+}
+
+} // namespace
+
+} // namespace integer_index_bucket_util
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/numeric/integer-index-storage.cc b/icing/index/numeric/integer-index-storage.cc
index 22ef8bd..db1983c 100644
--- a/icing/index/numeric/integer-index-storage.cc
+++ b/icing/index/numeric/integer-index-storage.cc
@@ -17,6 +17,7 @@
#include <algorithm>
#include <cstdint>
#include <functional>
+#include <iterator>
#include <limits>
#include <memory>
#include <queue>
@@ -37,6 +38,7 @@
#include "icing/index/hit/doc-hit-info.h"
#include "icing/index/iterator/doc-hit-info-iterator.h"
#include "icing/index/numeric/doc-hit-info-iterator-numeric.h"
+#include "icing/index/numeric/integer-index-bucket-util.h"
#include "icing/index/numeric/integer-index-data.h"
#include "icing/index/numeric/numeric-index.h"
#include "icing/index/numeric/posting-list-integer-index-accessor.h"
@@ -50,6 +52,41 @@ namespace lib {
namespace {
+// Helper function to flush data between [it_start, it_end) into posting list(s)
+// and return posting list id.
+// Note: it will sort data between [it_start, it_end) by basic hit value, so the
+// caller should be aware that the data order will be changed after calling this
+// function.
+libtextclassifier3::StatusOr<PostingListIdentifier> FlushDataIntoPostingLists(
+ FlashIndexStorage* flash_index_storage,
+ PostingListIntegerIndexSerializer* posting_list_serializer,
+ const std::vector<IntegerIndexData>::iterator& it_start,
+ const std::vector<IntegerIndexData>::iterator& it_end) {
+ if (it_start == it_end) {
+ return PostingListIdentifier::kInvalid;
+ }
+
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<PostingListIntegerIndexAccessor> new_pl_accessor,
+ PostingListIntegerIndexAccessor::Create(flash_index_storage,
+ posting_list_serializer));
+
+ std::sort(it_start, it_end);
+ for (auto it = it_end - 1; it >= it_start; --it) {
+ ICING_RETURN_IF_ERROR(new_pl_accessor->PrependData(*it));
+ }
+
+ PostingListAccessor::FinalizeResult result =
+ std::move(*new_pl_accessor).Finalize();
+ if (!result.status.ok()) {
+ return result.status;
+ }
+ if (!result.id.is_valid()) {
+ return absl_ports::InternalError("Fail to flush data into posting list(s)");
+ }
+ return result.id;
+}
+
// The following 4 methods are helper functions to get the correct file path of
// metadata/sorted_buckets/unsorted_buckets/flash_index_storage, according to
// the given working directory.
@@ -510,9 +547,12 @@ libtextclassifier3::Status IntegerIndexStorage::AddKeys(
mutable_new_arr.SetArray(/*idx=*/0, new_buckets.data(), new_buckets.size());
}
- // Step 4: merge the unsorted bucket array into the sorted bucket array if the
- // length of the unsorted bucket array exceeds the threshold.
- // TODO(b/259743562): [Optimization 1] implement merge
+ // Step 4: sort and merge the unsorted bucket array into the sorted bucket
+ // array if the length of the unsorted bucket array exceeds the
+ // threshold.
+ if (unsorted_buckets_->num_elements() > kUnsortedBucketsLengthThreshold) {
+ ICING_RETURN_IF_ERROR(SortBuckets());
+ }
info().num_data += new_keys.size();
@@ -679,29 +719,23 @@ IntegerIndexStorage::InitializeNewFiles(
absl_ports::StrCat("Failed to create directory: ", working_path));
}
- // TODO(b/259743562): [Optimization 1] decide max # buckets, unsorted buckets
- // threshold
// Initialize sorted_buckets
int32_t pre_mapping_mmap_size = sizeof(Bucket) * (1 << 10);
- int32_t max_file_size =
- pre_mapping_mmap_size + FileBackedVector<Bucket>::Header::kHeaderSize;
ICING_ASSIGN_OR_RETURN(
std::unique_ptr<FileBackedVector<Bucket>> sorted_buckets,
FileBackedVector<Bucket>::Create(
filesystem, GetSortedBucketsFilePath(working_path),
- MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size,
- pre_mapping_mmap_size));
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
+ FileBackedVector<Bucket>::kMaxFileSize, pre_mapping_mmap_size));
// Initialize unsorted_buckets
- pre_mapping_mmap_size = sizeof(Bucket) * 100;
- max_file_size =
- pre_mapping_mmap_size + FileBackedVector<Bucket>::Header::kHeaderSize;
+ pre_mapping_mmap_size = sizeof(Bucket) * kUnsortedBucketsLengthThreshold;
ICING_ASSIGN_OR_RETURN(
std::unique_ptr<FileBackedVector<Bucket>> unsorted_buckets,
FileBackedVector<Bucket>::Create(
filesystem, GetUnsortedBucketsFilePath(working_path),
- MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size,
- pre_mapping_mmap_size));
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
+ FileBackedVector<Bucket>::kMaxFileSize, pre_mapping_mmap_size));
// Initialize flash_index_storage
ICING_ASSIGN_OR_RETURN(
@@ -785,29 +819,23 @@ IntegerIndexStorage::InitializeExistingFiles(
/*pre_mapping_file_offset=*/0,
/*pre_mapping_mmap_size=*/kMetadataFileSize));
- // TODO(b/259743562): [Optimization 1] decide max # buckets, unsorted buckets
- // threshold
// Initialize sorted_buckets
int32_t pre_mapping_mmap_size = sizeof(Bucket) * (1 << 10);
- int32_t max_file_size =
- pre_mapping_mmap_size + FileBackedVector<Bucket>::Header::kHeaderSize;
ICING_ASSIGN_OR_RETURN(
std::unique_ptr<FileBackedVector<Bucket>> sorted_buckets,
FileBackedVector<Bucket>::Create(
filesystem, GetSortedBucketsFilePath(working_path),
- MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size,
- pre_mapping_mmap_size));
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
+ FileBackedVector<Bucket>::kMaxFileSize, pre_mapping_mmap_size));
// Initialize unsorted_buckets
- pre_mapping_mmap_size = sizeof(Bucket) * 100;
- max_file_size =
- pre_mapping_mmap_size + FileBackedVector<Bucket>::Header::kHeaderSize;
+ pre_mapping_mmap_size = sizeof(Bucket) * kUnsortedBucketsLengthThreshold;
ICING_ASSIGN_OR_RETURN(
std::unique_ptr<FileBackedVector<Bucket>> unsorted_buckets,
FileBackedVector<Bucket>::Create(
filesystem, GetUnsortedBucketsFilePath(working_path),
- MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size,
- pre_mapping_mmap_size));
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
+ FileBackedVector<Bucket>::kMaxFileSize, pre_mapping_mmap_size));
// Initialize flash_index_storage
ICING_ASSIGN_OR_RETURN(
@@ -845,28 +873,13 @@ IntegerIndexStorage::FlushDataIntoNewSortedBucket(
}
ICING_ASSIGN_OR_RETURN(
- std::unique_ptr<PostingListIntegerIndexAccessor> new_pl_accessor,
- PostingListIntegerIndexAccessor::Create(
- storage->flash_index_storage_.get(),
- storage->posting_list_serializer_));
-
- std::sort(data.begin(), data.end());
- for (auto itr = data.rbegin(); itr != data.rend(); ++itr) {
- ICING_RETURN_IF_ERROR(new_pl_accessor->PrependData(*itr));
- }
-
- PostingListAccessor::FinalizeResult result =
- std::move(*new_pl_accessor).Finalize();
- if (!result.status.ok()) {
- return result.status;
- }
- if (!result.id.is_valid()) {
- return absl_ports::InternalError("Fail to flush data into posting list");
- }
+ PostingListIdentifier pl_id,
+ FlushDataIntoPostingLists(storage->flash_index_storage_.get(),
+ storage->posting_list_serializer_, data.begin(),
+ data.end()));
storage->info().num_data += data.size();
- return storage->sorted_buckets_->Append(
- Bucket(key_lower, key_upper, result.id));
+ return storage->sorted_buckets_->Append(Bucket(key_lower, key_upper, pl_id));
}
libtextclassifier3::Status IntegerIndexStorage::PersistStoragesToDisk() {
@@ -921,21 +934,80 @@ IntegerIndexStorage::AddKeysIntoBucketAndSplitIfNecessary(
}
for (auto it = it_start; it != it_end; ++it) {
- // TODO(b/259743562): [Optimization 1] implement split bucket if pl is full
- // and the bucket is splittable
+ if (mutable_bucket.Get().key_lower() < mutable_bucket.Get().key_upper() &&
+ pl_accessor->WantsSplit()) {
+ // If the bucket needs split (max size and full) and is splittable, then
+ // we perform bucket splitting.
+
+ // 1. Finalize the current posting list accessor.
+ PostingListAccessor::FinalizeResult result =
+ std::move(*pl_accessor).Finalize();
+ if (!result.status.ok()) {
+ return result.status;
+ }
+
+ // 2. Create another posting list accessor instance. Read all data and
+ // free all posting lists.
+ ICING_ASSIGN_OR_RETURN(
+ pl_accessor,
+ PostingListIntegerIndexAccessor::CreateFromExisting(
+ flash_index_storage_.get(), posting_list_serializer_, result.id));
+ ICING_ASSIGN_OR_RETURN(std::vector<IntegerIndexData> all_data,
+ pl_accessor->GetAllDataAndFree());
+
+ // 3. Append all remaining new data.
+ all_data.reserve(all_data.size() + std::distance(it, it_end));
+ for (; it != it_end; ++it) {
+ all_data.push_back(IntegerIndexData(section_id, document_id, *it));
+ }
+
+ // 4. Run bucket splitting algorithm to decide new buckets and dispatch
+ // data.
+ std::vector<integer_index_bucket_util::DataRangeAndBucketInfo>
+ new_bucket_infos = integer_index_bucket_util::Split(
+ all_data, mutable_bucket.Get().key_lower(),
+ mutable_bucket.Get().key_upper(),
+ kNumDataThresholdForBucketSplit);
+ if (new_bucket_infos.empty()) {
+ ICING_LOG(WARNING)
+ << "No buckets after splitting. This should not happen.";
+ return absl_ports::InternalError("Split error");
+ }
+
+ // 5. Flush data.
+ std::vector<Bucket> new_buckets;
+ for (int i = 0; i < new_bucket_infos.size(); ++i) {
+ ICING_ASSIGN_OR_RETURN(
+ PostingListIdentifier pl_id,
+ FlushDataIntoPostingLists(
+ flash_index_storage_.get(), posting_list_serializer_,
+ new_bucket_infos[i].start, new_bucket_infos[i].end));
+ if (i == 0) {
+ // Reuse mutable_bucket
+ mutable_bucket.Get().set_key_lower(new_bucket_infos[i].key_lower);
+ mutable_bucket.Get().set_key_upper(new_bucket_infos[i].key_upper);
+ mutable_bucket.Get().set_posting_list_identifier(pl_id);
+ } else {
+ new_buckets.push_back(Bucket(new_bucket_infos[i].key_lower,
+ new_bucket_infos[i].key_upper, pl_id));
+ }
+ }
+
+ return new_buckets;
+ }
+
ICING_RETURN_IF_ERROR(pl_accessor->PrependData(
IntegerIndexData(section_id, document_id, *it)));
}
- // TODO(b/259743562): [Optimization 1] implement split and return new buckets.
- // We will change the original bucket (mutable_bucket)
- // in-place to one of the new buckets, and the rest will
- // be returned and added into unsorted buckets in AddKeys.
PostingListAccessor::FinalizeResult result =
std::move(*pl_accessor).Finalize();
if (!result.status.ok()) {
return result.status;
}
+ if (!result.id.is_valid()) {
+ return absl_ports::InternalError("Fail to flush data into posting list(s)");
+ }
mutable_bucket.Get().set_posting_list_identifier(result.id);
diff --git a/icing/index/numeric/integer-index-storage.h b/icing/index/numeric/integer-index-storage.h
index be0add9..ddd9231 100644
--- a/icing/index/numeric/integer-index-storage.h
+++ b/icing/index/numeric/integer-index-storage.h
@@ -30,6 +30,7 @@
#include "icing/file/posting_list/flash-index-storage.h"
#include "icing/file/posting_list/posting-list-identifier.h"
#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/index/numeric/integer-index-data.h"
#include "icing/index/numeric/posting-list-integer-index-serializer.h"
#include "icing/schema/section.h"
#include "icing/store/document-id.h"
@@ -117,6 +118,10 @@ class IntegerIndexStorage : public PersistentStorage {
int64_t key_upper() const { return key_upper_; }
+ void set_key_lower(int64_t key_lower) { key_lower_ = key_lower; }
+
+ void set_key_upper(int64_t key_upper) { key_upper_ = key_upper; }
+
PostingListIdentifier posting_list_identifier() const {
return posting_list_identifier_;
}
@@ -176,14 +181,29 @@ class IntegerIndexStorage : public PersistentStorage {
WorkingPathType::kDirectory;
static constexpr std::string_view kFilePrefix = "integer_index_storage";
- // # of data threshold for bucket merging. If total # data of adjacent buckets
- // exceed this value, then flush the accumulated data. Otherwise merge
- // buckets and their data.
+ // # of data threshold for bucket merging during optimization (TransferIndex).
+ // If total # data of adjacent buckets exceed this value, then flush the
+ // accumulated data. Otherwise merge buckets and their data.
//
// Calculated by: 0.7 * (kMaxPostingListSize / sizeof(IntegerIndexData)),
// where kMaxPostingListSize = (kPageSize - sizeof(IndexBlock::BlockHeader)).
static constexpr int32_t kNumDataThresholdForBucketMerge = 240;
+ // # of data threshold for bucket splitting during indexing (AddKeys).
+ // When the posting list of a bucket is full, we will try to split data into
+ // multiple buckets according to their keys. In order to achieve good
+ // (amortized) time complexity, we want # of data in new buckets to be at most
+ // half # of elements in a full posting list.
+ //
+ // Calculated by: 0.5 * (kMaxPostingListSize / sizeof(IntegerIndexData)),
+ // where kMaxPostingListSize = (kPageSize - sizeof(IndexBlock::BlockHeader)).
+ static constexpr int32_t kNumDataThresholdForBucketSplit = 170;
+
+ // Length threshold to sort and merge unsorted buckets into sorted buckets. If
+ // the length of unsorted_buckets exceed the threshold, then call
+ // SortBuckets().
+ static constexpr int32_t kUnsortedBucketsLengthThreshold = 50;
+
// Creates a new IntegerIndexStorage instance to index integers (for a single
// property). If any of the underlying file is missing, then delete the whole
// working_path and (re)initialize with new ones. Otherwise initialize and
@@ -370,7 +390,6 @@ class IntegerIndexStorage : public PersistentStorage {
// into several new buckets with new ranges, and split the data (according
// to their keys and the range of new buckets) of the original posting
// list into several new posting lists.
- // TODO(b/259743562): [Optimization 1] implement split
// - Otherwise, just simply add a new key into it, and PostingListAccessor
// mechanism will automatically create a new max size posting list and
// chain them.
diff --git a/icing/index/numeric/integer-index-storage_benchmark.cc b/icing/index/numeric/integer-index-storage_benchmark.cc
index d150f2d..54b19c3 100644
--- a/icing/index/numeric/integer-index-storage_benchmark.cc
+++ b/icing/index/numeric/integer-index-storage_benchmark.cc
@@ -57,6 +57,7 @@ namespace lib {
namespace {
using ::testing::Eq;
+using ::testing::IsEmpty;
using ::testing::SizeIs;
static constexpr SectionId kDefaultSectionId = 12;
@@ -237,18 +238,24 @@ void BM_ExactQuery(benchmark::State& state) {
std::unique_ptr<DocHitInfoIterator> iterator,
storage->GetIterator(/*query_key_lower=*/exact_query_key,
/*query_key_upper=*/exact_query_key));
- int cnt = 0;
+ std::vector<DocHitInfo> data;
while (iterator->Advance().ok()) {
- benchmark::DoNotOptimize(iterator->doc_hit_info());
- ++cnt;
+ data.push_back(iterator->doc_hit_info());
}
+ state.PauseTiming();
const auto it = keys.find(exact_query_key);
if (it == keys.end()) {
- ASSERT_THAT(cnt, Eq(0));
+ ASSERT_THAT(data, IsEmpty());
} else {
- ASSERT_THAT(it->second, SizeIs(cnt));
+ ASSERT_THAT(data, SizeIs(it->second.size()));
+ std::reverse(data.begin(), data.end());
+ for (int i = 0; i < data.size(); ++i) {
+ ASSERT_THAT(data[i].document_id(), Eq(it->second[i]));
+ ASSERT_THAT(data[i].hit_section_ids_mask(), Eq(1 << kDefaultSectionId));
+ }
}
+ state.ResumeTiming();
}
}
BENCHMARK(BM_ExactQuery)
diff --git a/icing/index/numeric/integer-index-storage_test.cc b/icing/index/numeric/integer-index-storage_test.cc
index 9d6864c..ed7d5db 100644
--- a/icing/index/numeric/integer-index-storage_test.cc
+++ b/icing/index/numeric/integer-index-storage_test.cc
@@ -14,6 +14,8 @@
#include "icing/index/numeric/integer-index-storage.h"
+#include <unistd.h>
+
#include <cstdint>
#include <limits>
#include <memory>
@@ -26,7 +28,10 @@
#include "gmock/gmock.h"
#include "gtest/gtest.h"
#include "icing/file/file-backed-vector.h"
+#include "icing/file/filesystem.h"
#include "icing/file/persistent-storage.h"
+#include "icing/file/posting_list/flash-index-storage.h"
+#include "icing/file/posting_list/index-block.h"
#include "icing/file/posting_list/posting-list-identifier.h"
#include "icing/index/hit/doc-hit-info.h"
#include "icing/index/iterator/doc-hit-info-iterator.h"
@@ -42,14 +47,17 @@ namespace lib {
namespace {
+using ::testing::Contains;
using ::testing::ElementsAre;
using ::testing::ElementsAreArray;
using ::testing::Eq;
+using ::testing::Ge;
using ::testing::Gt;
using ::testing::HasSubstr;
using ::testing::IsEmpty;
using ::testing::IsFalse;
using ::testing::IsTrue;
+using ::testing::Key;
using ::testing::Le;
using ::testing::Ne;
using ::testing::Not;
@@ -1186,6 +1194,150 @@ TEST_F(IntegerIndexStorageTest,
EqualsDocHitInfo(kDefaultDocumentId, expected_sections))));
}
+TEST_F(IntegerIndexStorageTest, SplitBuckets) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndexStorage> storage,
+ IntegerIndexStorage::Create(filesystem_, working_path_, Options(),
+ serializer_.get()));
+
+ uint32_t block_size = FlashIndexStorage::SelectBlockSize();
+ uint32_t max_posting_list_bytes = IndexBlock::CalculateMaxPostingListBytes(
+ block_size, serializer_->GetDataTypeBytes());
+ uint32_t max_num_data_before_split =
+ max_posting_list_bytes / serializer_->GetDataTypeBytes();
+
+ // Add max_num_data_before_split + 1 keys to invoke bucket splitting.
+ // Keys: max_num_data_before_split to 0
+ // Document ids: 0 to max_num_data_before_split
+ std::unordered_map<int64_t, DocumentId> data;
+ int64_t key = max_num_data_before_split;
+ DocumentId document_id = 0;
+ for (int i = 0; i < max_num_data_before_split + 1; ++i) {
+ data[key] = document_id;
+ ICING_ASSERT_OK(
+ storage->AddKeys(document_id, kDefaultSectionId, /*new_keys=*/{key}));
+ ++document_id;
+ --key;
+ }
+ ICING_ASSERT_OK(storage->PersistToDisk());
+
+ // Manually check sorted and unsorted buckets.
+ {
+ // Check sorted buckets.
+ const std::string sorted_buckets_file_path = absl_ports::StrCat(
+ working_path_, "/", IntegerIndexStorage::kFilePrefix, ".s");
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<Bucket>> sorted_buckets,
+ FileBackedVector<Bucket>::Create(
+ filesystem_, sorted_buckets_file_path,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+
+ EXPECT_THAT(sorted_buckets->num_elements(), Eq(1));
+ ICING_ASSERT_OK_AND_ASSIGN(const Bucket* bucket1,
+ sorted_buckets->Get(/*idx=*/0));
+ EXPECT_THAT(bucket1->key_lower(), Eq(std::numeric_limits<int64_t>::min()));
+ EXPECT_THAT(bucket1->key_upper(), Ne(std::numeric_limits<int64_t>::max()));
+
+ int64_t sorted_bucket_key_upper = bucket1->key_upper();
+
+ // Check unsorted buckets.
+ const std::string unsorted_buckets_file_path = absl_ports::StrCat(
+ working_path_, "/", IntegerIndexStorage::kFilePrefix, ".u");
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<Bucket>> unsorted_buckets,
+ FileBackedVector<Bucket>::Create(
+ filesystem_, unsorted_buckets_file_path,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+
+ EXPECT_THAT(unsorted_buckets->num_elements(), Ge(1));
+ ICING_ASSERT_OK_AND_ASSIGN(const Bucket* bucket2,
+ unsorted_buckets->Get(/*idx=*/0));
+ EXPECT_THAT(bucket2->key_lower(), Eq(sorted_bucket_key_upper + 1));
+ }
+
+ // Ensure that search works normally.
+ std::vector<SectionId> expected_sections = {kDefaultSectionId};
+ for (int64_t key = max_num_data_before_split; key >= 0; key--) {
+ ASSERT_THAT(data, Contains(Key(key)));
+ DocumentId expected_document_id = data[key];
+ EXPECT_THAT(Query(storage.get(), /*key_lower=*/key, /*key_upper=*/key),
+ IsOkAndHolds(ElementsAre(EqualsDocHitInfo(expected_document_id,
+ expected_sections))));
+ }
+}
+
+TEST_F(IntegerIndexStorageTest, SplitBucketsTriggerSortBuckets) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndexStorage> storage,
+ IntegerIndexStorage::Create(filesystem_, working_path_, Options(),
+ serializer_.get()));
+
+ uint32_t block_size = FlashIndexStorage::SelectBlockSize();
+ uint32_t max_posting_list_bytes = IndexBlock::CalculateMaxPostingListBytes(
+ block_size, serializer_->GetDataTypeBytes());
+ uint32_t max_num_data_before_split =
+ max_posting_list_bytes / serializer_->GetDataTypeBytes();
+
+ // Add IntegerIndexStorage::kUnsortedBucketsLengthThreshold keys. For each
+ // key, add max_num_data_before_split + 1 data. Then we will get:
+ // - Bucket splitting will create kUnsortedBucketsLengthThreshold + 1 unsorted
+ // buckets [[50, 50], [49, 49], ..., [1, 1], [51, INT64_MAX]].
+ // - Since there are kUnsortedBucketsLengthThreshold + 1 unsorted buckets, we
+ // should sort and merge buckets.
+ std::unordered_map<int64_t, std::vector<DocumentId>> data;
+ int64_t key = IntegerIndexStorage::kUnsortedBucketsLengthThreshold;
+ DocumentId document_id = 0;
+ for (int i = 0; i < IntegerIndexStorage::kUnsortedBucketsLengthThreshold;
+ ++i) {
+ for (int j = 0; j < max_num_data_before_split + 1; ++j) {
+ data[key].push_back(document_id);
+ ICING_ASSERT_OK(
+ storage->AddKeys(document_id, kDefaultSectionId, /*new_keys=*/{key}));
+ ++document_id;
+ }
+ --key;
+ }
+ ICING_ASSERT_OK(storage->PersistToDisk());
+
+ // Manually check sorted and unsorted buckets.
+ {
+ // Check unsorted buckets.
+ const std::string unsorted_buckets_file_path = absl_ports::StrCat(
+ working_path_, "/", IntegerIndexStorage::kFilePrefix, ".u");
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<Bucket>> unsorted_buckets,
+ FileBackedVector<Bucket>::Create(
+ filesystem_, unsorted_buckets_file_path,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ EXPECT_THAT(unsorted_buckets->num_elements(), Eq(0));
+
+ // Check sorted buckets.
+ const std::string sorted_buckets_file_path = absl_ports::StrCat(
+ working_path_, "/", IntegerIndexStorage::kFilePrefix, ".s");
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<Bucket>> sorted_buckets,
+ FileBackedVector<Bucket>::Create(
+ filesystem_, sorted_buckets_file_path,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ EXPECT_THAT(sorted_buckets->num_elements(), Gt(1));
+ }
+
+ // Ensure that search works normally.
+ for (key = 1; key <= IntegerIndexStorage::kUnsortedBucketsLengthThreshold;
+ ++key) {
+ ASSERT_THAT(data, Contains(Key(key)));
+
+ std::vector<DocHitInfo> expected_doc_hit_infos;
+ for (DocumentId doc_id : data[key]) {
+ expected_doc_hit_infos.push_back(DocHitInfo(
+ doc_id, /*hit_section_ids_mask=*/UINT64_C(1) << kDefaultSectionId));
+ }
+ EXPECT_THAT(Query(storage.get(), /*key_lower=*/key, /*key_upper=*/key),
+ IsOkAndHolds(ElementsAreArray(expected_doc_hit_infos.rbegin(),
+ expected_doc_hit_infos.rend())));
+ }
+}
+
TEST_F(IntegerIndexStorageTest, TransferIndex) {
// We use predefined custom buckets to initialize new integer index storage
// and create some test keys accordingly.
diff --git a/icing/index/numeric/integer-index.cc b/icing/index/numeric/integer-index.cc
index a2d40f1..2f876e4 100644
--- a/icing/index/numeric/integer-index.cc
+++ b/icing/index/numeric/integer-index.cc
@@ -14,10 +14,12 @@
#include "icing/index/numeric/integer-index.h"
+#include <algorithm>
#include <cstdint>
#include <memory>
#include <string>
#include <string_view>
+#include <utility>
#include <vector>
#include "icing/text_classifier/lib3/utils/base/status.h"
@@ -27,6 +29,7 @@
#include "icing/file/destructible-directory.h"
#include "icing/file/filesystem.h"
#include "icing/file/memory-mapped-file.h"
+#include "icing/index/iterator/doc-hit-info-iterator-section-restrict.h"
#include "icing/index/numeric/doc-hit-info-iterator-numeric.h"
#include "icing/index/numeric/integer-index-storage.h"
#include "icing/index/numeric/posting-list-integer-index-serializer.h"
@@ -50,6 +53,17 @@ std::string GetMetadataFilePath(std::string_view working_path) {
return absl_ports::StrCat(working_path, "/", GetMetadataFileName());
}
+constexpr std::string_view kWildcardPropertyIndexFileName =
+ "wildcard_property_index";
+
+constexpr std::string_view kWildcardPropertyStorageFileName =
+ "wildcard_property_storage";
+
+std::string GetWildcardPropertyStorageFilePath(std::string_view working_path) {
+ return absl_ports::StrCat(working_path, "/",
+ kWildcardPropertyStorageFileName);
+}
+
// Helper function to get the sub working (directory) path of
// IntegerIndexStorage according to the given working directory and property
// path.
@@ -64,8 +78,9 @@ libtextclassifier3::StatusOr<std::vector<std::string>>
GetAllExistingPropertyPaths(const Filesystem& filesystem,
const std::string& working_path) {
std::vector<std::string> property_paths;
- if (!filesystem.ListDirectory(working_path.c_str(),
- /*exclude=*/{GetMetadataFileName()},
+ std::unordered_set<std::string> excludes = {
+ GetMetadataFileName(), std::string(kWildcardPropertyStorageFileName)};
+ if (!filesystem.ListDirectory(working_path.c_str(), excludes,
/*recursive=*/false, &property_paths)) {
return absl_ports::InternalError("Failed to list directory");
}
@@ -81,6 +96,9 @@ GetPropertyIntegerIndexStorageMap(
IntegerIndex::PropertyToStorageMapType property_to_storage_map;
for (const std::string& property_path : property_paths) {
+ if (property_path == kWildcardPropertyIndexFileName) {
+ continue;
+ }
std::string storage_working_path =
GetPropertyIndexStoragePath(working_path, property_path);
ICING_ASSIGN_OR_RETURN(
@@ -95,16 +113,61 @@ GetPropertyIntegerIndexStorageMap(
return property_to_storage_map;
}
+// RETURNS:
+// - On success, an unordered_set representing the list of property paths
+// stored in the WildcardPropertyStorage managed by property_storage
+// - INTERNAL_ERROR on any failure to successfully read the underlying proto.
+libtextclassifier3::StatusOr<std::unordered_set<std::string>> CreatePropertySet(
+ const FileBackedProto<WildcardPropertyStorage>& property_storage) {
+ std::unordered_set<std::string> wildcard_properties_set;
+ auto wildcard_properties_or = property_storage.Read();
+ if (!wildcard_properties_or.ok()) {
+ if (absl_ports::IsNotFound(wildcard_properties_or.status())) {
+ return wildcard_properties_set;
+ }
+ return wildcard_properties_or.status();
+ }
+
+ const WildcardPropertyStorage* wildcard_properties =
+ wildcard_properties_or.ValueOrDie();
+ wildcard_properties_set.reserve(wildcard_properties->property_entries_size());
+ for (const std::string& property : wildcard_properties->property_entries()) {
+ wildcard_properties_set.insert(property);
+ }
+ return wildcard_properties_set;
+}
+
} // namespace
libtextclassifier3::Status IntegerIndex::Editor::IndexAllBufferedKeys() && {
auto iter = integer_index_.property_to_storage_map_.find(property_path_);
IntegerIndexStorage* target_storage = nullptr;
+ // 1. Check if this property already has its own individual index.
if (iter != integer_index_.property_to_storage_map_.end()) {
target_storage = iter->second.get();
+ // 2. Check if this property was added to wildcard storage.
+ } else if (integer_index_.wildcard_properties_set_.find(property_path_) !=
+ integer_index_.wildcard_properties_set_.end()) {
+ target_storage = integer_index_.wildcard_index_storage_.get();
+ // 3. Check if we've reach the limit of individual property storages.
+ } else if (integer_index_.property_to_storage_map_.size() >=
+ kMaxPropertyStorages) {
+ // 3a. Create the wildcard storage if it doesn't exist.
+ if (integer_index_.wildcard_index_storage_ == nullptr) {
+ ICING_ASSIGN_OR_RETURN(
+ integer_index_.wildcard_index_storage_,
+ IntegerIndexStorage::Create(
+ integer_index_.filesystem_,
+ GetPropertyIndexStoragePath(integer_index_.working_path_,
+ kWildcardPropertyIndexFileName),
+ IntegerIndexStorage::Options(),
+ integer_index_.posting_list_serializer_.get()));
+ }
+ ICING_RETURN_IF_ERROR(
+ integer_index_.AddPropertyToWildcardStorage(property_path_));
+ target_storage = integer_index_.wildcard_index_storage_.get();
+ // 4. Create a new individual storage for this new property.
} else {
- // A new property path. Create a new storage instance and insert into the
- // map.
ICING_ASSIGN_OR_RETURN(
std::unique_ptr<IntegerIndexStorage> new_storage,
IntegerIndexStorage::Create(
@@ -144,15 +207,45 @@ IntegerIndex::~IntegerIndex() {
libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>>
IntegerIndex::GetIterator(std::string_view property_path, int64_t key_lower,
- int64_t key_upper) const {
- auto iter = property_to_storage_map_.find(std::string(property_path));
- if (iter == property_to_storage_map_.end()) {
- // Return an empty iterator.
- return std::make_unique<DocHitInfoIteratorNumeric<int64_t>>(
- /*numeric_index_iter=*/nullptr);
+ int64_t key_upper,
+ const DocumentStore& document_store,
+ const SchemaStore& schema_store) const {
+ std::string property_path_str(property_path);
+ auto iter = property_to_storage_map_.find(property_path_str);
+ if (iter != property_to_storage_map_.end()) {
+ return iter->second->GetIterator(key_lower, key_upper);
+ }
+
+ if (wildcard_properties_set_.find(property_path_str) !=
+ wildcard_properties_set_.end()) {
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<DocHitInfoIterator> delegate,
+ wildcard_index_storage_->GetIterator(key_lower, key_upper));
+ std::set<std::string> property_paths = {std::move(property_path_str)};
+ return std::make_unique<DocHitInfoIteratorSectionRestrict>(
+ std::move(delegate), &document_store, &schema_store,
+ std::move(property_paths));
+ }
+
+ // Return an empty iterator.
+ return std::make_unique<DocHitInfoIteratorNumeric<int64_t>>(
+ /*numeric_index_iter=*/nullptr);
+}
+
+libtextclassifier3::Status IntegerIndex::AddPropertyToWildcardStorage(
+ const std::string& property_path) {
+ WildcardPropertyStorage wildcard_properties;
+ wildcard_properties.mutable_property_entries()->Reserve(
+ wildcard_properties_set_.size());
+ for (const std::string& property_path : wildcard_properties_set_) {
+ wildcard_properties.add_property_entries(property_path);
}
+ ICING_RETURN_IF_ERROR(wildcard_property_storage_->Write(
+ std::make_unique<WildcardPropertyStorage>(
+ std::move(wildcard_properties))));
- return iter->second->GetIterator(key_lower, key_upper);
+ wildcard_properties_set_.insert(property_path);
+ return libtextclassifier3::Status::OK;
}
libtextclassifier3::Status IntegerIndex::Optimize(
@@ -183,6 +276,8 @@ libtextclassifier3::Status IntegerIndex::Optimize(
// Destruct current storage instances to safely swap directories.
metadata_mmapped_file_.reset();
property_to_storage_map_.clear();
+ wildcard_index_storage_.reset();
+ wildcard_property_storage_.reset();
if (!filesystem_.SwapFiles(temp_working_path_ddir.dir().c_str(),
working_path_.c_str())) {
return absl_ports::InternalError(
@@ -190,9 +285,10 @@ libtextclassifier3::Status IntegerIndex::Optimize(
}
// Reinitialize the integer index.
+ std::string metadata_file_path = GetMetadataFilePath(working_path_);
ICING_ASSIGN_OR_RETURN(
MemoryMappedFile metadata_mmapped_file,
- MemoryMappedFile::Create(filesystem_, GetMetadataFilePath(working_path_),
+ MemoryMappedFile::Create(filesystem_, metadata_file_path,
MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
/*max_file_size=*/kMetadataFileSize,
/*pre_mapping_file_offset=*/0,
@@ -200,6 +296,25 @@ libtextclassifier3::Status IntegerIndex::Optimize(
metadata_mmapped_file_ =
std::make_unique<MemoryMappedFile>(std::move(metadata_mmapped_file));
+ // Recreate all of the data structures tracking the wildcard storage.
+ std::string wildcard_property_path =
+ GetWildcardPropertyStorageFilePath(working_path_);
+ wildcard_property_storage_ =
+ std::make_unique<FileBackedProto<WildcardPropertyStorage>>(
+ filesystem_, wildcard_property_path);
+
+ ICING_ASSIGN_OR_RETURN(wildcard_properties_set_,
+ CreatePropertySet(*wildcard_property_storage_));
+ if (!wildcard_properties_set_.empty()) {
+ ICING_ASSIGN_OR_RETURN(
+ wildcard_index_storage_,
+ IntegerIndexStorage::Create(
+ filesystem_,
+ GetPropertyIndexStoragePath(working_path_,
+ kWildcardPropertyIndexFileName),
+ IntegerIndexStorage::Options(), posting_list_serializer_.get()));
+ }
+
// Initialize all existing integer index storages.
ICING_ASSIGN_OR_RETURN(
property_to_storage_map_,
@@ -212,6 +327,7 @@ libtextclassifier3::Status IntegerIndex::Optimize(
libtextclassifier3::Status IntegerIndex::Clear() {
// Step 1: clear property_to_storage_map_.
property_to_storage_map_.clear();
+ wildcard_index_storage_.reset();
// Step 2: delete all IntegerIndexStorages. It is safe because there is no
// active IntegerIndexStorage after clearing the map.
@@ -224,6 +340,15 @@ libtextclassifier3::Status IntegerIndex::Clear() {
GetPropertyIndexStoragePath(working_path_, property_path)));
}
+ // Step 3: Delete the wildcard property storage
+ std::string wildcard_property_path =
+ GetWildcardPropertyStorageFilePath(working_path_);
+ if (filesystem_.FileExists(wildcard_property_path.c_str()) ||
+ !filesystem_.DeleteFile(wildcard_property_path.c_str())) {
+ return absl_ports::InternalError(absl_ports::StrCat(
+ "Unable to delete file at path ", wildcard_property_path));
+ }
+
info().last_added_document_id = kInvalidDocumentId;
return libtextclassifier3::Status::OK;
}
@@ -249,12 +374,20 @@ IntegerIndex::InitializeNewFiles(const Filesystem& filesystem,
ICING_RETURN_IF_ERROR(metadata_mmapped_file.GrowAndRemapIfNecessary(
/*file_offset=*/0, /*mmap_size=*/kMetadataFileSize));
+ std::string wildcard_property_path =
+ GetWildcardPropertyStorageFilePath(working_path);
+ auto wildcard_property_storage =
+ std::make_unique<FileBackedProto<WildcardPropertyStorage>>(
+ filesystem, wildcard_property_path);
+
// Create instance.
auto new_integer_index = std::unique_ptr<IntegerIndex>(new IntegerIndex(
filesystem, std::move(working_path),
std::make_unique<PostingListIntegerIndexSerializer>(),
std::make_unique<MemoryMappedFile>(std::move(metadata_mmapped_file)),
- /*property_to_storage_map=*/{}));
+ /*property_to_storage_map=*/{}, std::move(wildcard_property_storage),
+ /*wildcard_properties_set=*/{}, /*wildcard_index_storage=*/nullptr));
+
// Initialize info content by writing mapped memory directly.
Info& info_ref = new_integer_index->info();
info_ref.magic = Info::kMagic;
@@ -287,11 +420,33 @@ IntegerIndex::InitializeExistingFiles(const Filesystem& filesystem,
GetPropertyIntegerIndexStorageMap(filesystem, working_path,
posting_list_serializer.get()));
+ std::string wildcard_property_path =
+ GetWildcardPropertyStorageFilePath(working_path);
+ auto wildcard_property_storage =
+ std::make_unique<FileBackedProto<WildcardPropertyStorage>>(
+ filesystem, wildcard_property_path);
+
+ ICING_ASSIGN_OR_RETURN(
+ std::unordered_set<std::string> wildcard_properties_set,
+ CreatePropertySet(*wildcard_property_storage));
+
+ std::unique_ptr<IntegerIndexStorage> wildcard_index_storage;
+ if (!wildcard_properties_set.empty()) {
+ ICING_ASSIGN_OR_RETURN(
+ wildcard_index_storage,
+ IntegerIndexStorage::Create(
+ filesystem,
+ GetPropertyIndexStoragePath(working_path,
+ kWildcardPropertyIndexFileName),
+ IntegerIndexStorage::Options(), posting_list_serializer.get()));
+ }
+
// Create instance.
auto integer_index = std::unique_ptr<IntegerIndex>(new IntegerIndex(
filesystem, std::move(working_path), std::move(posting_list_serializer),
std::make_unique<MemoryMappedFile>(std::move(metadata_mmapped_file)),
- std::move(property_to_storage_map)));
+ std::move(property_to_storage_map), std::move(wildcard_property_storage),
+ std::move(wildcard_properties_set), std::move(wildcard_index_storage)));
// Initialize existing PersistentStorage. Checksums will be validated.
ICING_RETURN_IF_ERROR(integer_index->InitializeExistingStorage());
@@ -303,31 +458,78 @@ IntegerIndex::InitializeExistingFiles(const Filesystem& filesystem,
return integer_index;
}
-libtextclassifier3::Status IntegerIndex::TransferIndex(
+libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndexStorage>>
+IntegerIndex::TransferIntegerIndexStorage(
const std::vector<DocumentId>& document_id_old_to_new,
+ const IntegerIndexStorage* old_storage, const std::string& property_path,
IntegerIndex* new_integer_index) const {
- for (const auto& [property_path, old_storage] : property_to_storage_map_) {
- std::string new_storage_working_path = GetPropertyIndexStoragePath(
- new_integer_index->working_path_, property_path);
- ICING_ASSIGN_OR_RETURN(
- std::unique_ptr<IntegerIndexStorage> new_storage,
- IntegerIndexStorage::Create(
- new_integer_index->filesystem_, new_storage_working_path,
- IntegerIndexStorage::Options(),
- new_integer_index->posting_list_serializer_.get()));
+ std::string new_storage_working_path = GetPropertyIndexStoragePath(
+ new_integer_index->working_path_, property_path);
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<IntegerIndexStorage> new_storage,
+ IntegerIndexStorage::Create(
+ new_integer_index->filesystem_, new_storage_working_path,
+ IntegerIndexStorage::Options(),
+ new_integer_index->posting_list_serializer_.get()));
+
+ ICING_RETURN_IF_ERROR(
+ old_storage->TransferIndex(document_id_old_to_new, new_storage.get()));
+ if (new_storage->num_data() == 0) {
+ new_storage.reset();
ICING_RETURN_IF_ERROR(
- old_storage->TransferIndex(document_id_old_to_new, new_storage.get()));
+ IntegerIndexStorage::Discard(filesystem_, new_storage_working_path));
+ }
+ return new_storage;
+}
+
+libtextclassifier3::Status IntegerIndex::TransferWildcardStorage(
+ IntegerIndex* new_integer_index) const {
+ auto property_storage = std::make_unique<WildcardPropertyStorage>();
+ property_storage->mutable_property_entries()->Reserve(
+ wildcard_properties_set_.size());
+ for (const std::string& property : wildcard_properties_set_) {
+ property_storage->add_property_entries(property);
+ }
+
+ ICING_RETURN_IF_ERROR(new_integer_index->wildcard_property_storage_->Write(
+ std::move(property_storage)));
+ new_integer_index->wildcard_properties_set_ = wildcard_properties_set_;
+ return libtextclassifier3::Status::OK;
+}
- if (new_storage->num_data() == 0) {
- new_storage.reset();
- ICING_RETURN_IF_ERROR(
- IntegerIndexStorage::Discard(filesystem_, new_storage_working_path));
- } else {
+libtextclassifier3::Status IntegerIndex::TransferIndex(
+ const std::vector<DocumentId>& document_id_old_to_new,
+ IntegerIndex* new_integer_index) const {
+ // Transfer over the integer index storages
+ std::unique_ptr<IntegerIndexStorage> new_storage;
+ for (const auto& [property_path, old_storage] : property_to_storage_map_) {
+ ICING_ASSIGN_OR_RETURN(
+ new_storage,
+ TransferIntegerIndexStorage(document_id_old_to_new, old_storage.get(),
+ property_path, new_integer_index));
+ if (new_storage != nullptr) {
new_integer_index->property_to_storage_map_.insert(
- std::make_pair(property_path, std::move(new_storage)));
+ {property_path, std::move(new_storage)});
}
}
+ if (wildcard_index_storage_ != nullptr) {
+ ICING_ASSIGN_OR_RETURN(
+ new_storage,
+ TransferIntegerIndexStorage(
+ document_id_old_to_new, wildcard_index_storage_.get(),
+ std::string(kWildcardPropertyIndexFileName), new_integer_index));
+ if (new_storage != nullptr) {
+ new_integer_index->wildcard_index_storage_ = std::move(new_storage);
+
+ // The only time we need to copy over the list of properties using
+ // wildcard storage is if wildcard_index_storage and new_storage are both
+ // non-null. Otherwise, the new wildcard index storage won't have any
+ // data.
+ ICING_RETURN_IF_ERROR(TransferWildcardStorage(new_integer_index));
+ }
+ }
+
return libtextclassifier3::Status::OK;
}
@@ -335,6 +537,11 @@ libtextclassifier3::Status IntegerIndex::PersistStoragesToDisk() {
for (auto& [_, storage] : property_to_storage_map_) {
ICING_RETURN_IF_ERROR(storage->PersistToDisk());
}
+ // No need to persist wildcard_properties_storage_. All calls to
+ // FileBackedProto::Write are fully written through at the time of the call.
+ if (wildcard_index_storage_) {
+ ICING_RETURN_IF_ERROR(wildcard_index_storage_->PersistToDisk());
+ }
return libtextclassifier3::Status::OK;
}
@@ -350,8 +557,8 @@ libtextclassifier3::StatusOr<Crc32> IntegerIndex::ComputeInfoChecksum() {
}
libtextclassifier3::StatusOr<Crc32> IntegerIndex::ComputeStoragesChecksum() {
- // XOR all crcs of all storages. Since XOR is commutative and associative, the
- // order doesn't matter.
+ // XOR all crcs of all storages. Since XOR is commutative and associative,
+ // the order doesn't matter.
uint32_t storages_checksum = 0;
for (auto& [property_path, storage] : property_to_storage_map_) {
ICING_ASSIGN_OR_RETURN(Crc32 storage_crc, storage->UpdateChecksums());
@@ -359,6 +566,17 @@ libtextclassifier3::StatusOr<Crc32> IntegerIndex::ComputeStoragesChecksum() {
storages_checksum ^= storage_crc.Get();
}
+
+ if (wildcard_index_storage_ != nullptr) {
+ ICING_ASSIGN_OR_RETURN(Crc32 storage_crc,
+ wildcard_index_storage_->UpdateChecksums());
+ storages_checksum ^= storage_crc.Get();
+ }
+
+ ICING_ASSIGN_OR_RETURN(Crc32 wildcard_properties_crc,
+ wildcard_property_storage_->ComputeChecksum());
+ storages_checksum ^= wildcard_properties_crc.Get();
+
return Crc32(storages_checksum);
}
diff --git a/icing/index/numeric/integer-index.h b/icing/index/numeric/integer-index.h
index 050a143..303bb41 100644
--- a/icing/index/numeric/integer-index.h
+++ b/icing/index/numeric/integer-index.h
@@ -23,12 +23,16 @@
#include "icing/text_classifier/lib3/utils/base/status.h"
#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/file/file-backed-proto.h"
#include "icing/file/filesystem.h"
#include "icing/file/memory-mapped-file.h"
#include "icing/index/numeric/integer-index-storage.h"
#include "icing/index/numeric/numeric-index.h"
#include "icing/index/numeric/posting-list-integer-index-serializer.h"
+#include "icing/index/numeric/wildcard-property-storage.pb.h"
+#include "icing/schema/schema-store.h"
#include "icing/store/document-id.h"
+#include "icing/store/document-store.h"
#include "icing/util/crc32.h"
namespace icing {
@@ -46,6 +50,11 @@ class IntegerIndex : public NumericIndex<int64_t> {
using PropertyToStorageMapType =
std::unordered_map<std::string, std::unique_ptr<IntegerIndexStorage>>;
+ // Maximum number of individual property storages that this index will allow
+ // before falling back to placing hits for any new properties into the
+ // 'wildcard' storage.
+ static constexpr int kMaxPropertyStorages = 32;
+
struct Info {
static constexpr int32_t kMagic = 0x238a3dcb;
@@ -125,8 +134,9 @@ class IntegerIndex : public NumericIndex<int64_t> {
// - NOT_FOUND_ERROR if the given property_path doesn't exist
// - Any IntegerIndexStorage errors
libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>> GetIterator(
- std::string_view property_path, int64_t key_lower,
- int64_t key_upper) const override;
+ std::string_view property_path, int64_t key_lower, int64_t key_upper,
+ const DocumentStore& document_store,
+ const SchemaStore& schema_store) const override;
// Reduces internal file sizes by reclaiming space and ids of deleted
// documents. Integer index will convert all data (hits) to the new document
@@ -165,6 +175,11 @@ class IntegerIndex : public NumericIndex<int64_t> {
}
}
+ int num_property_indices() const override {
+ return property_to_storage_map_.size() +
+ ((wildcard_index_storage_ == nullptr) ? 0 : 1);
+ }
+
private:
class Editor : public NumericIndex<int64_t>::Editor {
public:
@@ -191,17 +206,24 @@ class IntegerIndex : public NumericIndex<int64_t> {
IntegerIndex& integer_index_; // Does not own.
};
- explicit IntegerIndex(const Filesystem& filesystem,
- std::string&& working_path,
- std::unique_ptr<PostingListIntegerIndexSerializer>
- posting_list_serializer,
- std::unique_ptr<MemoryMappedFile> metadata_mmapped_file,
- PropertyToStorageMapType&& property_to_storage_map)
+ explicit IntegerIndex(
+ const Filesystem& filesystem, std::string&& working_path,
+ std::unique_ptr<PostingListIntegerIndexSerializer>
+ posting_list_serializer,
+ std::unique_ptr<MemoryMappedFile> metadata_mmapped_file,
+ PropertyToStorageMapType&& property_to_storage_map,
+ std::unique_ptr<FileBackedProto<WildcardPropertyStorage>>
+ wildcard_property_storage,
+ std::unordered_set<std::string> wildcard_properties_set,
+ std::unique_ptr<icing::lib::IntegerIndexStorage> wildcard_index_storage)
: NumericIndex<int64_t>(filesystem, std::move(working_path),
kWorkingPathType),
posting_list_serializer_(std::move(posting_list_serializer)),
metadata_mmapped_file_(std::move(metadata_mmapped_file)),
- property_to_storage_map_(std::move(property_to_storage_map)) {}
+ property_to_storage_map_(std::move(property_to_storage_map)),
+ wildcard_property_storage_(std::move(wildcard_property_storage)),
+ wildcard_properties_set_(std::move(wildcard_properties_set)),
+ wildcard_index_storage_(std::move(wildcard_index_storage)) {}
static libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndex>>
InitializeNewFiles(const Filesystem& filesystem, std::string&& working_path);
@@ -210,6 +232,17 @@ class IntegerIndex : public NumericIndex<int64_t> {
InitializeExistingFiles(const Filesystem& filesystem,
std::string&& working_path);
+ // Adds the property path to the list of properties using wildcard storage.
+ // This will both update the in-memory list (wildcard_properties_set_) and
+ // the persistent list (wilcard_property_storage_).
+ //
+ // RETURNS:
+ // - OK on success
+ // - INTERNAL_ERROR if unable to successfully persist updated properties
+ // list in wildcard_property_storage_.
+ libtextclassifier3::Status AddPropertyToWildcardStorage(
+ const std::string& property_path);
+
// Transfers integer index data from the current integer index to
// new_integer_index.
//
@@ -222,6 +255,29 @@ class IntegerIndex : public NumericIndex<int64_t> {
const std::vector<DocumentId>& document_id_old_to_new,
IntegerIndex* new_integer_index) const;
+ // Transfers integer index data from old_storage to new_integer_index.
+ //
+ // Returns:
+ // - OK on success
+ // - INTERNAL_ERROR on I/O error. This could potentially leave the storages
+ // in an invalid state and the caller should handle it properly (e.g.
+ // discard and rebuild)
+ libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndexStorage>>
+ TransferIntegerIndexStorage(
+ const std::vector<DocumentId>& document_id_old_to_new,
+ const IntegerIndexStorage* old_storage, const std::string& property_path,
+ IntegerIndex* new_integer_index) const;
+
+ // Transfers the persistent and in-memory list of properties using the
+ // wildcard storage from old_storage to new_integer_index.
+ //
+ // RETURNS:
+ // - OK on success
+ // - INTERNAL_ERROR if unable to successfully persist updated properties
+ // list in new_integer_index.
+ libtextclassifier3::Status TransferWildcardStorage(
+ IntegerIndex* new_integer_index) const;
+
// Flushes contents of all storages to underlying files.
//
// Returns:
@@ -277,6 +333,19 @@ class IntegerIndex : public NumericIndex<int64_t> {
// Property path to integer index storage map.
PropertyToStorageMapType property_to_storage_map_;
+
+ // Persistent list of properties that have added content to
+ // wildcard_index_storage_.
+ std::unique_ptr<FileBackedProto<WildcardPropertyStorage>>
+ wildcard_property_storage_;
+
+ // In-memory list of properties that have added content to
+ // wildcard_index_storage_.
+ std::unordered_set<std::string> wildcard_properties_set_;
+
+ // The index storage that is used once we have already created
+ // kMaxPropertyStorages in property_to_storage_map.
+ std::unique_ptr<icing::lib::IntegerIndexStorage> wildcard_index_storage_;
};
} // namespace lib
diff --git a/icing/index/numeric/integer-index_test.cc b/icing/index/numeric/integer-index_test.cc
index c6cf855..c4dacb8 100644
--- a/icing/index/numeric/integer-index_test.cc
+++ b/icing/index/numeric/integer-index_test.cc
@@ -25,6 +25,7 @@
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "gmock/gmock.h"
#include "gtest/gtest.h"
+#include "icing/document-builder.h"
#include "icing/file/filesystem.h"
#include "icing/index/hit/doc-hit-info.h"
#include "icing/index/iterator/doc-hit-info-iterator.h"
@@ -32,6 +33,9 @@
#include "icing/index/numeric/integer-index-storage.h"
#include "icing/index/numeric/numeric-index.h"
#include "icing/index/numeric/posting-list-integer-index-serializer.h"
+#include "icing/proto/document.pb.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/schema-builder.h"
#include "icing/schema/section.h"
#include "icing/store/document-id.h"
#include "icing/testing/common-matchers.h"
@@ -68,9 +72,25 @@ class NumericIndexIntegerTest : public ::testing::Test {
IsTrue());
working_path_ = base_dir_ + "/numeric_index_integer_test";
+ std::string schema_dir = base_dir_ + "/schema_test";
+
+ ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(schema_dir.c_str()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ schema_store_, SchemaStore::Create(&filesystem_, schema_dir, &clock_));
+
+ std::string document_store_dir = base_dir_ + "/doc_store_test";
+ ASSERT_TRUE(
+ filesystem_.CreateDirectoryRecursively(document_store_dir.c_str()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult doc_store_create_result,
+ DocumentStore::Create(&filesystem_, document_store_dir, &clock_,
+ schema_store_.get()));
+ doc_store_ = std::move(doc_store_create_result.document_store);
}
void TearDown() override {
+ doc_store_.reset();
+ schema_store_.reset();
filesystem_.DeleteDirectoryRecursively(base_dir_.c_str());
}
@@ -92,9 +112,67 @@ class NumericIndexIntegerTest : public ::testing::Test {
return IntegerIndex::Create(filesystem_, working_path_);
}
+ template <typename NotIntegerIndexType>
+ bool is_integer_index() const {
+ return false;
+ }
+
+ template <>
+ bool is_integer_index<IntegerIndex>() const {
+ return true;
+ }
+
+ libtextclassifier3::StatusOr<std::vector<DocumentId>> CompactDocStore() {
+ std::string document_store_dir = base_dir_ + "/doc_store_test";
+ std::string document_store_compact_dir =
+ base_dir_ + "/doc_store_compact_test";
+ if (!filesystem_.CreateDirectoryRecursively(
+ document_store_compact_dir.c_str())) {
+ return absl_ports::InternalError("Unable to create compact directory");
+ }
+ ICING_ASSIGN_OR_RETURN(
+ std::vector<DocumentId> docid_map,
+ doc_store_->OptimizeInto(document_store_compact_dir, nullptr));
+
+ doc_store_.reset();
+ if (!filesystem_.SwapFiles(document_store_dir.c_str(),
+ document_store_compact_dir.c_str())) {
+ return absl_ports::InternalError("Unable to swap directories.");
+ }
+ if (!filesystem_.DeleteDirectoryRecursively(
+ document_store_compact_dir.c_str())) {
+ return absl_ports::InternalError("Unable to delete compact directory");
+ }
+
+ ICING_ASSIGN_OR_RETURN(
+ DocumentStore::CreateResult doc_store_create_result,
+ DocumentStore::Create(&filesystem_, document_store_dir, &clock_,
+ schema_store_.get()));
+ doc_store_ = std::move(doc_store_create_result.document_store);
+ return docid_map;
+ }
+
+ libtextclassifier3::StatusOr<std::vector<DocHitInfo>> Query(
+ const NumericIndex<int64_t>* integer_index,
+ std::string_view property_path, int64_t key_lower, int64_t key_upper) {
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<DocHitInfoIterator> iter,
+ integer_index->GetIterator(property_path, key_lower, key_upper,
+ *doc_store_, *schema_store_));
+
+ std::vector<DocHitInfo> result;
+ while (iter->Advance().ok()) {
+ result.push_back(iter->doc_hit_info());
+ }
+ return result;
+ }
+
Filesystem filesystem_;
std::string base_dir_;
std::string working_path_;
+ std::unique_ptr<SchemaStore> schema_store_;
+ std::unique_ptr<DocumentStore> doc_store_;
+ Clock clock_;
};
void Index(NumericIndex<int64_t>* integer_index, std::string_view property_path,
@@ -109,20 +187,6 @@ void Index(NumericIndex<int64_t>* integer_index, std::string_view property_path,
ICING_EXPECT_OK(std::move(*editor).IndexAllBufferedKeys());
}
-libtextclassifier3::StatusOr<std::vector<DocHitInfo>> Query(
- const NumericIndex<int64_t>* integer_index, std::string_view property_path,
- int64_t key_lower, int64_t key_upper) {
- ICING_ASSIGN_OR_RETURN(
- std::unique_ptr<DocHitInfoIterator> iter,
- integer_index->GetIterator(property_path, key_lower, key_upper));
-
- std::vector<DocHitInfo> result;
- while (iter->Advance().ok()) {
- result.push_back(iter->doc_hit_info());
- }
- return result;
-}
-
using TestTypes = ::testing::Types<DummyNumericIndex<int64_t>, IntegerIndex>;
TYPED_TEST_SUITE(NumericIndexIntegerTest, TestTypes);
@@ -180,8 +244,8 @@ TYPED_TEST(NumericIndexIntegerTest, SingleKeyExactQuery) {
int64_t query_key = 2;
std::vector<SectionId> expected_sections = {kDefaultSectionId};
- EXPECT_THAT(Query(integer_index.get(), kDefaultTestPropertyPath,
- /*key_lower=*/query_key, /*key_upper=*/query_key),
+ EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath,
+ /*key_lower=*/query_key, /*key_upper=*/query_key),
IsOkAndHolds(ElementsAre(
EqualsDocHitInfo(/*document_id=*/5, expected_sections),
EqualsDocHitInfo(/*document_id=*/2, expected_sections))));
@@ -206,8 +270,8 @@ TYPED_TEST(NumericIndexIntegerTest, SingleKeyRangeQuery) {
kDefaultSectionId, /*keys=*/{2});
std::vector<SectionId> expected_sections = {kDefaultSectionId};
- EXPECT_THAT(Query(integer_index.get(), kDefaultTestPropertyPath,
- /*key_lower=*/1, /*key_upper=*/3),
+ EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath,
+ /*key_lower=*/1, /*key_upper=*/3),
IsOkAndHolds(ElementsAre(
EqualsDocHitInfo(/*document_id=*/5, expected_sections),
EqualsDocHitInfo(/*document_id=*/2, expected_sections),
@@ -215,6 +279,258 @@ TYPED_TEST(NumericIndexIntegerTest, SingleKeyRangeQuery) {
EqualsDocHitInfo(/*document_id=*/0, expected_sections))));
}
+TYPED_TEST(NumericIndexIntegerTest, WildcardStorageQuery) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<NumericIndex<int64_t>> integer_index,
+ this->template CreateIntegerIndex<TypeParam>());
+
+ // This test sets its schema assuming that max property storages == 32.
+ ASSERT_THAT(IntegerIndex::kMaxPropertyStorages, Eq(32));
+
+ PropertyConfigProto int_property_config =
+ PropertyConfigBuilder()
+ .SetName("otherProperty1")
+ .SetCardinality(CARDINALITY_REPEATED)
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .Build();
+ // Create a schema with two types:
+ // - TypeA has 34 properties:
+ // 'desiredProperty', 'otherProperty'*, 'undesiredProperty'
+ // - TypeB has 2 properties: 'anotherProperty', 'desiredProperty'
+ // 1. The 32 'otherProperty's will consume all of the individual storages
+ // 2. TypeA.desiredProperty and TypeB.anotherProperty will both be assigned
+ // SectionId = 0 for their respective types.
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("TypeA")
+ .AddProperty(int_property_config)
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty2"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty3"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty4"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty5"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty6"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty7"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty8"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty9"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty10"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty11"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty12"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty13"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty14"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty15"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty16"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty17"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty18"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty19"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty20"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty21"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty22"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty23"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty24"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty25"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty26"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty27"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty28"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty29"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty30"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty31"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty32"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("desiredProperty"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("undesiredProperty")))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("TypeB")
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("anotherProperty"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("desiredProperty")))
+ .Build();
+ ICING_ASSERT_OK(this->schema_store_->SetSchema(schema));
+
+ // Put 11 docs of "TypeA" into the document store.
+ DocumentProto doc =
+ DocumentBuilder().SetKey("ns1", "uri0").SetSchema("TypeA").Build();
+ ICING_ASSERT_OK(this->doc_store_->Put(doc));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri1").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri2").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri3").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri4").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri5").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri6").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri7").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri8").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri9").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri10").Build()));
+
+ // Put 5 docs of "TypeB" into the document store.
+ doc = DocumentBuilder(doc).SetUri("uri11").SetSchema("TypeB").Build();
+ ICING_ASSERT_OK(this->doc_store_->Put(doc));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri12").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri13").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri14").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri15").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri16").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri17").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri18").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri19").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri20").Build()));
+
+ // Ids are assigned alphabetically, so the property ids are:
+ // TypeA.desiredProperty = 0
+ // TypeA.otherPropertyN = N
+ // TypeA.undesiredProperty = 33
+ // TypeB.anotherProperty = 0
+ // TypeB.desiredProperty = 1
+ SectionId typea_desired_prop_id = 0;
+ SectionId typea_undesired_prop_id = 33;
+ SectionId typeb_another_prop_id = 0;
+ SectionId typeb_desired_prop_id = 1;
+
+ // Index numeric content for other properties to force our property into the
+ // wildcard storage.
+ std::string other_property_path = "otherProperty";
+ for (int i = 1; i <= IntegerIndex::kMaxPropertyStorages; ++i) {
+ Index(integer_index.get(),
+ absl_ports::StrCat(other_property_path, std::to_string(i)),
+ /*document_id=*/0, /*section_id=*/i, /*keys=*/{i});
+ }
+
+ // Index numeric content for TypeA.desiredProperty
+ std::string desired_property = "desiredProperty";
+ Index(integer_index.get(), desired_property, /*document_id=*/0,
+ typea_desired_prop_id, /*keys=*/{1});
+ Index(integer_index.get(), desired_property, /*document_id=*/1,
+ typea_desired_prop_id, /*keys=*/{3});
+ Index(integer_index.get(), desired_property, /*document_id=*/2,
+ typea_desired_prop_id, /*keys=*/{2});
+ Index(integer_index.get(), desired_property, /*document_id=*/3,
+ typea_desired_prop_id, /*keys=*/{0});
+ Index(integer_index.get(), desired_property, /*document_id=*/4,
+ typea_desired_prop_id, /*keys=*/{4});
+ Index(integer_index.get(), desired_property, /*document_id=*/5,
+ typea_desired_prop_id, /*keys=*/{2});
+
+ // Index the same numeric content for TypeA.undesiredProperty
+ std::string undesired_property = "undesiredProperty";
+ Index(integer_index.get(), undesired_property, /*document_id=*/6,
+ typea_undesired_prop_id, /*keys=*/{3});
+ Index(integer_index.get(), undesired_property, /*document_id=*/7,
+ typea_undesired_prop_id, /*keys=*/{2});
+ Index(integer_index.get(), undesired_property, /*document_id=*/8,
+ typea_undesired_prop_id, /*keys=*/{0});
+ Index(integer_index.get(), undesired_property, /*document_id=*/9,
+ typea_undesired_prop_id, /*keys=*/{4});
+ Index(integer_index.get(), undesired_property, /*document_id=*/10,
+ typea_undesired_prop_id, /*keys=*/{2});
+
+ // Index the same numeric content for TypeB.anotherProperty
+ std::string another_property = "anotherProperty";
+ Index(integer_index.get(), another_property, /*document_id=*/11,
+ typeb_another_prop_id, /*keys=*/{3});
+ Index(integer_index.get(), another_property, /*document_id=*/12,
+ typeb_another_prop_id, /*keys=*/{2});
+ Index(integer_index.get(), another_property, /*document_id=*/13,
+ typeb_another_prop_id, /*keys=*/{0});
+ Index(integer_index.get(), another_property, /*document_id=*/14,
+ typeb_another_prop_id, /*keys=*/{4});
+ Index(integer_index.get(), another_property, /*document_id=*/15,
+ typeb_another_prop_id, /*keys=*/{2});
+
+ // Finally, index the same numeric content for TypeB.desiredProperty
+ Index(integer_index.get(), desired_property, /*document_id=*/16,
+ typeb_desired_prop_id, /*keys=*/{3});
+ Index(integer_index.get(), desired_property, /*document_id=*/17,
+ typeb_desired_prop_id, /*keys=*/{2});
+ Index(integer_index.get(), desired_property, /*document_id=*/18,
+ typeb_desired_prop_id, /*keys=*/{0});
+ Index(integer_index.get(), desired_property, /*document_id=*/19,
+ typeb_desired_prop_id, /*keys=*/{4});
+ Index(integer_index.get(), desired_property, /*document_id=*/20,
+ typeb_desired_prop_id, /*keys=*/{2});
+
+ if (this->template is_integer_index<TypeParam>()) {
+ EXPECT_THAT(integer_index->num_property_indices(), Eq(33));
+ } else {
+ EXPECT_THAT(integer_index->num_property_indices(), Eq(35));
+ }
+
+ // Only the hits for 'desired_prop_id' should be returned.
+ std::vector<SectionId> expected_sections_typea = {typea_desired_prop_id};
+ std::vector<SectionId> expected_sections_typeb = {typeb_desired_prop_id};
+ EXPECT_THAT(
+ this->Query(integer_index.get(), desired_property,
+ /*key_lower=*/2, /*key_upper=*/2),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/20, expected_sections_typeb),
+ EqualsDocHitInfo(/*document_id=*/17, expected_sections_typeb),
+ EqualsDocHitInfo(/*document_id=*/5, expected_sections_typea),
+ EqualsDocHitInfo(/*document_id=*/2, expected_sections_typea))));
+
+ EXPECT_THAT(
+ this->Query(integer_index.get(), desired_property,
+ /*key_lower=*/1, /*key_upper=*/3),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/20, expected_sections_typeb),
+ EqualsDocHitInfo(/*document_id=*/17, expected_sections_typeb),
+ EqualsDocHitInfo(/*document_id=*/16, expected_sections_typeb),
+ EqualsDocHitInfo(/*document_id=*/5, expected_sections_typea),
+ EqualsDocHitInfo(/*document_id=*/2, expected_sections_typea),
+ EqualsDocHitInfo(/*document_id=*/1, expected_sections_typea),
+ EqualsDocHitInfo(/*document_id=*/0, expected_sections_typea))));
+}
+
TYPED_TEST(NumericIndexIntegerTest, EmptyResult) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<NumericIndex<int64_t>> integer_index,
@@ -233,11 +549,11 @@ TYPED_TEST(NumericIndexIntegerTest, EmptyResult) {
Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/5,
kDefaultSectionId, /*keys=*/{2});
- EXPECT_THAT(Query(integer_index.get(), kDefaultTestPropertyPath,
- /*key_lower=*/10, /*key_upper=*/10),
+ EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath,
+ /*key_lower=*/10, /*key_upper=*/10),
IsOkAndHolds(IsEmpty()));
- EXPECT_THAT(Query(integer_index.get(), kDefaultTestPropertyPath,
- /*key_lower=*/100, /*key_upper=*/200),
+ EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath,
+ /*key_lower=*/100, /*key_upper=*/200),
IsOkAndHolds(IsEmpty()));
}
@@ -252,8 +568,8 @@ TYPED_TEST(NumericIndexIntegerTest,
Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/0,
kDefaultSectionId, /*keys=*/{1});
- EXPECT_THAT(Query(integer_index.get(), kAnotherPropertyPath,
- /*key_lower=*/100, /*key_upper=*/200),
+ EXPECT_THAT(this->Query(integer_index.get(), kAnotherPropertyPath,
+ /*key_lower=*/100, /*key_upper=*/200),
IsOkAndHolds(IsEmpty()));
}
@@ -286,8 +602,8 @@ TYPED_TEST(NumericIndexIntegerTest,
kDefaultSectionId, /*keys=*/{4, -1000});
std::vector<SectionId> expected_sections = {kDefaultSectionId};
- EXPECT_THAT(Query(integer_index.get(), kDefaultTestPropertyPath,
- /*key_lower=*/1, /*key_upper=*/3),
+ EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath,
+ /*key_lower=*/1, /*key_upper=*/3),
IsOkAndHolds(ElementsAre(
EqualsDocHitInfo(/*document_id=*/6, expected_sections),
EqualsDocHitInfo(/*document_id=*/5, expected_sections),
@@ -326,39 +642,39 @@ TYPED_TEST(NumericIndexIntegerTest, EdgeNumericValues) {
std::vector<SectionId> expected_sections = {kDefaultSectionId};
// Negative key
- EXPECT_THAT(Query(integer_index.get(), kDefaultTestPropertyPath,
- /*key_lower=*/-100, /*key_upper=*/-70),
+ EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath,
+ /*key_lower=*/-100, /*key_upper=*/-70),
IsOkAndHolds(ElementsAre(
EqualsDocHitInfo(/*document_id=*/2, expected_sections),
EqualsDocHitInfo(/*document_id=*/1, expected_sections))));
// INT64_MAX key
- EXPECT_THAT(Query(integer_index.get(), kDefaultTestPropertyPath,
- /*key_lower=*/std::numeric_limits<int64_t>::max(),
- /*key_upper=*/std::numeric_limits<int64_t>::max()),
+ EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath,
+ /*key_lower=*/std::numeric_limits<int64_t>::max(),
+ /*key_upper=*/std::numeric_limits<int64_t>::max()),
IsOkAndHolds(ElementsAre(
EqualsDocHitInfo(/*document_id=*/7, expected_sections),
EqualsDocHitInfo(/*document_id=*/3, expected_sections))));
// INT64_MIN key
- EXPECT_THAT(Query(integer_index.get(), kDefaultTestPropertyPath,
- /*key_lower=*/std::numeric_limits<int64_t>::min(),
- /*key_upper=*/std::numeric_limits<int64_t>::min()),
+ EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath,
+ /*key_lower=*/std::numeric_limits<int64_t>::min(),
+ /*key_upper=*/std::numeric_limits<int64_t>::min()),
IsOkAndHolds(ElementsAre(
EqualsDocHitInfo(/*document_id=*/9, expected_sections),
EqualsDocHitInfo(/*document_id=*/4, expected_sections))));
// Key = 0
- EXPECT_THAT(Query(integer_index.get(), kDefaultTestPropertyPath,
- /*key_lower=*/0, /*key_upper=*/0),
+ EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath,
+ /*key_lower=*/0, /*key_upper=*/0),
IsOkAndHolds(ElementsAre(
EqualsDocHitInfo(/*document_id=*/8, expected_sections),
EqualsDocHitInfo(/*document_id=*/0, expected_sections))));
// All keys from INT64_MIN to INT64_MAX
- EXPECT_THAT(Query(integer_index.get(), kDefaultTestPropertyPath,
- /*key_lower=*/std::numeric_limits<int64_t>::min(),
- /*key_upper=*/std::numeric_limits<int64_t>::max()),
+ EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath,
+ /*key_lower=*/std::numeric_limits<int64_t>::min(),
+ /*key_upper=*/std::numeric_limits<int64_t>::max()),
IsOkAndHolds(ElementsAre(
EqualsDocHitInfo(/*document_id=*/9, expected_sections),
EqualsDocHitInfo(/*document_id=*/8, expected_sections),
@@ -404,8 +720,9 @@ TYPED_TEST(NumericIndexIntegerTest,
/*section_id=*/3, /*keys=*/{5});
EXPECT_THAT(
- Query(integer_index.get(), kDefaultTestPropertyPath, /*key_lower=*/1,
- /*key_upper=*/3),
+ this->Query(integer_index.get(), kDefaultTestPropertyPath,
+ /*key_lower=*/1,
+ /*key_upper=*/3),
IsOkAndHolds(ElementsAre(
EqualsDocHitInfo(/*document_id=*/2, std::vector<SectionId>{4, 5}),
EqualsDocHitInfo(/*document_id=*/1, std::vector<SectionId>{1, 2}),
@@ -433,8 +750,8 @@ TYPED_TEST(NumericIndexIntegerTest, NonRelevantPropertyShouldNotBeIncluded) {
kDefaultSectionId, /*keys=*/{2});
std::vector<SectionId> expected_sections = {kDefaultSectionId};
- EXPECT_THAT(Query(integer_index.get(), kDefaultTestPropertyPath,
- /*key_lower=*/1, /*key_upper=*/3),
+ EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath,
+ /*key_lower=*/1, /*key_upper=*/3),
IsOkAndHolds(ElementsAre(
EqualsDocHitInfo(/*document_id=*/5, expected_sections),
EqualsDocHitInfo(/*document_id=*/1, expected_sections),
@@ -460,8 +777,8 @@ TYPED_TEST(NumericIndexIntegerTest,
Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/5,
kDefaultSectionId, /*keys=*/{2});
- EXPECT_THAT(Query(integer_index.get(), kDefaultTestPropertyPath,
- /*key_lower=*/3, /*key_upper=*/1),
+ EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath,
+ /*key_lower=*/3, /*key_upper=*/1),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
}
@@ -499,30 +816,30 @@ TYPED_TEST(NumericIndexIntegerTest, Optimize) {
// Verify index and query API still work normally after Optimize().
std::vector<SectionId> expected_sections = {kDefaultSectionId};
- EXPECT_THAT(Query(integer_index.get(), kDefaultTestPropertyPath,
- /*key_lower=*/1, /*key_upper=*/1),
+ EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath,
+ /*key_lower=*/1, /*key_upper=*/1),
IsOkAndHolds(ElementsAre(
EqualsDocHitInfo(/*document_id=*/0, expected_sections))));
- EXPECT_THAT(Query(integer_index.get(), kDefaultTestPropertyPath,
- /*key_lower=*/3, /*key_upper=*/3),
+ EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath,
+ /*key_lower=*/3, /*key_upper=*/3),
IsOkAndHolds(ElementsAre(
EqualsDocHitInfo(/*document_id=*/1, expected_sections))));
- EXPECT_THAT(Query(integer_index.get(), kDefaultTestPropertyPath,
- /*key_lower=*/0, /*key_upper=*/0),
+ EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath,
+ /*key_lower=*/0, /*key_upper=*/0),
IsOkAndHolds(IsEmpty()));
- EXPECT_THAT(Query(integer_index.get(), kDefaultTestPropertyPath,
- /*key_lower=*/4, /*key_upper=*/4),
+ EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath,
+ /*key_lower=*/4, /*key_upper=*/4),
IsOkAndHolds(ElementsAre(
EqualsDocHitInfo(/*document_id=*/2, expected_sections))));
- EXPECT_THAT(Query(integer_index.get(), kDefaultTestPropertyPath,
- /*key_lower=*/2, /*key_upper=*/2),
+ EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath,
+ /*key_lower=*/2, /*key_upper=*/2),
IsOkAndHolds(ElementsAre(
EqualsDocHitInfo(/*document_id=*/3, expected_sections))));
Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/5,
kDefaultSectionId, /*keys=*/{123});
- EXPECT_THAT(Query(integer_index.get(), kDefaultTestPropertyPath,
- /*key_lower=*/123, /*key_upper=*/123),
+ EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath,
+ /*key_lower=*/123, /*key_upper=*/123),
IsOkAndHolds(ElementsAre(
EqualsDocHitInfo(/*document_id=*/5, expected_sections))));
}
@@ -581,40 +898,40 @@ TYPED_TEST(NumericIndexIntegerTest, OptimizeMultiplePropertyPaths) {
// Verify index and query API still work normally after Optimize().
// Key = 1
- EXPECT_THAT(Query(integer_index.get(), kPropertyPath1, /*key_lower=*/1,
- /*key_upper=*/1),
+ EXPECT_THAT(this->Query(integer_index.get(), kPropertyPath1, /*key_lower=*/1,
+ /*key_upper=*/1),
IsOkAndHolds(IsEmpty()));
- EXPECT_THAT(Query(integer_index.get(), kPropertyPath2, /*key_lower=*/1,
- /*key_upper=*/1),
+ EXPECT_THAT(this->Query(integer_index.get(), kPropertyPath2, /*key_lower=*/1,
+ /*key_upper=*/1),
IsOkAndHolds(ElementsAre(EqualsDocHitInfo(
/*document_id=*/0, std::vector<SectionId>{kSectionId2}))));
// key = 2
- EXPECT_THAT(Query(integer_index.get(), kPropertyPath1, /*key_lower=*/2,
- /*key_upper=*/2),
+ EXPECT_THAT(this->Query(integer_index.get(), kPropertyPath1, /*key_lower=*/2,
+ /*key_upper=*/2),
IsOkAndHolds(ElementsAre(EqualsDocHitInfo(
/*document_id=*/0, std::vector<SectionId>{kSectionId1}))));
- EXPECT_THAT(Query(integer_index.get(), kPropertyPath2, /*key_lower=*/2,
- /*key_upper=*/2),
+ EXPECT_THAT(this->Query(integer_index.get(), kPropertyPath2, /*key_lower=*/2,
+ /*key_upper=*/2),
IsOkAndHolds(IsEmpty()));
// key = 3
- EXPECT_THAT(Query(integer_index.get(), kPropertyPath1, /*key_lower=*/3,
- /*key_upper=*/3),
+ EXPECT_THAT(this->Query(integer_index.get(), kPropertyPath1, /*key_lower=*/3,
+ /*key_upper=*/3),
IsOkAndHolds(ElementsAre(EqualsDocHitInfo(
/*document_id=*/1, std::vector<SectionId>{kSectionId1}))));
- EXPECT_THAT(Query(integer_index.get(), kPropertyPath2, /*key_lower=*/3,
- /*key_upper=*/3),
+ EXPECT_THAT(this->Query(integer_index.get(), kPropertyPath2, /*key_lower=*/3,
+ /*key_upper=*/3),
IsOkAndHolds(ElementsAre(EqualsDocHitInfo(
/*document_id=*/2, std::vector<SectionId>{kSectionId2}))));
// key = 4
- EXPECT_THAT(Query(integer_index.get(), kPropertyPath1, /*key_lower=*/4,
- /*key_upper=*/4),
+ EXPECT_THAT(this->Query(integer_index.get(), kPropertyPath1, /*key_lower=*/4,
+ /*key_upper=*/4),
IsOkAndHolds(ElementsAre(EqualsDocHitInfo(
/*document_id=*/3, std::vector<SectionId>{kSectionId1}))));
- EXPECT_THAT(Query(integer_index.get(), kPropertyPath2, /*key_lower=*/4,
- /*key_upper=*/4),
+ EXPECT_THAT(this->Query(integer_index.get(), kPropertyPath2, /*key_lower=*/4,
+ /*key_upper=*/4),
IsOkAndHolds(IsEmpty()));
}
@@ -655,9 +972,9 @@ TYPED_TEST(NumericIndexIntegerTest, OptimizeShouldDiscardEmptyPropertyStorage) {
// All data in "prop2" as well as the underlying storage should be deleted, so
// when querying "prop2", we should get empty result.
- EXPECT_THAT(Query(integer_index.get(), kPropertyPath2,
- /*key_lower=*/std::numeric_limits<int64_t>::min(),
- /*key_upper=*/std::numeric_limits<int64_t>::max()),
+ EXPECT_THAT(this->Query(integer_index.get(), kPropertyPath2,
+ /*key_lower=*/std::numeric_limits<int64_t>::min(),
+ /*key_upper=*/std::numeric_limits<int64_t>::max()),
IsOkAndHolds(IsEmpty()));
if (std::is_same_v<IntegerIndex, TypeParam>) {
std::string prop2_storage_working_path =
@@ -670,8 +987,8 @@ TYPED_TEST(NumericIndexIntegerTest, OptimizeShouldDiscardEmptyPropertyStorage) {
// Verify we can still index and query for "prop2".
Index(integer_index.get(), kPropertyPath2, /*document_id=*/100, kSectionId2,
/*keys=*/{123});
- EXPECT_THAT(Query(integer_index.get(), kPropertyPath2,
- /*key_lower=*/123, /*key_upper=*/123),
+ EXPECT_THAT(this->Query(integer_index.get(), kPropertyPath2,
+ /*key_lower=*/123, /*key_upper=*/123),
IsOkAndHolds(ElementsAre(EqualsDocHitInfo(
/*document_id=*/100, std::vector<SectionId>{kSectionId2}))));
}
@@ -697,9 +1014,9 @@ TYPED_TEST(NumericIndexIntegerTest, OptimizeOutOfRangeDocumentId) {
EXPECT_THAT(integer_index->last_added_document_id(), Eq(kInvalidDocumentId));
// Verify all data are discarded after Optimize().
- EXPECT_THAT(Query(integer_index.get(), kDefaultTestPropertyPath,
- /*key_lower=*/std::numeric_limits<int64_t>::min(),
- /*key_upper=*/std::numeric_limits<int64_t>::max()),
+ EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath,
+ /*key_lower=*/std::numeric_limits<int64_t>::min(),
+ /*key_upper=*/std::numeric_limits<int64_t>::max()),
IsOkAndHolds(IsEmpty()));
}
@@ -731,9 +1048,9 @@ TYPED_TEST(NumericIndexIntegerTest, OptimizeDeleteAll) {
EXPECT_THAT(integer_index->last_added_document_id(), Eq(kInvalidDocumentId));
// Verify all data are discarded after Optimize().
- EXPECT_THAT(Query(integer_index.get(), kDefaultTestPropertyPath,
- /*key_lower=*/std::numeric_limits<int64_t>::min(),
- /*key_upper=*/std::numeric_limits<int64_t>::max()),
+ EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath,
+ /*key_lower=*/std::numeric_limits<int64_t>::min(),
+ /*key_upper=*/std::numeric_limits<int64_t>::max()),
IsOkAndHolds(IsEmpty()));
}
@@ -750,13 +1067,13 @@ TYPED_TEST(NumericIndexIntegerTest, Clear) {
ASSERT_THAT(integer_index->last_added_document_id(), Eq(1));
ASSERT_THAT(
- Query(integer_index.get(), /*property_path=*/"A", /*key_lower=*/1,
- /*key_upper=*/1),
+ this->Query(integer_index.get(), /*property_path=*/"A", /*key_lower=*/1,
+ /*key_upper=*/1),
IsOkAndHolds(ElementsAre(EqualsDocHitInfo(
/*document_id=*/0, std::vector<SectionId>{kDefaultSectionId}))));
ASSERT_THAT(
- Query(integer_index.get(), /*property_path=*/"B", /*key_lower=*/3,
- /*key_upper=*/3),
+ this->Query(integer_index.get(), /*property_path=*/"B", /*key_lower=*/3,
+ /*key_upper=*/3),
IsOkAndHolds(ElementsAre(EqualsDocHitInfo(
/*document_id=*/1, std::vector<SectionId>{kDefaultSectionId}))));
@@ -764,12 +1081,14 @@ TYPED_TEST(NumericIndexIntegerTest, Clear) {
// kInvalidDocumentId, and the previous added keys should be deleted.
ICING_ASSERT_OK(integer_index->Clear());
EXPECT_THAT(integer_index->last_added_document_id(), Eq(kInvalidDocumentId));
- EXPECT_THAT(Query(integer_index.get(), /*property_path=*/"A", /*key_lower=*/1,
- /*key_upper=*/1),
- IsOkAndHolds(IsEmpty()));
- EXPECT_THAT(Query(integer_index.get(), /*property_path=*/"B", /*key_lower=*/3,
- /*key_upper=*/3),
- IsOkAndHolds(IsEmpty()));
+ EXPECT_THAT(
+ this->Query(integer_index.get(), /*property_path=*/"A", /*key_lower=*/1,
+ /*key_upper=*/1),
+ IsOkAndHolds(IsEmpty()));
+ EXPECT_THAT(
+ this->Query(integer_index.get(), /*property_path=*/"B", /*key_lower=*/3,
+ /*key_upper=*/3),
+ IsOkAndHolds(IsEmpty()));
// Integer index should be able to work normally after Clear().
Index(integer_index.get(), /*property_path=*/"A", /*document_id=*/3,
@@ -780,13 +1099,13 @@ TYPED_TEST(NumericIndexIntegerTest, Clear) {
EXPECT_THAT(integer_index->last_added_document_id(), Eq(4));
EXPECT_THAT(
- Query(integer_index.get(), /*property_path=*/"A", /*key_lower=*/123,
- /*key_upper=*/123),
+ this->Query(integer_index.get(), /*property_path=*/"A", /*key_lower=*/123,
+ /*key_upper=*/123),
IsOkAndHolds(ElementsAre(EqualsDocHitInfo(
/*document_id=*/3, std::vector<SectionId>{kDefaultSectionId}))));
EXPECT_THAT(
- Query(integer_index.get(), /*property_path=*/"B", /*key_lower=*/456,
- /*key_upper=*/456),
+ this->Query(integer_index.get(), /*property_path=*/"B", /*key_lower=*/456,
+ /*key_upper=*/456),
IsOkAndHolds(ElementsAre(EqualsDocHitInfo(
/*document_id=*/4, std::vector<SectionId>{kDefaultSectionId}))));
}
@@ -1066,6 +1385,260 @@ TEST_F(IntegerIndexTest,
HasSubstr("Invalid storages crc"));
}
}
+
+TEST_F(IntegerIndexTest, WildcardStoragePersistenceQuery) {
+ // This test sets its schema assuming that max property storages == 32.
+ ASSERT_THAT(IntegerIndex::kMaxPropertyStorages, Eq(32));
+
+ PropertyConfigProto int_property_config =
+ PropertyConfigBuilder()
+ .SetName("otherProperty1")
+ .SetCardinality(CARDINALITY_REPEATED)
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .Build();
+ // Create a schema with two types:
+ // - TypeA has 34 properties:
+ // 'desiredProperty', 'otherProperty'*, 'undesiredProperty'
+ // - TypeB has 2 properties: 'anotherProperty', 'desiredProperty'
+ // 1. The 32 'otherProperty's will consume all of the individual storages
+ // 2. TypeA.desiredProperty and TypeB.anotherProperty will both be assigned
+ // SectionId = 0 for their respective types.
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("TypeA")
+ .AddProperty(int_property_config)
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty2"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty3"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty4"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty5"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty6"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty7"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty8"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty9"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty10"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty11"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty12"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty13"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty14"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty15"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty16"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty17"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty18"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty19"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty20"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty21"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty22"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty23"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty24"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty25"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty26"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty27"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty28"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty29"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty30"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty31"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty32"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("desiredProperty"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("undesiredProperty")))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("TypeB")
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("anotherProperty"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("desiredProperty")))
+ .Build();
+ ICING_ASSERT_OK(this->schema_store_->SetSchema(schema));
+
+ // Ids are assigned alphabetically, so the property ids are:
+ // TypeA.desiredProperty = 0
+ // TypeA.otherPropertyN = N
+ // TypeA.undesiredProperty = 33
+ // TypeB.anotherProperty = 0
+ // TypeB.desiredProperty = 1
+ SectionId typea_desired_prop_id = 0;
+ SectionId typea_undesired_prop_id = 33;
+ SectionId typeb_another_prop_id = 0;
+ SectionId typeb_desired_prop_id = 1;
+ std::string desired_property = "desiredProperty";
+ std::string undesired_property = "undesiredProperty";
+ std::string another_property = "anotherProperty";
+
+ // Put 11 docs of "TypeA" into the document store.
+ DocumentProto doc =
+ DocumentBuilder().SetKey("ns1", "uri0").SetSchema("TypeA").Build();
+ ICING_ASSERT_OK(this->doc_store_->Put(doc));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri1").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri2").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri3").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri4").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri5").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri6").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri7").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri8").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri9").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri10").Build()));
+
+ // Put 10 docs of "TypeB" into the document store.
+ doc = DocumentBuilder(doc).SetUri("uri11").SetSchema("TypeB").Build();
+ ICING_ASSERT_OK(this->doc_store_->Put(doc));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri12").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri13").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri14").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri15").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri16").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri17").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri18").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri19").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri20").Build()));
+
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndex> integer_index,
+ IntegerIndex::Create(filesystem_, working_path_));
+
+ // Index numeric content for other properties to force our property into the
+ // wildcard storage.
+ std::string other_property_path = "otherProperty";
+ for (int i = 1; i <= IntegerIndex::kMaxPropertyStorages; ++i) {
+ Index(integer_index.get(),
+ absl_ports::StrCat(other_property_path, std::to_string(i)),
+ /*document_id=*/0, /*section_id=*/i, /*keys=*/{i});
+ }
+
+ // Index numeric content for TypeA.desiredProperty
+ Index(integer_index.get(), desired_property, /*document_id=*/0,
+ typea_desired_prop_id, /*keys=*/{1});
+ Index(integer_index.get(), desired_property, /*document_id=*/1,
+ typea_desired_prop_id, /*keys=*/{3});
+ Index(integer_index.get(), desired_property, /*document_id=*/2,
+ typea_desired_prop_id, /*keys=*/{2});
+ Index(integer_index.get(), desired_property, /*document_id=*/3,
+ typea_desired_prop_id, /*keys=*/{0});
+ Index(integer_index.get(), desired_property, /*document_id=*/4,
+ typea_desired_prop_id, /*keys=*/{4});
+ Index(integer_index.get(), desired_property, /*document_id=*/5,
+ typea_desired_prop_id, /*keys=*/{2});
+
+ // Index the same numeric content for TypeA.undesiredProperty
+ Index(integer_index.get(), undesired_property, /*document_id=*/6,
+ typea_undesired_prop_id, /*keys=*/{3});
+ Index(integer_index.get(), undesired_property, /*document_id=*/7,
+ typea_undesired_prop_id, /*keys=*/{2});
+ Index(integer_index.get(), undesired_property, /*document_id=*/8,
+ typea_undesired_prop_id, /*keys=*/{0});
+ Index(integer_index.get(), undesired_property, /*document_id=*/9,
+ typea_undesired_prop_id, /*keys=*/{4});
+ Index(integer_index.get(), undesired_property, /*document_id=*/10,
+ typea_undesired_prop_id, /*keys=*/{2});
+
+ // Index the same numeric content for TypeB.undesiredProperty
+ Index(integer_index.get(), another_property, /*document_id=*/11,
+ typeb_another_prop_id, /*keys=*/{3});
+ Index(integer_index.get(), another_property, /*document_id=*/12,
+ typeb_another_prop_id, /*keys=*/{2});
+ Index(integer_index.get(), another_property, /*document_id=*/13,
+ typeb_another_prop_id, /*keys=*/{0});
+ Index(integer_index.get(), another_property, /*document_id=*/14,
+ typeb_another_prop_id, /*keys=*/{4});
+ Index(integer_index.get(), another_property, /*document_id=*/15,
+ typeb_another_prop_id, /*keys=*/{2});
+
+ // Finally, index the same numeric content for TypeB.desiredProperty
+ Index(integer_index.get(), desired_property, /*document_id=*/16,
+ typeb_desired_prop_id, /*keys=*/{3});
+ Index(integer_index.get(), desired_property, /*document_id=*/17,
+ typeb_desired_prop_id, /*keys=*/{2});
+ Index(integer_index.get(), desired_property, /*document_id=*/18,
+ typeb_desired_prop_id, /*keys=*/{0});
+ Index(integer_index.get(), desired_property, /*document_id=*/19,
+ typeb_desired_prop_id, /*keys=*/{4});
+ Index(integer_index.get(), desired_property, /*document_id=*/20,
+ typeb_desired_prop_id, /*keys=*/{2});
+ }
+
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<IntegerIndex> integer_index,
+ IntegerIndex::Create(filesystem_, working_path_));
+
+ EXPECT_THAT(integer_index->num_property_indices(), Eq(33));
+
+ // Only the hits for 'desired_prop_id' should be returned.
+ std::vector<SectionId> expected_sections_typea = {typea_desired_prop_id};
+ std::vector<SectionId> expected_sections_typeb = {typeb_desired_prop_id};
+ EXPECT_THAT(
+ Query(integer_index.get(), desired_property,
+ /*key_lower=*/2, /*key_upper=*/2),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/20, expected_sections_typeb),
+ EqualsDocHitInfo(/*document_id=*/17, expected_sections_typeb),
+ EqualsDocHitInfo(/*document_id=*/5, expected_sections_typea),
+ EqualsDocHitInfo(/*document_id=*/2, expected_sections_typea))));
+
+ EXPECT_THAT(
+ Query(integer_index.get(), desired_property,
+ /*key_lower=*/1, /*key_upper=*/3),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/20, expected_sections_typeb),
+ EqualsDocHitInfo(/*document_id=*/17, expected_sections_typeb),
+ EqualsDocHitInfo(/*document_id=*/16, expected_sections_typeb),
+ EqualsDocHitInfo(/*document_id=*/5, expected_sections_typea),
+ EqualsDocHitInfo(/*document_id=*/2, expected_sections_typea),
+ EqualsDocHitInfo(/*document_id=*/1, expected_sections_typea),
+ EqualsDocHitInfo(/*document_id=*/0, expected_sections_typea))));
+}
+
TEST_F(IntegerIndexTest,
IntegerIndexShouldWorkAfterOptimizeAndReinitialization) {
constexpr std::string_view kPropertyPath1 = "prop1";
@@ -1183,6 +1756,550 @@ TEST_F(IntegerIndexTest,
}
}
+TEST_F(IntegerIndexTest, WildcardStorageWorksAfterOptimize) {
+ // This test sets its schema assuming that max property storages == 32.
+ ASSERT_THAT(IntegerIndex::kMaxPropertyStorages, Eq(32));
+
+ PropertyConfigProto int_property_config =
+ PropertyConfigBuilder()
+ .SetName("otherProperty1")
+ .SetCardinality(CARDINALITY_REPEATED)
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .Build();
+ // Create a schema with two types:
+ // - TypeA has 34 properties:
+ // 'desiredProperty', 'otherProperty'*, 'undesiredProperty'
+ // - TypeB has 2 properties: 'anotherProperty', 'desiredProperty'
+ // 1. The 32 'otherProperty's will consume all of the individual storages
+ // 2. TypeA.desiredProperty and TypeB.anotherProperty will both be assigned
+ // SectionId = 0 for their respective types.
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("TypeA")
+ .AddProperty(int_property_config)
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty2"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty3"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty4"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty5"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty6"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty7"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty8"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty9"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty10"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty11"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty12"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty13"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty14"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty15"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty16"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty17"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty18"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty19"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty20"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty21"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty22"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty23"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty24"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty25"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty26"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty27"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty28"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty29"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty30"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty31"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty32"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("desiredProperty"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("undesiredProperty")))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("TypeB")
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("anotherProperty"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("desiredProperty")))
+ .Build();
+ ICING_ASSERT_OK(this->schema_store_->SetSchema(schema));
+
+ // Ids are assigned alphabetically, so the property ids are:
+ // TypeA.desiredProperty = 0
+ // TypeA.otherPropertyN = N
+ // TypeA.undesiredProperty = 33
+ // TypeB.anotherProperty = 0
+ // TypeB.desiredProperty = 1
+ SectionId typea_desired_prop_id = 0;
+ SectionId typea_undesired_prop_id = 33;
+ SectionId typeb_another_prop_id = 0;
+ SectionId typeb_desired_prop_id = 1;
+ std::string desired_property = "desiredProperty";
+ std::string undesired_property = "undesiredProperty";
+ std::string another_property = "anotherProperty";
+
+ // Only the hits for 'desired_prop_id' should be returned.
+ std::vector<SectionId> expected_sections_typea = {typea_desired_prop_id};
+ std::vector<SectionId> expected_sections_typeb = {typeb_desired_prop_id};
+
+ // Put 11 docs of "TypeA" into the document store.
+ DocumentProto doc =
+ DocumentBuilder().SetKey("ns1", "uri0").SetSchema("TypeA").Build();
+ ICING_ASSERT_OK(this->doc_store_->Put(doc));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri1").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri2").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri3").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri4").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri5").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri6").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri7").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri8").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri9").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri10").Build()));
+
+ // Put 10 docs of "TypeB" into the document store.
+ doc = DocumentBuilder(doc).SetUri("uri11").SetSchema("TypeB").Build();
+ ICING_ASSERT_OK(this->doc_store_->Put(doc));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri12").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri13").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri14").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri15").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri16").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri17").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri18").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri19").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri20").Build()));
+
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndex> integer_index,
+ IntegerIndex::Create(filesystem_, working_path_));
+
+ // Index numeric content for other properties to force our property into the
+ // wildcard storage.
+ std::string other_property_path = "otherProperty";
+ for (int i = 1; i <= IntegerIndex::kMaxPropertyStorages; ++i) {
+ Index(integer_index.get(),
+ absl_ports::StrCat(other_property_path, std::to_string(i)),
+ /*document_id=*/0, /*section_id=*/i, /*keys=*/{i});
+ }
+
+ // Index numeric content for TypeA.desiredProperty
+ Index(integer_index.get(), desired_property, /*document_id=*/0,
+ typea_desired_prop_id, /*keys=*/{1});
+ Index(integer_index.get(), desired_property, /*document_id=*/1,
+ typea_desired_prop_id, /*keys=*/{3});
+ Index(integer_index.get(), desired_property, /*document_id=*/2,
+ typea_desired_prop_id, /*keys=*/{2});
+ Index(integer_index.get(), desired_property, /*document_id=*/3,
+ typea_desired_prop_id, /*keys=*/{0});
+ Index(integer_index.get(), desired_property, /*document_id=*/4,
+ typea_desired_prop_id, /*keys=*/{4});
+ Index(integer_index.get(), desired_property, /*document_id=*/5,
+ typea_desired_prop_id, /*keys=*/{2});
+
+ // Index the same numeric content for TypeA.undesiredProperty
+ Index(integer_index.get(), undesired_property, /*document_id=*/6,
+ typea_undesired_prop_id, /*keys=*/{3});
+ Index(integer_index.get(), undesired_property, /*document_id=*/7,
+ typea_undesired_prop_id, /*keys=*/{2});
+ Index(integer_index.get(), undesired_property, /*document_id=*/8,
+ typea_undesired_prop_id, /*keys=*/{0});
+ Index(integer_index.get(), undesired_property, /*document_id=*/9,
+ typea_undesired_prop_id, /*keys=*/{4});
+ Index(integer_index.get(), undesired_property, /*document_id=*/10,
+ typea_undesired_prop_id, /*keys=*/{2});
+
+ // Index the same numeric content for TypeB.undesiredProperty
+ Index(integer_index.get(), another_property, /*document_id=*/11,
+ typeb_another_prop_id, /*keys=*/{3});
+ Index(integer_index.get(), another_property, /*document_id=*/12,
+ typeb_another_prop_id, /*keys=*/{2});
+ Index(integer_index.get(), another_property, /*document_id=*/13,
+ typeb_another_prop_id, /*keys=*/{0});
+ Index(integer_index.get(), another_property, /*document_id=*/14,
+ typeb_another_prop_id, /*keys=*/{4});
+ Index(integer_index.get(), another_property, /*document_id=*/15,
+ typeb_another_prop_id, /*keys=*/{2});
+
+ // Finally, index the same numeric content for TypeB.desiredProperty
+ Index(integer_index.get(), desired_property, /*document_id=*/16,
+ typeb_desired_prop_id, /*keys=*/{3});
+ Index(integer_index.get(), desired_property, /*document_id=*/17,
+ typeb_desired_prop_id, /*keys=*/{2});
+ Index(integer_index.get(), desired_property, /*document_id=*/18,
+ typeb_desired_prop_id, /*keys=*/{0});
+ Index(integer_index.get(), desired_property, /*document_id=*/19,
+ typeb_desired_prop_id, /*keys=*/{4});
+ Index(integer_index.get(), desired_property, /*document_id=*/20,
+ typeb_desired_prop_id, /*keys=*/{2});
+
+ ICING_ASSERT_OK(doc_store_->Delete(/*document_id=*/3));
+ ICING_ASSERT_OK(doc_store_->Delete(/*document_id=*/5));
+ // Delete doc id = 3, 5, compress and keep the rest.
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<DocumentId> document_id_old_to_new,
+ CompactDocStore());
+
+ DocumentId new_last_added_document_id = 18;
+ EXPECT_THAT(integer_index->Optimize(document_id_old_to_new,
+ new_last_added_document_id),
+ IsOk());
+ EXPECT_THAT(integer_index->last_added_document_id(),
+ Eq(new_last_added_document_id));
+
+ EXPECT_THAT(
+ Query(integer_index.get(), desired_property,
+ /*key_lower=*/2, /*key_upper=*/2),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/20 - 2, expected_sections_typeb),
+ EqualsDocHitInfo(/*document_id=*/17 - 2, expected_sections_typeb),
+ EqualsDocHitInfo(/*document_id=*/2, expected_sections_typea))));
+
+ EXPECT_THAT(
+ Query(integer_index.get(), desired_property,
+ /*key_lower=*/1, /*key_upper=*/3),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/20 - 2, expected_sections_typeb),
+ EqualsDocHitInfo(/*document_id=*/17 - 2, expected_sections_typeb),
+ EqualsDocHitInfo(/*document_id=*/16 - 2, expected_sections_typeb),
+ EqualsDocHitInfo(/*document_id=*/2, expected_sections_typea),
+ EqualsDocHitInfo(/*document_id=*/1, expected_sections_typea),
+ EqualsDocHitInfo(/*document_id=*/0, expected_sections_typea))));
+ }
+
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<IntegerIndex> integer_index,
+ IntegerIndex::Create(filesystem_, working_path_));
+
+ EXPECT_THAT(integer_index->num_property_indices(), Eq(33));
+
+ EXPECT_THAT(
+ Query(integer_index.get(), desired_property,
+ /*key_lower=*/2, /*key_upper=*/2),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/20 - 2, expected_sections_typeb),
+ EqualsDocHitInfo(/*document_id=*/17 - 2, expected_sections_typeb),
+ EqualsDocHitInfo(/*document_id=*/2, expected_sections_typea))));
+
+ EXPECT_THAT(
+ Query(integer_index.get(), desired_property,
+ /*key_lower=*/1, /*key_upper=*/3),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/20 - 2, expected_sections_typeb),
+ EqualsDocHitInfo(/*document_id=*/17 - 2, expected_sections_typeb),
+ EqualsDocHitInfo(/*document_id=*/16 - 2, expected_sections_typeb),
+ EqualsDocHitInfo(/*document_id=*/2, expected_sections_typea),
+ EqualsDocHitInfo(/*document_id=*/1, expected_sections_typea),
+ EqualsDocHitInfo(/*document_id=*/0, expected_sections_typea))));
+}
+
+// This test covers the situation where Optimize causes us to throw out some of
+// the individual index storages (because they don't have any hits anymore).
+// In this case, any properties that added content to the wildcard storage (even
+// if all of their content was also deleted) should still be placed in the
+// wilcard storage.
+TEST_F(IntegerIndexTest, WildcardStorageAvailableIndicesAfterOptimize) {
+ // This test sets its schema assuming that max property storages == 32.
+ ASSERT_THAT(IntegerIndex::kMaxPropertyStorages, Eq(32));
+
+ PropertyConfigProto int_property_config =
+ PropertyConfigBuilder()
+ .SetName("otherProperty1")
+ .SetCardinality(CARDINALITY_REPEATED)
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .Build();
+ // Create a schema with two types:
+ // - TypeA has 34 properties:
+ // 'desiredProperty', 'otherProperty'*, 'undesiredProperty'
+ // - TypeB has 2 properties: 'anotherProperty', 'desiredProperty'
+ // 1. The 32 'otherProperty's will consume all of the individual storages
+ // 2. TypeA.desiredProperty and TypeB.anotherProperty will both be assigned
+ // SectionId = 0 for their respective types.
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("TypeA")
+ .AddProperty(int_property_config)
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty2"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty3"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty4"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty5"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty6"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty7"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty8"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty9"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty10"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty11"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty12"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty13"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty14"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty15"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty16"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty17"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty18"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty19"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty20"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty21"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty22"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty23"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty24"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty25"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty26"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty27"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty28"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty29"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty30"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty31"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty32"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("desiredProperty"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("undesiredProperty")))
+ .Build();
+ ICING_ASSERT_OK(this->schema_store_->SetSchema(schema));
+
+ // Ids are assigned alphabetically, so the property ids are:
+ // TypeA.desiredProperty = 0
+ // TypeA.otherPropertyN = N
+ // TypeA.undesiredProperty = 33
+ // TypeB.anotherProperty = 0
+ // TypeB.desiredProperty = 1
+ SectionId typea_desired_prop_id = 0;
+ SectionId typea_undesired_prop_id = 33;
+ SectionId typea_other1_prop_id = 1;
+ std::string desired_property = "desiredProperty";
+ std::string undesired_property = "undesiredProperty";
+ std::string another_property = "anotherProperty";
+ std::string other_property_1 = "otherProperty1";
+
+ // Only the hits for 'desired_prop_id' should be returned.
+ std::vector<SectionId> expected_sections_typea = {typea_desired_prop_id};
+
+ // Put 11 docs of "TypeA" into the document store.
+ DocumentProto doc =
+ DocumentBuilder().SetKey("ns1", "uri0").SetSchema("TypeA").Build();
+ ICING_ASSERT_OK(this->doc_store_->Put(doc));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri1").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri2").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri3").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri4").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri5").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri6").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri7").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri8").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri9").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri10").Build()));
+
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndex> integer_index,
+ IntegerIndex::Create(filesystem_, working_path_));
+
+ // Index numeric content for other properties to force our property into the
+ // wildcard storage.
+ std::string other_property_path = "otherProperty";
+ for (int i = 1; i <= IntegerIndex::kMaxPropertyStorages; ++i) {
+ Index(integer_index.get(),
+ absl_ports::StrCat(other_property_path, std::to_string(i)),
+ /*document_id=*/0, /*section_id=*/i, /*keys=*/{i});
+ }
+
+ // Index numeric content for TypeA.desiredProperty
+ Index(integer_index.get(), desired_property, /*document_id=*/0,
+ typea_desired_prop_id, /*keys=*/{1});
+ Index(integer_index.get(), desired_property, /*document_id=*/1,
+ typea_desired_prop_id, /*keys=*/{3});
+ Index(integer_index.get(), desired_property, /*document_id=*/2,
+ typea_desired_prop_id, /*keys=*/{2});
+ Index(integer_index.get(), desired_property, /*document_id=*/3,
+ typea_desired_prop_id, /*keys=*/{0});
+ Index(integer_index.get(), desired_property, /*document_id=*/4,
+ typea_desired_prop_id, /*keys=*/{4});
+ Index(integer_index.get(), desired_property, /*document_id=*/5,
+ typea_desired_prop_id, /*keys=*/{2});
+
+ // Index the same numeric content for TypeA.undesiredProperty
+ Index(integer_index.get(), undesired_property, /*document_id=*/6,
+ typea_undesired_prop_id, /*keys=*/{3});
+ Index(integer_index.get(), undesired_property, /*document_id=*/7,
+ typea_undesired_prop_id, /*keys=*/{2});
+ Index(integer_index.get(), undesired_property, /*document_id=*/8,
+ typea_undesired_prop_id, /*keys=*/{0});
+ Index(integer_index.get(), undesired_property, /*document_id=*/9,
+ typea_undesired_prop_id, /*keys=*/{4});
+ Index(integer_index.get(), undesired_property, /*document_id=*/10,
+ typea_undesired_prop_id, /*keys=*/{2});
+
+ // Delete all the docs that had hits in otherProperty* and
+ // undesiredProperty.
+ ICING_ASSERT_OK(doc_store_->Delete(/*document_id=*/0));
+ ICING_ASSERT_OK(doc_store_->Delete(/*document_id=*/6));
+ ICING_ASSERT_OK(doc_store_->Delete(/*document_id=*/7));
+ ICING_ASSERT_OK(doc_store_->Delete(/*document_id=*/8));
+ ICING_ASSERT_OK(doc_store_->Delete(/*document_id=*/9));
+ ICING_ASSERT_OK(doc_store_->Delete(/*document_id=*/10));
+ // Delete doc id = 0, 6, 7, 8, 9, 10. Compress and keep the rest.
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<DocumentId> document_id_old_to_new,
+ CompactDocStore());
+
+ DocumentId new_last_added_document_id = 5 - 1;
+ EXPECT_THAT(integer_index->Optimize(document_id_old_to_new,
+ new_last_added_document_id),
+ IsOk());
+ EXPECT_THAT(integer_index->last_added_document_id(),
+ Eq(new_last_added_document_id));
+
+ EXPECT_THAT(
+ Query(integer_index.get(), desired_property,
+ /*key_lower=*/2, /*key_upper=*/2),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/5 - 1, expected_sections_typea),
+ EqualsDocHitInfo(/*document_id=*/2 - 1, expected_sections_typea))));
+
+ EXPECT_THAT(
+ Query(integer_index.get(), desired_property,
+ /*key_lower=*/1, /*key_upper=*/3),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/5 - 1, expected_sections_typea),
+ EqualsDocHitInfo(/*document_id=*/2 - 1, expected_sections_typea),
+ EqualsDocHitInfo(/*document_id=*/1 - 1, expected_sections_typea))));
+ }
+
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<IntegerIndex> integer_index,
+ IntegerIndex::Create(filesystem_, working_path_));
+
+ EXPECT_THAT(integer_index->num_property_indices(), Eq(1));
+
+ // Add a new doc (docid==5) and a hit for desiredProperty. This should still
+ // be placed into the wildcard integer storage.
+ doc = DocumentBuilder().SetKey("ns1", "uri11").SetSchema("TypeA").Build();
+ ICING_ASSERT_OK(this->doc_store_->Put(doc));
+ Index(integer_index.get(), desired_property, /*document_id=*/5,
+ typea_desired_prop_id, /*keys=*/{12});
+ EXPECT_THAT(integer_index->num_property_indices(), Eq(1));
+
+ EXPECT_THAT(Query(integer_index.get(), desired_property,
+ /*key_lower=*/12, /*key_upper=*/12),
+ IsOkAndHolds(ElementsAre(EqualsDocHitInfo(
+ /*document_id=*/5, expected_sections_typea))));
+
+ // Add a new doc (docid==6) and a hit for undesiredProperty. This should still
+ // be placed into the wildcard integer storage.
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri12").Build()));
+ Index(integer_index.get(), undesired_property, /*document_id=*/6,
+ typea_undesired_prop_id, /*keys=*/{3});
+ EXPECT_THAT(integer_index->num_property_indices(), Eq(1));
+
+ expected_sections_typea = {typea_undesired_prop_id};
+ EXPECT_THAT(Query(integer_index.get(), undesired_property,
+ /*key_lower=*/3, /*key_upper=*/3),
+ IsOkAndHolds(ElementsAre(EqualsDocHitInfo(
+ /*document_id=*/6, expected_sections_typea))));
+
+ // Add a new doc (docid==7) and a hit for otherProperty1. This should be given
+ // its own individual storage.
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri13").Build()));
+ Index(integer_index.get(), other_property_1, /*document_id=*/7,
+ typea_other1_prop_id, /*keys=*/{3});
+ EXPECT_THAT(integer_index->num_property_indices(), Eq(2));
+
+ expected_sections_typea = {typea_other1_prop_id};
+ EXPECT_THAT(Query(integer_index.get(), other_property_1,
+ /*key_lower=*/3, /*key_upper=*/3),
+ IsOkAndHolds(ElementsAre(EqualsDocHitInfo(
+ /*document_id=*/7, expected_sections_typea))));
+}
+
} // namespace
} // namespace lib
diff --git a/icing/index/numeric/numeric-index.h b/icing/index/numeric/numeric-index.h
index 347260a..28640ca 100644
--- a/icing/index/numeric/numeric-index.h
+++ b/icing/index/numeric/numeric-index.h
@@ -23,8 +23,10 @@
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/file/persistent-storage.h"
#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/schema/schema-store.h"
#include "icing/schema/section.h"
#include "icing/store/document-id.h"
+#include "icing/store/document-store.h"
namespace icing {
namespace lib {
@@ -126,8 +128,9 @@ class NumericIndex : public PersistentStorage {
// - INVALID_ARGUMENT_ERROR if key_lower > key_upper
// - Any other errors, depending on the actual implementation
virtual libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>>
- GetIterator(std::string_view property_path, T key_lower,
- T key_upper) const = 0;
+ GetIterator(std::string_view property_path, T key_lower, T key_upper,
+ const DocumentStore& document_store,
+ const SchemaStore& schema_store) const = 0;
// Reduces internal file sizes by reclaiming space and ids of deleted
// documents. Numeric index will convert all data (hits) to the new document
@@ -162,6 +165,10 @@ class NumericIndex : public PersistentStorage {
// last_added_document_id() or last_added_document_id() is invalid.
virtual void set_last_added_document_id(DocumentId document_id) = 0;
+ // The number of individual indices that the NumericIndex has created to
+ // search over all indexed properties thus far.
+ virtual int num_property_indices() const = 0;
+
protected:
explicit NumericIndex(const Filesystem& filesystem,
std::string&& working_path,
diff --git a/icing/index/numeric/posting-list-integer-index-accessor.cc b/icing/index/numeric/posting-list-integer-index-accessor.cc
index 220b240..af2aea4 100644
--- a/icing/index/numeric/posting-list-integer-index-accessor.cc
+++ b/icing/index/numeric/posting-list-integer-index-accessor.cc
@@ -64,6 +64,58 @@ PostingListIntegerIndexAccessor::CreateFromExisting(
// Returns the next batch of integer index data for the provided posting list.
libtextclassifier3::StatusOr<std::vector<IntegerIndexData>>
PostingListIntegerIndexAccessor::GetNextDataBatch() {
+ return GetNextDataBatchImpl(/*free_posting_list=*/false);
+}
+
+libtextclassifier3::StatusOr<std::vector<IntegerIndexData>>
+PostingListIntegerIndexAccessor::GetAllDataAndFree() {
+ if (preexisting_posting_list_ == nullptr) {
+ return absl_ports::FailedPreconditionError(
+ "Cannot retrieve data from a PostingListIntegerIndexAccessor that "
+ "was not created from a preexisting posting list.");
+ }
+
+ std::vector<IntegerIndexData> all_data;
+ while (true) {
+ ICING_ASSIGN_OR_RETURN(std::vector<IntegerIndexData> batch,
+ GetNextDataBatchImpl(/*free_posting_list=*/true));
+ if (batch.empty()) {
+ break;
+ }
+ std::move(batch.begin(), batch.end(), std::back_inserter(all_data));
+ }
+
+ return all_data;
+}
+
+libtextclassifier3::Status PostingListIntegerIndexAccessor::PrependData(
+ const IntegerIndexData& data) {
+ PostingListUsed& active_pl = (preexisting_posting_list_ != nullptr)
+ ? preexisting_posting_list_->posting_list
+ : in_memory_posting_list_;
+ libtextclassifier3::Status status =
+ serializer_->PrependData(&active_pl, data);
+ if (!absl_ports::IsResourceExhausted(status)) {
+ return status;
+ }
+ // There is no more room to add data to this current posting list! Therefore,
+ // we need to either move those data to a larger posting list or flush this
+ // posting list and create another max-sized posting list in the chain.
+ if (preexisting_posting_list_ != nullptr) {
+ ICING_RETURN_IF_ERROR(FlushPreexistingPostingList());
+ } else {
+ ICING_RETURN_IF_ERROR(FlushInMemoryPostingList());
+ }
+
+ // Re-add data. Should always fit since we just cleared
+ // in_memory_posting_list_. It's fine to explicitly reference
+ // in_memory_posting_list_ here because there's no way of reaching this line
+ // while preexisting_posting_list_ is still in use.
+ return serializer_->PrependData(&in_memory_posting_list_, data);
+}
+
+libtextclassifier3::StatusOr<std::vector<IntegerIndexData>>
+PostingListIntegerIndexAccessor::GetNextDataBatchImpl(bool free_posting_list) {
if (preexisting_posting_list_ == nullptr) {
if (has_reached_posting_list_chain_end_) {
return std::vector<IntegerIndexData>();
@@ -85,6 +137,11 @@ PostingListIntegerIndexAccessor::GetNextDataBatch() {
next_block_index = preexisting_posting_list_->next_block_index;
}
+ if (free_posting_list) {
+ ICING_RETURN_IF_ERROR(
+ storage_->FreePostingList(std::move(*preexisting_posting_list_)));
+ }
+
if (next_block_index != kInvalidBlockIndex) {
// Since we only have to deal with next block for max-sized posting list
// block, max_num_posting_lists is 1 and posting_list_index_bits is
@@ -103,31 +160,5 @@ PostingListIntegerIndexAccessor::GetNextDataBatch() {
return batch;
}
-libtextclassifier3::Status PostingListIntegerIndexAccessor::PrependData(
- const IntegerIndexData& data) {
- PostingListUsed& active_pl = (preexisting_posting_list_ != nullptr)
- ? preexisting_posting_list_->posting_list
- : in_memory_posting_list_;
- libtextclassifier3::Status status =
- serializer_->PrependData(&active_pl, data);
- if (!absl_ports::IsResourceExhausted(status)) {
- return status;
- }
- // There is no more room to add data to this current posting list! Therefore,
- // we need to either move those data to a larger posting list or flush this
- // posting list and create another max-sized posting list in the chain.
- if (preexisting_posting_list_ != nullptr) {
- ICING_RETURN_IF_ERROR(FlushPreexistingPostingList());
- } else {
- ICING_RETURN_IF_ERROR(FlushInMemoryPostingList());
- }
-
- // Re-add data. Should always fit since we just cleared
- // in_memory_posting_list_. It's fine to explicitly reference
- // in_memory_posting_list_ here because there's no way of reaching this line
- // while preexisting_posting_list_ is still in use.
- return serializer_->PrependData(&in_memory_posting_list_, data);
-}
-
} // namespace lib
} // namespace icing
diff --git a/icing/index/numeric/posting-list-integer-index-accessor.h b/icing/index/numeric/posting-list-integer-index-accessor.h
index 4c1eced..f0d3d25 100644
--- a/icing/index/numeric/posting-list-integer-index-accessor.h
+++ b/icing/index/numeric/posting-list-integer-index-accessor.h
@@ -50,7 +50,7 @@ class PostingListIntegerIndexAccessor : public PostingListAccessor {
Create(FlashIndexStorage* storage,
PostingListIntegerIndexSerializer* serializer);
- // Create a PostingListIntegerIndexAccessor with an existing posting list
+ // Creates a PostingListIntegerIndexAccessor with an existing posting list
// identified by existing_posting_list_id.
//
// RETURNS:
@@ -64,17 +64,30 @@ class PostingListIntegerIndexAccessor : public PostingListAccessor {
PostingListSerializer* GetSerializer() override { return serializer_; }
- // Retrieve the next batch of data in the posting list chain
+ // Retrieves the next batch of data in the posting list chain.
//
// RETURNS:
// - On success, a vector of integer index data in the posting list chain
- // - INTERNAL if called on an instance that was created via Create, if
- // unable to read the next posting list in the chain or if the posting
- // list has been corrupted somehow.
+ // - FAILED_PRECONDITION_ERROR if called on an instance that was created via
+ // Create.
+ // - INTERNAL_ERROR if unable to read the next posting list in the chain or
+ // if the posting list has been corrupted somehow.
libtextclassifier3::StatusOr<std::vector<IntegerIndexData>>
GetNextDataBatch();
- // Prepend one data. This may result in flushing the posting list to disk (if
+ // Retrieves all data from the posting list chain and frees all posting
+ // list(s).
+ //
+ // RETURNS:
+ // - On success, a vector of integer index data in the posting list chain
+ // - FAILED_PRECONDITION_ERROR if called on an instance that was created via
+ // Create.
+ // - INTERNAL_ERROR if unable to read the next posting list in the chain or
+ // if the posting list has been corrupted somehow.
+ libtextclassifier3::StatusOr<std::vector<IntegerIndexData>>
+ GetAllDataAndFree();
+
+ // Prepends one data. This may result in flushing the posting list to disk (if
// the PostingListIntegerIndexAccessor holds a max-sized posting list that
// is full) or freeing a pre-existing posting list if it is too small to fit
// all data necessary.
@@ -87,7 +100,15 @@ class PostingListIntegerIndexAccessor : public PostingListAccessor {
// posting list.
libtextclassifier3::Status PrependData(const IntegerIndexData& data);
- // TODO(b/259743562): [Optimization 1] add GetAndClear, IsFull for split
+ bool WantsSplit() const {
+ const PostingListUsed* current_pl =
+ preexisting_posting_list_ != nullptr
+ ? &preexisting_posting_list_->posting_list
+ : &in_memory_posting_list_;
+ // Only max-sized PLs get split. Smaller PLs just get copied to larger PLs.
+ return current_pl->size_in_bytes() == storage_->max_posting_list_bytes() &&
+ serializer_->IsFull(current_pl);
+ }
private:
explicit PostingListIntegerIndexAccessor(
@@ -96,6 +117,20 @@ class PostingListIntegerIndexAccessor : public PostingListAccessor {
: PostingListAccessor(storage, std::move(in_memory_posting_list)),
serializer_(serializer) {}
+ // Retrieves the next batch of data in the posting list chain.
+ //
+ // - free_posting_list: a boolean flag indicating whether freeing all posting
+ // lists after retrieving batch data.
+ //
+ // RETURNS:
+ // - On success, a vector of integer index data in the posting list chain
+ // - FAILED_PRECONDITION_ERROR if called on an instance that was created via
+ // Create.
+ // - INTERNAL_ERROR if unable to read the next posting list in the chain or
+ // if the posting list has been corrupted somehow.
+ libtextclassifier3::StatusOr<std::vector<IntegerIndexData>>
+ GetNextDataBatchImpl(bool free_posting_list);
+
PostingListIntegerIndexSerializer* serializer_; // Does not own.
};
diff --git a/icing/index/numeric/posting-list-integer-index-accessor_test.cc b/icing/index/numeric/posting-list-integer-index-accessor_test.cc
index 48221b9..f655fea 100644
--- a/icing/index/numeric/posting-list-integer-index-accessor_test.cc
+++ b/icing/index/numeric/posting-list-integer-index-accessor_test.cc
@@ -25,6 +25,7 @@
#include "gtest/gtest.h"
#include "icing/file/filesystem.h"
#include "icing/file/posting_list/flash-index-storage.h"
+#include "icing/file/posting_list/posting-list-common.h"
#include "icing/file/posting_list/posting-list-identifier.h"
#include "icing/index/numeric/integer-index-data.h"
#include "icing/index/numeric/posting-list-integer-index-serializer.h"
@@ -42,6 +43,7 @@ using ::testing::ElementsAre;
using ::testing::ElementsAreArray;
using ::testing::Eq;
using ::testing::Lt;
+using ::testing::Ne;
using ::testing::SizeIs;
class PostingListIntegerIndexAccessorTest : public ::testing::Test {
@@ -402,6 +404,131 @@ TEST_F(PostingListIntegerIndexAccessorTest,
EXPECT_THAT(result2.status, IsOk());
}
+TEST_F(PostingListIntegerIndexAccessorTest, GetAllDataAndFree) {
+ IntegerIndexData data1(/*section_id=*/3, /*document_id=*/1, /*key=*/123);
+ IntegerIndexData data2(/*section_id=*/3, /*document_id=*/2, /*key=*/456);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PostingListIntegerIndexAccessor> pl_accessor1,
+ PostingListIntegerIndexAccessor::Create(flash_index_storage_.get(),
+ serializer_.get()));
+ // Add 2 data.
+ ICING_ASSERT_OK(pl_accessor1->PrependData(data1));
+ ICING_ASSERT_OK(pl_accessor1->PrependData(data2));
+ PostingListAccessor::FinalizeResult result1 =
+ std::move(*pl_accessor1).Finalize();
+ ICING_ASSERT_OK(result1.status);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PostingListIntegerIndexAccessor> pl_accessor2,
+ PostingListIntegerIndexAccessor::CreateFromExisting(
+ flash_index_storage_.get(), serializer_.get(), result1.id));
+ EXPECT_THAT(pl_accessor2->GetAllDataAndFree(),
+ IsOkAndHolds(ElementsAre(data2, data1)));
+
+ // Allocate a new posting list with same size again.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PostingListIntegerIndexAccessor> pl_accessor3,
+ PostingListIntegerIndexAccessor::Create(flash_index_storage_.get(),
+ serializer_.get()));
+ // Add 2 data.
+ ICING_ASSERT_OK(pl_accessor3->PrependData(data1));
+ ICING_ASSERT_OK(pl_accessor3->PrependData(data2));
+ PostingListAccessor::FinalizeResult result3 =
+ std::move(*pl_accessor3).Finalize();
+ ICING_ASSERT_OK(result3.status);
+ // We should get the same id if the previous one has been freed correctly by
+ // GetAllDataAndFree.
+ EXPECT_THAT(result3.id, Eq(result1.id));
+}
+
+TEST_F(PostingListIntegerIndexAccessorTest, GetAllDataAndFreePostingListChain) {
+ uint32_t block_size = FlashIndexStorage::SelectBlockSize();
+ uint32_t max_posting_list_bytes = IndexBlock::CalculateMaxPostingListBytes(
+ block_size, serializer_->GetDataTypeBytes());
+ uint32_t max_num_data_single_posting_list =
+ max_posting_list_bytes / serializer_->GetDataTypeBytes();
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PostingListIntegerIndexAccessor> pl_accessor1,
+ PostingListIntegerIndexAccessor::Create(flash_index_storage_.get(),
+ serializer_.get()));
+
+ // Prepend max_num_data_single_posting_list + 1 data.
+ std::vector<IntegerIndexData> data_vec;
+ for (uint32_t i = 0; i < max_num_data_single_posting_list + 1; ++i) {
+ IntegerIndexData data(/*section_id=*/3, static_cast<DocumentId>(i),
+ /*key=*/i);
+ ICING_ASSERT_OK(pl_accessor1->PrependData(data));
+ data_vec.push_back(data);
+ }
+
+ // This will cause:
+ // - Allocate the first max-sized posting list at block index = 1, storing
+ // max_num_data_single_posting_list data.
+ // - Allocate the second max-sized posting list at block index = 2, storing 1
+ // data. Also its next_block_index is 1.
+ // - IOW, we will get 2 -> 1 and result1.id points to 2.
+ PostingListAccessor::FinalizeResult result1 =
+ std::move(*pl_accessor1).Finalize();
+ ICING_ASSERT_OK(result1.status);
+
+ uint32_t first_pl_block_index = kInvalidBlockIndex;
+ {
+ // result1.id points at the second (max-sized) PL, and next_block_index of
+ // the second PL points to the first PL's block. Fetch the first PL's block
+ // index manually.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListHolder pl_holder,
+ flash_index_storage_->GetPostingList(result1.id));
+ first_pl_block_index = pl_holder.next_block_index;
+ }
+ ASSERT_THAT(first_pl_block_index, Ne(kInvalidBlockIndex));
+
+ // Call GetAllDataAndFree. This will free block 2 and block 1.
+ // Free block list: 1 -> 2 (since free block list is LIFO).
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PostingListIntegerIndexAccessor> pl_accessor2,
+ PostingListIntegerIndexAccessor::CreateFromExisting(
+ flash_index_storage_.get(), serializer_.get(), result1.id));
+ EXPECT_THAT(
+ pl_accessor2->GetAllDataAndFree(),
+ IsOkAndHolds(ElementsAreArray(data_vec.rbegin(), data_vec.rend())));
+ pl_accessor2.reset();
+
+ // Allocate a new posting list with same size again.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PostingListIntegerIndexAccessor> pl_accessor3,
+ PostingListIntegerIndexAccessor::Create(flash_index_storage_.get(),
+ serializer_.get()));
+ // Add same set of data.
+ for (uint32_t i = 0; i < max_num_data_single_posting_list + 1; ++i) {
+ ICING_ASSERT_OK(pl_accessor3->PrependData(data_vec[i]));
+ }
+
+ // This will cause:
+ // - Allocate the first max-sized posting list from the free block list, which
+ // is block index = 1, storing max_num_data_single_posting_list data.
+ // - Allocate the second max-sized posting list from the next block in free
+ // block list, which is block index = 2, storing 1 data. Also its
+ // next_block_index should be 1.
+ PostingListAccessor::FinalizeResult result3 =
+ std::move(*pl_accessor3).Finalize();
+ ICING_ASSERT_OK(result3.status);
+ // We should get the same id if the previous one has been freed correctly by
+ // GetAllDataAndFree.
+ EXPECT_THAT(result3.id, Eq(result1.id));
+ // Also the first PL should be the same if it has been freed correctly by
+ // GetAllDataAndFree. Since it is a max-sized posting list, we just need to
+ // verify the block index.
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListHolder pl_holder,
+ flash_index_storage_->GetPostingList(result3.id));
+ EXPECT_THAT(pl_holder.next_block_index, Eq(first_pl_block_index));
+ }
+}
+
} // namespace
} // namespace lib
diff --git a/icing/index/numeric/posting-list-integer-index-serializer.h b/icing/index/numeric/posting-list-integer-index-serializer.h
index 9cfdb7a..ea2f2da 100644
--- a/icing/index/numeric/posting-list-integer-index-serializer.h
+++ b/icing/index/numeric/posting-list-integer-index-serializer.h
@@ -111,6 +111,12 @@ class PostingListIntegerIndexSerializer : public PostingListSerializer {
libtextclassifier3::Status PopFrontData(PostingListUsed* posting_list_used,
uint32_t num_data) const;
+ // Helper function to determine if posting list is full.
+ bool IsFull(const PostingListUsed* posting_list_used) const {
+ return GetSpecialData(posting_list_used, /*index=*/0).data().is_valid() &&
+ GetSpecialData(posting_list_used, /*index=*/1).data().is_valid();
+ }
+
private:
// Posting list layout formats:
//
@@ -228,11 +234,6 @@ class PostingListIntegerIndexSerializer : public PostingListSerializer {
// +-----------------+-----------------+---+--------+-----+--------+--------+
// Helpers to determine what state the posting list is in.
- bool IsFull(const PostingListUsed* posting_list_used) const {
- return GetSpecialData(posting_list_used, /*index=*/0).data().is_valid() &&
- GetSpecialData(posting_list_used, /*index=*/1).data().is_valid();
- }
-
bool IsAlmostFull(const PostingListUsed* posting_list_used) const {
return !GetSpecialData(posting_list_used, /*index=*/0).data().is_valid() &&
GetSpecialData(posting_list_used, /*index=*/1).data().is_valid();