diff options
Diffstat (limited to 'icing/index/numeric/integer-index-bucket-util.cc')
-rw-r--r-- | icing/index/numeric/integer-index-bucket-util.cc | 205 |
1 files changed, 205 insertions, 0 deletions
diff --git a/icing/index/numeric/integer-index-bucket-util.cc b/icing/index/numeric/integer-index-bucket-util.cc new file mode 100644 index 0000000..a05baab --- /dev/null +++ b/icing/index/numeric/integer-index-bucket-util.cc @@ -0,0 +1,205 @@ +// Copyright (C) 2023 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/index/numeric/integer-index-bucket-util.h" + +#include <algorithm> +#include <cstdint> +#include <iterator> +#include <limits> +#include <utility> +#include <vector> + +#include "icing/index/numeric/integer-index-data.h" + +namespace icing { +namespace lib { + +namespace integer_index_bucket_util { + +namespace { + +// Helper function to determine if data slice [start, end) forms a "full +// single-range bucket". +// +// Full single-range bucket: keys of all data are identical and # of them exceed +// num_data_threshold. +// +// REQUIRES: data slice [start, end) are sorted by key. +inline bool WouldBeFullSingleRangeBucket( + const std::vector<IntegerIndexData>::iterator& start, + const std::vector<IntegerIndexData>::iterator& end, + int32_t num_data_threshold) { + return std::distance(start, end) > num_data_threshold && + start->key() == (end - 1)->key(); +} + +// Helper function to determine if a bucket is full single-range. +// +// REQUIRES: +// bucket.key_lower <= [bucket.start, bucket.end)->key() <= bucket.key_upper +inline bool IsFullSingleRangeBucket(const DataRangeAndBucketInfo& bucket, + int32_t num_data_threshold) { + return bucket.key_lower == bucket.key_upper && + WouldBeFullSingleRangeBucket(bucket.start, bucket.end, + num_data_threshold); +} + +// Helper function to append new bucket(s) with corresponding data slice for +// range [curr_key_lower, last_key] where last_key = (it_end - 1)->key(). +// +// Also it handles an edge case: +// If data slice [it_start, it_end) forms a "full single-range bucket" (see +// WouldBeFullSingleRangeBucket for definition), then we have to put them into a +// single range bucket [last_key, last_key] instead of [curr_key_lower, +// last_key]. Also we have to deal with range [curr_key_lower, last_key - 1]: +// - If the previous bucket exists and it is not a "full single-range bucket", +// then merge [curr_key_lower, last_key - 1] into the previous bucket, i.e. +// change the previous bucket's key_upper to (last_key - 1). Then we will end +// up having: +// - [prev_bucket.key_lower, last_key - 1] +// - [last_key, last_key] +// - Otherwise, we have to create [curr_key_lower, last_key - 1] with +// empty data. Then we will end up having (Note: prev_bucket.key_upper == +// curr_key_lower - 1): +// - [prev_bucket.key_lower, curr_key_lower - 1] +// - [curr_key_lower, last_key - 1] +// - [last_key, last_key] +// This will avoid split bucket being called too frequently. +// For example, original_key_lower = 0, original_key_upper = 50. If we have +// (num_data_threshold + 1) data with key = 20 and another data with key = 40: +// - Without this part, we will split them into [[0, 20], [21, 50]]. Then when +// adding data with key = 10 next round, we will invoke split again and split +// [0, 20] to [[0, 10], [11, 20]]. +// - With this part, we will split them into [[0, 19], [20, 20], [21, 50]], +// which will avoid splitting in the next round for key = 20. +// +// REQUIRES: it_start < it_end +void AppendNewBuckets(const std::vector<IntegerIndexData>::iterator& it_start, + const std::vector<IntegerIndexData>::iterator& it_end, + int64_t curr_key_lower, int32_t num_data_threshold, + std::vector<DataRangeAndBucketInfo>& results) { + int64_t last_key = (it_end - 1)->key(); + if (curr_key_lower < last_key && + WouldBeFullSingleRangeBucket(it_start, it_end, num_data_threshold)) { + if (!results.empty() && + !IsFullSingleRangeBucket(results.back(), num_data_threshold)) { + // Previous bucket is not full single-range, so merge it to now hold the + // range [prev_bucket.key_lower, last_key - 1]. + results.back().key_upper = last_key - 1; + } else { + // There is either no previous bucket or the previous bucket is full + // single-range. So add an empty bucket for the range [curr_key_lower, + // last_key - 1]. + results.push_back(DataRangeAndBucketInfo(it_start, it_start, + curr_key_lower, last_key - 1)); + } + curr_key_lower = last_key; + } + results.push_back( + DataRangeAndBucketInfo(it_start, it_end, curr_key_lower, last_key)); +} + +} // namespace + +std::vector<DataRangeAndBucketInfo> Split(std::vector<IntegerIndexData>& data, + int64_t original_key_lower, + int64_t original_key_upper, + int32_t num_data_threshold) { + // Early return if there is no need to split. + if (data.size() <= num_data_threshold) { + return {DataRangeAndBucketInfo(data.begin(), data.end(), original_key_lower, + original_key_upper)}; + } + + // Sort data by key. + std::sort( + data.begin(), data.end(), + [](const IntegerIndexData& lhs, const IntegerIndexData& rhs) -> bool { + return lhs.key() < rhs.key(); + }); + + std::vector<DataRangeAndBucketInfo> results; + int64_t curr_key_lower = original_key_lower; + // Sliding window [it_start, it_end) to separate data into different buckets. + auto it_start = data.begin(); + auto it_end = data.begin(); + while (it_end != data.end()) { + // Attempt to extend it_end by 1, but we have to include all data with the + // same key since they cannot be separated into different buckets. Also use + // extend_it_end to avoid modifying it_end directly. For some edge cases, + // the extension in a single round is extremely large (i.e. a lot of data + // have the same key), and we want to separate them. For example: + // - key = 0: 5 data + // - key = 1: num_data_threshold - 1 data + // In the second round, # of data in the sliding window will exceed the + // threshold. We want to separate all data with key = 0 into a single bucket + // instead of putting key = 0 and key = 1 together. Therefore, using + // extend_it_end allow us to preserve it_end of the previous round and be + // able to deal with this case. + auto extend_it_end = it_end + 1; + while (extend_it_end != data.end() && + it_end->key() == extend_it_end->key()) { + ++extend_it_end; + } + + if (std::distance(it_start, extend_it_end) > num_data_threshold && + it_start != it_end) { + // Split data between [it_start, it_end) into range [curr_key_lower, + // (it_end - 1)->key()]. + AppendNewBuckets(it_start, it_end, curr_key_lower, num_data_threshold, + results); + + // it_end at this moment won't be data.end(), so the last element of the + // new bucket can't have key == INT64_MAX. Therefore, it is safe to set + // curr_key_lower as ((it_end - 1)->key() + 1). + curr_key_lower = (it_end - 1)->key() + 1; + it_start = it_end; + } + it_end = extend_it_end; + } + + // Handle the final range [curr_key_lower, original_key_upper]. + if (curr_key_lower <= original_key_upper) { + if (it_start != it_end) { + AppendNewBuckets(it_start, it_end, curr_key_lower, num_data_threshold, + results); + + // AppendNewBuckets only handles range [curr_key_lower, (it_end - + // 1)->key()], so we have to handle range [(it_end - 1)->key() + 1, + // original_key_upper] if needed. + int64_t last_key = (it_end - 1)->key(); + if (last_key != std::numeric_limits<int64_t>::max() && + last_key + 1 <= original_key_upper) { + if (!results.empty() && + !IsFullSingleRangeBucket(results.back(), num_data_threshold)) { + results.back().key_upper = original_key_upper; + } else { + results.push_back(DataRangeAndBucketInfo( + it_start, it_start, last_key + 1, original_key_upper)); + } + } + } else { + results.push_back(DataRangeAndBucketInfo(it_start, it_end, curr_key_lower, + original_key_upper)); + } + } + + return results; +} + +} // namespace integer_index_bucket_util + +} // namespace lib +} // namespace icing |