1 files changed, 205 insertions, 0 deletions
diff --git a/icing/index/numeric/integer-index-bucket-util.cc b/icing/index/numeric/integer-index-bucket-util.cc
new file mode 100644
index 0000000..a05baab
--- /dev/null
+++ b/icing/index/numeric/integer-index-bucket-util.cc
@@ -0,0 +1,205 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/numeric/integer-index-bucket-util.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <iterator>
+#include <limits>
+#include <utility>
+#include <vector>
+
+#include "icing/index/numeric/integer-index-data.h"
+
+namespace icing {
+namespace lib {
+
+namespace integer_index_bucket_util {
+
+namespace {
+
+// Helper function to determine if data slice [start, end) forms a "full
+// single-range bucket".
+//
+// Full single-range bucket: keys of all data are identical and # of them exceed
+// num_data_threshold.
+//
+// REQUIRES: data slice [start, end) are sorted by key.
+inline bool WouldBeFullSingleRangeBucket(
+    const std::vector<IntegerIndexData>::iterator& start,
+    const std::vector<IntegerIndexData>::iterator& end,
+    int32_t num_data_threshold) {
+  return std::distance(start, end) > num_data_threshold &&
+         start->key() == (end - 1)->key();
+}
+
+// Helper function to determine if a bucket is full single-range.
+//
+// REQUIRES:
+//   bucket.key_lower <= [bucket.start, bucket.end)->key() <= bucket.key_upper
+inline bool IsFullSingleRangeBucket(const DataRangeAndBucketInfo& bucket,
+                                    int32_t num_data_threshold) {
+  return bucket.key_lower == bucket.key_upper &&
+         WouldBeFullSingleRangeBucket(bucket.start, bucket.end,
+                                      num_data_threshold);
+}
+
+// Helper function to append new bucket(s) with corresponding data slice for
+// range [curr_key_lower, last_key] where last_key = (it_end - 1)->key().
+//
+// Also it handles an edge case:
+// If data slice [it_start, it_end) forms a "full single-range bucket" (see
+// WouldBeFullSingleRangeBucket for definition), then we have to put them into a
+// single range bucket [last_key, last_key] instead of [curr_key_lower,
+// last_key]. Also we have to deal with range [curr_key_lower, last_key - 1]:
+// - If the previous bucket exists and it is not a "full single-range bucket",
+//   then merge [curr_key_lower, last_key - 1] into the previous bucket, i.e.
+//   change the previous bucket's key_upper to (last_key - 1). Then we will end
+//   up having:
+//   - [prev_bucket.key_lower, last_key - 1]
+//   - [last_key, last_key]
+// - Otherwise, we have to create [curr_key_lower, last_key - 1] with
+//   empty data. Then we will end up having (Note: prev_bucket.key_upper ==
+//   curr_key_lower - 1):
+//   - [prev_bucket.key_lower, curr_key_lower - 1]
+//   - [curr_key_lower, last_key - 1]
+//   - [last_key, last_key]
+// This will avoid split bucket being called too frequently.
+// For example, original_key_lower = 0, original_key_upper = 50. If we have
+// (num_data_threshold + 1) data with key = 20 and another data with key = 40:
+// - Without this part, we will split them into [[0, 20], [21, 50]]. Then when
+//   adding data with key = 10 next round, we will invoke split again and split
+//   [0, 20] to [[0, 10], [11, 20]].
+// - With this part, we will split them into [[0, 19], [20, 20], [21, 50]],
+//   which will avoid splitting in the next round for key = 20.
+//
+// REQUIRES: it_start < it_end
+void AppendNewBuckets(const std::vector<IntegerIndexData>::iterator& it_start,
+                      const std::vector<IntegerIndexData>::iterator& it_end,
+                      int64_t curr_key_lower, int32_t num_data_threshold,
+                      std::vector<DataRangeAndBucketInfo>& results) {
+  int64_t last_key = (it_end - 1)->key();
+  if (curr_key_lower < last_key &&
+      WouldBeFullSingleRangeBucket(it_start, it_end, num_data_threshold)) {
+    if (!results.empty() &&
+        !IsFullSingleRangeBucket(results.back(), num_data_threshold)) {
+      // Previous bucket is not full single-range, so merge it to now hold the
+      // range [prev_bucket.key_lower, last_key - 1].
+      results.back().key_upper = last_key - 1;
+    } else {
+      // There is either no previous bucket or the previous bucket is full
+      // single-range. So add an empty bucket for the range [curr_key_lower,
+      // last_key - 1].
+      results.push_back(DataRangeAndBucketInfo(it_start, it_start,
+                                               curr_key_lower, last_key - 1));
+    }
+    curr_key_lower = last_key;
+  }
+  results.push_back(
+      DataRangeAndBucketInfo(it_start, it_end, curr_key_lower, last_key));
+}
+
+}  // namespace
+
+std::vector<DataRangeAndBucketInfo> Split(std::vector<IntegerIndexData>& data,
+                                          int64_t original_key_lower,
+                                          int64_t original_key_upper,
+                                          int32_t num_data_threshold) {
+  // Early return if there is no need to split.
+  if (data.size() <= num_data_threshold) {
+    return {DataRangeAndBucketInfo(data.begin(), data.end(), original_key_lower,
+                                   original_key_upper)};
+  }
+
+  // Sort data by key.
+  std::sort(
+      data.begin(), data.end(),
+      [](const IntegerIndexData& lhs, const IntegerIndexData& rhs) -> bool {
+        return lhs.key() < rhs.key();
+      });
+
+  std::vector<DataRangeAndBucketInfo> results;
+  int64_t curr_key_lower = original_key_lower;
+  // Sliding window [it_start, it_end) to separate data into different buckets.
+  auto it_start = data.begin();
+  auto it_end = data.begin();
+  while (it_end != data.end()) {
+    // Attempt to extend it_end by 1, but we have to include all data with the
+    // same key since they cannot be separated into different buckets. Also use
+    // extend_it_end to avoid modifying it_end directly. For some edge cases,
+    // the extension in a single round is extremely large (i.e. a lot of data
+    // have the same key), and we want to separate them. For example:
+    // - key = 0: 5 data
+    // - key = 1: num_data_threshold - 1 data
+    // In the second round, # of data in the sliding window will exceed the
+    // threshold. We want to separate all data with key = 0 into a single bucket
+    // instead of putting key = 0 and key = 1 together. Therefore, using
+    // extend_it_end allow us to preserve it_end of the previous round and be
+    // able to deal with this case.
+    auto extend_it_end = it_end + 1;
+    while (extend_it_end != data.end() &&
+           it_end->key() == extend_it_end->key()) {
+      ++extend_it_end;
+    }
+
+    if (std::distance(it_start, extend_it_end) > num_data_threshold &&
+        it_start != it_end) {
+      // Split data between [it_start, it_end) into range [curr_key_lower,
+      // (it_end - 1)->key()].
+      AppendNewBuckets(it_start, it_end, curr_key_lower, num_data_threshold,
+                       results);
+
+      // it_end at this moment won't be data.end(), so the last element of the
+      // new bucket can't have key == INT64_MAX. Therefore, it is safe to set
+      // curr_key_lower as ((it_end - 1)->key() + 1).
+      curr_key_lower = (it_end - 1)->key() + 1;
+      it_start = it_end;
+    }
+    it_end = extend_it_end;
+  }
+
+  // Handle the final range [curr_key_lower, original_key_upper].
+  if (curr_key_lower <= original_key_upper) {
+    if (it_start != it_end) {
+      AppendNewBuckets(it_start, it_end, curr_key_lower, num_data_threshold,
+                       results);
+
+      // AppendNewBuckets only handles range [curr_key_lower, (it_end -
+      // 1)->key()], so we have to handle range [(it_end - 1)->key() + 1,
+      // original_key_upper] if needed.
+      int64_t last_key = (it_end - 1)->key();
+      if (last_key != std::numeric_limits<int64_t>::max() &&
+          last_key + 1 <= original_key_upper) {
+        if (!results.empty() &&
+            !IsFullSingleRangeBucket(results.back(), num_data_threshold)) {
+          results.back().key_upper = original_key_upper;
+        } else {
+          results.push_back(DataRangeAndBucketInfo(
+              it_start, it_start, last_key + 1, original_key_upper));
+        }
+      }
+    } else {
+      results.push_back(DataRangeAndBucketInfo(it_start, it_end, curr_key_lower,
+                                               original_key_upper));
+    }
+  }
+
+  return results;
+}
+
+}  // namespace integer_index_bucket_util
+
+}  // namespace lib
+}  // namespace icing