diff options
author | Terry Wang <tytytyww@google.com> | 2023-01-23 14:54:45 -0800 |
---|---|---|
committer | Terry Wang <tytytyww@google.com> | 2023-01-23 14:54:45 -0800 |
commit | cccafab8dfcae94d7072eb49ea971e3c688bdfc4 (patch) | |
tree | c5658335e3a0287ab3a7b25a3f18585920515f75 /icing/index/numeric | |
parent | ec29b14d5a908748d6e0699df7c85842a95a7b3c (diff) | |
download | icing-cccafab8dfcae94d7072eb49ea971e3c688bdfc4.tar.gz |
Update Icing from upstream.
Descriptions:
======================================================================
Fix time complexity regression for snippet retriever
======================================================================
Add support for QueryTerms SectionRestrict Map in Advanced Query.
======================================================================
[NumericSearch][Storage][9/x] Create unit tests for AddKeys and
GetIterator
======================================================================
[NumericSearch][Storage][8/x] Add options
======================================================================
Fix the stack overflow issue for icing advanced scoring/query
parsers/visitors by limiting the number of tokens allowed
======================================================================
Improves the has joinspec check
======================================================================
Fix the bug of DocHitInfoIteratorTermMain found by monkey test
======================================================================
Rename tests defined in DestructibleDirectoryTest to include
"directory" instead of "file".
======================================================================
Support projection in icing monkey test
======================================================================
[NumericSearch][Storage][7/x] Implement iterator for query
======================================================================
[NumericSearch][Storage][6/x] Implement AddKeys
======================================================================
Add support for relevance scoring by enabling term_frequency retrieval
in DocHitInfoIterators and by populating the QueryTermIteratorsMap.
======================================================================
[JoinableCache][1/x] Compute SchemaDelta for joinable properties
======================================================================
[NumericSearch][Storage][5/x] PersistToDisk
======================================================================
[NumericSearch][Storage][4.1/x] Create unit tests for initialize
======================================================================
[NumericSearch][Storage][4.0/x] Implement Create and Initialize functions
======================================================================
Rename PostingListUsedSerializer related classes
Bug: 193244409
Bug: 208654892
Bug: 246984163
Bug: 249829533
Bug: 256022027
Bug: 265258364
Bug: 265834832
Change-Id: Ibc7011a793ef5ed09eace0fb05d168adf5b4faca
Diffstat (limited to 'icing/index/numeric')
-rw-r--r-- | icing/index/numeric/integer-index-storage.cc | 872 | ||||
-rw-r--r-- | icing/index/numeric/integer-index-storage.h | 214 | ||||
-rw-r--r-- | icing/index/numeric/integer-index-storage_test.cc | 1147 | ||||
-rw-r--r-- | icing/index/numeric/posting-list-integer-index-accessor.cc (renamed from icing/index/numeric/posting-list-integer-index-data-accessor.cc) | 30 | ||||
-rw-r--r-- | icing/index/numeric/posting-list-integer-index-accessor.h (renamed from icing/index/numeric/posting-list-integer-index-data-accessor.h) | 38 | ||||
-rw-r--r-- | icing/index/numeric/posting-list-integer-index-accessor_test.cc (renamed from icing/index/numeric/posting-list-integer-index-data-accessor_test.cc) | 94 | ||||
-rw-r--r-- | icing/index/numeric/posting-list-integer-index-serializer.cc (renamed from icing/index/numeric/posting-list-used-integer-index-data-serializer.cc) | 45 | ||||
-rw-r--r-- | icing/index/numeric/posting-list-integer-index-serializer.h (renamed from icing/index/numeric/posting-list-used-integer-index-data-serializer.h) | 9 | ||||
-rw-r--r-- | icing/index/numeric/posting-list-integer-index-serializer_test.cc (renamed from icing/index/numeric/posting-list-used-integer-index-data-serializer_test.cc) | 54 |
9 files changed, 2355 insertions, 148 deletions
diff --git a/icing/index/numeric/integer-index-storage.cc b/icing/index/numeric/integer-index-storage.cc new file mode 100644 index 0000000..fa8fa3e --- /dev/null +++ b/icing/index/numeric/integer-index-storage.cc @@ -0,0 +1,872 @@ +// Copyright (C) 2022 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/index/numeric/integer-index-storage.h" + +#include <algorithm> +#include <limits> +#include <memory> +#include <queue> +#include <string> +#include <string_view> +#include <utility> +#include <vector> + +#include "icing/text_classifier/lib3/utils/base/status.h" +#include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "icing/absl_ports/canonical_errors.h" +#include "icing/absl_ports/str_cat.h" +#include "icing/file/file-backed-vector.h" +#include "icing/file/filesystem.h" +#include "icing/file/memory-mapped-file.h" +#include "icing/file/posting_list/flash-index-storage.h" +#include "icing/file/posting_list/posting-list-identifier.h" +#include "icing/index/hit/doc-hit-info.h" +#include "icing/index/iterator/doc-hit-info-iterator.h" +#include "icing/index/numeric/doc-hit-info-iterator-numeric.h" +#include "icing/index/numeric/integer-index-data.h" +#include "icing/index/numeric/numeric-index.h" +#include "icing/index/numeric/posting-list-integer-index-accessor.h" +#include "icing/index/numeric/posting-list-integer-index-serializer.h" +#include "icing/schema/section.h" +#include "icing/store/document-id.h" +#include "icing/util/status-macros.h" + +namespace icing { +namespace lib { + +namespace { + +// Helper function to PWrite crcs and info to metadata_file_path. +libtextclassifier3::Status WriteMetadata( + const Filesystem& filesystem, const std::string& metadata_file_path, + const IntegerIndexStorage::Crcs* crcs, + const IntegerIndexStorage::Info* info) { + ScopedFd sfd(filesystem.OpenForWrite(metadata_file_path.c_str())); + if (!sfd.is_valid()) { + return absl_ports::InternalError("Failed to create metadata file"); + } + + // Write crcs and info. File layout: <Crcs><Info> + ICING_RETURN_IF_ERROR(crcs->Serialize(filesystem, sfd.get())); + ICING_RETURN_IF_ERROR(info->Serialize(filesystem, sfd.get())); + + return libtextclassifier3::Status::OK; +} + +// Helper function to update checksums from info and storages to a Crcs +// instance. +libtextclassifier3::Status UpdateChecksums( + IntegerIndexStorage::Crcs* crcs, IntegerIndexStorage::Info* info, + FileBackedVector<IntegerIndexStorage::Bucket>* sorted_buckets, + FileBackedVector<IntegerIndexStorage::Bucket>* unsorted_buckets, + FlashIndexStorage* flash_index_storage) { + // Compute crcs + ICING_ASSIGN_OR_RETURN(Crc32 sorted_buckets_crc, + sorted_buckets->ComputeChecksum()); + ICING_ASSIGN_OR_RETURN(Crc32 unsorted_buckets_crc, + unsorted_buckets->ComputeChecksum()); + + crcs->component_crcs.info_crc = info->ComputeChecksum().Get(); + crcs->component_crcs.sorted_buckets_crc = sorted_buckets_crc.Get(); + crcs->component_crcs.unsorted_buckets_crc = unsorted_buckets_crc.Get(); + // TODO(b/259744228): implement and update flash_index_storage checksum + crcs->component_crcs.flash_index_storage_crc = 0; + crcs->all_crc = crcs->component_crcs.ComputeChecksum().Get(); + + return libtextclassifier3::Status::OK; +} + +// Helper function to validate checksums. +libtextclassifier3::Status ValidateChecksums( + const IntegerIndexStorage::Crcs* crcs, + const IntegerIndexStorage::Info* info, + FileBackedVector<IntegerIndexStorage::Bucket>* sorted_buckets, + FileBackedVector<IntegerIndexStorage::Bucket>* unsorted_buckets, + FlashIndexStorage* flash_index_storage) { + if (crcs->all_crc != crcs->component_crcs.ComputeChecksum().Get()) { + return absl_ports::FailedPreconditionError( + "Invalid all crc for IntegerIndexStorage"); + } + + if (crcs->component_crcs.info_crc != info->ComputeChecksum().Get()) { + return absl_ports::FailedPreconditionError( + "Invalid info crc for IntegerIndexStorage"); + } + + ICING_ASSIGN_OR_RETURN(Crc32 sorted_buckets_crc, + sorted_buckets->ComputeChecksum()); + if (crcs->component_crcs.sorted_buckets_crc != sorted_buckets_crc.Get()) { + return absl_ports::FailedPreconditionError( + "Mismatch crc with IntegerIndexStorage sorted buckets"); + } + + ICING_ASSIGN_OR_RETURN(Crc32 unsorted_buckets_crc, + unsorted_buckets->ComputeChecksum()); + if (crcs->component_crcs.unsorted_buckets_crc != unsorted_buckets_crc.Get()) { + return absl_ports::FailedPreconditionError( + "Mismatch crc with IntegerIndexStorage unsorted buckets"); + } + + // TODO(b/259744228): implement and verify flash_index_storage checksum + + return libtextclassifier3::Status::OK; +} + +// The following 4 methods are helper functions to get the correct file path of +// metadata/sorted_buckets/unsorted_buckets/flash_index_storage, according to +// the given working directory. +std::string GetMetadataFilePath(std::string_view working_dir) { + return absl_ports::StrCat(working_dir, "/", IntegerIndexStorage::kFilePrefix, + ".m"); +} + +std::string GetSortedBucketsFilePath(std::string_view working_dir) { + return absl_ports::StrCat(working_dir, "/", IntegerIndexStorage::kFilePrefix, + ".s"); +} + +std::string GetUnsortedBucketsFilePath(std::string_view working_dir) { + return absl_ports::StrCat(working_dir, "/", IntegerIndexStorage::kFilePrefix, + ".u"); +} + +std::string GetFlashIndexStorageFilePath(std::string_view working_dir) { + return absl_ports::StrCat(working_dir, "/", IntegerIndexStorage::kFilePrefix, + ".f"); +} + +} // namespace + +// We add (BasicHits, key) into a bucket in DocumentId descending and SectionId +// ascending order. When doing range query, we may access buckets and want to +// return BasicHits to callers sorted by DocumentId. Therefore, this problem is +// actually "merge K sorted lists". +// To implement this algorithm via priority_queue, we create this wrapper class +// to store PostingListIntegerIndexAccessor for iterating through the posting +// list chain. +// - Non-relevant (i.e. not in range [key_lower, key_upper]) will be skipped. +// - Relevant BasicHits will be returned. +class BucketPostingListIterator { + public: + class Comparator { + public: + // REQUIRES: 2 BucketPostingListIterator* instances (lhs, rhs) should be + // valid, i.e. the preceding AdvanceAndFilter() succeeded. + bool operator()(const BucketPostingListIterator* lhs, + const BucketPostingListIterator* rhs) const { + // std::priority_queue is a max heap and we should return BasicHits in + // DocumentId descending order. + // - BucketPostingListIterator::operator< should have the same order as + // DocumentId. + // - BasicHit encodes inverted document id and BasicHit::operator< + // compares the encoded raw value directly. + // - Therefore, BucketPostingListIterator::operator< should compare + // BasicHit reversely. + // - This will make priority_queue return buckets in DocumentId + // descending and SectionId ascending order. + // - Whatever direction we sort SectionId by (or pop by priority_queue) + // doesn't matter because all hits for the same DocumentId will be + // merged into a single DocHitInfo. + return rhs->GetCurrentBasicHit() < lhs->GetCurrentBasicHit(); + } + }; + + explicit BucketPostingListIterator( + std::unique_ptr<PostingListIntegerIndexAccessor> pl_accessor) + : pl_accessor_(std::move(pl_accessor)), + should_retrieve_next_batch_(true) {} + + // Advances to the next relevant data. The posting list of a bucket contains + // keys within range [bucket.key_lower, bucket.key_upper], but some of them + // may be out of [query_key_lower, query_key_upper], so when advancing we have + // to filter out those non-relevant keys. + // + // Returns: + // - OK on success + // - OUT_OF_RANGE_ERROR if reaching the end (i.e. no more relevant data) + // - Any other PostingListIntegerIndexAccessor errors + libtextclassifier3::Status AdvanceAndFilter(int64_t query_key_lower, + int64_t query_key_upper) { + // Move curr_ until reaching a relevant data (i.e. key in range + // [query_key_lower, query_key_upper]) + do { + if (!should_retrieve_next_batch_) { + ++curr_; + should_retrieve_next_batch_ = + curr_ >= cached_batch_integer_index_data_.cend(); + } + if (should_retrieve_next_batch_) { + ICING_RETURN_IF_ERROR(GetNextDataBatch()); + should_retrieve_next_batch_ = false; + } + } while (curr_->key() < query_key_lower || curr_->key() > query_key_upper); + + return libtextclassifier3::Status::OK; + } + + const BasicHit& GetCurrentBasicHit() const { return curr_->basic_hit(); } + + private: + // Gets next batch of data from the posting list chain, caches in + // cached_batch_integer_index_data_, and sets curr_ to the begin of the cache. + libtextclassifier3::Status GetNextDataBatch() { + auto cached_batch_integer_index_data_or = pl_accessor_->GetNextDataBatch(); + if (!cached_batch_integer_index_data_or.ok()) { + ICING_LOG(WARNING) + << "Fail to get next batch data from posting list due to: " + << cached_batch_integer_index_data_or.status().error_message(); + return std::move(cached_batch_integer_index_data_or).status(); + } + + cached_batch_integer_index_data_ = + std::move(cached_batch_integer_index_data_or).ValueOrDie(); + curr_ = cached_batch_integer_index_data_.cbegin(); + + if (cached_batch_integer_index_data_.empty()) { + return absl_ports::OutOfRangeError("End of iterator"); + } + + return libtextclassifier3::Status::OK; + } + + std::unique_ptr<PostingListIntegerIndexAccessor> pl_accessor_; + std::vector<IntegerIndexData> cached_batch_integer_index_data_; + std::vector<IntegerIndexData>::const_iterator curr_; + bool should_retrieve_next_batch_; +}; + +// Wrapper class to iterate through IntegerIndexStorage to get relevant data. +// It uses multiple BucketPostingListIterator instances from different candidate +// buckets and merges all relevant BasicHits from these buckets by +// std::priority_queue in DocumentId descending order. Also different SectionIds +// of the same DocumentId will be merged into SectionIdMask and returned as a +// single DocHitInfo. +class IntegerIndexStorageIterator : public NumericIndex<int64_t>::Iterator { + public: + explicit IntegerIndexStorageIterator( + int64_t query_key_lower, int64_t query_key_upper, + std::vector<std::unique_ptr<BucketPostingListIterator>>&& bucket_pl_iters) + : NumericIndex<int64_t>::Iterator(query_key_lower, query_key_upper) { + std::vector<BucketPostingListIterator*> bucket_pl_iters_raw_ptrs; + for (std::unique_ptr<BucketPostingListIterator>& bucket_pl_itr : + bucket_pl_iters) { + // Before adding BucketPostingListIterator* into the priority queue, we + // have to advance the bucket iterator to the first valid data since the + // priority queue needs valid data to compare the order. + // Note: it is possible that the bucket iterator fails to advance for the + // first round, because data could be filtered out by [query_key_lower, + // query_key_upper]. In this case, just discard the iterator. + if (bucket_pl_itr->AdvanceAndFilter(query_key_lower, query_key_upper) + .ok()) { + bucket_pl_iters_raw_ptrs.push_back(bucket_pl_itr.get()); + bucket_pl_iters_.push_back(std::move(bucket_pl_itr)); + } + } + + pq_ = std::priority_queue<BucketPostingListIterator*, + std::vector<BucketPostingListIterator*>, + BucketPostingListIterator::Comparator>( + comparator_, std::move(bucket_pl_iters_raw_ptrs)); + } + + ~IntegerIndexStorageIterator() override = default; + + // Advances to the next DocHitInfo. Note: several BucketPostingListIterator + // instances may be advanced if they point to data with the same DocumentId. + // + // Returns: + // - OK on success + // - OUT_OF_RANGE_ERROR if reaching the end (i.e. no more relevant data) + // - Any BucketPostingListIterator errors + libtextclassifier3::Status Advance() override; + + DocHitInfo GetDocHitInfo() const override { return doc_hit_info_; } + + private: + BucketPostingListIterator::Comparator comparator_; + + // We have to fetch and pop the top BucketPostingListIterator from + // std::priority_queue to perform "merge K sorted lists algorithm". + // - Since std::priority_queue::pop() doesn't return the top element, we have + // to call top() and pop() together. + // - std::move the top() element by const_cast is not an appropriate way + // because it introduces transient unstable state for std::priority_queue. + // - We don't want to copy BucketPostingListIterator, either. + // - Therefore, add bucket_pl_iters_ for the ownership of all + // BucketPostingListIterator instances and std::priority_queue uses the raw + // pointer. So when calling top(), we can simply copy the raw pointer via + // top() and avoid transient unstable state. + std::vector<std::unique_ptr<BucketPostingListIterator>> bucket_pl_iters_; + std::priority_queue<BucketPostingListIterator*, + std::vector<BucketPostingListIterator*>, + BucketPostingListIterator::Comparator> + pq_; + + DocHitInfo doc_hit_info_; +}; + +libtextclassifier3::Status IntegerIndexStorageIterator::Advance() { + if (pq_.empty()) { + return absl_ports::OutOfRangeError("End of iterator"); + } + + DocumentId document_id = pq_.top()->GetCurrentBasicHit().document_id(); + doc_hit_info_ = DocHitInfo(document_id); + // Merge sections with same document_id into a single DocHitInfo + while (!pq_.empty() && + pq_.top()->GetCurrentBasicHit().document_id() == document_id) { + doc_hit_info_.UpdateSection(pq_.top()->GetCurrentBasicHit().section_id()); + + BucketPostingListIterator* bucket_itr = pq_.top(); + pq_.pop(); + + if (bucket_itr->AdvanceAndFilter(key_lower_, key_upper_).ok()) { + pq_.push(bucket_itr); + } + } + + return libtextclassifier3::Status::OK; +} + +bool IntegerIndexStorage::Options::IsValid() const { + if (!HasCustomInitBuckets()) { + return true; + } + + // Verify if the range of buckets are disjoint and the range union is + // [INT64_MIN, INT64_MAX]. + std::vector<Bucket> buckets; + buckets.reserve(custom_init_sorted_buckets.size() + + custom_init_unsorted_buckets.size()); + buckets.insert(buckets.end(), custom_init_sorted_buckets.begin(), + custom_init_sorted_buckets.end()); + buckets.insert(buckets.end(), custom_init_unsorted_buckets.begin(), + custom_init_unsorted_buckets.end()); + if (buckets.empty()) { + return false; + } + std::sort(buckets.begin(), buckets.end()); + int64_t expected_lower = std::numeric_limits<int64_t>::min(); + for (int i = 0; i < buckets.size(); ++i) { + // key_lower should not be greater than key_upper and init bucket should + // have invalid posting list identifier. + if (buckets[i].key_lower() > buckets[i].key_upper() || + buckets[i].posting_list_identifier().is_valid()) { + return false; + } + + if (buckets[i].key_lower() != expected_lower) { + return false; + } + + // If it is the last bucket, then key_upper should be INT64_MAX. Otherwise + // it should not be INT64_MAX. Use XOR for this logic. + if ((buckets[i].key_upper() == std::numeric_limits<int64_t>::max()) ^ + (i == buckets.size() - 1)) { + return false; + } + expected_lower = buckets[i].key_upper() + 1; + } + + return true; +} + +/* static */ libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndexStorage>> +IntegerIndexStorage::Create( + const Filesystem& filesystem, std::string_view base_dir, Options options, + PostingListIntegerIndexSerializer* posting_list_serializer) { + if (!options.IsValid()) { + return absl_ports::InvalidArgumentError( + "Invalid IntegerIndexStorage options"); + } + + std::string working_dir = absl_ports::StrCat(base_dir, "/", kSubDirectory); + if (!filesystem.FileExists(GetMetadataFilePath(working_dir).c_str()) || + !filesystem.FileExists(GetSortedBucketsFilePath(working_dir).c_str()) || + !filesystem.FileExists(GetUnsortedBucketsFilePath(working_dir).c_str()) || + !filesystem.FileExists( + GetFlashIndexStorageFilePath(working_dir).c_str())) { + // Delete working_dir if any of them is missing, and reinitialize. + if (!filesystem.DeleteDirectoryRecursively(working_dir.c_str())) { + return absl_ports::InternalError( + absl_ports::StrCat("Failed to delete directory: ", working_dir)); + } + return InitializeNewFiles(filesystem, std::move(working_dir), + std::move(options), posting_list_serializer); + } + return InitializeExistingFiles(filesystem, std::move(working_dir), + std::move(options), posting_list_serializer); +} + +IntegerIndexStorage::~IntegerIndexStorage() { + if (!PersistToDisk().ok()) { + ICING_LOG(WARNING) + << "Failed to persist hash map to disk while destructing " + << working_dir_; + } +} + +class IntegerIndexStorageComparator { + public: + bool operator()(const IntegerIndexStorage::Bucket& lhs, int64_t rhs) const { + return lhs.key_upper() < rhs; + } +} kComparator; + +libtextclassifier3::Status IntegerIndexStorage::AddKeys( + DocumentId document_id, SectionId section_id, + std::vector<int64_t>&& new_keys) { + if (new_keys.empty()) { + return libtextclassifier3::Status::OK; + } + + std::sort(new_keys.begin(), new_keys.end()); + + // Dedupe + auto last = std::unique(new_keys.begin(), new_keys.end()); + new_keys.erase(last, new_keys.end()); + + // When adding keys into a bucket, we potentially split it into 2 new buckets + // and one of them will be added into the unsorted bucket array. + // When handling keys belonging to buckets in the unsorted bucket array, we + // don't have to (and must not) handle these newly split buckets. Therefore, + // collect all newly split buckets in another vector and append them into the + // unsorted bucket array after adding all keys. + std::vector<Bucket> new_buckets; + + // Binary search range of the sorted bucket array. + const Bucket* sorted_bucket_arr_begin = sorted_buckets_->array(); + const Bucket* sorted_bucket_arr_end = + sorted_buckets_->array() + sorted_buckets_->num_elements(); + + // Step 1: handle keys belonging to buckets in the sorted bucket array. Skip + // keys belonging to the unsorted bucket array and deal with them in + // the next step. + // - Iterate through new_keys by it_start. + // - Binary search (std::lower_bound comparing key with bucket.key_upper()) to + // find the first bucket in the sorted bucket array with key_upper is not + // smaller than (>=) the key. + // - Skip (and advance it_start) all keys smaller than the target bucket's + // key_lower. It means these keys belong to buckets in the unsorted bucket + // array and we will deal with them later. + // - Find it_end such that all keys within range [it_start, it_end) belong to + // the target bucket. + // - Batch add keys within range [it_start, it_end) into the target bucket. + auto it_start = new_keys.cbegin(); + while (it_start != new_keys.cend() && + sorted_bucket_arr_begin < sorted_bucket_arr_end) { + // Use std::lower_bound to find the first bucket in the sorted bucket array + // with key_upper >= *it_start. + const Bucket* target_bucket = std::lower_bound( + sorted_bucket_arr_begin, sorted_bucket_arr_end, *it_start, kComparator); + if (target_bucket >= sorted_bucket_arr_end) { + // Keys in range [it_start, new_keys.cend()) are greater than all sorted + // buckets' key_upper, so we can end step 1. In fact, they belong to + // buckets in the unsorted bucket array and we will deal with them in + // step 2. + break; + } + + // Sequential instead of binary search to advance it_start and it_end for + // several reasons: + // - Eventually we have to iterate through all keys within range [it_start, + // it_end) and add them into the posting list, so binary search doesn't + // improve the overall time complexity. + // - Binary search may jump to far-away indices, which potentially + // downgrades the cache performance. + + // After binary search, we've ensured *it_start <= + // target_bucket->key_upper(), but it is still possible that *it_start (and + // the next several keys) is still smaller than target_bucket->key_lower(), + // so we have to skip them. In fact, they belong to buckets in the unsorted + // bucket array. + // + // For example: + // - sorted bucket array: [(INT_MIN, 0), (1, 5), (100, 300), (301, 550)] + // - unsorted bucket array: [(550, INT_MAX), (6, 99)] + // - new_keys: [10, 20, 40, 102, 150, 200, 500, 600] + // std::lower_bound (target = 10) will get target_bucket = (100, 300), but + // we have to skip 10, 20, 40 because they are smaller than 100 (the + // bucket's key_lower). We should move it_start pointing to key 102. + while (it_start != new_keys.cend() && + *it_start < target_bucket->key_lower()) { + ++it_start; + } + + // Locate it_end such that all keys within range [it_start, it_end) belong + // to target_bucket and all keys outside this range don't belong to + // target_bucket. + // + // For example (continue above), we should locate it_end to point to key + // 500. + auto it_end = it_start; + while (it_end != new_keys.cend() && *it_end <= target_bucket->key_upper()) { + ++it_end; + } + + // Now, keys within range [it_start, it_end) belong to target_bucket, so + // construct IntegerIndexData and add them into the bucket's posting list. + if (it_start != it_end) { + ICING_ASSIGN_OR_RETURN( + FileBackedVector<Bucket>::MutableView mutable_bucket, + sorted_buckets_->GetMutable(target_bucket - + sorted_buckets_->array())); + ICING_ASSIGN_OR_RETURN( + std::vector<Bucket> round_new_buckets, + AddKeysIntoBucketAndSplitIfNecessary( + document_id, section_id, it_start, it_end, mutable_bucket)); + new_buckets.insert(new_buckets.end(), round_new_buckets.begin(), + round_new_buckets.end()); + } + + it_start = it_end; + sorted_bucket_arr_begin = target_bucket + 1; + } + + // Step 2: handle keys belonging to buckets in the unsorted bucket array. They + // were skipped in step 1. + // For each bucket in the unsorted bucket array, find [it_start, it_end) such + // that all keys within this range belong to the bucket and add them. + // - Binary search (std::lower_bound comparing bucket.key_lower() with key) to + // find it_start. + // - Sequential advance (start from it_start) to find it_end. Same reason as + // above for choosing sequential advance instead of binary search. + // - Add keys within range [it_start, it_end) into the bucket. + for (int32_t i = 0; i < unsorted_buckets_->num_elements(); ++i) { + ICING_ASSIGN_OR_RETURN(FileBackedVector<Bucket>::MutableView mutable_bucket, + unsorted_buckets_->GetMutable(i)); + auto it_start = std::lower_bound(new_keys.cbegin(), new_keys.cend(), + mutable_bucket.Get().key_lower()); + if (it_start == new_keys.cend()) { + continue; + } + + // Sequential advance instead of binary search to find the correct position + // of it_end for the same reasons mentioned above in step 1. + auto it_end = it_start; + while (it_end != new_keys.cend() && + *it_end <= mutable_bucket.Get().key_upper()) { + ++it_end; + } + + // Now, key within range [it_start, it_end) belong to the bucket, so + // construct IntegerIndexData and add them into the bucket's posting list. + if (it_start != it_end) { + ICING_ASSIGN_OR_RETURN( + std::vector<Bucket> round_new_buckets, + AddKeysIntoBucketAndSplitIfNecessary( + document_id, section_id, it_start, it_end, mutable_bucket)); + new_buckets.insert(new_buckets.end(), round_new_buckets.begin(), + round_new_buckets.end()); + } + } + + // Step 3: append new buckets into the unsorted bucket array. + if (!new_buckets.empty()) { + ICING_ASSIGN_OR_RETURN( + typename FileBackedVector<Bucket>::MutableArrayView mutable_new_arr, + unsorted_buckets_->Allocate(new_buckets.size())); + mutable_new_arr.SetArray(/*idx=*/0, new_buckets.data(), new_buckets.size()); + } + + // Step 4: merge the unsorted bucket array into the sorted bucket array if the + // length of the unsorted bucket array exceeds the threshold. + // TODO(b/259743562): [Optimization 1] implement merge + + return libtextclassifier3::Status::OK; +} + +libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>> +IntegerIndexStorage::GetIterator(int64_t query_key_lower, + int64_t query_key_upper) const { + if (query_key_lower > query_key_upper) { + return absl_ports::InvalidArgumentError( + "key_lower should not be greater than key_upper"); + } + + std::vector<std::unique_ptr<BucketPostingListIterator>> bucket_pl_iters; + + // Sorted bucket array + const Bucket* sorted_bucket_arr_begin = sorted_buckets_->array(); + const Bucket* sorted_bucket_arr_end = + sorted_buckets_->array() + sorted_buckets_->num_elements(); + for (const Bucket* bucket = + std::lower_bound(sorted_bucket_arr_begin, sorted_bucket_arr_end, + query_key_lower, kComparator); + bucket < sorted_bucket_arr_end && bucket->key_lower() <= query_key_upper; + ++bucket) { + if (!bucket->posting_list_identifier().is_valid()) { + continue; + } + + ICING_ASSIGN_OR_RETURN( + std::unique_ptr<PostingListIntegerIndexAccessor> pl_accessor, + PostingListIntegerIndexAccessor::CreateFromExisting( + flash_index_storage_.get(), posting_list_serializer_, + bucket->posting_list_identifier())); + bucket_pl_iters.push_back( + std::make_unique<BucketPostingListIterator>(std::move(pl_accessor))); + } + + // Unsorted bucket array + for (int32_t i = 0; i < unsorted_buckets_->num_elements(); ++i) { + ICING_ASSIGN_OR_RETURN(const Bucket* bucket, unsorted_buckets_->Get(i)); + if (query_key_upper < bucket->key_lower() || + query_key_lower > bucket->key_upper() || + !bucket->posting_list_identifier().is_valid()) { + // Skip bucket whose range doesn't overlap with [query_key_lower, + // query_key_upper] or posting_list_identifier is invalid. + continue; + } + + ICING_ASSIGN_OR_RETURN( + std::unique_ptr<PostingListIntegerIndexAccessor> pl_accessor, + PostingListIntegerIndexAccessor::CreateFromExisting( + flash_index_storage_.get(), posting_list_serializer_, + bucket->posting_list_identifier())); + bucket_pl_iters.push_back( + std::make_unique<BucketPostingListIterator>(std::move(pl_accessor))); + } + + return std::make_unique<DocHitInfoIteratorNumeric<int64_t>>( + std::make_unique<IntegerIndexStorageIterator>( + query_key_lower, query_key_upper, std::move(bucket_pl_iters))); +} + +libtextclassifier3::Status IntegerIndexStorage::PersistToDisk() { + ICING_RETURN_IF_ERROR(sorted_buckets_->PersistToDisk()); + ICING_RETURN_IF_ERROR(unsorted_buckets_->PersistToDisk()); + if (!flash_index_storage_->PersistToDisk()) { + return absl_ports::InternalError( + "Fail to persist FlashIndexStorage to disk"); + } + + ICING_RETURN_IF_ERROR(UpdateChecksums(crcs(), info(), sorted_buckets_.get(), + unsorted_buckets_.get(), + flash_index_storage_.get())); + // Changes should have been applied to the underlying file when using + // MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, but call msync() as an + // extra safety step to ensure they are written out. + ICING_RETURN_IF_ERROR(metadata_mmapped_file_->PersistToDisk()); + + return libtextclassifier3::Status::OK; +} + +/* static */ libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndexStorage>> +IntegerIndexStorage::InitializeNewFiles( + const Filesystem& filesystem, std::string&& working_dir, Options&& options, + PostingListIntegerIndexSerializer* posting_list_serializer) { + // Create working directory. + if (!filesystem.CreateDirectoryRecursively(working_dir.c_str())) { + return absl_ports::InternalError( + absl_ports::StrCat("Failed to create directory: ", working_dir)); + } + + // TODO(b/259743562): [Optimization 1] decide max # buckets, unsorted buckets + // threshold + // Initialize sorted_buckets + int32_t pre_mapping_mmap_size = sizeof(Bucket) * (1 << 10); + int32_t max_file_size = + pre_mapping_mmap_size + FileBackedVector<Bucket>::Header::kHeaderSize; + ICING_ASSIGN_OR_RETURN( + std::unique_ptr<FileBackedVector<Bucket>> sorted_buckets, + FileBackedVector<Bucket>::Create( + filesystem, GetSortedBucketsFilePath(working_dir), + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size, + pre_mapping_mmap_size)); + + // Initialize unsorted_buckets + pre_mapping_mmap_size = sizeof(Bucket) * 100; + max_file_size = + pre_mapping_mmap_size + FileBackedVector<Bucket>::Header::kHeaderSize; + ICING_ASSIGN_OR_RETURN( + std::unique_ptr<FileBackedVector<Bucket>> unsorted_buckets, + FileBackedVector<Bucket>::Create( + filesystem, GetUnsortedBucketsFilePath(working_dir), + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size, + pre_mapping_mmap_size)); + + // Initialize flash_index_storage + ICING_ASSIGN_OR_RETURN( + FlashIndexStorage flash_index_storage, + FlashIndexStorage::Create(GetFlashIndexStorageFilePath(working_dir), + &filesystem, posting_list_serializer)); + + if (options.HasCustomInitBuckets()) { + // Insert custom init buckets. + std::sort(options.custom_init_sorted_buckets.begin(), + options.custom_init_sorted_buckets.end()); + ICING_ASSIGN_OR_RETURN( + typename FileBackedVector<Bucket>::MutableArrayView + mutable_new_sorted_bucket_arr, + sorted_buckets->Allocate(options.custom_init_sorted_buckets.size())); + mutable_new_sorted_bucket_arr.SetArray( + /*idx=*/0, options.custom_init_sorted_buckets.data(), + options.custom_init_sorted_buckets.size()); + + ICING_ASSIGN_OR_RETURN(typename FileBackedVector<Bucket>::MutableArrayView + mutable_new_unsorted_bucket_arr, + unsorted_buckets->Allocate( + options.custom_init_unsorted_buckets.size())); + mutable_new_unsorted_bucket_arr.SetArray( + /*idx=*/0, options.custom_init_unsorted_buckets.data(), + options.custom_init_unsorted_buckets.size()); + + // After inserting buckets, we can clear vectors since there is no need to + // cache them. + options.custom_init_sorted_buckets.clear(); + options.custom_init_unsorted_buckets.clear(); + } else { + // Insert one bucket with range [INT64_MIN, INT64_MAX]. + ICING_RETURN_IF_ERROR(sorted_buckets->Append(Bucket( + /*key_lower=*/std::numeric_limits<int64_t>::min(), + /*key_upper=*/std::numeric_limits<int64_t>::max()))); + } + ICING_RETURN_IF_ERROR(sorted_buckets->PersistToDisk()); + + // Create and initialize new info + Info new_info; + new_info.magic = Info::kMagic; + new_info.num_keys = 0; + + // Compute checksums + Crcs new_crcs; + ICING_RETURN_IF_ERROR( + UpdateChecksums(&new_crcs, &new_info, sorted_buckets.get(), + unsorted_buckets.get(), &flash_index_storage)); + + const std::string metadata_file_path = GetMetadataFilePath(working_dir); + // Write new metadata file + ICING_RETURN_IF_ERROR( + WriteMetadata(filesystem, metadata_file_path, &new_crcs, &new_info)); + + // Mmap the content of the crcs and info. + ICING_ASSIGN_OR_RETURN( + MemoryMappedFile metadata_mmapped_file, + MemoryMappedFile::Create(filesystem, metadata_file_path, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, + /*max_file_size=*/kMetadataFileSize, + /*pre_mapping_file_offset=*/0, + /*pre_mapping_mmap_size=*/kMetadataFileSize)); + + return std::unique_ptr<IntegerIndexStorage>(new IntegerIndexStorage( + filesystem, std::move(working_dir), std::move(options), + posting_list_serializer, + std::make_unique<MemoryMappedFile>(std::move(metadata_mmapped_file)), + std::move(sorted_buckets), std::move(unsorted_buckets), + std::make_unique<FlashIndexStorage>(std::move(flash_index_storage)))); +} + +/* static */ libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndexStorage>> +IntegerIndexStorage::InitializeExistingFiles( + const Filesystem& filesystem, std::string&& working_dir, Options&& options, + PostingListIntegerIndexSerializer* posting_list_serializer) { + // Mmap the content of the crcs and info. + ICING_ASSIGN_OR_RETURN( + MemoryMappedFile metadata_mmapped_file, + MemoryMappedFile::Create(filesystem, GetMetadataFilePath(working_dir), + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, + /*max_file_size=*/kMetadataFileSize, + /*pre_mapping_file_offset=*/0, + /*pre_mapping_mmap_size=*/kMetadataFileSize)); + + // TODO(b/259743562): [Optimization 1] decide max # buckets, unsorted buckets + // threshold + // Initialize sorted_buckets + int32_t pre_mapping_mmap_size = sizeof(Bucket) * (1 << 10); + int32_t max_file_size = + pre_mapping_mmap_size + FileBackedVector<Bucket>::Header::kHeaderSize; + ICING_ASSIGN_OR_RETURN( + std::unique_ptr<FileBackedVector<Bucket>> sorted_buckets, + FileBackedVector<Bucket>::Create( + filesystem, GetSortedBucketsFilePath(working_dir), + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size, + pre_mapping_mmap_size)); + + // Initialize unsorted_buckets + pre_mapping_mmap_size = sizeof(Bucket) * 100; + max_file_size = + pre_mapping_mmap_size + FileBackedVector<Bucket>::Header::kHeaderSize; + ICING_ASSIGN_OR_RETURN( + std::unique_ptr<FileBackedVector<Bucket>> unsorted_buckets, + FileBackedVector<Bucket>::Create( + filesystem, GetUnsortedBucketsFilePath(working_dir), + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size, + pre_mapping_mmap_size)); + + // Initialize flash_index_storage + ICING_ASSIGN_OR_RETURN( + FlashIndexStorage flash_index_storage, + FlashIndexStorage::Create(GetFlashIndexStorageFilePath(working_dir), + &filesystem, posting_list_serializer)); + + Crcs* crcs_ptr = reinterpret_cast<Crcs*>( + metadata_mmapped_file.mutable_region() + Crcs::kFileOffset); + Info* info_ptr = reinterpret_cast<Info*>( + metadata_mmapped_file.mutable_region() + Info::kFileOffset); + // Validate checksums of info and 3 storages. + ICING_RETURN_IF_ERROR( + ValidateChecksums(crcs_ptr, info_ptr, sorted_buckets.get(), + unsorted_buckets.get(), &flash_index_storage)); + + return std::unique_ptr<IntegerIndexStorage>(new IntegerIndexStorage( + filesystem, std::move(working_dir), std::move(options), + posting_list_serializer, + std::make_unique<MemoryMappedFile>(std::move(metadata_mmapped_file)), + std::move(sorted_buckets), std::move(unsorted_buckets), + std::make_unique<FlashIndexStorage>(std::move(flash_index_storage)))); +} + +libtextclassifier3::StatusOr<std::vector<IntegerIndexStorage::Bucket>> +IntegerIndexStorage::AddKeysIntoBucketAndSplitIfNecessary( + DocumentId document_id, SectionId section_id, + const std::vector<int64_t>::const_iterator& it_start, + const std::vector<int64_t>::const_iterator& it_end, + FileBackedVector<Bucket>::MutableView& mutable_bucket) { + std::unique_ptr<PostingListIntegerIndexAccessor> pl_accessor; + if (mutable_bucket.Get().posting_list_identifier().is_valid()) { + ICING_ASSIGN_OR_RETURN( + pl_accessor, PostingListIntegerIndexAccessor::CreateFromExisting( + flash_index_storage_.get(), posting_list_serializer_, + mutable_bucket.Get().posting_list_identifier())); + } else { + ICING_ASSIGN_OR_RETURN( + pl_accessor, PostingListIntegerIndexAccessor::Create( + flash_index_storage_.get(), posting_list_serializer_)); + } + + for (auto it = it_start; it != it_end; ++it) { + // TODO(b/259743562): [Optimization 1] implement split bucket if pl is full + // and the bucket is splittable + ICING_RETURN_IF_ERROR(pl_accessor->PrependData( + IntegerIndexData(section_id, document_id, *it))); + } + + // TODO(b/259743562): [Optimization 1] implement split and return new buckets. + // We will change the original bucket (mutable_bucket) + // in-place to one of the new buckets, and the rest will + // be returned and added into unsorted buckets in AddKeys. + PostingListAccessor::FinalizeResult result = + std::move(*pl_accessor).Finalize(); + if (!result.status.ok()) { + return result.status; + } + + mutable_bucket.Get().set_posting_list_identifier(result.id); + + return std::vector<Bucket>(); +} +} // namespace lib +} // namespace icing diff --git a/icing/index/numeric/integer-index-storage.h b/icing/index/numeric/integer-index-storage.h index 2048e76..562060b 100644 --- a/icing/index/numeric/integer-index-storage.h +++ b/icing/index/numeric/integer-index-storage.h @@ -19,13 +19,19 @@ #include <memory> #include <string> #include <string_view> +#include <vector> +#include "icing/text_classifier/lib3/utils/base/status.h" +#include "icing/text_classifier/lib3/utils/base/statusor.h" #include "icing/file/file-backed-vector.h" #include "icing/file/filesystem.h" #include "icing/file/memory-mapped-file.h" #include "icing/file/posting_list/flash-index-storage.h" #include "icing/file/posting_list/posting-list-identifier.h" -#include "icing/index/numeric/posting-list-used-integer-index-data-serializer.h" +#include "icing/index/iterator/doc-hit-info-iterator.h" +#include "icing/index/numeric/posting-list-integer-index-serializer.h" +#include "icing/schema/section.h" +#include "icing/store/document-id.h" #include "icing/util/crc32.h" namespace icing { @@ -91,6 +97,14 @@ class IntegerIndexStorage { } } __attribute__((packed)); + libtextclassifier3::Status Serialize(const Filesystem& filesystem, + int fd) const { + if (!filesystem.PWrite(fd, kFileOffset, this, sizeof(*this))) { + return absl_ports::InternalError("Failed to write crcs into file"); + } + return libtextclassifier3::Status::OK; + } + bool operator==(const Crcs& other) const { return all_crc == other.all_crc && component_crcs == other.component_crcs; } @@ -108,6 +122,14 @@ class IntegerIndexStorage { int32_t magic; int32_t num_keys; + libtextclassifier3::Status Serialize(const Filesystem& filesystem, + int fd) const { + if (!filesystem.PWrite(fd, kFileOffset, this, sizeof(*this))) { + return absl_ports::InternalError("Failed to write info into file"); + } + return libtextclassifier3::Status::OK; + } + Crc32 ComputeChecksum() const { return Crc32( std::string_view(reinterpret_cast<const char*>(this), sizeof(Info))); @@ -115,6 +137,9 @@ class IntegerIndexStorage { } __attribute__((packed)); static_assert(sizeof(Info) == 8, ""); + static constexpr int32_t kMetadataFileSize = sizeof(Crcs) + sizeof(Info); + static_assert(kMetadataFileSize == 28); + // Bucket class Bucket { public: @@ -126,17 +151,26 @@ class IntegerIndexStorage { static constexpr int32_t kMaxNumBuckets = 1 << 23; explicit Bucket(int64_t key_lower, int64_t key_upper, - PostingListIdentifier posting_list_identifier) + PostingListIdentifier posting_list_identifier = + PostingListIdentifier::kInvalid) : key_lower_(key_lower), key_upper_(key_upper), posting_list_identifier_(posting_list_identifier) {} + bool operator<(const Bucket& other) const { + return key_lower_ < other.key_lower_; + } + // For FileBackedVector bool operator==(const Bucket& other) const { return key_lower_ == other.key_lower_ && key_upper_ == other.key_upper_ && posting_list_identifier_ == other.posting_list_identifier_; } + int64_t key_lower() const { return key_lower_; } + + int64_t key_upper() const { return key_upper_; } + PostingListIdentifier posting_list_identifier() const { return posting_list_identifier_; } @@ -160,19 +194,185 @@ class IntegerIndexStorage { FileBackedVector<Bucket>::kElementTypeSize, "Max # of buckets cannot fit into FileBackedVector"); + struct Options { + explicit Options() {} + + explicit Options(std::vector<Bucket> custom_init_sorted_buckets_in, + std::vector<Bucket> custom_init_unsorted_buckets_in) + : custom_init_sorted_buckets(std::move(custom_init_sorted_buckets_in)), + custom_init_unsorted_buckets( + std::move(custom_init_unsorted_buckets_in)) {} + + bool IsValid() const; + + bool HasCustomInitBuckets() const { + return !custom_init_sorted_buckets.empty() || + !custom_init_unsorted_buckets.empty(); + } + + // Custom buckets when initializing new files. If both are empty, then the + // initial bucket is (INT64_MIN, INT64_MAX). Usually we only set them in the + // unit test. Note that all buckets in custom_init_sorted_buckets and + // custom_init_unsorted_buckets should be disjoint and the range union + // should be [INT64_MIN, INT64_MAX]. + std::vector<Bucket> custom_init_sorted_buckets; + std::vector<Bucket> custom_init_unsorted_buckets; + }; + + static constexpr std::string_view kSubDirectory = "storage_dir"; + static constexpr std::string_view kFilePrefix = "integer_index_storage"; + + // Creates a new IntegerIndexStorage instance to index integers. For directory + // management purpose, we define working_dir as "<base_dir>/storage_dir", and + // all underlying files will be stored under it. If any of the underlying file + // is missing, then delete the whole working_dir and (re)initialize with new + // ones. Otherwise initialize and create the instance by existing files. + // + // filesystem: Object to make system level calls + // base_dir: Specifies the base directory for all integer index data related + // files to be stored. As mentioned above, all files will be stored + // under working_dir (which is "<base_dir>/storage_dir"). + // options: Options instance. + // posting_list_serializer: a PostingListIntegerIndexSerializer instance to + // serialize/deserialize integer index data to/from + // posting lists. + // + // Returns: + // - INVALID_ARGUMENT_ERROR if any value in options is invalid. + // - FAILED_PRECONDITION_ERROR if the file checksum doesn't match the stored + // checksum. + // - INTERNAL_ERROR on I/O errors. + // - Any FileBackedVector/FlashIndexStorage errors. + static libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndexStorage>> + Create(const Filesystem& filesystem, std::string_view base_dir, + Options options, + PostingListIntegerIndexSerializer* posting_list_serializer); + + // Delete copy and move constructor/assignment operator. + IntegerIndexStorage(const IntegerIndexStorage&) = delete; + IntegerIndexStorage& operator=(const IntegerIndexStorage&) = delete; + + IntegerIndexStorage(IntegerIndexStorage&&) = delete; + IntegerIndexStorage& operator=(IntegerIndexStorage&&) = delete; + + ~IntegerIndexStorage(); + + // Batch adds new keys (of the same DocumentId and SectionId) into the integer + // index storage. + // Note that since we separate different property names into different integer + // index storages, it is impossible to have keys in a single document across + // multiple sections to add into the same integer index storage. + // + // Returns: + // - OK on success + // - Any FileBackedVector or PostingList errors + libtextclassifier3::Status AddKeys(DocumentId document_id, + SectionId section_id, + std::vector<int64_t>&& new_keys); + + // Returns a DocHitInfoIteratorNumeric<int64_t> (in DocHitInfoIterator + // interface type format) for iterating through all docs which have the + // specified (integer) property contents in range [query_key_lower, + // query_key_upper]. + // When iterating through all relevant doc hits, it: + // - Merges multiple SectionIds of doc hits with same DocumentId into a single + // SectionIdMask and constructs DocHitInfo. + // - Returns DocHitInfo in descending DocumentId order. + // + // Returns: + // - On success: a DocHitInfoIterator(Numeric) + // - INVALID_ARGUMENT_ERROR if query_key_lower > query_key_upper + // - Any FileBackedVector or PostingList errors + libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>> GetIterator( + int64_t query_key_lower, int64_t query_key_upper) const; + + // Flushes content to underlying files. + // + // Returns: + // - OK on success + // - INTERNAL_ERROR on I/O error + libtextclassifier3::Status PersistToDisk(); + private: + static libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndexStorage>> + InitializeNewFiles( + const Filesystem& filesystem, std::string&& working_dir, + Options&& options, + PostingListIntegerIndexSerializer* posting_list_serializer); + + static libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndexStorage>> + InitializeExistingFiles( + const Filesystem& filesystem, std::string&& working_dir, + Options&& options, + PostingListIntegerIndexSerializer* posting_list_serializer); + explicit IntegerIndexStorage( - const Filesystem& filesystem, std::string_view base_dir, - PostingListUsedIntegerIndexDataSerializer* serializer, + const Filesystem& filesystem, std::string&& working_dir, + Options&& options, + PostingListIntegerIndexSerializer* posting_list_serializer, std::unique_ptr<MemoryMappedFile> metadata_mmapped_file, std::unique_ptr<FileBackedVector<Bucket>> sorted_buckets, std::unique_ptr<FileBackedVector<Bucket>> unsorted_buckets, - std::unique_ptr<FlashIndexStorage> flash_index_storage); + std::unique_ptr<FlashIndexStorage> flash_index_storage) + : filesystem_(filesystem), + working_dir_(std::move(working_dir)), + options_(std::move(options)), + posting_list_serializer_(posting_list_serializer), + metadata_mmapped_file_(std::move(metadata_mmapped_file)), + sorted_buckets_(std::move(sorted_buckets)), + unsorted_buckets_(std::move(unsorted_buckets)), + flash_index_storage_(std::move(flash_index_storage)) {} + + // Helper function to add keys in range [it_start, it_end) into the given + // bucket. It handles the bucket and its corresponding posting list(s) to make + // searching and indexing efficient. + // + // When the (single) posting list of the bucket is full: + // - If the size of posting list hasn't reached the max size, then just simply + // add a new key into it, and PostingListAccessor mechanism will + // automatically double the size of the posting list. + // - Else: + // - If the bucket is splittable (i.e. key_lower < key_upper), then split it + // into several new buckets with new ranges, and split the data (according + // to their keys and the range of new buckets) of the original posting + // list into several new posting lists. + // TODO(b/259743562): [Optimization 1] implement split + // - Otherwise, just simply add a new key into it, and PostingListAccessor + // mechanism will automatically create a new max size posting list and + // chain them. + // + // Returns: + // - On success: a vector of new Buckets (to add into the unsorted bucket + // array later) + // - Any FileBackedVector or PostingList errors + libtextclassifier3::StatusOr<std::vector<Bucket>> + AddKeysIntoBucketAndSplitIfNecessary( + DocumentId document_id, SectionId section_id, + const std::vector<int64_t>::const_iterator& it_start, + const std::vector<int64_t>::const_iterator& it_end, + FileBackedVector<Bucket>::MutableView& mutable_bucket); + + Crcs* crcs() { + return reinterpret_cast<Crcs*>(metadata_mmapped_file_->mutable_region() + + Crcs::kFileOffset); + } + + Info* info() { + return reinterpret_cast<Info*>(metadata_mmapped_file_->mutable_region() + + Info::kFileOffset); + } + + const Info* info() const { + return reinterpret_cast<const Info*>(metadata_mmapped_file_->region() + + Info::kFileOffset); + } const Filesystem& filesystem_; - std::string base_dir_; + std::string working_dir_; + + Options options_; - PostingListUsedIntegerIndexDataSerializer* serializer_; // Does not own. + PostingListIntegerIndexSerializer* posting_list_serializer_; // Does not own. std::unique_ptr<MemoryMappedFile> metadata_mmapped_file_; std::unique_ptr<FileBackedVector<Bucket>> sorted_buckets_; diff --git a/icing/index/numeric/integer-index-storage_test.cc b/icing/index/numeric/integer-index-storage_test.cc new file mode 100644 index 0000000..0afc96b --- /dev/null +++ b/icing/index/numeric/integer-index-storage_test.cc @@ -0,0 +1,1147 @@ +// Copyright (C) 2022 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/index/numeric/integer-index-storage.h" + +#include <cstdint> +#include <limits> +#include <memory> +#include <string> +#include <string_view> +#include <vector> + +#include "icing/text_classifier/lib3/utils/base/status.h" +#include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "icing/file/posting_list/posting-list-identifier.h" +#include "icing/index/hit/doc-hit-info.h" +#include "icing/index/iterator/doc-hit-info-iterator.h" +#include "icing/index/numeric/posting-list-integer-index-serializer.h" +#include "icing/schema/section.h" +#include "icing/store/document-id.h" +#include "icing/testing/common-matchers.h" +#include "icing/testing/tmp-directory.h" +#include "icing/util/crc32.h" + +namespace icing { +namespace lib { + +namespace { + +using ::testing::ElementsAre; +using ::testing::ElementsAreArray; +using ::testing::Eq; +using ::testing::HasSubstr; +using ::testing::IsEmpty; +using ::testing::IsFalse; +using ::testing::IsTrue; +using ::testing::Ne; + +using Bucket = IntegerIndexStorage::Bucket; +using Crcs = IntegerIndexStorage::Crcs; +using Info = IntegerIndexStorage::Info; +using Options = IntegerIndexStorage::Options; + +static constexpr int32_t kCorruptedValueOffset = 3; +static constexpr DocumentId kDefaultDocumentId = 123; +static constexpr SectionId kDefaultSectionId = 31; + +class IntegerIndexStorageTest : public ::testing::Test { + protected: + void SetUp() override { + base_dir_ = GetTestTempDir() + "/integer_index_storage_test"; + + serializer_ = std::make_unique<PostingListIntegerIndexSerializer>(); + } + + void TearDown() override { + serializer_.reset(); + filesystem_.DeleteDirectoryRecursively(base_dir_.c_str()); + } + + Filesystem filesystem_; + std::string base_dir_; + std::unique_ptr<PostingListIntegerIndexSerializer> serializer_; +}; + +libtextclassifier3::StatusOr<std::vector<DocHitInfo>> Query( + const IntegerIndexStorage* storage, int64_t key_lower, int64_t key_upper) { + ICING_ASSIGN_OR_RETURN(std::unique_ptr<DocHitInfoIterator> iter, + storage->GetIterator(key_lower, key_upper)); + std::vector<DocHitInfo> hits; + while (iter->Advance().ok()) { + hits.push_back(iter->doc_hit_info()); + } + return hits; +} + +TEST_F(IntegerIndexStorageTest, OptionsEmptyCustomInitBucketsShouldBeValid) { + EXPECT_THAT(Options().IsValid(), IsTrue()); +} + +TEST_F(IntegerIndexStorageTest, OptionsInvalidCustomInitBucketsRange) { + // Invalid custom init sorted bucket + EXPECT_THAT( + Options(/*custom_init_sorted_buckets_in=*/ + {Bucket(std::numeric_limits<int64_t>::min(), 5), Bucket(9, 6)}, + /*custom_init_unsorted_buckets_in=*/ + {Bucket(10, std::numeric_limits<int64_t>::max())}) + .IsValid(), + IsFalse()); + + // Invalid custom init unsorted bucket + EXPECT_THAT( + Options(/*custom_init_sorted_buckets_in=*/ + {Bucket(10, std::numeric_limits<int64_t>::max())}, + /*custom_init_unsorted_buckets_in=*/ + {Bucket(std::numeric_limits<int64_t>::min(), 5), Bucket(9, 6)}) + .IsValid(), + IsFalse()); +} + +TEST_F(IntegerIndexStorageTest, + OptionsInvalidCustomInitBucketsPostingListIdentifier) { + // Custom init buckets should contain invalid posting list identifier. + PostingListIdentifier valid_posting_list_identifier(0, 0, 0); + ASSERT_THAT(valid_posting_list_identifier.is_valid(), IsTrue()); + + // Invalid custom init sorted bucket + EXPECT_THAT(Options(/*custom_init_sorted_buckets_in=*/ + {Bucket(std::numeric_limits<int64_t>::min(), + std::numeric_limits<int64_t>::max(), + valid_posting_list_identifier)}, + /*custom_init_unsorted_buckets_in=*/{}) + .IsValid(), + IsFalse()); + + // Invalid custom init unsorted bucket + EXPECT_THAT(Options(/*custom_init_sorted_buckets_in=*/{}, + /*custom_init_unsorted_buckets_in=*/ + {Bucket(std::numeric_limits<int64_t>::min(), + std::numeric_limits<int64_t>::max(), + valid_posting_list_identifier)}) + .IsValid(), + IsFalse()); +} + +TEST_F(IntegerIndexStorageTest, OptionsInvalidCustomInitBucketsOverlapping) { + // sorted buckets overlap + EXPECT_THAT(Options(/*custom_init_sorted_buckets_in=*/ + {Bucket(std::numeric_limits<int64_t>::min(), -100), + Bucket(-100, std::numeric_limits<int64_t>::max())}, + /*custom_init_unsorted_buckets_in=*/{}) + .IsValid(), + IsFalse()); + + // unsorted buckets overlap + EXPECT_THAT(Options(/*custom_init_sorted_buckets_in=*/{}, + /*custom_init_unsorted_buckets_in=*/ + {Bucket(-100, std::numeric_limits<int64_t>::max()), + Bucket(std::numeric_limits<int64_t>::min(), -100)}) + .IsValid(), + IsFalse()); + + // Cross buckets overlap + EXPECT_THAT(Options(/*custom_init_sorted_buckets_in=*/ + {Bucket(std::numeric_limits<int64_t>::min(), -100), + Bucket(-99, 0)}, + /*custom_init_unsorted_buckets_in=*/ + {Bucket(200, std::numeric_limits<int64_t>::max()), + Bucket(0, 50), Bucket(51, 199)}) + .IsValid(), + IsFalse()); +} + +TEST_F(IntegerIndexStorageTest, OptionsInvalidCustomInitBucketsUnion) { + // Missing INT64_MAX + EXPECT_THAT(Options(/*custom_init_sorted_buckets_in=*/ + {Bucket(std::numeric_limits<int64_t>::min(), -100), + Bucket(-99, 0)}, + /*custom_init_unsorted_buckets_in=*/ + {Bucket(1, 1000)}) + .IsValid(), + IsFalse()); + + // Missing INT64_MIN + EXPECT_THAT(Options(/*custom_init_sorted_buckets_in=*/ + {Bucket(-200, -100), Bucket(-99, 0)}, + /*custom_init_unsorted_buckets_in=*/ + {Bucket(1, std::numeric_limits<int64_t>::max())}) + .IsValid(), + IsFalse()); + + // Missing some intermediate ranges + EXPECT_THAT(Options(/*custom_init_sorted_buckets_in=*/ + {Bucket(std::numeric_limits<int64_t>::min(), -100)}, + /*custom_init_unsorted_buckets_in=*/ + {Bucket(1, std::numeric_limits<int64_t>::max())}) + .IsValid(), + IsFalse()); +} + +TEST_F(IntegerIndexStorageTest, InvalidBaseDir) { + EXPECT_THAT(IntegerIndexStorage::Create(filesystem_, "/dev/null", Options(), + serializer_.get()), + StatusIs(libtextclassifier3::StatusCode::INTERNAL)); +} + +TEST_F(IntegerIndexStorageTest, CreateWithInvalidOptionsShouldFail) { + Options invalid_options( + /*custom_init_sorted_buckets_in=*/{}, + /*custom_init_unsorted_buckets_in=*/{ + Bucket(-100, std::numeric_limits<int64_t>::max()), + Bucket(std::numeric_limits<int64_t>::min(), -100)}); + ASSERT_THAT(invalid_options.IsValid(), IsFalse()); + + EXPECT_THAT(IntegerIndexStorage::Create(filesystem_, base_dir_, + invalid_options, serializer_.get()), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); +} + +TEST_F(IntegerIndexStorageTest, InitializeNewFiles) { + { + // Create new integer index storage + ASSERT_FALSE(filesystem_.DirectoryExists(base_dir_.c_str())); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<IntegerIndexStorage> storage, + IntegerIndexStorage::Create(filesystem_, base_dir_, Options(), + serializer_.get())); + + ICING_ASSERT_OK(storage->PersistToDisk()); + } + + // Metadata file should be initialized correctly for both info and crcs + // sections. + const std::string metadata_file_path = + absl_ports::StrCat(base_dir_, "/", IntegerIndexStorage::kSubDirectory, + "/", IntegerIndexStorage::kFilePrefix, ".m"); + ScopedFd metadata_sfd(filesystem_.OpenForWrite(metadata_file_path.c_str())); + ASSERT_TRUE(metadata_sfd.is_valid()); + + // Check info section + Info info; + ASSERT_TRUE(filesystem_.PRead(metadata_sfd.get(), &info, sizeof(Info), + Info::kFileOffset)); + EXPECT_THAT(info.magic, Eq(Info::kMagic)); + EXPECT_THAT(info.num_keys, Eq(0)); + + // Check crcs section + Crcs crcs; + ASSERT_TRUE(filesystem_.PRead(metadata_sfd.get(), &crcs, sizeof(Crcs), + Crcs::kFileOffset)); + // # of elements in sorted_buckets should be 1, so it should have non-zero + // crc value. + EXPECT_THAT(crcs.component_crcs.sorted_buckets_crc, Ne(0)); + // Other empty file backed vectors should have 0 crc value. + EXPECT_THAT(crcs.component_crcs.unsorted_buckets_crc, Eq(0)); + EXPECT_THAT(crcs.component_crcs.flash_index_storage_crc, Eq(0)); + EXPECT_THAT(crcs.component_crcs.info_crc, + Eq(Crc32(std::string_view(reinterpret_cast<const char*>(&info), + sizeof(Info))) + .Get())); + EXPECT_THAT(crcs.all_crc, + Eq(Crc32(std::string_view( + reinterpret_cast<const char*>(&crcs.component_crcs), + sizeof(Crcs::ComponentCrcs))) + .Get())); +} + +TEST_F(IntegerIndexStorageTest, + InitializationShouldFailWithoutPersistToDiskOrDestruction) { + // Create new integer index storage + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<IntegerIndexStorage> storage, + IntegerIndexStorage::Create(filesystem_, base_dir_, Options(), + serializer_.get())); + + // Insert some data. + ICING_ASSERT_OK(storage->AddKeys(/*document_id=*/0, /*section_id=*/20, + /*new_keys=*/{0, 100, -100})); + ICING_ASSERT_OK(storage->AddKeys(/*document_id=*/1, /*section_id=*/2, + /*new_keys=*/{3, -1000, 500})); + ICING_ASSERT_OK(storage->AddKeys(/*document_id=*/2, /*section_id=*/15, + /*new_keys=*/{-6, 321, 98})); + + // Without calling PersistToDisk, checksums will not be recomputed or synced + // to disk, so initializing another instance on the same files should fail. + EXPECT_THAT(IntegerIndexStorage::Create(filesystem_, base_dir_, Options(), + serializer_.get()), + StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); +} + +TEST_F(IntegerIndexStorageTest, InitializationShouldSucceedWithPersistToDisk) { + // Create new integer index storage + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<IntegerIndexStorage> storage1, + IntegerIndexStorage::Create(filesystem_, base_dir_, Options(), + serializer_.get())); + + // Insert some data. + ICING_ASSERT_OK(storage1->AddKeys(/*document_id=*/0, /*section_id=*/20, + /*new_keys=*/{0, 100, -100})); + ICING_ASSERT_OK(storage1->AddKeys(/*document_id=*/1, /*section_id=*/2, + /*new_keys=*/{3, -1000, 500})); + ICING_ASSERT_OK(storage1->AddKeys(/*document_id=*/2, /*section_id=*/15, + /*new_keys=*/{-6, 321, 98})); + ICING_ASSERT_OK_AND_ASSIGN( + std::vector<DocHitInfo> doc_hit_info_vec, + Query(storage1.get(), + /*key_lower=*/std::numeric_limits<int64_t>::min(), + /*key_upper=*/std::numeric_limits<int64_t>::max())); + + // After calling PersistToDisk, all checksums should be recomputed and synced + // correctly to disk, so initializing another instance on the same files + // should succeed, and we should be able to get the same contents. + ICING_EXPECT_OK(storage1->PersistToDisk()); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<IntegerIndexStorage> storage2, + IntegerIndexStorage::Create(filesystem_, base_dir_, Options(), + serializer_.get())); + EXPECT_THAT( + Query(storage2.get(), /*key_lower=*/std::numeric_limits<int64_t>::min(), + /*key_upper=*/std::numeric_limits<int64_t>::max()), + IsOkAndHolds( + ElementsAreArray(doc_hit_info_vec.begin(), doc_hit_info_vec.end()))); +} + +TEST_F(IntegerIndexStorageTest, InitializationShouldSucceedAfterDestruction) { + std::vector<DocHitInfo> doc_hit_info_vec; + { + // Create new integer index storage + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<IntegerIndexStorage> storage, + IntegerIndexStorage::Create(filesystem_, base_dir_, Options(), + serializer_.get())); + + ICING_ASSERT_OK_AND_ASSIGN( + doc_hit_info_vec, + Query(storage.get(), + /*key_lower=*/std::numeric_limits<int64_t>::min(), + /*key_upper=*/std::numeric_limits<int64_t>::max())); + } + + { + // The previous instance went out of scope and was destructed. Although we + // didn't call PersistToDisk explicitly, the destructor should invoke it and + // thus initializing another instance on the same files should succeed, and + // we should be able to get the same contents. + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<IntegerIndexStorage> storage, + IntegerIndexStorage::Create(filesystem_, base_dir_, Options(), + serializer_.get())); + EXPECT_THAT( + Query(storage.get(), /*key_lower=*/std::numeric_limits<int64_t>::min(), + /*key_upper=*/std::numeric_limits<int64_t>::max()), + IsOkAndHolds(ElementsAreArray(doc_hit_info_vec.begin(), + doc_hit_info_vec.end()))); + } +} + +TEST_F(IntegerIndexStorageTest, + InitializeExistingFilesWithWrongAllCrcShouldFail) { + { + // Create new integer index storage + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<IntegerIndexStorage> storage, + IntegerIndexStorage::Create(filesystem_, base_dir_, Options(), + serializer_.get())); + ICING_ASSERT_OK(storage->AddKeys(kDefaultDocumentId, kDefaultSectionId, + /*new_keys=*/{0, 100, -100})); + + ICING_ASSERT_OK(storage->PersistToDisk()); + } + + const std::string metadata_file_path = + absl_ports::StrCat(base_dir_, "/", IntegerIndexStorage::kSubDirectory, + "/", IntegerIndexStorage::kFilePrefix, ".m"); + ScopedFd metadata_sfd(filesystem_.OpenForWrite(metadata_file_path.c_str())); + ASSERT_TRUE(metadata_sfd.is_valid()); + + Crcs crcs; + ASSERT_TRUE(filesystem_.PRead(metadata_sfd.get(), &crcs, sizeof(Crcs), + Crcs::kFileOffset)); + + // Manually corrupt all_crc + crcs.all_crc += kCorruptedValueOffset; + ASSERT_TRUE(filesystem_.PWrite(metadata_sfd.get(), Crcs::kFileOffset, &crcs, + sizeof(Crcs))); + metadata_sfd.reset(); + + { + // Attempt to create the integer index storage with metadata containing + // corrupted all_crc. This should fail. + libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndexStorage>> + storage_or = IntegerIndexStorage::Create(filesystem_, base_dir_, + Options(), serializer_.get()); + EXPECT_THAT(storage_or, + StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); + EXPECT_THAT(storage_or.status().error_message(), + HasSubstr("Invalid all crc for IntegerIndexStorage")); + } +} + +TEST_F(IntegerIndexStorageTest, + InitializeExistingFilesWithCorruptedInfoShouldFail) { + { + // Create new integer index storage + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<IntegerIndexStorage> storage, + IntegerIndexStorage::Create(filesystem_, base_dir_, Options(), + serializer_.get())); + ICING_ASSERT_OK(storage->AddKeys(kDefaultDocumentId, kDefaultSectionId, + /*new_keys=*/{0, 100, -100})); + + ICING_ASSERT_OK(storage->PersistToDisk()); + } + + const std::string metadata_file_path = + absl_ports::StrCat(base_dir_, "/", IntegerIndexStorage::kSubDirectory, + "/", IntegerIndexStorage::kFilePrefix, ".m"); + ScopedFd metadata_sfd(filesystem_.OpenForWrite(metadata_file_path.c_str())); + ASSERT_TRUE(metadata_sfd.is_valid()); + + Info info; + ASSERT_TRUE(filesystem_.PRead(metadata_sfd.get(), &info, sizeof(Info), + Info::kFileOffset)); + + // Modify info, but don't update the checksum. This would be similar to + // corruption of info. + info.num_keys += kCorruptedValueOffset; + ASSERT_TRUE(filesystem_.PWrite(metadata_sfd.get(), Info::kFileOffset, &info, + sizeof(Info))); + { + // Attempt to create the integer index storage with info that doesn't match + // its checksum and confirm that it fails. + libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndexStorage>> + storage_or = IntegerIndexStorage::Create(filesystem_, base_dir_, + Options(), serializer_.get()); + EXPECT_THAT(storage_or, + StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); + EXPECT_THAT(storage_or.status().error_message(), + HasSubstr("Invalid info crc for IntegerIndexStorage")); + } +} + +TEST_F(IntegerIndexStorageTest, + InitializeExistingFilesWithWrongSortedBucketsCrcShouldFail) { + { + // Create new integer index storage + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<IntegerIndexStorage> storage, + IntegerIndexStorage::Create(filesystem_, base_dir_, Options(), + serializer_.get())); + ICING_ASSERT_OK(storage->AddKeys(kDefaultDocumentId, kDefaultSectionId, + /*new_keys=*/{0, 100, -100})); + + ICING_ASSERT_OK(storage->PersistToDisk()); + } + + const std::string metadata_file_path = + absl_ports::StrCat(base_dir_, "/", IntegerIndexStorage::kSubDirectory, + "/", IntegerIndexStorage::kFilePrefix, ".m"); + ScopedFd metadata_sfd(filesystem_.OpenForWrite(metadata_file_path.c_str())); + ASSERT_TRUE(metadata_sfd.is_valid()); + + Crcs crcs; + ASSERT_TRUE(filesystem_.PRead(metadata_sfd.get(), &crcs, sizeof(Crcs), + Crcs::kFileOffset)); + + // Manually corrupt sorted_buckets_crc + crcs.component_crcs.sorted_buckets_crc += kCorruptedValueOffset; + crcs.all_crc = crcs.component_crcs.ComputeChecksum().Get(); + ASSERT_TRUE(filesystem_.PWrite(metadata_sfd.get(), Crcs::kFileOffset, &crcs, + sizeof(Crcs))); + { + // Attempt to create the integer index storage with metadata containing + // corrupted sorted_buckets_crc. This should fail. + libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndexStorage>> + storage_or = IntegerIndexStorage::Create(filesystem_, base_dir_, + Options(), serializer_.get()); + EXPECT_THAT(storage_or, + StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); + EXPECT_THAT( + storage_or.status().error_message(), + HasSubstr("Mismatch crc with IntegerIndexStorage sorted buckets")); + } +} + +TEST_F(IntegerIndexStorageTest, + InitializeExistingFilesWithWrongUnsortedBucketsCrcShouldFail) { + { + // Create new integer index storage + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<IntegerIndexStorage> storage, + IntegerIndexStorage::Create(filesystem_, base_dir_, Options(), + serializer_.get())); + ICING_ASSERT_OK(storage->AddKeys(kDefaultDocumentId, kDefaultSectionId, + /*new_keys=*/{0, 100, -100})); + + ICING_ASSERT_OK(storage->PersistToDisk()); + } + + const std::string metadata_file_path = + absl_ports::StrCat(base_dir_, "/", IntegerIndexStorage::kSubDirectory, + "/", IntegerIndexStorage::kFilePrefix, ".m"); + ScopedFd metadata_sfd(filesystem_.OpenForWrite(metadata_file_path.c_str())); + ASSERT_TRUE(metadata_sfd.is_valid()); + + Crcs crcs; + ASSERT_TRUE(filesystem_.PRead(metadata_sfd.get(), &crcs, sizeof(Crcs), + Crcs::kFileOffset)); + + // Manually corrupt unsorted_buckets_crc + crcs.component_crcs.unsorted_buckets_crc += kCorruptedValueOffset; + crcs.all_crc = crcs.component_crcs.ComputeChecksum().Get(); + ASSERT_TRUE(filesystem_.PWrite(metadata_sfd.get(), Crcs::kFileOffset, &crcs, + sizeof(Crcs))); + { + // Attempt to create the integer index storage with metadata containing + // corrupted unsorted_buckets_crc. This should fail. + libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndexStorage>> + storage_or = IntegerIndexStorage::Create(filesystem_, base_dir_, + Options(), serializer_.get()); + EXPECT_THAT(storage_or, + StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); + EXPECT_THAT( + storage_or.status().error_message(), + HasSubstr("Mismatch crc with IntegerIndexStorage unsorted buckets")); + } +} + +// TODO(b/259744228): add test for corrupted flash_index_storage_crc + +TEST_F(IntegerIndexStorageTest, InvalidQuery) { + // Create new integer index storage + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<IntegerIndexStorage> storage, + IntegerIndexStorage::Create(filesystem_, base_dir_, Options(), + serializer_.get())); + EXPECT_THAT( + storage->GetIterator(/*query_key_lower=*/0, /*query_key_upper=*/-1), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); +} + +TEST_F(IntegerIndexStorageTest, ExactQuerySortedBuckets) { + // We use predefined custom buckets to initialize new integer index storage + // and create some test keys accordingly. + std::vector<Bucket> custom_init_sorted_buckets = { + Bucket(-1000, -100), Bucket(0, 100), Bucket(150, 199), Bucket(200, 300), + Bucket(301, 999)}; + std::vector<Bucket> custom_init_unsorted_buckets = { + Bucket(1000, std::numeric_limits<int64_t>::max()), Bucket(-99, -1), + Bucket(101, 149), Bucket(std::numeric_limits<int64_t>::min(), -1001)}; + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<IntegerIndexStorage> storage, + IntegerIndexStorage::Create( + filesystem_, base_dir_, + Options(std::move(custom_init_sorted_buckets), + std::move(custom_init_unsorted_buckets)), + serializer_.get())); + + // Add some keys into sorted buckets [(-1000,-100), (200,300)]. + EXPECT_THAT(storage->AddKeys(/*document_id=*/0, kDefaultSectionId, + /*new_keys=*/{-500}), + IsOk()); + EXPECT_THAT(storage->AddKeys(/*document_id=*/1, kDefaultSectionId, + /*new_keys=*/{208}), + IsOk()); + EXPECT_THAT(storage->AddKeys(/*document_id=*/2, kDefaultSectionId, + /*new_keys=*/{-200}), + IsOk()); + EXPECT_THAT(storage->AddKeys(/*document_id=*/3, kDefaultSectionId, + /*new_keys=*/{-1000}), + IsOk()); + EXPECT_THAT(storage->AddKeys(/*document_id=*/4, kDefaultSectionId, + /*new_keys=*/{300}), + IsOk()); + + std::vector<SectionId> expected_sections = {kDefaultSectionId}; + // Exact query on key in each sorted bucket should get the correct result. + EXPECT_THAT(Query(storage.get(), /*key_lower=*/-500, /*key_upper=*/-500), + IsOkAndHolds(ElementsAre( + EqualsDocHitInfo(/*document_id=*/0, expected_sections)))); + EXPECT_THAT(Query(storage.get(), /*key_lower=*/208, /*key_upper=*/208), + IsOkAndHolds(ElementsAre( + EqualsDocHitInfo(/*document_id=*/1, expected_sections)))); + EXPECT_THAT(Query(storage.get(), /*key_lower=*/-200, /*key_upper=*/-200), + IsOkAndHolds(ElementsAre( + EqualsDocHitInfo(/*document_id=*/2, expected_sections)))); + EXPECT_THAT(Query(storage.get(), /*key_lower=*/-1000, /*key_upper=*/-1000), + IsOkAndHolds(ElementsAre( + EqualsDocHitInfo(/*document_id=*/3, expected_sections)))); + EXPECT_THAT(Query(storage.get(), /*key_lower=*/300, /*key_upper=*/300), + IsOkAndHolds(ElementsAre( + EqualsDocHitInfo(/*document_id=*/4, expected_sections)))); +} + +TEST_F(IntegerIndexStorageTest, ExactQueryUnsortedBuckets) { + // We use predefined custom buckets to initialize new integer index storage + // and create some test keys accordingly. + std::vector<Bucket> custom_init_sorted_buckets = { + Bucket(-1000, -100), Bucket(0, 100), Bucket(150, 199), Bucket(200, 300), + Bucket(301, 999)}; + std::vector<Bucket> custom_init_unsorted_buckets = { + Bucket(1000, std::numeric_limits<int64_t>::max()), Bucket(-99, -1), + Bucket(101, 149), Bucket(std::numeric_limits<int64_t>::min(), -1001)}; + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<IntegerIndexStorage> storage, + IntegerIndexStorage::Create( + filesystem_, base_dir_, + Options(std::move(custom_init_sorted_buckets), + std::move(custom_init_unsorted_buckets)), + serializer_.get())); + + // Add some keys into unsorted buckets [(1000,INT64_MAX), (INT64_MIN,-1001)]. + EXPECT_THAT(storage->AddKeys(/*document_id=*/0, kDefaultSectionId, + /*new_keys=*/{1024}), + IsOk()); + EXPECT_THAT( + storage->AddKeys(/*document_id=*/1, kDefaultSectionId, + /*new_keys=*/{std::numeric_limits<int64_t>::max()}), + IsOk()); + EXPECT_THAT( + storage->AddKeys(/*document_id=*/2, kDefaultSectionId, + /*new_keys=*/{std::numeric_limits<int64_t>::min()}), + IsOk()); + EXPECT_THAT(storage->AddKeys(/*document_id=*/3, kDefaultSectionId, + /*new_keys=*/{-1500}), + IsOk()); + EXPECT_THAT(storage->AddKeys(/*document_id=*/4, kDefaultSectionId, + /*new_keys=*/{2000}), + IsOk()); + + std::vector<SectionId> expected_sections = {kDefaultSectionId}; + // Exact query on key in each unsorted bucket should get the correct result. + EXPECT_THAT(Query(storage.get(), /*key_lower=*/1024, /*key_upper=*/1024), + IsOkAndHolds(ElementsAre( + EqualsDocHitInfo(/*document_id=*/0, expected_sections)))); + EXPECT_THAT( + Query(storage.get(), /*key_lower=*/std::numeric_limits<int64_t>::max(), + /*key_upper=*/std::numeric_limits<int64_t>::max()), + IsOkAndHolds( + ElementsAre(EqualsDocHitInfo(/*document_id=*/1, expected_sections)))); + EXPECT_THAT( + Query(storage.get(), /*key_lower=*/std::numeric_limits<int64_t>::min(), + /*key_upper=*/std::numeric_limits<int64_t>::min()), + IsOkAndHolds( + ElementsAre(EqualsDocHitInfo(/*document_id=*/2, expected_sections)))); + EXPECT_THAT(Query(storage.get(), /*key_lower=*/-1500, /*key_upper=*/-1500), + IsOkAndHolds(ElementsAre( + EqualsDocHitInfo(/*document_id=*/3, expected_sections)))); + EXPECT_THAT(Query(storage.get(), /*key_lower=*/2000, /*key_upper=*/2000), + IsOkAndHolds(ElementsAre( + EqualsDocHitInfo(/*document_id=*/4, expected_sections)))); +} + +TEST_F(IntegerIndexStorageTest, ExactQueryIdenticalKeys) { + // We use predefined custom buckets to initialize new integer index storage + // and create some test keys accordingly. + std::vector<Bucket> custom_init_sorted_buckets = { + Bucket(-1000, -100), Bucket(0, 100), Bucket(150, 199), Bucket(200, 300), + Bucket(301, 999)}; + std::vector<Bucket> custom_init_unsorted_buckets = { + Bucket(1000, std::numeric_limits<int64_t>::max()), Bucket(-99, -1), + Bucket(101, 149), Bucket(std::numeric_limits<int64_t>::min(), -1001)}; + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<IntegerIndexStorage> storage, + IntegerIndexStorage::Create( + filesystem_, base_dir_, + Options(std::move(custom_init_sorted_buckets), + std::move(custom_init_unsorted_buckets)), + serializer_.get())); + + // Add some keys into buckets [(0,100), (1000,INT64_MAX)]. + EXPECT_THAT(storage->AddKeys(/*document_id=*/0, kDefaultSectionId, + /*new_keys=*/{1024}), + IsOk()); + EXPECT_THAT(storage->AddKeys(/*document_id=*/1, kDefaultSectionId, + /*new_keys=*/{1024}), + IsOk()); + EXPECT_THAT(storage->AddKeys(/*document_id=*/2, kDefaultSectionId, + /*new_keys=*/{20}), + IsOk()); + EXPECT_THAT(storage->AddKeys(/*document_id=*/3, kDefaultSectionId, + /*new_keys=*/{20}), + IsOk()); + + std::vector<SectionId> expected_sections = {kDefaultSectionId}; + // Exact query on key with multiple hits should get the correct result. + EXPECT_THAT(Query(storage.get(), /*key_lower=*/1024, /*key_upper=*/1024), + IsOkAndHolds(ElementsAre( + EqualsDocHitInfo(/*document_id=*/1, expected_sections), + EqualsDocHitInfo(/*document_id=*/0, expected_sections)))); + EXPECT_THAT(Query(storage.get(), /*key_lower=*/20, /*key_upper=*/20), + IsOkAndHolds(ElementsAre( + EqualsDocHitInfo(/*document_id=*/3, expected_sections), + EqualsDocHitInfo(/*document_id=*/2, expected_sections)))); +} + +TEST_F(IntegerIndexStorageTest, RangeQueryEmptyIntegerIndexStorage) { + std::vector<Bucket> custom_init_sorted_buckets = { + Bucket(-1000, -100), Bucket(0, 100), Bucket(150, 199), Bucket(200, 300), + Bucket(301, 999)}; + std::vector<Bucket> custom_init_unsorted_buckets = { + Bucket(1000, std::numeric_limits<int64_t>::max()), Bucket(-99, -1), + Bucket(101, 149), Bucket(std::numeric_limits<int64_t>::min(), -1001)}; + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<IntegerIndexStorage> storage, + IntegerIndexStorage::Create( + filesystem_, base_dir_, + Options(std::move(custom_init_sorted_buckets), + std::move(custom_init_unsorted_buckets)), + serializer_.get())); + + EXPECT_THAT( + Query(storage.get(), /*key_lower=*/std::numeric_limits<int64_t>::min(), + /*key_upper=*/std::numeric_limits<int64_t>::max()), + IsOkAndHolds(IsEmpty())); +} + +TEST_F(IntegerIndexStorageTest, RangeQuerySingleEntireSortedBucket) { + // We use predefined custom buckets to initialize new integer index storage + // and create some test keys accordingly. + std::vector<Bucket> custom_init_sorted_buckets = { + Bucket(-1000, -100), Bucket(0, 100), Bucket(150, 199), Bucket(200, 300), + Bucket(301, 999)}; + std::vector<Bucket> custom_init_unsorted_buckets = { + Bucket(1000, std::numeric_limits<int64_t>::max()), Bucket(-99, -1), + Bucket(101, 149), Bucket(std::numeric_limits<int64_t>::min(), -1001)}; + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<IntegerIndexStorage> storage, + IntegerIndexStorage::Create( + filesystem_, base_dir_, + Options(std::move(custom_init_sorted_buckets), + std::move(custom_init_unsorted_buckets)), + serializer_.get())); + + // Add some keys into sorted buckets [(-1000,-100), (200,300)]. + EXPECT_THAT(storage->AddKeys(/*document_id=*/0, kDefaultSectionId, + /*new_keys=*/{-500}), + IsOk()); + EXPECT_THAT(storage->AddKeys(/*document_id=*/1, kDefaultSectionId, + /*new_keys=*/{208}), + IsOk()); + EXPECT_THAT(storage->AddKeys(/*document_id=*/2, kDefaultSectionId, + /*new_keys=*/{-200}), + IsOk()); + EXPECT_THAT(storage->AddKeys(/*document_id=*/3, kDefaultSectionId, + /*new_keys=*/{-1000}), + IsOk()); + EXPECT_THAT(storage->AddKeys(/*document_id=*/4, kDefaultSectionId, + /*new_keys=*/{300}), + IsOk()); + + std::vector<SectionId> expected_sections = {kDefaultSectionId}; + // Range query on each sorted bucket boundary should get the correct result. + EXPECT_THAT(Query(storage.get(), /*key_lower=*/-1000, /*key_upper=*/-100), + IsOkAndHolds(ElementsAre( + EqualsDocHitInfo(/*document_id=*/3, expected_sections), + EqualsDocHitInfo(/*document_id=*/2, expected_sections), + EqualsDocHitInfo(/*document_id=*/0, expected_sections)))); + EXPECT_THAT(Query(storage.get(), /*key_lower=*/0, /*key_upper=*/100), + IsOkAndHolds(IsEmpty())); + EXPECT_THAT(Query(storage.get(), /*key_lower=*/150, /*key_upper=*/199), + IsOkAndHolds(IsEmpty())); + EXPECT_THAT(Query(storage.get(), /*key_lower=*/200, /*key_upper=*/300), + IsOkAndHolds(ElementsAre( + EqualsDocHitInfo(/*document_id=*/4, expected_sections), + EqualsDocHitInfo(/*document_id=*/1, expected_sections)))); + EXPECT_THAT(Query(storage.get(), /*key_lower=*/301, /*key_upper=*/999), + IsOkAndHolds(IsEmpty())); +} + +TEST_F(IntegerIndexStorageTest, RangeQuerySingleEntireUnsortedBucket) { + // We use predefined custom buckets to initialize new integer index storage + // and create some test keys accordingly. + std::vector<Bucket> custom_init_sorted_buckets = { + Bucket(-1000, -100), Bucket(0, 100), Bucket(150, 199), Bucket(200, 300), + Bucket(301, 999)}; + std::vector<Bucket> custom_init_unsorted_buckets = { + Bucket(1000, std::numeric_limits<int64_t>::max()), Bucket(-99, -1), + Bucket(101, 149), Bucket(std::numeric_limits<int64_t>::min(), -1001)}; + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<IntegerIndexStorage> storage, + IntegerIndexStorage::Create( + filesystem_, base_dir_, + Options(std::move(custom_init_sorted_buckets), + std::move(custom_init_unsorted_buckets)), + serializer_.get())); + + // Add some keys into unsorted buckets [(1000,INT64_MAX), (INT64_MIN,-1001)]. + EXPECT_THAT(storage->AddKeys(/*document_id=*/0, kDefaultSectionId, + /*new_keys=*/{1024}), + IsOk()); + EXPECT_THAT( + storage->AddKeys(/*document_id=*/1, kDefaultSectionId, + /*new_keys=*/{std::numeric_limits<int64_t>::max()}), + IsOk()); + EXPECT_THAT( + storage->AddKeys(/*document_id=*/2, kDefaultSectionId, + /*new_keys=*/{std::numeric_limits<int64_t>::min()}), + IsOk()); + EXPECT_THAT(storage->AddKeys(/*document_id=*/3, kDefaultSectionId, + /*new_keys=*/{-1500}), + IsOk()); + EXPECT_THAT(storage->AddKeys(/*document_id=*/4, kDefaultSectionId, + /*new_keys=*/{2000}), + IsOk()); + + std::vector<SectionId> expected_sections = {kDefaultSectionId}; + // Range query on each unsorted bucket boundary should get the correct result. + EXPECT_THAT(Query(storage.get(), /*key_lower=*/1000, + /*key_upper=*/std::numeric_limits<int64_t>::max()), + IsOkAndHolds(ElementsAre( + EqualsDocHitInfo(/*document_id=*/4, expected_sections), + EqualsDocHitInfo(/*document_id=*/1, expected_sections), + EqualsDocHitInfo(/*document_id=*/0, expected_sections)))); + EXPECT_THAT(Query(storage.get(), /*key_lower=*/-99, /*key_upper=*/-1), + IsOkAndHolds(IsEmpty())); + EXPECT_THAT(Query(storage.get(), /*key_lower=*/101, /*key_upper=*/149), + IsOkAndHolds(IsEmpty())); + EXPECT_THAT( + Query(storage.get(), /*key_lower=*/std::numeric_limits<int64_t>::min(), + /*key_upper=*/-1001), + IsOkAndHolds( + ElementsAre(EqualsDocHitInfo(/*document_id=*/3, expected_sections), + EqualsDocHitInfo(/*document_id=*/2, expected_sections)))); +} + +TEST_F(IntegerIndexStorageTest, RangeQuerySinglePartialSortedBucket) { + // We use predefined custom buckets to initialize new integer index storage + // and create some test keys accordingly. + std::vector<Bucket> custom_init_sorted_buckets = { + Bucket(-1000, -100), Bucket(0, 100), Bucket(150, 199), Bucket(200, 300), + Bucket(301, 999)}; + std::vector<Bucket> custom_init_unsorted_buckets = { + Bucket(1000, std::numeric_limits<int64_t>::max()), Bucket(-99, -1), + Bucket(101, 149), Bucket(std::numeric_limits<int64_t>::min(), -1001)}; + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<IntegerIndexStorage> storage, + IntegerIndexStorage::Create( + filesystem_, base_dir_, + Options(std::move(custom_init_sorted_buckets), + std::move(custom_init_unsorted_buckets)), + serializer_.get())); + + // Add some keys into sorted bucket (0,100). + EXPECT_THAT(storage->AddKeys(/*document_id=*/0, kDefaultSectionId, + /*new_keys=*/{43}), + IsOk()); + EXPECT_THAT(storage->AddKeys(/*document_id=*/1, kDefaultSectionId, + /*new_keys=*/{30}), + IsOk()); + + std::vector<SectionId> expected_sections = {kDefaultSectionId}; + // Range query on partial range of each sorted bucket should get the correct + // result. + EXPECT_THAT(Query(storage.get(), /*key_lower=*/25, /*key_upper=*/200), + IsOkAndHolds(ElementsAre( + EqualsDocHitInfo(/*document_id=*/1, expected_sections), + EqualsDocHitInfo(/*document_id=*/0, expected_sections)))); + EXPECT_THAT(Query(storage.get(), /*key_lower=*/-1000, /*key_upper=*/49), + IsOkAndHolds(ElementsAre( + EqualsDocHitInfo(/*document_id=*/1, expected_sections), + EqualsDocHitInfo(/*document_id=*/0, expected_sections)))); + EXPECT_THAT(Query(storage.get(), /*key_lower=*/25, /*key_upper=*/49), + IsOkAndHolds(ElementsAre( + EqualsDocHitInfo(/*document_id=*/1, expected_sections), + EqualsDocHitInfo(/*document_id=*/0, expected_sections)))); + EXPECT_THAT(Query(storage.get(), /*key_lower=*/31, /*key_upper=*/49), + IsOkAndHolds(ElementsAre( + EqualsDocHitInfo(/*document_id=*/0, expected_sections)))); + EXPECT_THAT(Query(storage.get(), /*key_lower=*/25, /*key_upper=*/31), + IsOkAndHolds(ElementsAre( + EqualsDocHitInfo(/*document_id=*/1, expected_sections)))); + EXPECT_THAT(Query(storage.get(), /*key_lower=*/3, /*key_upper=*/5), + IsOkAndHolds(IsEmpty())); +} + +TEST_F(IntegerIndexStorageTest, RangeQuerySinglePartialUnsortedBucket) { + // We use predefined custom buckets to initialize new integer index storage + // and create some test keys accordingly. + std::vector<Bucket> custom_init_sorted_buckets = { + Bucket(-1000, -100), Bucket(0, 100), Bucket(150, 199), Bucket(200, 300), + Bucket(301, 999)}; + std::vector<Bucket> custom_init_unsorted_buckets = { + Bucket(1000, std::numeric_limits<int64_t>::max()), Bucket(-99, -1), + Bucket(101, 149), Bucket(std::numeric_limits<int64_t>::min(), -1001)}; + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<IntegerIndexStorage> storage, + IntegerIndexStorage::Create( + filesystem_, base_dir_, + Options(std::move(custom_init_sorted_buckets), + std::move(custom_init_unsorted_buckets)), + serializer_.get())); + + // Add some keys into unsorted buckets (-99,-1). + EXPECT_THAT(storage->AddKeys(/*document_id=*/0, kDefaultSectionId, + /*new_keys=*/{-19}), + IsOk()); + EXPECT_THAT(storage->AddKeys(/*document_id=*/1, kDefaultSectionId, + /*new_keys=*/{-72}), + IsOk()); + + std::vector<SectionId> expected_sections = {kDefaultSectionId}; + // Range query on partial range of each unsorted bucket should get the correct + // result. + EXPECT_THAT(Query(storage.get(), /*key_lower=*/-1000, /*key_upper=*/-15), + IsOkAndHolds(ElementsAre( + EqualsDocHitInfo(/*document_id=*/1, expected_sections), + EqualsDocHitInfo(/*document_id=*/0, expected_sections)))); + EXPECT_THAT(Query(storage.get(), /*key_lower=*/-80, /*key_upper=*/149), + IsOkAndHolds(ElementsAre( + EqualsDocHitInfo(/*document_id=*/1, expected_sections), + EqualsDocHitInfo(/*document_id=*/0, expected_sections)))); + EXPECT_THAT(Query(storage.get(), /*key_lower=*/-80, /*key_upper=*/-15), + IsOkAndHolds(ElementsAre( + EqualsDocHitInfo(/*document_id=*/1, expected_sections), + EqualsDocHitInfo(/*document_id=*/0, expected_sections)))); + EXPECT_THAT(Query(storage.get(), /*key_lower=*/-38, /*key_upper=*/-15), + IsOkAndHolds(ElementsAre( + EqualsDocHitInfo(/*document_id=*/0, expected_sections)))); + EXPECT_THAT(Query(storage.get(), /*key_lower=*/-80, /*key_upper=*/-38), + IsOkAndHolds(ElementsAre( + EqualsDocHitInfo(/*document_id=*/1, expected_sections)))); + EXPECT_THAT(Query(storage.get(), /*key_lower=*/-95, /*key_upper=*/-92), + IsOkAndHolds(IsEmpty())); +} + +TEST_F(IntegerIndexStorageTest, RangeQueryMultipleBuckets) { + // We use predefined custom buckets to initialize new integer index storage + // and create some test keys accordingly. + std::vector<Bucket> custom_init_sorted_buckets = { + Bucket(-1000, -100), Bucket(0, 100), Bucket(150, 199), Bucket(200, 300), + Bucket(301, 999)}; + std::vector<Bucket> custom_init_unsorted_buckets = { + Bucket(1000, std::numeric_limits<int64_t>::max()), Bucket(-99, -1), + Bucket(101, 149), Bucket(std::numeric_limits<int64_t>::min(), -1001)}; + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<IntegerIndexStorage> storage, + IntegerIndexStorage::Create( + filesystem_, base_dir_, + Options(std::move(custom_init_sorted_buckets), + std::move(custom_init_unsorted_buckets)), + serializer_.get())); + + // Add some keys into buckets [(-1000,-100), (200,300), (1000,INT64_MAX), + // (INT64_MIN,-1001)] + EXPECT_THAT(storage->AddKeys(/*document_id=*/0, kDefaultSectionId, + /*new_keys=*/{-500}), + IsOk()); + EXPECT_THAT(storage->AddKeys(/*document_id=*/1, kDefaultSectionId, + /*new_keys=*/{1024}), + IsOk()); + EXPECT_THAT(storage->AddKeys(/*document_id=*/2, kDefaultSectionId, + /*new_keys=*/{-200}), + IsOk()); + EXPECT_THAT(storage->AddKeys(/*document_id=*/3, kDefaultSectionId, + /*new_keys=*/{208}), + IsOk()); + EXPECT_THAT( + storage->AddKeys(/*document_id=*/4, kDefaultSectionId, + /*new_keys=*/{std::numeric_limits<int64_t>::max()}), + IsOk()); + EXPECT_THAT(storage->AddKeys(/*document_id=*/5, kDefaultSectionId, + /*new_keys=*/{-1000}), + IsOk()); + EXPECT_THAT(storage->AddKeys(/*document_id=*/6, kDefaultSectionId, + /*new_keys=*/{300}), + IsOk()); + EXPECT_THAT( + storage->AddKeys(/*document_id=*/7, kDefaultSectionId, + /*new_keys=*/{std::numeric_limits<int64_t>::min()}), + IsOk()); + EXPECT_THAT(storage->AddKeys(/*document_id=*/8, kDefaultSectionId, + /*new_keys=*/{-1500}), + IsOk()); + EXPECT_THAT(storage->AddKeys(/*document_id=*/9, kDefaultSectionId, + /*new_keys=*/{2000}), + IsOk()); + + std::vector<SectionId> expected_sections = {kDefaultSectionId}; + // Range query should get the correct result. + EXPECT_THAT( + Query(storage.get(), /*key_lower=*/std::numeric_limits<int64_t>::min(), + /*key_upper=*/std::numeric_limits<int64_t>::max()), + IsOkAndHolds( + ElementsAre(EqualsDocHitInfo(/*document_id=*/9, expected_sections), + EqualsDocHitInfo(/*document_id=*/8, expected_sections), + EqualsDocHitInfo(/*document_id=*/7, expected_sections), + EqualsDocHitInfo(/*document_id=*/6, expected_sections), + EqualsDocHitInfo(/*document_id=*/5, expected_sections), + EqualsDocHitInfo(/*document_id=*/4, expected_sections), + EqualsDocHitInfo(/*document_id=*/3, expected_sections), + EqualsDocHitInfo(/*document_id=*/2, expected_sections), + EqualsDocHitInfo(/*document_id=*/1, expected_sections), + EqualsDocHitInfo(/*document_id=*/0, expected_sections)))); + EXPECT_THAT(Query(storage.get(), /*key_lower=*/-199, + /*key_upper=*/std::numeric_limits<int64_t>::max()), + IsOkAndHolds(ElementsAre( + EqualsDocHitInfo(/*document_id=*/9, expected_sections), + EqualsDocHitInfo(/*document_id=*/6, expected_sections), + EqualsDocHitInfo(/*document_id=*/4, expected_sections), + EqualsDocHitInfo(/*document_id=*/3, expected_sections), + EqualsDocHitInfo(/*document_id=*/1, expected_sections)))); + EXPECT_THAT( + Query(storage.get(), /*key_lower=*/std::numeric_limits<int64_t>::min(), + /*key_upper=*/-200), + IsOkAndHolds( + ElementsAre(EqualsDocHitInfo(/*document_id=*/8, expected_sections), + EqualsDocHitInfo(/*document_id=*/7, expected_sections), + EqualsDocHitInfo(/*document_id=*/5, expected_sections), + EqualsDocHitInfo(/*document_id=*/2, expected_sections), + EqualsDocHitInfo(/*document_id=*/0, expected_sections)))); +} + +TEST_F(IntegerIndexStorageTest, BatchAdd) { + // We use predefined custom buckets to initialize new integer index storage + // and create some test keys accordingly. + std::vector<Bucket> custom_init_sorted_buckets = { + Bucket(-1000, -100), Bucket(0, 100), Bucket(150, 199), Bucket(200, 300), + Bucket(301, 999)}; + std::vector<Bucket> custom_init_unsorted_buckets = { + Bucket(1000, std::numeric_limits<int64_t>::max()), Bucket(-99, -1), + Bucket(101, 149), Bucket(std::numeric_limits<int64_t>::min(), -1001)}; + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<IntegerIndexStorage> storage, + IntegerIndexStorage::Create( + filesystem_, base_dir_, + Options(std::move(custom_init_sorted_buckets), + std::move(custom_init_unsorted_buckets)), + serializer_.get())); + + // Sorted buckets: [(-1000,-100), (0,100), (150,199), (200,300), (301,999)] + // Unsorted buckets: [(1000,INT64_MAX), (-99,-1), (101,149), + // (INT64_MIN,-1001)] + // Batch add the following keys (including some edge cases) to test the + // correctness of the sort and binary search logic in AddKeys(). + // clang-format off + std::vector<int64_t> keys = {4000, 3000, 2000, 300, 201, 200, 106, 104, + 100, 3, 2, 1, 0, -97, -98, -99, + -100, -200, -1000, -1001, -1500, -2000, + std::numeric_limits<int64_t>::max(), + std::numeric_limits<int64_t>::min()}; + // clang-format on + EXPECT_THAT(storage->AddKeys(kDefaultDocumentId, kDefaultSectionId, + std::vector<int64_t>(keys)), + IsOk()); + + std::vector<SectionId> expected_sections = {kDefaultSectionId}; + for (int64_t key : keys) { + EXPECT_THAT(Query(storage.get(), /*key_lower=*/key, /*key_upper=*/key), + IsOkAndHolds(ElementsAre( + EqualsDocHitInfo(kDefaultDocumentId, expected_sections)))); + } +} + +TEST_F(IntegerIndexStorageTest, MultipleKeysShouldMergeAndDedupeDocHitInfo) { + // We use predefined custom buckets to initialize new integer index storage + // and create some test keys accordingly. + std::vector<Bucket> custom_init_sorted_buckets = { + Bucket(-1000, -100), Bucket(0, 100), Bucket(150, 199), Bucket(200, 300), + Bucket(301, 999)}; + std::vector<Bucket> custom_init_unsorted_buckets = { + Bucket(1000, std::numeric_limits<int64_t>::max()), Bucket(-99, -1), + Bucket(101, 149), Bucket(std::numeric_limits<int64_t>::min(), -1001)}; + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<IntegerIndexStorage> storage, + IntegerIndexStorage::Create( + filesystem_, base_dir_, + Options(std::move(custom_init_sorted_buckets), + std::move(custom_init_unsorted_buckets)), + serializer_.get())); + + // Add some keys with same document id and section id. + EXPECT_THAT( + storage->AddKeys( + /*document_id=*/0, kDefaultSectionId, /*new_keys=*/ + {-500, 1024, -200, 208, std::numeric_limits<int64_t>::max(), -1000, + 300, std::numeric_limits<int64_t>::min(), -1500, 2000}), + IsOk()); + + std::vector<SectionId> expected_sections = {kDefaultSectionId}; + EXPECT_THAT( + Query(storage.get(), /*key_lower=*/std::numeric_limits<int64_t>::min(), + /*key_upper=*/std::numeric_limits<int64_t>::max()), + IsOkAndHolds( + ElementsAre(EqualsDocHitInfo(/*document_id=*/0, expected_sections)))); +} + +TEST_F(IntegerIndexStorageTest, + MultipleSectionsShouldMergeSectionsAndDedupeDocHitInfo) { + // We use predefined custom buckets to initialize new integer index storage + // and create some test keys accordingly. + std::vector<Bucket> custom_init_sorted_buckets = { + Bucket(-1000, -100), Bucket(0, 100), Bucket(150, 199), Bucket(200, 300), + Bucket(301, 999)}; + std::vector<Bucket> custom_init_unsorted_buckets = { + Bucket(1000, std::numeric_limits<int64_t>::max()), Bucket(-99, -1), + Bucket(101, 149), Bucket(std::numeric_limits<int64_t>::min(), -1001)}; + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<IntegerIndexStorage> storage, + IntegerIndexStorage::Create( + filesystem_, base_dir_, + Options(std::move(custom_init_sorted_buckets), + std::move(custom_init_unsorted_buckets)), + serializer_.get())); + + // Add some keys with same document id but different section ids. + EXPECT_THAT(storage->AddKeys(kDefaultDocumentId, /*section_id=*/63, + /*new_keys=*/{-500}), + IsOk()); + EXPECT_THAT(storage->AddKeys(kDefaultDocumentId, /*section_id=*/62, + /*new_keys=*/{1024}), + IsOk()); + EXPECT_THAT(storage->AddKeys(kDefaultDocumentId, /*section_id=*/61, + /*new_keys=*/{-200}), + IsOk()); + EXPECT_THAT(storage->AddKeys(kDefaultDocumentId, /*section_id=*/60, + /*new_keys=*/{208}), + IsOk()); + EXPECT_THAT( + storage->AddKeys(kDefaultDocumentId, /*section_id=*/59, + /*new_keys=*/{std::numeric_limits<int64_t>::max()}), + IsOk()); + EXPECT_THAT(storage->AddKeys(kDefaultDocumentId, /*section_id=*/58, + /*new_keys=*/{-1000}), + IsOk()); + EXPECT_THAT(storage->AddKeys(kDefaultDocumentId, /*section_id=*/57, + /*new_keys=*/{300}), + IsOk()); + EXPECT_THAT( + storage->AddKeys(kDefaultDocumentId, /*section_id=*/56, + /*new_keys=*/{std::numeric_limits<int64_t>::min()}), + IsOk()); + EXPECT_THAT(storage->AddKeys(kDefaultDocumentId, /*section_id=*/55, + /*new_keys=*/{-1500}), + IsOk()); + EXPECT_THAT(storage->AddKeys(kDefaultDocumentId, /*section_id=*/54, + /*new_keys=*/{2000}), + IsOk()); + + std::vector<SectionId> expected_sections = {63, 62, 61, 60, 59, + 58, 57, 56, 55, 54}; + EXPECT_THAT( + Query(storage.get(), /*key_lower=*/std::numeric_limits<int64_t>::min(), + /*key_upper=*/std::numeric_limits<int64_t>::max()), + IsOkAndHolds(ElementsAre( + EqualsDocHitInfo(kDefaultDocumentId, expected_sections)))); +} + +} // namespace + +} // namespace lib +} // namespace icing diff --git a/icing/index/numeric/posting-list-integer-index-data-accessor.cc b/icing/index/numeric/posting-list-integer-index-accessor.cc index 73b48e2..271f8dc 100644 --- a/icing/index/numeric/posting-list-integer-index-data-accessor.cc +++ b/icing/index/numeric/posting-list-integer-index-accessor.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "icing/index/numeric/posting-list-integer-index-data-accessor.h" +#include "icing/index/numeric/posting-list-integer-index-accessor.h" #include <cstdint> #include <memory> @@ -26,17 +26,16 @@ #include "icing/file/posting_list/posting-list-identifier.h" #include "icing/file/posting_list/posting-list-used.h" #include "icing/index/numeric/integer-index-data.h" -#include "icing/index/numeric/posting-list-used-integer-index-data-serializer.h" +#include "icing/index/numeric/posting-list-integer-index-serializer.h" #include "icing/util/status-macros.h" namespace icing { namespace lib { /* static */ libtextclassifier3::StatusOr< - std::unique_ptr<PostingListIntegerIndexDataAccessor>> -PostingListIntegerIndexDataAccessor::Create( - FlashIndexStorage* storage, - PostingListUsedIntegerIndexDataSerializer* serializer) { + std::unique_ptr<PostingListIntegerIndexAccessor>> +PostingListIntegerIndexAccessor::Create( + FlashIndexStorage* storage, PostingListIntegerIndexSerializer* serializer) { uint32_t max_posting_list_bytes = IndexBlock::CalculateMaxPostingListBytes( storage->block_size(), serializer->GetDataTypeBytes()); std::unique_ptr<uint8_t[]> posting_list_buffer_array = @@ -45,20 +44,19 @@ PostingListIntegerIndexDataAccessor::Create( PostingListUsed posting_list_buffer, PostingListUsed::CreateFromUnitializedRegion( serializer, posting_list_buffer_array.get(), max_posting_list_bytes)); - return std::unique_ptr<PostingListIntegerIndexDataAccessor>( - new PostingListIntegerIndexDataAccessor( + return std::unique_ptr<PostingListIntegerIndexAccessor>( + new PostingListIntegerIndexAccessor( storage, std::move(posting_list_buffer_array), std::move(posting_list_buffer), serializer)); } /* static */ libtextclassifier3::StatusOr< - std::unique_ptr<PostingListIntegerIndexDataAccessor>> -PostingListIntegerIndexDataAccessor::CreateFromExisting( - FlashIndexStorage* storage, - PostingListUsedIntegerIndexDataSerializer* serializer, + std::unique_ptr<PostingListIntegerIndexAccessor>> +PostingListIntegerIndexAccessor::CreateFromExisting( + FlashIndexStorage* storage, PostingListIntegerIndexSerializer* serializer, PostingListIdentifier existing_posting_list_id) { ICING_ASSIGN_OR_RETURN( - std::unique_ptr<PostingListIntegerIndexDataAccessor> pl_accessor, + std::unique_ptr<PostingListIntegerIndexAccessor> pl_accessor, Create(storage, serializer)); ICING_ASSIGN_OR_RETURN(PostingListHolder holder, storage->GetPostingList(existing_posting_list_id)); @@ -69,13 +67,13 @@ PostingListIntegerIndexDataAccessor::CreateFromExisting( // Returns the next batch of integer index data for the provided posting list. libtextclassifier3::StatusOr<std::vector<IntegerIndexData>> -PostingListIntegerIndexDataAccessor::GetNextDataBatch() { +PostingListIntegerIndexAccessor::GetNextDataBatch() { if (preexisting_posting_list_ == nullptr) { if (has_reached_posting_list_chain_end_) { return std::vector<IntegerIndexData>(); } return absl_ports::FailedPreconditionError( - "Cannot retrieve data from a PostingListIntegerIndexDataAccessor that " + "Cannot retrieve data from a PostingListIntegerIndexAccessor that " "was not created from a preexisting posting list."); } ICING_ASSIGN_OR_RETURN( @@ -106,7 +104,7 @@ PostingListIntegerIndexDataAccessor::GetNextDataBatch() { return batch; } -libtextclassifier3::Status PostingListIntegerIndexDataAccessor::PrependData( +libtextclassifier3::Status PostingListIntegerIndexAccessor::PrependData( const IntegerIndexData& data) { PostingListUsed& active_pl = (preexisting_posting_list_ != nullptr) ? preexisting_posting_list_->posting_list diff --git a/icing/index/numeric/posting-list-integer-index-data-accessor.h b/icing/index/numeric/posting-list-integer-index-accessor.h index 7835bf9..64a8901 100644 --- a/icing/index/numeric/posting-list-integer-index-data-accessor.h +++ b/icing/index/numeric/posting-list-integer-index-accessor.h @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef ICING_INDEX_NUMERIC_POSTING_LIST_INTEGER_INDEX_DATA_ACCESSOR_H_ -#define ICING_INDEX_NUMERIC_POSTING_LIST_INTEGER_INDEX_DATA_ACCESSOR_H_ +#ifndef ICING_INDEX_NUMERIC_POSTING_LIST_INTEGER_INDEX_ACCESSOR_H_ +#define ICING_INDEX_NUMERIC_POSTING_LIST_INTEGER_INDEX_ACCESSOR_H_ #include <cstdint> #include <memory> @@ -26,7 +26,7 @@ #include "icing/file/posting_list/posting-list-identifier.h" #include "icing/file/posting_list/posting-list-used.h" #include "icing/index/numeric/integer-index-data.h" -#include "icing/index/numeric/posting-list-used-integer-index-data-serializer.h" +#include "icing/index/numeric/posting-list-integer-index-serializer.h" namespace icing { namespace lib { @@ -34,35 +34,35 @@ namespace lib { // TODO(b/259743562): Refactor PostingListAccessor derived classes // This class is used to provide a simple abstraction for adding integer index -// data to posting lists. PostingListIntegerIndexDataAccessor handles: +// data to posting lists. PostingListIntegerIndexAccessor handles: // 1) selection of properly-sized posting lists for the accumulated integer // index data during Finalize() // 2) chaining of max-sized posting lists. -class PostingListIntegerIndexDataAccessor : public PostingListAccessor { +class PostingListIntegerIndexAccessor : public PostingListAccessor { public: - // Creates an empty PostingListIntegerIndexDataAccessor. + // Creates an empty PostingListIntegerIndexAccessor. // // RETURNS: - // - On success, a valid instance of PostingListIntegerIndexDataAccessor + // - On success, a valid instance of PostingListIntegerIndexAccessor // - INVALID_ARGUMENT error if storage has an invalid block_size. static libtextclassifier3::StatusOr< - std::unique_ptr<PostingListIntegerIndexDataAccessor>> + std::unique_ptr<PostingListIntegerIndexAccessor>> Create(FlashIndexStorage* storage, - PostingListUsedIntegerIndexDataSerializer* serializer); + PostingListIntegerIndexSerializer* serializer); - // Create a PostingListIntegerIndexDataAccessor with an existing posting list + // Create a PostingListIntegerIndexAccessor with an existing posting list // identified by existing_posting_list_id. // // RETURNS: - // - On success, a valid instance of PostingListIntegerIndexDataAccessor + // - On success, a valid instance of PostingListIntegerIndexAccessor // - INVALID_ARGUMENT if storage has an invalid block_size. static libtextclassifier3::StatusOr< - std::unique_ptr<PostingListIntegerIndexDataAccessor>> + std::unique_ptr<PostingListIntegerIndexAccessor>> CreateFromExisting(FlashIndexStorage* storage, - PostingListUsedIntegerIndexDataSerializer* serializer, + PostingListIntegerIndexSerializer* serializer, PostingListIdentifier existing_posting_list_id); - PostingListUsedSerializer* GetSerializer() override { return serializer_; } + PostingListSerializer* GetSerializer() override { return serializer_; } // Retrieve the next batch of data in the posting list chain // @@ -75,7 +75,7 @@ class PostingListIntegerIndexDataAccessor : public PostingListAccessor { GetNextDataBatch(); // Prepend one data. This may result in flushing the posting list to disk (if - // the PostingListIntegerIndexDataAccessor holds a max-sized posting list that + // the PostingListIntegerIndexAccessor holds a max-sized posting list that // is full) or freeing a pre-existing posting list if it is too small to fit // all data necessary. // @@ -90,19 +90,19 @@ class PostingListIntegerIndexDataAccessor : public PostingListAccessor { // TODO(b/259743562): [Optimization 1] add GetAndClear, IsFull for split private: - explicit PostingListIntegerIndexDataAccessor( + explicit PostingListIntegerIndexAccessor( FlashIndexStorage* storage, std::unique_ptr<uint8_t[]> posting_list_buffer_array, PostingListUsed posting_list_buffer, - PostingListUsedIntegerIndexDataSerializer* serializer) + PostingListIntegerIndexSerializer* serializer) : PostingListAccessor(storage, std::move(posting_list_buffer_array), std::move(posting_list_buffer)), serializer_(serializer) {} - PostingListUsedIntegerIndexDataSerializer* serializer_; // Does not own. + PostingListIntegerIndexSerializer* serializer_; // Does not own. }; } // namespace lib } // namespace icing -#endif // ICING_INDEX_NUMERIC_POSTING_LIST_INTEGER_INDEX_DATA_ACCESSOR_H_ +#endif // ICING_INDEX_NUMERIC_POSTING_LIST_INTEGER_INDEX_ACCESSOR_H_ diff --git a/icing/index/numeric/posting-list-integer-index-data-accessor_test.cc b/icing/index/numeric/posting-list-integer-index-accessor_test.cc index ca0804e..f5fd693 100644 --- a/icing/index/numeric/posting-list-integer-index-data-accessor_test.cc +++ b/icing/index/numeric/posting-list-integer-index-accessor_test.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "icing/index/numeric/posting-list-integer-index-data-accessor.h" +#include "icing/index/numeric/posting-list-integer-index-accessor.h" #include <cstdint> #include <memory> @@ -27,7 +27,7 @@ #include "icing/file/posting_list/flash-index-storage.h" #include "icing/file/posting_list/posting-list-identifier.h" #include "icing/index/numeric/integer-index-data.h" -#include "icing/index/numeric/posting-list-used-integer-index-data-serializer.h" +#include "icing/index/numeric/posting-list-integer-index-serializer.h" #include "icing/schema/section.h" #include "icing/store/document-id.h" #include "icing/testing/common-matchers.h" @@ -44,7 +44,7 @@ using ::testing::Eq; using ::testing::Lt; using ::testing::SizeIs; -class PostingListIntegerIndexDataAccessorTest : public ::testing::Test { +class PostingListIntegerIndexAccessorTest : public ::testing::Test { protected: void SetUp() override { test_dir_ = GetTestTempDir() + "/test_dir"; @@ -53,7 +53,7 @@ class PostingListIntegerIndexDataAccessorTest : public ::testing::Test { ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(test_dir_.c_str())); ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(test_dir_.c_str())); - serializer_ = std::make_unique<PostingListUsedIntegerIndexDataSerializer>(); + serializer_ = std::make_unique<PostingListIntegerIndexSerializer>(); ICING_ASSERT_OK_AND_ASSIGN( FlashIndexStorage flash_index_storage, @@ -71,7 +71,7 @@ class PostingListIntegerIndexDataAccessorTest : public ::testing::Test { Filesystem filesystem_; std::string test_dir_; std::string file_name_; - std::unique_ptr<PostingListUsedIntegerIndexDataSerializer> serializer_; + std::unique_ptr<PostingListIntegerIndexSerializer> serializer_; std::unique_ptr<FlashIndexStorage> flash_index_storage_; }; @@ -96,11 +96,11 @@ std::vector<IntegerIndexData> CreateData(int num_data, return data; } -TEST_F(PostingListIntegerIndexDataAccessorTest, DataAddAndRetrieveProperly) { +TEST_F(PostingListIntegerIndexAccessorTest, DataAddAndRetrieveProperly) { ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<PostingListIntegerIndexDataAccessor> pl_accessor, - PostingListIntegerIndexDataAccessor::Create(flash_index_storage_.get(), - serializer_.get())); + std::unique_ptr<PostingListIntegerIndexAccessor> pl_accessor, + PostingListIntegerIndexAccessor::Create(flash_index_storage_.get(), + serializer_.get())); // Add some integer index data std::vector<IntegerIndexData> data_vec = CreateData(/*num_data=*/5, /*start_document_id=*/0, /*start_key=*/819); @@ -122,11 +122,11 @@ TEST_F(PostingListIntegerIndexDataAccessorTest, DataAddAndRetrieveProperly) { EXPECT_THAT(pl_holder.block.next_block_index(), Eq(kInvalidBlockIndex)); } -TEST_F(PostingListIntegerIndexDataAccessorTest, PreexistingPLKeepOnSameBlock) { +TEST_F(PostingListIntegerIndexAccessorTest, PreexistingPLKeepOnSameBlock) { ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<PostingListIntegerIndexDataAccessor> pl_accessor, - PostingListIntegerIndexDataAccessor::Create(flash_index_storage_.get(), - serializer_.get())); + std::unique_ptr<PostingListIntegerIndexAccessor> pl_accessor, + PostingListIntegerIndexAccessor::Create(flash_index_storage_.get(), + serializer_.get())); // Add a single data. This will fit in a min-sized posting list. IntegerIndexData data1(/*section_id=*/1, /*document_id=*/0, /*key=*/12345); ICING_ASSERT_OK(pl_accessor->PrependData(data1)); @@ -141,7 +141,7 @@ TEST_F(PostingListIntegerIndexDataAccessorTest, PreexistingPLKeepOnSameBlock) { // two data, so this should NOT cause the previous pl to be reallocated. ICING_ASSERT_OK_AND_ASSIGN( pl_accessor, - PostingListIntegerIndexDataAccessor::CreateFromExisting( + PostingListIntegerIndexAccessor::CreateFromExisting( flash_index_storage_.get(), serializer_.get(), result1.id)); IntegerIndexData data2(/*section_id=*/1, /*document_id=*/1, /*key=*/23456); ICING_ASSERT_OK(pl_accessor->PrependData(data2)); @@ -159,12 +159,11 @@ TEST_F(PostingListIntegerIndexDataAccessorTest, PreexistingPLKeepOnSameBlock) { IsOkAndHolds(ElementsAre(data2, data1))); } -TEST_F(PostingListIntegerIndexDataAccessorTest, - PreexistingPLReallocateToLargerPL) { +TEST_F(PostingListIntegerIndexAccessorTest, PreexistingPLReallocateToLargerPL) { ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<PostingListIntegerIndexDataAccessor> pl_accessor, - PostingListIntegerIndexDataAccessor::Create(flash_index_storage_.get(), - serializer_.get())); + std::unique_ptr<PostingListIntegerIndexAccessor> pl_accessor, + PostingListIntegerIndexAccessor::Create(flash_index_storage_.get(), + serializer_.get())); // Adding 3 data should cause Finalize allocating a 48-byte posting list, // which can store at most 4 data. std::vector<IntegerIndexData> data_vec1 = @@ -182,7 +181,7 @@ TEST_F(PostingListIntegerIndexDataAccessorTest, // Now add more data. ICING_ASSERT_OK_AND_ASSIGN( pl_accessor, - PostingListIntegerIndexDataAccessor::CreateFromExisting( + PostingListIntegerIndexAccessor::CreateFromExisting( flash_index_storage_.get(), serializer_.get(), result1.id)); // The current posting list can fit 1 more data. Adding 12 more data should // result in these data being moved to a larger posting list. Also the total @@ -217,12 +216,11 @@ TEST_F(PostingListIntegerIndexDataAccessorTest, all_data_vec.rend()))); } -TEST_F(PostingListIntegerIndexDataAccessorTest, - MultiBlockChainsBlocksProperly) { +TEST_F(PostingListIntegerIndexAccessorTest, MultiBlockChainsBlocksProperly) { ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<PostingListIntegerIndexDataAccessor> pl_accessor, - PostingListIntegerIndexDataAccessor::Create(flash_index_storage_.get(), - serializer_.get())); + std::unique_ptr<PostingListIntegerIndexAccessor> pl_accessor, + PostingListIntegerIndexAccessor::Create(flash_index_storage_.get(), + serializer_.get())); // Block size is 4096, sizeof(BlockHeader) is 12 and sizeof(IntegerIndexData) // is 12, so the max size posting list can store (4096 - 12) / 12 = 340 data. // Adding 341 data should cause: @@ -268,12 +266,12 @@ TEST_F(PostingListIntegerIndexDataAccessorTest, IsOkAndHolds(ElementsAreArray(first_block_data_start, data_vec.rend()))); } -TEST_F(PostingListIntegerIndexDataAccessorTest, +TEST_F(PostingListIntegerIndexAccessorTest, PreexistingMultiBlockReusesBlocksProperly) { ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<PostingListIntegerIndexDataAccessor> pl_accessor, - PostingListIntegerIndexDataAccessor::Create(flash_index_storage_.get(), - serializer_.get())); + std::unique_ptr<PostingListIntegerIndexAccessor> pl_accessor, + PostingListIntegerIndexAccessor::Create(flash_index_storage_.get(), + serializer_.get())); // Block size is 4096, sizeof(BlockHeader) is 12 and sizeof(IntegerIndexData) // is 12, so the max size posting list can store (4096 - 12) / 12 = 340 data. // Adding 341 data will cause: @@ -296,7 +294,7 @@ TEST_F(PostingListIntegerIndexDataAccessorTest, // fill it up. ICING_ASSERT_OK_AND_ASSIGN( pl_accessor, - PostingListIntegerIndexDataAccessor::CreateFromExisting( + PostingListIntegerIndexAccessor::CreateFromExisting( flash_index_storage_.get(), serializer_.get(), first_add_id)); std::vector<IntegerIndexData> data_vec2 = CreateData( /*num_data=*/10, @@ -342,23 +340,23 @@ TEST_F(PostingListIntegerIndexDataAccessorTest, all_data_vec.rend()))); } -TEST_F(PostingListIntegerIndexDataAccessorTest, +TEST_F(PostingListIntegerIndexAccessorTest, InvalidDataShouldReturnInvalidArgument) { ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<PostingListIntegerIndexDataAccessor> pl_accessor, - PostingListIntegerIndexDataAccessor::Create(flash_index_storage_.get(), - serializer_.get())); + std::unique_ptr<PostingListIntegerIndexAccessor> pl_accessor, + PostingListIntegerIndexAccessor::Create(flash_index_storage_.get(), + serializer_.get())); IntegerIndexData invalid_data; EXPECT_THAT(pl_accessor->PrependData(invalid_data), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } -TEST_F(PostingListIntegerIndexDataAccessorTest, +TEST_F(PostingListIntegerIndexAccessorTest, BasicHitIncreasingShouldReturnInvalidArgument) { ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<PostingListIntegerIndexDataAccessor> pl_accessor, - PostingListIntegerIndexDataAccessor::Create(flash_index_storage_.get(), - serializer_.get())); + std::unique_ptr<PostingListIntegerIndexAccessor> pl_accessor, + PostingListIntegerIndexAccessor::Create(flash_index_storage_.get(), + serializer_.get())); IntegerIndexData data1(/*section_id=*/3, /*document_id=*/1, /*key=*/12345); ICING_ASSERT_OK(pl_accessor->PrependData(data1)); @@ -371,24 +369,24 @@ TEST_F(PostingListIntegerIndexDataAccessorTest, StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } -TEST_F(PostingListIntegerIndexDataAccessorTest, +TEST_F(PostingListIntegerIndexAccessorTest, NewPostingListNoDataAddedShouldReturnInvalidArgument) { ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<PostingListIntegerIndexDataAccessor> pl_accessor, - PostingListIntegerIndexDataAccessor::Create(flash_index_storage_.get(), - serializer_.get())); + std::unique_ptr<PostingListIntegerIndexAccessor> pl_accessor, + PostingListIntegerIndexAccessor::Create(flash_index_storage_.get(), + serializer_.get())); PostingListAccessor::FinalizeResult result = std::move(*pl_accessor).Finalize(); EXPECT_THAT(result.status, StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } -TEST_F(PostingListIntegerIndexDataAccessorTest, +TEST_F(PostingListIntegerIndexAccessorTest, PreexistingPostingListNoDataAddedShouldSucceed) { ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<PostingListIntegerIndexDataAccessor> pl_accessor1, - PostingListIntegerIndexDataAccessor::Create(flash_index_storage_.get(), - serializer_.get())); + std::unique_ptr<PostingListIntegerIndexAccessor> pl_accessor1, + PostingListIntegerIndexAccessor::Create(flash_index_storage_.get(), + serializer_.get())); IntegerIndexData data1(/*section_id=*/3, /*document_id=*/1, /*key=*/12345); ICING_ASSERT_OK(pl_accessor1->PrependData(data1)); PostingListAccessor::FinalizeResult result1 = @@ -396,8 +394,8 @@ TEST_F(PostingListIntegerIndexDataAccessorTest, ICING_ASSERT_OK(result1.status); ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<PostingListIntegerIndexDataAccessor> pl_accessor2, - PostingListIntegerIndexDataAccessor::CreateFromExisting( + std::unique_ptr<PostingListIntegerIndexAccessor> pl_accessor2, + PostingListIntegerIndexAccessor::CreateFromExisting( flash_index_storage_.get(), serializer_.get(), result1.id)); PostingListAccessor::FinalizeResult result2 = std::move(*pl_accessor2).Finalize(); diff --git a/icing/index/numeric/posting-list-used-integer-index-data-serializer.cc b/icing/index/numeric/posting-list-integer-index-serializer.cc index 800fd6b..6556451 100644 --- a/icing/index/numeric/posting-list-used-integer-index-data-serializer.cc +++ b/icing/index/numeric/posting-list-integer-index-serializer.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "icing/index/numeric/posting-list-used-integer-index-data-serializer.h" +#include "icing/index/numeric/posting-list-integer-index-serializer.h" #include <cstdint> #include <vector> @@ -29,7 +29,7 @@ namespace icing { namespace lib { -uint32_t PostingListUsedIntegerIndexDataSerializer::GetBytesUsed( +uint32_t PostingListIntegerIndexSerializer::GetBytesUsed( const PostingListUsed* posting_list_used) const { // The special data will be included if they represent actual data. If they // represent the data start offset or the invalid data sentinel, they are not @@ -38,7 +38,7 @@ uint32_t PostingListUsedIntegerIndexDataSerializer::GetBytesUsed( GetStartByteOffset(posting_list_used); } -uint32_t PostingListUsedIntegerIndexDataSerializer::GetMinPostingListSizeToFit( +uint32_t PostingListIntegerIndexSerializer::GetMinPostingListSizeToFit( const PostingListUsed* posting_list_used) const { if (IsFull(posting_list_used) || IsAlmostFull(posting_list_used)) { // If in either the FULL state or ALMOST_FULL state, this posting list *is* @@ -57,7 +57,7 @@ uint32_t PostingListUsedIntegerIndexDataSerializer::GetMinPostingListSizeToFit( return GetBytesUsed(posting_list_used) + GetDataTypeBytes(); } -void PostingListUsedIntegerIndexDataSerializer::Clear( +void PostingListIntegerIndexSerializer::Clear( PostingListUsed* posting_list_used) const { // Safe to ignore return value because posting_list_used->size_in_bytes() is // a valid argument. @@ -65,7 +65,7 @@ void PostingListUsedIntegerIndexDataSerializer::Clear( /*offset=*/posting_list_used->size_in_bytes()); } -libtextclassifier3::Status PostingListUsedIntegerIndexDataSerializer::MoveFrom( +libtextclassifier3::Status PostingListIntegerIndexSerializer::MoveFrom( PostingListUsed* dst, PostingListUsed* src) const { ICING_RETURN_ERROR_IF_NULL(dst); ICING_RETURN_ERROR_IF_NULL(src); @@ -121,7 +121,7 @@ libtextclassifier3::Status PostingListUsedIntegerIndexDataSerializer::MoveFrom( } libtextclassifier3::Status -PostingListUsedIntegerIndexDataSerializer::PrependDataToAlmostFull( +PostingListIntegerIndexSerializer::PrependDataToAlmostFull( PostingListUsed* posting_list_used, const IntegerIndexData& data) const { SpecialDataType special_data = GetSpecialData(posting_list_used, /*index=*/1); if (special_data.data().basic_hit() < data.basic_hit()) { @@ -139,7 +139,7 @@ PostingListUsedIntegerIndexDataSerializer::PrependDataToAlmostFull( return libtextclassifier3::Status::OK; } -void PostingListUsedIntegerIndexDataSerializer::PrependDataToEmpty( +void PostingListIntegerIndexSerializer::PrependDataToEmpty( PostingListUsed* posting_list_used, const IntegerIndexData& data) const { // First data to be added. Just add verbatim, no compression. if (posting_list_used->size_in_bytes() == kSpecialDataSize) { @@ -165,7 +165,7 @@ void PostingListUsedIntegerIndexDataSerializer::PrependDataToEmpty( } libtextclassifier3::Status -PostingListUsedIntegerIndexDataSerializer::PrependDataToNotFull( +PostingListIntegerIndexSerializer::PrependDataToNotFull( PostingListUsed* posting_list_used, const IntegerIndexData& data, uint32_t offset) const { IntegerIndexData cur; @@ -193,8 +193,7 @@ PostingListUsedIntegerIndexDataSerializer::PrependDataToNotFull( return libtextclassifier3::Status::OK; } -libtextclassifier3::Status -PostingListUsedIntegerIndexDataSerializer::PrependData( +libtextclassifier3::Status PostingListIntegerIndexSerializer::PrependData( PostingListUsed* posting_list_used, const IntegerIndexData& data) const { static_assert( sizeof(BasicHit::Value) <= sizeof(uint64_t), @@ -223,7 +222,7 @@ PostingListUsedIntegerIndexDataSerializer::PrependData( } } -uint32_t PostingListUsedIntegerIndexDataSerializer::PrependDataArray( +uint32_t PostingListIntegerIndexSerializer::PrependDataArray( PostingListUsed* posting_list_used, const IntegerIndexData* array, uint32_t num_data, bool keep_prepended) const { if (!IsPostingListValid(posting_list_used)) { @@ -248,14 +247,14 @@ uint32_t PostingListUsedIntegerIndexDataSerializer::PrependDataArray( } libtextclassifier3::StatusOr<std::vector<IntegerIndexData>> -PostingListUsedIntegerIndexDataSerializer::GetData( +PostingListIntegerIndexSerializer::GetData( const PostingListUsed* posting_list_used) const { std::vector<IntegerIndexData> data_arr_out; ICING_RETURN_IF_ERROR(GetData(posting_list_used, &data_arr_out)); return data_arr_out; } -libtextclassifier3::Status PostingListUsedIntegerIndexDataSerializer::GetData( +libtextclassifier3::Status PostingListIntegerIndexSerializer::GetData( const PostingListUsed* posting_list_used, std::vector<IntegerIndexData>* data_arr_out) const { return GetDataInternal(posting_list_used, @@ -263,8 +262,7 @@ libtextclassifier3::Status PostingListUsedIntegerIndexDataSerializer::GetData( /*pop=*/false, data_arr_out); } -libtextclassifier3::Status -PostingListUsedIntegerIndexDataSerializer::PopFrontData( +libtextclassifier3::Status PostingListIntegerIndexSerializer::PopFrontData( PostingListUsed* posting_list_used, uint32_t num_data) const { if (num_data == 1 && IsFull(posting_list_used)) { // The PL is in FULL state which means that we save 2 uncompressed data in @@ -345,8 +343,7 @@ PostingListUsedIntegerIndexDataSerializer::PopFrontData( return libtextclassifier3::Status::OK; } -libtextclassifier3::Status -PostingListUsedIntegerIndexDataSerializer::GetDataInternal( +libtextclassifier3::Status PostingListIntegerIndexSerializer::GetDataInternal( const PostingListUsed* posting_list_used, uint32_t limit, bool pop, std::vector<IntegerIndexData>* out) const { // TODO(b/259743562): [Optimization 2] handle compressed data @@ -404,8 +401,8 @@ PostingListUsedIntegerIndexDataSerializer::GetDataInternal( return libtextclassifier3::Status::OK; } -PostingListUsedIntegerIndexDataSerializer::SpecialDataType -PostingListUsedIntegerIndexDataSerializer::GetSpecialData( +PostingListIntegerIndexSerializer::SpecialDataType +PostingListIntegerIndexSerializer::GetSpecialData( const PostingListUsed* posting_list_used, uint32_t index) const { // It is ok to temporarily construct a SpecialData with offset = 0 since we're // going to overwrite it by memcpy. @@ -417,7 +414,7 @@ PostingListUsedIntegerIndexDataSerializer::GetSpecialData( return special_data; } -void PostingListUsedIntegerIndexDataSerializer::SetSpecialData( +void PostingListIntegerIndexSerializer::SetSpecialData( PostingListUsed* posting_list_used, uint32_t index, const SpecialDataType& special_data) const { memcpy(posting_list_used->posting_list_buffer() + @@ -425,7 +422,7 @@ void PostingListUsedIntegerIndexDataSerializer::SetSpecialData( &special_data, sizeof(SpecialDataType)); } -bool PostingListUsedIntegerIndexDataSerializer::IsPostingListValid( +bool PostingListIntegerIndexSerializer::IsPostingListValid( const PostingListUsed* posting_list_used) const { if (IsAlmostFull(posting_list_used)) { // Special data 1 should hold a valid data. @@ -449,7 +446,7 @@ bool PostingListUsedIntegerIndexDataSerializer::IsPostingListValid( return true; } -uint32_t PostingListUsedIntegerIndexDataSerializer::GetStartByteOffset( +uint32_t PostingListIntegerIndexSerializer::GetStartByteOffset( const PostingListUsed* posting_list_used) const { if (IsFull(posting_list_used)) { return 0; @@ -460,7 +457,7 @@ uint32_t PostingListUsedIntegerIndexDataSerializer::GetStartByteOffset( } } -bool PostingListUsedIntegerIndexDataSerializer::SetStartByteOffset( +bool PostingListIntegerIndexSerializer::SetStartByteOffset( PostingListUsed* posting_list_used, uint32_t offset) const { if (offset > posting_list_used->size_in_bytes()) { ICING_LOG(ERROR) << "offset cannot be a value greater than size " @@ -497,7 +494,7 @@ bool PostingListUsedIntegerIndexDataSerializer::SetStartByteOffset( } libtextclassifier3::StatusOr<uint32_t> -PostingListUsedIntegerIndexDataSerializer::PrependDataUncompressed( +PostingListIntegerIndexSerializer::PrependDataUncompressed( PostingListUsed* posting_list_used, const IntegerIndexData& data, uint32_t offset) const { if (offset < kSpecialDataSize + sizeof(IntegerIndexData)) { diff --git a/icing/index/numeric/posting-list-used-integer-index-data-serializer.h b/icing/index/numeric/posting-list-integer-index-serializer.h index 49007e3..9cfdb7a 100644 --- a/icing/index/numeric/posting-list-used-integer-index-data-serializer.h +++ b/icing/index/numeric/posting-list-integer-index-serializer.h @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef ICING_INDEX_NUMERIC_POSTING_LIST_USED_INTEGER_INDEX_DATA_SERIALIZER_H_ -#define ICING_INDEX_NUMERIC_POSTING_LIST_USED_INTEGER_INDEX_DATA_SERIALIZER_H_ +#ifndef ICING_INDEX_NUMERIC_POSTING_LIST_INTEGER_INDEX_SERIALIZER_H_ +#define ICING_INDEX_NUMERIC_POSTING_LIST_INTEGER_INDEX_SERIALIZER_H_ #include <cstdint> #include <vector> @@ -28,8 +28,7 @@ namespace icing { namespace lib { // A serializer class to serialize IntegerIndexData to PostingListUsed. -class PostingListUsedIntegerIndexDataSerializer - : public PostingListUsedSerializer { +class PostingListIntegerIndexSerializer : public PostingListSerializer { public: using SpecialDataType = SpecialData<IntegerIndexData>; static_assert(sizeof(SpecialDataType) == sizeof(IntegerIndexData), ""); @@ -335,4 +334,4 @@ class PostingListUsedIntegerIndexDataSerializer } // namespace lib } // namespace icing -#endif // ICING_INDEX_NUMERIC_POSTING_LIST_USED_INTEGER_INDEX_DATA_SERIALIZER_H_ +#endif // ICING_INDEX_NUMERIC_POSTING_LIST_INTEGER_INDEX_SERIALIZER_H_ diff --git a/icing/index/numeric/posting-list-used-integer-index-data-serializer_test.cc b/icing/index/numeric/posting-list-integer-index-serializer_test.cc index c270137..d3d54ef 100644 --- a/icing/index/numeric/posting-list-used-integer-index-data-serializer_test.cc +++ b/icing/index/numeric/posting-list-integer-index-serializer_test.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "icing/index/numeric/posting-list-used-integer-index-data-serializer.h" +#include "icing/index/numeric/posting-list-integer-index-serializer.h" #include <memory> #include <vector> @@ -39,9 +39,8 @@ namespace { // without ALMOST_FULL) test cases, including for // PopFrontData. -TEST(PostingListUsedIntegerIndexDataSerializerTest, - GetMinPostingListSizeToFitNotNull) { - PostingListUsedIntegerIndexDataSerializer serializer; +TEST(PostingListIntegerIndexSerializerTest, GetMinPostingListSizeToFitNotNull) { + PostingListIntegerIndexSerializer serializer; int size = 2551 * sizeof(IntegerIndexData); std::unique_ptr<char[]> buf = std::make_unique<char[]>(size); @@ -65,9 +64,9 @@ TEST(PostingListUsedIntegerIndexDataSerializerTest, Eq(3 * sizeof(IntegerIndexData))); } -TEST(PostingListUsedIntegerIndexDataSerializerTest, +TEST(PostingListIntegerIndexSerializerTest, GetMinPostingListSizeToFitAlmostFull) { - PostingListUsedIntegerIndexDataSerializer serializer; + PostingListIntegerIndexSerializer serializer; int size = 3 * sizeof(IntegerIndexData); std::unique_ptr<char[]> buf = std::make_unique<char[]>(size); @@ -87,9 +86,8 @@ TEST(PostingListUsedIntegerIndexDataSerializerTest, EXPECT_THAT(serializer.GetMinPostingListSizeToFit(&pl_used), Eq(size)); } -TEST(PostingListUsedIntegerIndexDataSerializerTest, - GetMinPostingListSizeToFitFull) { - PostingListUsedIntegerIndexDataSerializer serializer; +TEST(PostingListIntegerIndexSerializerTest, GetMinPostingListSizeToFitFull) { + PostingListIntegerIndexSerializer serializer; int size = 3 * sizeof(IntegerIndexData); std::unique_ptr<char[]> buf = std::make_unique<char[]>(size); @@ -113,8 +111,8 @@ TEST(PostingListUsedIntegerIndexDataSerializerTest, EXPECT_THAT(serializer.GetMinPostingListSizeToFit(&pl_used), Eq(size)); } -TEST(PostingListUsedIntegerIndexDataSerializerTest, PrependDataNotFull) { - PostingListUsedIntegerIndexDataSerializer serializer; +TEST(PostingListIntegerIndexSerializerTest, PrependDataNotFull) { + PostingListIntegerIndexSerializer serializer; int size = 2551 * sizeof(IntegerIndexData); std::unique_ptr<char[]> buf = std::make_unique<char[]>(size); @@ -151,8 +149,8 @@ TEST(PostingListUsedIntegerIndexDataSerializerTest, PrependDataNotFull) { IsOkAndHolds(ElementsAre(data2, data1, data0))); } -TEST(PostingListUsedIntegerIndexDataSerializerTest, PrependDataAlmostFull) { - PostingListUsedIntegerIndexDataSerializer serializer; +TEST(PostingListIntegerIndexSerializerTest, PrependDataAlmostFull) { + PostingListIntegerIndexSerializer serializer; int size = 4 * sizeof(IntegerIndexData); std::unique_ptr<char[]> buf = std::make_unique<char[]>(size); @@ -195,9 +193,8 @@ TEST(PostingListUsedIntegerIndexDataSerializerTest, PrependDataAlmostFull) { StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED)); } -TEST(PostingListUsedIntegerIndexDataSerializerTest, - PrependDataPostingListUsedMinSize) { - PostingListUsedIntegerIndexDataSerializer serializer; +TEST(PostingListIntegerIndexSerializerTest, PrependDataPostingListUsedMinSize) { + PostingListIntegerIndexSerializer serializer; int size = serializer.GetMinPostingListSize(); std::unique_ptr<char[]> buf = std::make_unique<char[]>(size); @@ -233,9 +230,9 @@ TEST(PostingListUsedIntegerIndexDataSerializerTest, StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED)); } -TEST(PostingListUsedIntegerIndexDataSerializerTest, +TEST(PostingListIntegerIndexSerializerTest, PrependDataArrayDoNotKeepPrepended) { - PostingListUsedIntegerIndexDataSerializer serializer; + PostingListIntegerIndexSerializer serializer; int size = 6 * sizeof(IntegerIndexData); std::unique_ptr<char[]> buf = std::make_unique<char[]>(size); @@ -314,9 +311,8 @@ TEST(PostingListUsedIntegerIndexDataSerializerTest, IsOkAndHolds(ElementsAreArray(data_pushed.rbegin(), data_pushed.rend()))); } -TEST(PostingListUsedIntegerIndexDataSerializerTest, - PrependDataArrayKeepPrepended) { - PostingListUsedIntegerIndexDataSerializer serializer; +TEST(PostingListIntegerIndexSerializerTest, PrependDataArrayKeepPrepended) { + PostingListIntegerIndexSerializer serializer; int size = 6 * sizeof(IntegerIndexData); std::unique_ptr<char[]> buf = std::make_unique<char[]>(size); @@ -371,8 +367,8 @@ TEST(PostingListUsedIntegerIndexDataSerializerTest, IsOkAndHolds(ElementsAreArray(data_pushed.rbegin(), data_pushed.rend()))); } -TEST(PostingListUsedIntegerIndexDataSerializerTest, MoveFrom) { - PostingListUsedIntegerIndexDataSerializer serializer; +TEST(PostingListIntegerIndexSerializerTest, MoveFrom) { + PostingListIntegerIndexSerializer serializer; int size = 3 * serializer.GetMinPostingListSize(); std::unique_ptr<char[]> buf1 = std::make_unique<char[]>(size); @@ -412,9 +408,9 @@ TEST(PostingListUsedIntegerIndexDataSerializerTest, MoveFrom) { EXPECT_THAT(serializer.GetData(&pl_used1), IsOkAndHolds(IsEmpty())); } -TEST(PostingListUsedIntegerIndexDataSerializerTest, +TEST(PostingListIntegerIndexSerializerTest, MoveToNullReturnsFailedPrecondition) { - PostingListUsedIntegerIndexDataSerializer serializer; + PostingListIntegerIndexSerializer serializer; int size = 3 * serializer.GetMinPostingListSize(); std::unique_ptr<char[]> buf = std::make_unique<char[]>(size); @@ -443,8 +439,8 @@ TEST(PostingListUsedIntegerIndexDataSerializerTest, IsOkAndHolds(ElementsAreArray(data_arr.rbegin(), data_arr.rend()))); } -TEST(PostingListUsedIntegerIndexDataSerializerTest, MoveToPostingListTooSmall) { - PostingListUsedIntegerIndexDataSerializer serializer; +TEST(PostingListIntegerIndexSerializerTest, MoveToPostingListTooSmall) { + PostingListIntegerIndexSerializer serializer; int size1 = 3 * serializer.GetMinPostingListSize(); std::unique_ptr<char[]> buf1 = std::make_unique<char[]>(size1); @@ -486,8 +482,8 @@ TEST(PostingListUsedIntegerIndexDataSerializerTest, MoveToPostingListTooSmall) { IsOkAndHolds(ElementsAreArray(data_arr2.rbegin(), data_arr2.rend()))); } -TEST(PostingListUsedIntegerIndexDataSerializerTest, PopFrontData) { - PostingListUsedIntegerIndexDataSerializer serializer; +TEST(PostingListIntegerIndexSerializerTest, PopFrontData) { + PostingListIntegerIndexSerializer serializer; int size = 2 * serializer.GetMinPostingListSize(); std::unique_ptr<char[]> buf = std::make_unique<char[]>(size); |