diff options
author | Yen-Chieh Sung <sungyc@google.com> | 2023-07-06 18:01:13 -0700 |
---|---|---|
committer | Yen-Chieh Sung <sungyc@google.com> | 2023-07-06 19:39:11 -0700 |
commit | 4ac48569941d4f48b1a9bffdaf4abb2a3e8e9269 (patch) | |
tree | 076d8d961026f2800a6ca41106aa11d54527a29b | |
parent | f1f564329ad560f580c03f9e928057ad4e4fadc9 (diff) | |
download | icing-4ac48569941d4f48b1a9bffdaf4abb2a3e8e9269.tar.gz |
Update Icing from upstream.
Descriptions:
========================================================================
[Icing] Extend document store uri mapper
========================================================================
Implement dirty flag in PersistentStorage
========================================================================
[Icing][version 2][1/x] Implement ShouldRebuildDerivedFiles
========================================================================
[Icing][version 2][2/x] Bump kVersion to 2
========================================================================
[NumericSearch][optimization] Change numeric index bucket splitting threshold
========================================================================
[NumericSearch][optimization] Flag guard integer index bucket splitting threshold
========================================================================
Bug: 193919210
Bug: 280094535
Bug: 288969109
Bug: 259743562
NO_IFTTT="Path is only valid in G3."
Change-Id: Ie1e6f5969549f6c945ea32d03c724850ff5600a9
54 files changed, 2027 insertions, 740 deletions
diff --git a/icing/file/persistent-hash-map.cc b/icing/file/persistent-hash-map.cc index 729b09a..558c242 100644 --- a/icing/file/persistent-hash-map.cc +++ b/icing/file/persistent-hash-map.cc @@ -27,6 +27,7 @@ #include "icing/absl_ports/str_cat.h" #include "icing/file/file-backed-vector.h" #include "icing/file/memory-mapped-file.h" +#include "icing/file/persistent-storage.h" #include "icing/util/crc32.h" #include "icing/util/status-macros.h" @@ -167,6 +168,8 @@ PersistentHashMap::~PersistentHashMap() { libtextclassifier3::Status PersistentHashMap::Put(std::string_view key, const void* value) { + SetDirty(); + ICING_RETURN_IF_ERROR(ValidateKey(key)); ICING_ASSIGN_OR_RETURN( int32_t bucket_idx, @@ -207,6 +210,7 @@ libtextclassifier3::Status PersistentHashMap::GetOrPut(std::string_view key, FindEntryIndexByKey(bucket_idx, key)); if (idx_pair.target_entry_index == Entry::kInvalidIndex) { // If not found, then insert new key value pair. + SetDirty(); return Insert(bucket_idx, key, next_value); } @@ -232,6 +236,8 @@ libtextclassifier3::Status PersistentHashMap::Get(std::string_view key, } libtextclassifier3::Status PersistentHashMap::Delete(std::string_view key) { + SetDirty(); + ICING_RETURN_IF_ERROR(ValidateKey(key)); ICING_ASSIGN_OR_RETURN( int32_t bucket_idx, @@ -514,6 +520,7 @@ PersistentHashMap::InitializeExistingFiles(const Filesystem& filesystem, << " to " << persistent_hash_map->options_.max_load_factor_percent; + persistent_hash_map->SetInfoDirty(); persistent_hash_map->info().max_load_factor_percent = persistent_hash_map->options_.max_load_factor_percent; ICING_RETURN_IF_ERROR( @@ -525,26 +532,50 @@ PersistentHashMap::InitializeExistingFiles(const Filesystem& filesystem, return persistent_hash_map; } -libtextclassifier3::Status PersistentHashMap::PersistStoragesToDisk() { +libtextclassifier3::Status PersistentHashMap::PersistStoragesToDisk( + bool force) { + if (!force && !is_storage_dirty()) { + return libtextclassifier3::Status::OK; + } + ICING_RETURN_IF_ERROR(bucket_storage_->PersistToDisk()); ICING_RETURN_IF_ERROR(entry_storage_->PersistToDisk()); ICING_RETURN_IF_ERROR(kv_storage_->PersistToDisk()); + is_storage_dirty_ = false; return libtextclassifier3::Status::OK; } -libtextclassifier3::Status PersistentHashMap::PersistMetadataToDisk() { +libtextclassifier3::Status PersistentHashMap::PersistMetadataToDisk( + bool force) { + // We can skip persisting metadata to disk only if both info and storage are + // clean. + if (!force && !is_info_dirty() && !is_storage_dirty()) { + return libtextclassifier3::Status::OK; + } + // Changes should have been applied to the underlying file when using // MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, but call msync() as an // extra safety step to ensure they are written out. - return metadata_mmapped_file_->PersistToDisk(); + ICING_RETURN_IF_ERROR(metadata_mmapped_file_->PersistToDisk()); + is_info_dirty_ = false; + return libtextclassifier3::Status::OK; } -libtextclassifier3::StatusOr<Crc32> PersistentHashMap::ComputeInfoChecksum() { +libtextclassifier3::StatusOr<Crc32> PersistentHashMap::ComputeInfoChecksum( + bool force) { + if (!force && !is_info_dirty()) { + return Crc32(crcs().component_crcs.info_crc); + } + return info().ComputeChecksum(); } -libtextclassifier3::StatusOr<Crc32> -PersistentHashMap::ComputeStoragesChecksum() { +libtextclassifier3::StatusOr<Crc32> PersistentHashMap::ComputeStoragesChecksum( + bool force) { + if (!force && !is_storage_dirty()) { + return Crc32(crcs().component_crcs.storages_crc); + } + // Compute crcs ICING_ASSIGN_OR_RETURN(Crc32 bucket_storage_crc, bucket_storage_->ComputeChecksum()); @@ -602,6 +633,8 @@ libtextclassifier3::Status PersistentHashMap::CopyEntryValue( libtextclassifier3::Status PersistentHashMap::Insert(int32_t bucket_idx, std::string_view key, const void* value) { + SetDirty(); + // If entry_storage_->num_elements() + 1 exceeds options_.max_num_entries, // then return error. // We compute max_file_size of 3 storages by options_.max_num_entries. Since @@ -655,6 +688,8 @@ libtextclassifier3::Status PersistentHashMap::RehashIfNecessary( return libtextclassifier3::Status::OK; } + SetDirty(); + // Resize and reset buckets. ICING_RETURN_IF_ERROR( bucket_storage_->Set(0, new_num_bucket, Bucket(Entry::kInvalidIndex))); diff --git a/icing/file/persistent-hash-map.h b/icing/file/persistent-hash-map.h index 845b22a..5f7999d 100644 --- a/icing/file/persistent-hash-map.h +++ b/icing/file/persistent-hash-map.h @@ -394,7 +394,9 @@ class PersistentHashMap : public PersistentStorage { std::move(metadata_mmapped_file))), bucket_storage_(std::move(bucket_storage)), entry_storage_(std::move(entry_storage)), - kv_storage_(std::move(kv_storage)) {} + kv_storage_(std::move(kv_storage)), + is_info_dirty_(false), + is_storage_dirty_(false) {} static libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>> InitializeNewFiles(const Filesystem& filesystem, std::string&& working_path, @@ -409,20 +411,20 @@ class PersistentHashMap : public PersistentStorage { // Returns: // - OK on success // - INTERNAL_ERROR on I/O error - libtextclassifier3::Status PersistStoragesToDisk() override; + libtextclassifier3::Status PersistStoragesToDisk(bool force) override; // Flushes contents of metadata file. // // Returns: // - OK on success // - INTERNAL_ERROR on I/O error - libtextclassifier3::Status PersistMetadataToDisk() override; + libtextclassifier3::Status PersistMetadataToDisk(bool force) override; // Computes and returns Info checksum. // // Returns: // - Crc of the Info on success - libtextclassifier3::StatusOr<Crc32> ComputeInfoChecksum() override; + libtextclassifier3::StatusOr<Crc32> ComputeInfoChecksum(bool force) override; // Computes and returns all storages checksum. Checksums of bucket_storage_, // entry_storage_ and kv_storage_ will be combined together by XOR. @@ -430,7 +432,8 @@ class PersistentHashMap : public PersistentStorage { // Returns: // - Crc of all storages on success // - INTERNAL_ERROR if any data inconsistency - libtextclassifier3::StatusOr<Crc32> ComputeStoragesChecksum() override; + libtextclassifier3::StatusOr<Crc32> ComputeStoragesChecksum( + bool force) override; // Find the index of the target entry (that contains the key) from a bucket // (specified by bucket index). Also return the previous entry index, since @@ -496,6 +499,17 @@ class PersistentHashMap : public PersistentStorage { kInfoMetadataFileOffset); } + void SetInfoDirty() { is_info_dirty_ = true; } + // When storage is dirty, we have to set info dirty as well. So just expose + // SetDirty to set both. + void SetDirty() { + is_info_dirty_ = true; + is_storage_dirty_ = true; + } + + bool is_info_dirty() const { return is_info_dirty_; } + bool is_storage_dirty() const { return is_storage_dirty_; } + Options options_; std::unique_ptr<MemoryMappedFile> metadata_mmapped_file_; @@ -504,6 +518,9 @@ class PersistentHashMap : public PersistentStorage { std::unique_ptr<FileBackedVector<Bucket>> bucket_storage_; std::unique_ptr<FileBackedVector<Entry>> entry_storage_; std::unique_ptr<FileBackedVector<char>> kv_storage_; + + bool is_info_dirty_; + bool is_storage_dirty_; }; } // namespace lib diff --git a/icing/file/persistent-storage.h b/icing/file/persistent-storage.h index 727cae9..9cb5e4d 100644 --- a/icing/file/persistent-storage.h +++ b/icing/file/persistent-storage.h @@ -148,8 +148,9 @@ class PersistentStorage { return libtextclassifier3::Status::OK; } - ICING_RETURN_IF_ERROR(UpdateChecksumsInternal()); - ICING_RETURN_IF_ERROR(PersistMetadataToDisk()); + ICING_RETURN_IF_ERROR(UpdateChecksumsInternal(/*force=*/true)); + ICING_RETURN_IF_ERROR(PersistStoragesToDisk(/*force=*/true)); + ICING_RETURN_IF_ERROR(PersistMetadataToDisk(/*force=*/true)); is_initialized_ = true; return libtextclassifier3::Status::OK; @@ -184,38 +185,52 @@ class PersistentStorage { // 2) Updates all checksums by new data. // 3) Flushes metadata. // + // Force flag will be passed down to PersistMetadataToDisk, + // PersistStoragesToDisk, ComputeInfoChecksum, ComputeStoragesChecksum. + // - If force == true, then performs actual persisting operations/recomputes + // the checksum. + // - Otherwise, the derived class can decide itself whether skipping + // persisting operations/doing lazy checksum recomputing if the storage is + // not dirty. + // // Returns: // - OK on success // - FAILED_PRECONDITION_ERROR if PersistentStorage is uninitialized // - Any errors from PersistStoragesToDisk, UpdateChecksums, // PersistMetadataToDisk, depending on actual implementation - libtextclassifier3::Status PersistToDisk() { + libtextclassifier3::Status PersistToDisk(bool force = false) { if (!is_initialized_) { return absl_ports::FailedPreconditionError(absl_ports::StrCat( "PersistentStorage ", working_path_, " not initialized")); } - ICING_RETURN_IF_ERROR(PersistStoragesToDisk()); - ICING_RETURN_IF_ERROR(UpdateChecksums()); - ICING_RETURN_IF_ERROR(PersistMetadataToDisk()); + ICING_RETURN_IF_ERROR(UpdateChecksumsInternal(force)); + ICING_RETURN_IF_ERROR(PersistStoragesToDisk(force)); + ICING_RETURN_IF_ERROR(PersistMetadataToDisk(force)); return libtextclassifier3::Status::OK; } // Updates checksums of all components and returns the overall crc (all_crc) // of the persistent storage. // + // Force flag will be passed down ComputeInfoChecksum, + // ComputeStoragesChecksum. + // - If force == true, then recomputes the checksum. + // - Otherwise, the derived class can decide itself whether doing lazy + // checksum recomputing if the storage is not dirty. + // // Returns: // - Overall crc of the persistent storage on success // - FAILED_PRECONDITION_ERROR if PersistentStorage is uninitialized // - Any errors from ComputeInfoChecksum, ComputeStoragesChecksum, depending // on actual implementation - libtextclassifier3::StatusOr<Crc32> UpdateChecksums() { + libtextclassifier3::StatusOr<Crc32> UpdateChecksums(bool force = false) { if (!is_initialized_) { return absl_ports::FailedPreconditionError(absl_ports::StrCat( "PersistentStorage ", working_path_, " not initialized")); } - return UpdateChecksumsInternal(); + return UpdateChecksumsInternal(force); } protected: @@ -234,33 +249,41 @@ class PersistentStorage { // Returns: // - OK on success // - Any other errors, depending on actual implementation - virtual libtextclassifier3::Status PersistMetadataToDisk() = 0; + virtual libtextclassifier3::Status PersistMetadataToDisk(bool force) = 0; // Flushes contents of all storages to underlying files. // // Returns: // - OK on success // - Any other errors, depending on actual implementation - virtual libtextclassifier3::Status PersistStoragesToDisk() = 0; + virtual libtextclassifier3::Status PersistStoragesToDisk(bool force) = 0; // Computes and returns Info checksum. + // - If force = true, then recompute the entire checksum. + // - Otherwise, the derived class can decide itself whether doing lazy + // checksum computing if the storage is not dirty. // // This function will be mainly called by UpdateChecksums. // // Returns: // - Crc of the Info on success // - Any other errors, depending on actual implementation - virtual libtextclassifier3::StatusOr<Crc32> ComputeInfoChecksum() = 0; + virtual libtextclassifier3::StatusOr<Crc32> ComputeInfoChecksum( + bool force) = 0; // Computes and returns all storages checksum. If there are multiple storages, // usually we XOR their checksums together to a single checksum. + // - If force = true, then recompute the entire checksum. + // - Otherwise, the derived class can decide itself whether doing lazy + // checksum computing if the storage is not dirty. // // This function will be mainly called by UpdateChecksums. // // Returns: // - Crc of all storages on success // - Any other errors from depending on actual implementation - virtual libtextclassifier3::StatusOr<Crc32> ComputeStoragesChecksum() = 0; + virtual libtextclassifier3::StatusOr<Crc32> ComputeStoragesChecksum( + bool force) = 0; // Returns the Crcs instance reference. The derived class can either own a // concrete Crcs instance, or reinterpret_cast the memory-mapped region to @@ -292,11 +315,18 @@ class PersistentStorage { // - Overall crc of the persistent storage on success // - Any errors from ComputeInfoChecksum, ComputeStoragesChecksum, depending // on actual implementation - libtextclassifier3::StatusOr<Crc32> UpdateChecksumsInternal() { + libtextclassifier3::StatusOr<Crc32> UpdateChecksumsInternal(bool force) { Crcs& crcs_ref = crcs(); // Compute and update storages + info checksums. - ICING_ASSIGN_OR_RETURN(Crc32 info_crc, ComputeInfoChecksum()); - ICING_ASSIGN_OR_RETURN(Crc32 storages_crc, ComputeStoragesChecksum()); + ICING_ASSIGN_OR_RETURN(Crc32 info_crc, ComputeInfoChecksum(force)); + ICING_ASSIGN_OR_RETURN(Crc32 storages_crc, ComputeStoragesChecksum(force)); + if (crcs_ref.component_crcs.info_crc == info_crc.Get() && + crcs_ref.component_crcs.storages_crc == storages_crc.Get()) { + // If info and storages crc haven't changed, then we don't have to update + // checksums. + return Crc32(crcs_ref.all_crc); + } + crcs_ref.component_crcs.info_crc = info_crc.Get(); crcs_ref.component_crcs.storages_crc = storages_crc.Get(); @@ -318,12 +348,13 @@ class PersistentStorage { return absl_ports::FailedPreconditionError("Invalid all crc"); } - ICING_ASSIGN_OR_RETURN(Crc32 info_crc, ComputeInfoChecksum()); + ICING_ASSIGN_OR_RETURN(Crc32 info_crc, ComputeInfoChecksum(/*force=*/true)); if (crcs_ref.component_crcs.info_crc != info_crc.Get()) { return absl_ports::FailedPreconditionError("Invalid info crc"); } - ICING_ASSIGN_OR_RETURN(Crc32 storages_crc, ComputeStoragesChecksum()); + ICING_ASSIGN_OR_RETURN(Crc32 storages_crc, + ComputeStoragesChecksum(/*force=*/true)); if (crcs_ref.component_crcs.storages_crc != storages_crc.Get()) { return absl_ports::FailedPreconditionError("Invalid storages crc"); } diff --git a/icing/file/posting_list/flash-index-storage.cc b/icing/file/posting_list/flash-index-storage.cc index cd7ac12..21fea8a 100644 --- a/icing/file/posting_list/flash-index-storage.cc +++ b/icing/file/posting_list/flash-index-storage.cc @@ -487,6 +487,9 @@ libtextclassifier3::Status FlashIndexStorage::FreePostingList( PostingListHolder&& holder) { ICING_ASSIGN_OR_RETURN(IndexBlock block, GetIndexBlock(holder.id.block_index())); + if (block.posting_list_bytes() == max_posting_list_bytes()) { + block.SetNextBlockIndex(kInvalidBlockIndex); + } uint32_t posting_list_bytes = block.posting_list_bytes(); int best_block_info_index = FindBestIndexBlockInfo(posting_list_bytes); diff --git a/icing/file/version-util.cc b/icing/file/version-util.cc index f477072..7684262 100644 --- a/icing/file/version-util.cc +++ b/icing/file/version-util.cc @@ -102,6 +102,44 @@ StateChange GetVersionStateChange(const VersionInfo& existing_version_info, } } +bool ShouldRebuildDerivedFiles(const VersionInfo& existing_version_info, + int32_t curr_version) { + StateChange state_change = + GetVersionStateChange(existing_version_info, curr_version); + switch (state_change) { + case StateChange::kCompatible: + return false; + case StateChange::kUndetermined: + [[fallthrough]]; + case StateChange::kRollBack: + [[fallthrough]]; + case StateChange::kRollForward: + [[fallthrough]]; + case StateChange::kVersionZeroRollForward: + [[fallthrough]]; + case StateChange::kVersionZeroUpgrade: + return true; + case StateChange::kUpgrade: + break; + } + + bool should_rebuild = false; + int32_t existing_version = existing_version_info.version; + while (existing_version < curr_version) { + switch (existing_version) { + case 1: { + // version 1 -> version 2 upgrade, no need to rebuild + break; + } + default: + // This should not happen. Rebuild anyway if unsure. + should_rebuild |= true; + } + ++existing_version; + } + return should_rebuild; +} + } // namespace version_util } // namespace lib diff --git a/icing/file/version-util.h b/icing/file/version-util.h index 7fa7fbd..30c457d 100644 --- a/icing/file/version-util.h +++ b/icing/file/version-util.h @@ -28,9 +28,16 @@ namespace lib { namespace version_util { // - Version 0: Android T. Can be identified only by flash index magic. -// - Version 1: mainline release 2023-06. -inline static constexpr int32_t kVersion = 1; +// - Version 1: Android U release 2023-06. +// - Version 2: Android U 1st mainline release 2023-09. Schema is compatible +// with version 1. +// TODO(b/288969109): bump kVersion to 2 before finalizing the 1st Android U +// mainline release. +// LINT.IfChange(kVersion) +inline static constexpr int32_t kVersion = 2; +// LINT.ThenChange(//depot/google3/icing/schema/schema-store.cc:min_overlay_version_compatibility) inline static constexpr int32_t kVersionOne = 1; +inline static constexpr int32_t kVersionTwo = 2; inline static constexpr int kVersionZeroFlashIndexMagic = 0x6dfba6ae; @@ -89,6 +96,16 @@ libtextclassifier3::Status WriteVersion(const Filesystem& filesystem, StateChange GetVersionStateChange(const VersionInfo& existing_version_info, int32_t curr_version = kVersion); +// Helper method to determine whether Icing should rebuild all derived files. +// Sometimes it is not required to rebuild derived files when +// roll-forward/upgrading. This function "encodes" upgrade paths and checks if +// the roll-forward/upgrading requires derived files to be rebuilt or not. +// +// REQUIRES: curr_version > 0. We implement version checking in version 1, so +// the callers (except unit tests) will always use a version # greater than 0. +bool ShouldRebuildDerivedFiles(const VersionInfo& existing_version_info, + int32_t curr_version = kVersion); + } // namespace version_util } // namespace lib diff --git a/icing/file/version-util_test.cc b/icing/file/version-util_test.cc index 78cdb7d..e94c351 100644 --- a/icing/file/version-util_test.cc +++ b/icing/file/version-util_test.cc @@ -32,6 +32,8 @@ namespace version_util { namespace { using ::testing::Eq; +using ::testing::IsFalse; +using ::testing::IsTrue; struct VersionUtilReadVersionTestParam { std::optional<VersionInfo> existing_version_info; @@ -339,6 +341,14 @@ INSTANTIATE_TEST_SUITE_P( /*curr_version_in=*/2, /*expected_state_change_in=*/StateChange::kRollForward), + // - version 1, max_version 2 + // - Current version = 3 + // - Result: roll forward + VersionUtilStateChangeTestParam( + /*existing_version_info_in=*/VersionInfo(1, 2), + /*curr_version_in=*/3, + /*expected_state_change_in=*/StateChange::kRollForward), + // - version 1, max_version 3 // - Current version = 2 // - Result: roll forward @@ -379,6 +389,84 @@ INSTANTIATE_TEST_SUITE_P( /*curr_version_in=*/2, /*expected_state_change_in=*/StateChange::kRollBack))); +TEST(VersionUtilTest, ShouldRebuildDerivedFilesUndeterminedVersion) { + EXPECT_THAT( + ShouldRebuildDerivedFiles(VersionInfo(-1, -1), /*curr_version=*/1), + IsTrue()); + EXPECT_THAT( + ShouldRebuildDerivedFiles(VersionInfo(-1, -1), /*curr_version=*/2), + IsTrue()); +} + +TEST(VersionUtilTest, ShouldRebuildDerivedFilesVersionZeroUpgrade) { + // 0 -> 1 + EXPECT_THAT(ShouldRebuildDerivedFiles(VersionInfo(0, 0), /*curr_version=*/1), + IsTrue()); + + // 0 -> 2 + EXPECT_THAT(ShouldRebuildDerivedFiles(VersionInfo(0, 0), /*curr_version=*/2), + IsTrue()); +} + +TEST(VersionUtilTest, ShouldRebuildDerivedFilesVersionZeroRollForward) { + // (1 -> 0), 0 -> 1 + EXPECT_THAT(ShouldRebuildDerivedFiles(VersionInfo(0, 1), /*curr_version=*/1), + IsTrue()); + + // (1 -> 0), 0 -> 2 + EXPECT_THAT(ShouldRebuildDerivedFiles(VersionInfo(0, 1), /*curr_version=*/2), + IsTrue()); + + // (2 -> 0), 0 -> 1 + EXPECT_THAT(ShouldRebuildDerivedFiles(VersionInfo(0, 2), /*curr_version=*/1), + IsTrue()); +} + +TEST(VersionUtilTest, ShouldRebuildDerivedFilesRollBack) { + // 2 -> 1 + EXPECT_THAT(ShouldRebuildDerivedFiles(VersionInfo(2, 2), /*curr_version=*/1), + IsTrue()); + + // 3 -> 1 + EXPECT_THAT(ShouldRebuildDerivedFiles(VersionInfo(3, 3), /*curr_version=*/1), + IsTrue()); + + // (3 -> 2), 2 -> 1 + EXPECT_THAT(ShouldRebuildDerivedFiles(VersionInfo(2, 3), /*curr_version=*/1), + IsTrue()); +} + +TEST(VersionUtilTest, ShouldRebuildDerivedFilesRollForward) { + // (2 -> 1), 1 -> 2 + EXPECT_THAT(ShouldRebuildDerivedFiles(VersionInfo(1, 2), /*curr_version=*/2), + IsTrue()); + + // (2 -> 1), 1 -> 3 + EXPECT_THAT(ShouldRebuildDerivedFiles(VersionInfo(1, 2), /*curr_version=*/3), + IsTrue()); + + // (3 -> 1), 1 -> 2 + EXPECT_THAT(ShouldRebuildDerivedFiles(VersionInfo(1, 3), /*curr_version=*/2), + IsTrue()); +} + +TEST(VersionUtilTest, ShouldRebuildDerivedFilesCompatible) { + EXPECT_THAT(ShouldRebuildDerivedFiles(VersionInfo(2, 2), /*curr_version=*/2), + IsFalse()); + + EXPECT_THAT(ShouldRebuildDerivedFiles(VersionInfo(2, 3), /*curr_version=*/2), + IsFalse()); +} + +TEST(VersionUtilTest, ShouldRebuildDerivedFilesUpgrade) { + // Unlike other state changes, upgrade depends on the actual "encoded path". + + // kVersionOne -> kVersionTwo + EXPECT_THAT(ShouldRebuildDerivedFiles(VersionInfo(kVersionOne, kVersionOne), + /*curr_version=*/kVersionTwo), + IsFalse()); +} + } // namespace } // namespace version_util diff --git a/icing/icing-search-engine.cc b/icing/icing-search-engine.cc index 3ffb297..467c943 100644 --- a/icing/icing-search-engine.cc +++ b/icing/icing-search-engine.cc @@ -592,8 +592,10 @@ libtextclassifier3::Status IcingSearchEngine::InitializeMembers( filesystem_.get(), MakeSchemaDirectoryPath(options_.base_dir()), version_state_change, version_util::kVersion)); - // Step 2: discard all derived data - ICING_RETURN_IF_ERROR(DiscardDerivedFiles()); + // Step 2: discard all derived data if needed rebuild. + if (version_util::ShouldRebuildDerivedFiles(version_info)) { + ICING_RETURN_IF_ERROR(DiscardDerivedFiles()); + } // Step 3: update version file version_util::VersionInfo new_version_info( @@ -670,6 +672,7 @@ libtextclassifier3::Status IcingSearchEngine::InitializeMembers( ICING_ASSIGN_OR_RETURN( integer_index_, IntegerIndex::Create(*filesystem_, std::move(integer_index_dir), + options_.integer_index_bucket_split_threshold(), options_.pre_mapping_fbv())); // Discard qualified id join index directory and instantiate a new one. @@ -774,11 +777,12 @@ libtextclassifier3::Status IcingSearchEngine::InitializeDocumentStore( } ICING_ASSIGN_OR_RETURN( DocumentStore::CreateResult create_result, - DocumentStore::Create(filesystem_.get(), document_dir, clock_.get(), - schema_store_.get(), - force_recovery_and_revalidate_documents, - options_.document_store_namespace_id_fingerprint(), - options_.compression_level(), initialize_stats)); + DocumentStore::Create( + filesystem_.get(), document_dir, clock_.get(), schema_store_.get(), + force_recovery_and_revalidate_documents, + options_.document_store_namespace_id_fingerprint(), + options_.pre_mapping_fbv(), options_.use_persistent_hash_map(), + options_.compression_level(), initialize_stats)); document_store_ = std::move(create_result.document_store); return libtextclassifier3::Status::OK; @@ -825,8 +829,10 @@ libtextclassifier3::Status IcingSearchEngine::InitializeIndex( std::string integer_index_dir = MakeIntegerIndexWorkingPath(options_.base_dir()); InitializeStatsProto::RecoveryCause integer_index_recovery_cause; - auto integer_index_or = IntegerIndex::Create(*filesystem_, integer_index_dir, - options_.pre_mapping_fbv()); + auto integer_index_or = + IntegerIndex::Create(*filesystem_, integer_index_dir, + options_.integer_index_bucket_split_threshold(), + options_.pre_mapping_fbv()); if (!integer_index_or.ok()) { ICING_RETURN_IF_ERROR( IntegerIndex::Discard(*filesystem_, integer_index_dir)); @@ -837,6 +843,7 @@ libtextclassifier3::Status IcingSearchEngine::InitializeIndex( ICING_ASSIGN_OR_RETURN( integer_index_, IntegerIndex::Create(*filesystem_, std::move(integer_index_dir), + options_.integer_index_bucket_split_threshold(), options_.pre_mapping_fbv())); } else { // Integer index was created fine. @@ -2250,8 +2257,7 @@ IcingSearchEngine::OptimizeDocumentStore(OptimizeStatsProto* optimize_stats) { // Copies valid document data to tmp directory libtextclassifier3::StatusOr<std::vector<DocumentId>> document_id_old_to_new_or = document_store_->OptimizeInto( - temporary_document_dir, language_segmenter_.get(), - options_.document_store_namespace_id_fingerprint(), optimize_stats); + temporary_document_dir, language_segmenter_.get(), optimize_stats); // Handles error if any if (!document_id_old_to_new_or.ok()) { @@ -2289,6 +2295,7 @@ IcingSearchEngine::OptimizeDocumentStore(OptimizeStatsProto* optimize_stats) { filesystem_.get(), current_document_dir, clock_.get(), schema_store_.get(), /*force_recovery_and_revalidate_documents=*/false, options_.document_store_namespace_id_fingerprint(), + options_.pre_mapping_fbv(), options_.use_persistent_hash_map(), options_.compression_level(), /*initialize_stats=*/nullptr); // TODO(b/144458732): Implement a more robust version of // TC_ASSIGN_OR_RETURN that can support error logging. @@ -2316,6 +2323,7 @@ IcingSearchEngine::OptimizeDocumentStore(OptimizeStatsProto* optimize_stats) { filesystem_.get(), current_document_dir, clock_.get(), schema_store_.get(), /*force_recovery_and_revalidate_documents=*/false, options_.document_store_namespace_id_fingerprint(), + options_.pre_mapping_fbv(), options_.use_persistent_hash_map(), options_.compression_level(), /*initialize_stats=*/nullptr); if (!create_result_or.ok()) { // Unable to create DocumentStore from the new file. Mark as uninitialized diff --git a/icing/icing-search-engine_benchmark.cc b/icing/icing-search-engine_benchmark.cc index fb44595..354d11c 100644 --- a/icing/icing-search-engine_benchmark.cc +++ b/icing/icing-search-engine_benchmark.cc @@ -37,6 +37,7 @@ #include "icing/join/join-processor.h" #include "icing/proto/document.pb.h" #include "icing/proto/initialize.pb.h" +#include "icing/proto/persist.pb.h" #include "icing/proto/reset.pb.h" #include "icing/proto/schema.pb.h" #include "icing/proto/scoring.pb.h" @@ -1214,6 +1215,58 @@ void BM_JoinQueryQualifiedId(benchmark::State& state) { } BENCHMARK(BM_JoinQueryQualifiedId); +void BM_PersistToDisk(benchmark::State& state) { + // Initialize the filesystem + std::string test_dir = GetTestTempDir() + "/icing/benchmark"; + Filesystem filesystem; + DestructibleDirectory ddir(filesystem, test_dir); + + // Create the schema. + std::default_random_engine random; + int num_types = kAvgNumNamespaces * kAvgNumTypes; + ExactStringPropertyGenerator property_generator; + SchemaGenerator<ExactStringPropertyGenerator> schema_generator( + /*num_properties=*/state.range(1), &property_generator); + SchemaProto schema = schema_generator.GenerateSchema(num_types); + EvenDistributionTypeSelector type_selector(schema); + + // Generate documents. + int num_docs = state.range(0); + std::vector<std::string> language = CreateLanguages(kLanguageSize, &random); + const std::vector<DocumentProto> random_docs = + GenerateRandomDocuments(&type_selector, num_docs, language); + + for (auto _ : state) { + state.PauseTiming(); + // Create the index. + IcingSearchEngineOptions options; + options.set_base_dir(test_dir); + options.set_index_merge_size(kIcingFullIndexSize); + options.set_use_persistent_hash_map(true); + std::unique_ptr<IcingSearchEngine> icing = + std::make_unique<IcingSearchEngine>(options); + + ASSERT_THAT(icing->Reset().status(), ProtoIsOk()); + ASSERT_THAT(icing->SetSchema(schema).status(), ProtoIsOk()); + + for (const DocumentProto& doc : random_docs) { + ASSERT_THAT(icing->Put(doc).status(), ProtoIsOk()); + } + + state.ResumeTiming(); + + ASSERT_THAT(icing->PersistToDisk(PersistType::FULL).status(), ProtoIsOk()); + + state.PauseTiming(); + icing.reset(); + ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(test_dir.c_str())); + state.ResumeTiming(); + } +} +BENCHMARK(BM_PersistToDisk) + // Arguments: num_indexed_documents, num_sections + ->ArgPair(1024, 5); + } // namespace } // namespace lib diff --git a/icing/icing-search-engine_initialization_test.cc b/icing/icing-search-engine_initialization_test.cc index 64bcc98..d6c8de8 100644 --- a/icing/icing-search-engine_initialization_test.cc +++ b/icing/icing-search-engine_initialization_test.cc @@ -88,6 +88,7 @@ using ::testing::Eq; using ::testing::HasSubstr; using ::testing::IsEmpty; using ::testing::Matcher; +using ::testing::Ne; using ::testing::Return; using ::testing::SizeIs; @@ -1060,13 +1061,14 @@ TEST_F(IcingSearchEngineInitializationTest, // Puts message2 into DocumentStore but doesn't index it. ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, - DocumentStore::Create(filesystem(), GetDocumentDir(), &fake_clock, - schema_store.get(), - /*force_recovery_and_revalidate_documents=*/false, - /*namespace_id_fingerprint=*/false, - PortableFileBackedProtoLog< - DocumentWrapper>::kDeflateCompressionLevel, - /*initialize_stats=*/nullptr)); + DocumentStore::Create( + filesystem(), GetDocumentDir(), &fake_clock, schema_store.get(), + /*force_recovery_and_revalidate_documents=*/false, + /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false, + /*use_persistent_hash_map=*/false, + PortableFileBackedProtoLog< + DocumentWrapper>::kDeflateCompressionLevel, + /*initialize_stats=*/nullptr)); std::unique_ptr<DocumentStore> document_store = std::move(create_result.document_store); @@ -1493,6 +1495,108 @@ TEST_F(IcingSearchEngineInitializationTest, RecoverFromCorruptIntegerIndex) { } TEST_F(IcingSearchEngineInitializationTest, + RecoverFromIntegerIndexBucketSplitThresholdChange) { + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty( + PropertyConfigBuilder() + .SetName("indexableInteger") + .SetDataTypeInt64(NUMERIC_MATCH_RANGE) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); + + DocumentProto message = + DocumentBuilder() + .SetKey("namespace", "message/1") + .SetSchema("Message") + .AddInt64Property("indexableInteger", 123) + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + + // 1. Create an index with a message document. + { + TestIcingSearchEngine icing( + GetDefaultIcingOptions(), std::make_unique<Filesystem>(), + std::make_unique<IcingFilesystem>(), std::make_unique<FakeClock>(), + GetTestJniCache()); + + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk()); + + EXPECT_THAT(icing.Put(message).status(), ProtoIsOk()); + } + + // 2. Create the index again with different + // integer_index_bucket_split_threshold. This should trigger index + // restoration. + { + // Mock filesystem to observe and check the behavior of all indices. + auto mock_filesystem = std::make_unique<MockFilesystem>(); + EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(_)) + .WillRepeatedly(DoDefault()); + // Ensure term index directory should never be discarded. + EXPECT_CALL(*mock_filesystem, + DeleteDirectoryRecursively(EndsWith("/index_dir"))) + .Times(0); + // Ensure integer index directory should be discarded once, and Clear() + // should never be called (i.e. storage sub directory + // "*/integer_index_dir/*" should never be discarded) since we start it from + // scratch. + EXPECT_CALL(*mock_filesystem, + DeleteDirectoryRecursively(EndsWith("/integer_index_dir"))) + .Times(1); + EXPECT_CALL(*mock_filesystem, + DeleteDirectoryRecursively(HasSubstr("/integer_index_dir/"))) + .Times(0); + // Ensure qualified id join index directory should never be discarded, and + // Clear() should never be called (i.e. storage sub directory + // "*/qualified_id_join_index_dir/*" should never be discarded). + EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively( + EndsWith("/qualified_id_join_index_dir"))) + .Times(0); + EXPECT_CALL( + *mock_filesystem, + DeleteDirectoryRecursively(HasSubstr("/qualified_id_join_index_dir/"))) + .Times(0); + + static constexpr int32_t kNewIntegerIndexBucketSplitThreshold = 1000; + IcingSearchEngineOptions options = GetDefaultIcingOptions(); + ASSERT_THAT(kNewIntegerIndexBucketSplitThreshold, + Ne(options.integer_index_bucket_split_threshold())); + options.set_integer_index_bucket_split_threshold( + kNewIntegerIndexBucketSplitThreshold); + + TestIcingSearchEngine icing(options, std::move(mock_filesystem), + std::make_unique<IcingFilesystem>(), + std::make_unique<FakeClock>(), + GetTestJniCache()); + InitializeResultProto initialize_result = icing.Initialize(); + ASSERT_THAT(initialize_result.status(), ProtoIsOk()); + EXPECT_THAT(initialize_result.initialize_stats().index_restoration_cause(), + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT( + initialize_result.initialize_stats().integer_index_restoration_cause(), + Eq(InitializeStatsProto::IO_ERROR)); + EXPECT_THAT(initialize_result.initialize_stats() + .qualified_id_join_index_restoration_cause(), + Eq(InitializeStatsProto::NONE)); + + // Verify integer index works normally + SearchSpecProto search_spec; + search_spec.set_query("indexableInteger == 123"); + search_spec.set_search_type( + SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY); + search_spec.add_enabled_features(std::string(kNumericSearchFeature)); + + SearchResultProto results = + icing.Search(search_spec, ScoringSpecProto::default_instance(), + ResultSpecProto::default_instance()); + ASSERT_THAT(results.results(), SizeIs(1)); + EXPECT_THAT(results.results(0).document().uri(), Eq("message/1")); + } +} + +TEST_F(IcingSearchEngineInitializationTest, RecoverFromCorruptQualifiedIdJoinIndex) { // Test the following scenario: qualified id join index is corrupted (e.g. // checksum doesn't match). IcingSearchEngine should be able to recover @@ -3197,6 +3301,7 @@ TEST_F(IcingSearchEngineInitializationTest, ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndex> integer_index, IntegerIndex::Create(filesystem, GetIntegerIndexDir(), + /*num_data_threshold_for_bucket_split=*/65536, /*pre_mapping_fbv=*/false)); // Add hits for document 0. ASSERT_THAT(integer_index->last_added_document_id(), kInvalidDocumentId); @@ -3376,6 +3481,7 @@ TEST_F(IcingSearchEngineInitializationTest, ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndex> integer_index, IntegerIndex::Create(filesystem, GetIntegerIndexDir(), + /*num_data_threshold_for_bucket_split=*/65536, /*pre_mapping_fbv=*/false)); // Add hits for document 4. DocumentId original_last_added_doc_id = @@ -5122,13 +5228,14 @@ TEST_P(IcingSearchEngineInitializationVersionChangeTest, // Put message into DocumentStore ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, - DocumentStore::Create(filesystem(), GetDocumentDir(), &fake_clock, - schema_store.get(), - /*force_recovery_and_revalidate_documents=*/false, - /*namespace_id_fingerprint=*/false, - PortableFileBackedProtoLog< - DocumentWrapper>::kDeflateCompressionLevel, - /*initialize_stats=*/nullptr)); + DocumentStore::Create( + filesystem(), GetDocumentDir(), &fake_clock, schema_store.get(), + /*force_recovery_and_revalidate_documents=*/false, + /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false, + /*use_persistent_hash_map=*/false, + PortableFileBackedProtoLog< + DocumentWrapper>::kDeflateCompressionLevel, + /*initialize_stats=*/nullptr)); std::unique_ptr<DocumentStore> document_store = std::move(create_result.document_store); ICING_ASSERT_OK_AND_ASSIGN(DocumentId doc_id, document_store->Put(message)); @@ -5142,6 +5249,7 @@ TEST_P(IcingSearchEngineInitializationVersionChangeTest, ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndex> integer_index, IntegerIndex::Create(*filesystem(), GetIntegerIndexDir(), + /*num_data_threshold_for_bucket_split=*/65536, /*pre_mapping_fbv=*/false)); ICING_ASSERT_OK_AND_ASSIGN( @@ -5299,12 +5407,8 @@ INSTANTIATE_TEST_SUITE_P( /*version_in=*/version_util::kVersion + 1, /*max_version_in=*/version_util::kVersion + 1), - // Manually change existing data set's version to kVersion - 1 and - // max_version to kVersion - 1. When initializing, it will detect - // "upgrade". - version_util::VersionInfo( - /*version_in=*/version_util::kVersion - 1, - /*max_version_in=*/version_util::kVersion - 1), + // Currently we don't have any "upgrade" that requires rebuild derived + // files, so skip this case until we have a case for it. // Manually change existing data set's version to kVersion - 1 and // max_version to kVersion. When initializing, it will detect "roll diff --git a/icing/index/index-processor_benchmark.cc b/icing/index/index-processor_benchmark.cc index b6d3c29..9fee925 100644 --- a/icing/index/index-processor_benchmark.cc +++ b/icing/index/index-processor_benchmark.cc @@ -227,6 +227,7 @@ void BM_IndexDocumentWithOneProperty(benchmark::State& state) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<NumericIndex<int64_t>> integer_index, IntegerIndex::Create(filesystem, integer_index_dir, + IntegerIndex::kDefaultNumDataThresholdForBucketSplit, /*pre_mapping_fbv=*/true)); language_segmenter_factory::SegmenterOptions options(ULOC_US); std::unique_ptr<LanguageSegmenter> language_segmenter = @@ -302,6 +303,7 @@ void BM_IndexDocumentWithTenProperties(benchmark::State& state) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<NumericIndex<int64_t>> integer_index, IntegerIndex::Create(filesystem, integer_index_dir, + IntegerIndex::kDefaultNumDataThresholdForBucketSplit, /*pre_mapping_fbv=*/true)); language_segmenter_factory::SegmenterOptions options(ULOC_US); std::unique_ptr<LanguageSegmenter> language_segmenter = @@ -378,6 +380,7 @@ void BM_IndexDocumentWithDiacriticLetters(benchmark::State& state) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<NumericIndex<int64_t>> integer_index, IntegerIndex::Create(filesystem, integer_index_dir, + IntegerIndex::kDefaultNumDataThresholdForBucketSplit, /*pre_mapping_fbv=*/true)); language_segmenter_factory::SegmenterOptions options(ULOC_US); std::unique_ptr<LanguageSegmenter> language_segmenter = @@ -454,6 +457,7 @@ void BM_IndexDocumentWithHiragana(benchmark::State& state) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<NumericIndex<int64_t>> integer_index, IntegerIndex::Create(filesystem, integer_index_dir, + IntegerIndex::kDefaultNumDataThresholdForBucketSplit, /*pre_mapping_fbv=*/true)); language_segmenter_factory::SegmenterOptions options(ULOC_US); std::unique_ptr<LanguageSegmenter> language_segmenter = diff --git a/icing/index/index-processor_test.cc b/icing/index/index-processor_test.cc index 9505dbd..c2eaf62 100644 --- a/icing/index/index-processor_test.cc +++ b/icing/index/index-processor_test.cc @@ -172,8 +172,11 @@ class IndexProcessorTest : public Test { index_, Index::Create(options, &filesystem_, &icing_filesystem_)); ICING_ASSERT_OK_AND_ASSIGN( - integer_index_, IntegerIndex::Create(filesystem_, integer_index_dir_, - /*pre_mapping_fbv=*/false)); + integer_index_, + IntegerIndex::Create( + filesystem_, integer_index_dir_, + IntegerIndex::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv=*/false)); ICING_ASSERT_OK_AND_ASSIGN( qualified_id_join_index_, @@ -277,13 +280,14 @@ class IndexProcessorTest : public Test { ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(doc_store_dir_.c_str())); ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, - DocumentStore::Create(&filesystem_, doc_store_dir_, &fake_clock_, - schema_store_.get(), - /*force_recovery_and_revalidate_documents=*/false, - /*namespace_id_fingerprint=*/false, - PortableFileBackedProtoLog< - DocumentWrapper>::kDeflateCompressionLevel, - /*initialize_stats=*/nullptr)); + DocumentStore::Create( + &filesystem_, doc_store_dir_, &fake_clock_, schema_store_.get(), + /*force_recovery_and_revalidate_documents=*/false, + /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false, + /*use_persistent_hash_map=*/false, + PortableFileBackedProtoLog< + DocumentWrapper>::kDeflateCompressionLevel, + /*initialize_stats=*/nullptr)); doc_store_ = std::move(create_result.document_store); ICING_ASSERT_OK_AND_ASSIGN( diff --git a/icing/index/integer-section-indexing-handler_test.cc b/icing/index/integer-section-indexing-handler_test.cc index 96e21ca..91cc06f 100644 --- a/icing/index/integer-section-indexing-handler_test.cc +++ b/icing/index/integer-section-indexing-handler_test.cc @@ -106,6 +106,7 @@ class IntegerSectionIndexingHandlerTest : public ::testing::Test { ICING_ASSERT_OK_AND_ASSIGN( integer_index_, IntegerIndex::Create(filesystem_, integer_index_working_path_, + /*num_data_threshold_for_bucket_split=*/65536, /*pre_mapping_fbv=*/false)); language_segmenter_factory::SegmenterOptions segmenter_options(ULOC_US); @@ -169,6 +170,8 @@ class IntegerSectionIndexingHandlerTest : public ::testing::Test { schema_store_.get(), /*force_recovery_and_revalidate_documents=*/false, /*namespace_id_fingerprint=*/false, + /*pre_mapping_fbv=*/false, + /*use_persistent_hash_map=*/false, PortableFileBackedProtoLog< DocumentWrapper>::kDeflateCompressionLevel, /*initialize_stats=*/nullptr)); diff --git a/icing/index/iterator/doc-hit-info-iterator-filter_test.cc b/icing/index/iterator/doc-hit-info-iterator-filter_test.cc index d8839dc..d93fd02 100644 --- a/icing/index/iterator/doc-hit-info-iterator-filter_test.cc +++ b/icing/index/iterator/doc-hit-info-iterator-filter_test.cc @@ -55,7 +55,8 @@ libtextclassifier3::StatusOr<DocumentStore::CreateResult> CreateDocumentStore( return DocumentStore::Create( filesystem, base_dir, clock, schema_store, /*force_recovery_and_revalidate_documents=*/false, - /*namespace_id_fingerprint=*/false, + /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false, + /*use_persistent_hash_map=*/false, PortableFileBackedProtoLog<DocumentWrapper>::kDeflateCompressionLevel, /*initialize_stats=*/nullptr); } diff --git a/icing/index/iterator/doc-hit-info-iterator-property-in-schema_test.cc b/icing/index/iterator/doc-hit-info-iterator-property-in-schema_test.cc index df5ddf5..47f5cc5 100644 --- a/icing/index/iterator/doc-hit-info-iterator-property-in-schema_test.cc +++ b/icing/index/iterator/doc-hit-info-iterator-property-in-schema_test.cc @@ -97,13 +97,14 @@ class DocHitInfoIteratorPropertyInSchemaTest : public ::testing::Test { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, - DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_, - schema_store_.get(), - /*force_recovery_and_revalidate_documents=*/false, - /*namespace_id_fingerprint=*/false, - PortableFileBackedProtoLog< - DocumentWrapper>::kDeflateCompressionLevel, - /*initialize_stats=*/nullptr)); + DocumentStore::Create( + &filesystem_, test_dir_, &fake_clock_, schema_store_.get(), + /*force_recovery_and_revalidate_documents=*/false, + /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false, + /*use_persistent_hash_map=*/false, + PortableFileBackedProtoLog< + DocumentWrapper>::kDeflateCompressionLevel, + /*initialize_stats=*/nullptr)); document_store_ = std::move(create_result.document_store); } diff --git a/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc b/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc index c765e6d..47f55aa 100644 --- a/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc +++ b/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc @@ -101,13 +101,14 @@ class DocHitInfoIteratorSectionRestrictTest : public ::testing::Test { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, - DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_, - schema_store_.get(), - /*force_recovery_and_revalidate_documents=*/false, - /*namespace_id_fingerprint=*/false, - PortableFileBackedProtoLog< - DocumentWrapper>::kDeflateCompressionLevel, - /*initialize_stats=*/nullptr)); + DocumentStore::Create( + &filesystem_, test_dir_, &fake_clock_, schema_store_.get(), + /*force_recovery_and_revalidate_documents=*/false, + /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false, + /*use_persistent_hash_map=*/false, + PortableFileBackedProtoLog< + DocumentWrapper>::kDeflateCompressionLevel, + /*initialize_stats=*/nullptr)); document_store_ = std::move(create_result.document_store); } diff --git a/icing/index/numeric/dummy-numeric-index.h b/icing/index/numeric/dummy-numeric-index.h index 2c077a2..34b1acb 100644 --- a/icing/index/numeric/dummy-numeric-index.h +++ b/icing/index/numeric/dummy-numeric-index.h @@ -185,19 +185,20 @@ class DummyNumericIndex : public NumericIndex<T> { PersistentStorage::WorkingPathType::kDummy), last_added_document_id_(kInvalidDocumentId) {} - libtextclassifier3::Status PersistStoragesToDisk() override { + libtextclassifier3::Status PersistStoragesToDisk(bool force) override { return libtextclassifier3::Status::OK; } - libtextclassifier3::Status PersistMetadataToDisk() override { + libtextclassifier3::Status PersistMetadataToDisk(bool force) override { return libtextclassifier3::Status::OK; } - libtextclassifier3::StatusOr<Crc32> ComputeInfoChecksum() override { + libtextclassifier3::StatusOr<Crc32> ComputeInfoChecksum(bool force) override { return Crc32(0); } - libtextclassifier3::StatusOr<Crc32> ComputeStoragesChecksum() override { + libtextclassifier3::StatusOr<Crc32> ComputeStoragesChecksum( + bool force) override { return Crc32(0); } diff --git a/icing/index/numeric/integer-index-bucket-util.h b/icing/index/numeric/integer-index-bucket-util.h index 863bd01..d6fc245 100644 --- a/icing/index/numeric/integer-index-bucket-util.h +++ b/icing/index/numeric/integer-index-bucket-util.h @@ -61,7 +61,7 @@ struct DataRangeAndBucketInfo { // - Data slice (i.e. [start, end)) can be empty. // // REQUIRES: -// - original_key_lower <= original_key_upper +// - original_key_lower < original_key_upper // - num_data_threshold > 0 // - Keys of all data are in range [original_key_lower, original_key_upper] // diff --git a/icing/index/numeric/integer-index-storage.cc b/icing/index/numeric/integer-index-storage.cc index fa62b19..f0212da 100644 --- a/icing/index/numeric/integer-index-storage.cc +++ b/icing/index/numeric/integer-index-storage.cc @@ -45,6 +45,7 @@ #include "icing/index/numeric/posting-list-integer-index-serializer.h" #include "icing/schema/section.h" #include "icing/store/document-id.h" +#include "icing/util/crc32.h" #include "icing/util/status-macros.h" namespace icing { @@ -311,6 +312,11 @@ libtextclassifier3::Status IntegerIndexStorageIterator::Advance() { } bool IntegerIndexStorage::Options::IsValid() const { + if (num_data_threshold_for_bucket_split <= + kMinNumDataThresholdForBucketSplit) { + return false; + } + if (!HasCustomInitBuckets()) { return true; } @@ -403,12 +409,20 @@ libtextclassifier3::Status IntegerIndexStorage::AddKeys( return libtextclassifier3::Status::OK; } + SetDirty(); + std::sort(new_keys.begin(), new_keys.end()); // Dedupe auto last = std::unique(new_keys.begin(), new_keys.end()); new_keys.erase(last, new_keys.end()); + if (static_cast<int32_t>(new_keys.size()) > + std::numeric_limits<int32_t>::max() - info().num_data) { + return absl_ports::ResourceExhaustedError( + "# of keys in this integer index storage exceed the limit"); + } + // When adding keys into a bucket, we potentially split it into 2 new buckets // and one of them will be added into the unsorted bucket array. // When handling keys belonging to buckets in the unsorted bucket array, we @@ -649,6 +663,9 @@ libtextclassifier3::Status IntegerIndexStorage::TransferIndex( return lhs.get() < rhs.get(); }); + const int32_t num_data_threshold_for_bucket_merge = + kNumDataThresholdRatioForBucketMerge * + new_storage->options_.num_data_threshold_for_bucket_split; int64_t curr_key_lower = std::numeric_limits<int64_t>::min(); int64_t curr_key_upper = std::numeric_limits<int64_t>::min(); std::vector<IntegerIndexData> accumulated_data; @@ -687,7 +704,7 @@ libtextclassifier3::Status IntegerIndexStorage::TransferIndex( // - Flush accumulated_data and create a new bucket for them. // - OR merge new_data into accumulated_data and go to the next round. if (!accumulated_data.empty() && accumulated_data.size() + new_data.size() > - kNumDataThresholdForBucketMerge) { + num_data_threshold_for_bucket_merge) { // TODO(b/259743562): [Optimization 3] adjust upper bound to fit more data // from new_data to accumulated_data. ICING_RETURN_IF_ERROR(FlushDataIntoNewSortedBucket( @@ -879,9 +896,11 @@ IntegerIndexStorage::InitializeExistingFiles( IntegerIndexStorage::FlushDataIntoNewSortedBucket( int64_t key_lower, int64_t key_upper, std::vector<IntegerIndexData>&& data, IntegerIndexStorage* storage) { + storage->SetDirty(); + if (data.empty()) { - return storage->sorted_buckets_->Append( - Bucket(key_lower, key_upper, PostingListIdentifier::kInvalid)); + return storage->sorted_buckets_->Append(Bucket( + key_lower, key_upper, PostingListIdentifier::kInvalid, /*num_data=*/0)); } ICING_ASSIGN_OR_RETURN( @@ -891,10 +910,16 @@ IntegerIndexStorage::FlushDataIntoNewSortedBucket( data.end())); storage->info().num_data += data.size(); - return storage->sorted_buckets_->Append(Bucket(key_lower, key_upper, pl_id)); + return storage->sorted_buckets_->Append( + Bucket(key_lower, key_upper, pl_id, data.size())); } -libtextclassifier3::Status IntegerIndexStorage::PersistStoragesToDisk() { +libtextclassifier3::Status IntegerIndexStorage::PersistStoragesToDisk( + bool force) { + if (!force && !is_storage_dirty()) { + return libtextclassifier3::Status::OK; + } + ICING_RETURN_IF_ERROR(sorted_buckets_->PersistToDisk()); ICING_RETURN_IF_ERROR(unsorted_buckets_->PersistToDisk()); if (!flash_index_storage_->PersistToDisk()) { @@ -904,19 +929,35 @@ libtextclassifier3::Status IntegerIndexStorage::PersistStoragesToDisk() { return libtextclassifier3::Status::OK; } -libtextclassifier3::Status IntegerIndexStorage::PersistMetadataToDisk() { +libtextclassifier3::Status IntegerIndexStorage::PersistMetadataToDisk( + bool force) { + // We can skip persisting metadata to disk only if both info and storage are + // clean. + if (!force && !is_info_dirty() && !is_storage_dirty()) { + return libtextclassifier3::Status::OK; + } + // Changes should have been applied to the underlying file when using // MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, but call msync() as an // extra safety step to ensure they are written out. return metadata_mmapped_file_->PersistToDisk(); } -libtextclassifier3::StatusOr<Crc32> IntegerIndexStorage::ComputeInfoChecksum() { +libtextclassifier3::StatusOr<Crc32> IntegerIndexStorage::ComputeInfoChecksum( + bool force) { + if (!force && !is_info_dirty()) { + return Crc32(crcs().component_crcs.info_crc); + } + return info().ComputeChecksum(); } libtextclassifier3::StatusOr<Crc32> -IntegerIndexStorage::ComputeStoragesChecksum() { +IntegerIndexStorage::ComputeStoragesChecksum(bool force) { + if (!force && !is_storage_dirty()) { + return Crc32(crcs().component_crcs.storages_crc); + } + // Compute crcs ICING_ASSIGN_OR_RETURN(Crc32 sorted_buckets_crc, sorted_buckets_->ComputeChecksum()); @@ -933,6 +974,89 @@ IntegerIndexStorage::AddKeysIntoBucketAndSplitIfNecessary( const std::vector<int64_t>::const_iterator& it_start, const std::vector<int64_t>::const_iterator& it_end, FileBackedVector<Bucket>::MutableView& mutable_bucket) { + int32_t num_data_in_bucket = mutable_bucket.Get().num_data(); + int32_t num_new_data = std::distance(it_start, it_end); + if (mutable_bucket.Get().key_lower() < mutable_bucket.Get().key_upper() && + num_new_data + num_data_in_bucket > + options_.num_data_threshold_for_bucket_split) { + // Split bucket. + + // 1. Read all data and free all posting lists. + std::vector<IntegerIndexData> all_data; + if (mutable_bucket.Get().posting_list_identifier().is_valid()) { + ICING_ASSIGN_OR_RETURN( + std::unique_ptr<PostingListIntegerIndexAccessor> pl_accessor, + PostingListIntegerIndexAccessor::CreateFromExisting( + flash_index_storage_.get(), posting_list_serializer_, + mutable_bucket.Get().posting_list_identifier())); + ICING_ASSIGN_OR_RETURN(all_data, pl_accessor->GetAllDataAndFree()); + } + + // 2. Append all new data. + all_data.reserve(all_data.size() + num_new_data); + for (auto it = it_start; it != it_end; ++it) { + all_data.push_back(IntegerIndexData(section_id, document_id, *it)); + } + + // 3. Run bucket splitting algorithm to decide new buckets and dispatch + // data. + // - # of data in a full bucket = + // options_.num_data_threshold_for_bucket_split. + // - Bucket splitting logic will be invoked if adding new data + // (num_new_data >= 1) into a full bucket. + // - In order to achieve good (amortized) time complexity, we want # of + // data in new buckets to be around half_of_threshold (i.e. + // options_.num_data_threshold_for_bucket_split / 2). + // - Using half_of_threshold as the cutoff threshold will cause splitting + // buckets with [half_of_threshold, half_of_threshold, num_new_data] + // data, which is not ideal because num_new_data is usually small. + // - Thus, we pick (half_of_threshold + kNumDataAfterSplitAdjustment) as + // the cutoff threshold to avoid over-splitting. It can tolerate + // num_new_data up to (2 * kNumDataAfterSplitAdjustment) and + // split only 2 buckets (instead of 3) with + // [half_of_threshold + kNumDataAfterSplitAdjustment, + // half_of_threshold + (kNumDataAfterSplitAdjustment - num_new_data)]. + int32_t cutoff_threshold = + options_.num_data_threshold_for_bucket_split / 2 + + kNumDataAfterSplitAdjustment; + std::vector<integer_index_bucket_util::DataRangeAndBucketInfo> + new_bucket_infos = integer_index_bucket_util::Split( + all_data, mutable_bucket.Get().key_lower(), + mutable_bucket.Get().key_upper(), cutoff_threshold); + if (new_bucket_infos.empty()) { + ICING_LOG(WARNING) + << "No buckets after splitting. This should not happen."; + return absl_ports::InternalError("Split error"); + } + + // 4. Flush data and create new buckets. + std::vector<Bucket> new_buckets; + for (int i = 0; i < new_bucket_infos.size(); ++i) { + int32_t num_data_in_new_bucket = + std::distance(new_bucket_infos[i].start, new_bucket_infos[i].end); + ICING_ASSIGN_OR_RETURN( + PostingListIdentifier pl_id, + FlushDataIntoPostingLists( + flash_index_storage_.get(), posting_list_serializer_, + new_bucket_infos[i].start, new_bucket_infos[i].end)); + if (i == 0) { + // Reuse mutable_bucket + mutable_bucket.Get().set_key_lower(new_bucket_infos[i].key_lower); + mutable_bucket.Get().set_key_upper(new_bucket_infos[i].key_upper); + mutable_bucket.Get().set_posting_list_identifier(pl_id); + mutable_bucket.Get().set_num_data(num_data_in_new_bucket); + } else { + new_buckets.push_back(Bucket(new_bucket_infos[i].key_lower, + new_bucket_infos[i].key_upper, pl_id, + num_data_in_new_bucket)); + } + } + + return new_buckets; + } + + // Otherwise, we don't need to split bucket. Just simply add all new data into + // the bucket. std::unique_ptr<PostingListIntegerIndexAccessor> pl_accessor; if (mutable_bucket.Get().posting_list_identifier().is_valid()) { ICING_ASSIGN_OR_RETURN( @@ -946,68 +1070,6 @@ IntegerIndexStorage::AddKeysIntoBucketAndSplitIfNecessary( } for (auto it = it_start; it != it_end; ++it) { - if (mutable_bucket.Get().key_lower() < mutable_bucket.Get().key_upper() && - pl_accessor->WantsSplit()) { - // If the bucket needs split (max size and full) and is splittable, then - // we perform bucket splitting. - - // 1. Finalize the current posting list accessor. - PostingListAccessor::FinalizeResult result = - std::move(*pl_accessor).Finalize(); - if (!result.status.ok()) { - return result.status; - } - - // 2. Create another posting list accessor instance. Read all data and - // free all posting lists. - ICING_ASSIGN_OR_RETURN( - pl_accessor, - PostingListIntegerIndexAccessor::CreateFromExisting( - flash_index_storage_.get(), posting_list_serializer_, result.id)); - ICING_ASSIGN_OR_RETURN(std::vector<IntegerIndexData> all_data, - pl_accessor->GetAllDataAndFree()); - - // 3. Append all remaining new data. - all_data.reserve(all_data.size() + std::distance(it, it_end)); - for (; it != it_end; ++it) { - all_data.push_back(IntegerIndexData(section_id, document_id, *it)); - } - - // 4. Run bucket splitting algorithm to decide new buckets and dispatch - // data. - std::vector<integer_index_bucket_util::DataRangeAndBucketInfo> - new_bucket_infos = integer_index_bucket_util::Split( - all_data, mutable_bucket.Get().key_lower(), - mutable_bucket.Get().key_upper(), - kNumDataThresholdForBucketSplit); - if (new_bucket_infos.empty()) { - ICING_LOG(WARNING) - << "No buckets after splitting. This should not happen."; - return absl_ports::InternalError("Split error"); - } - - // 5. Flush data. - std::vector<Bucket> new_buckets; - for (int i = 0; i < new_bucket_infos.size(); ++i) { - ICING_ASSIGN_OR_RETURN( - PostingListIdentifier pl_id, - FlushDataIntoPostingLists( - flash_index_storage_.get(), posting_list_serializer_, - new_bucket_infos[i].start, new_bucket_infos[i].end)); - if (i == 0) { - // Reuse mutable_bucket - mutable_bucket.Get().set_key_lower(new_bucket_infos[i].key_lower); - mutable_bucket.Get().set_key_upper(new_bucket_infos[i].key_upper); - mutable_bucket.Get().set_posting_list_identifier(pl_id); - } else { - new_buckets.push_back(Bucket(new_bucket_infos[i].key_lower, - new_bucket_infos[i].key_upper, pl_id)); - } - } - - return new_buckets; - } - ICING_RETURN_IF_ERROR(pl_accessor->PrependData( IntegerIndexData(section_id, document_id, *it))); } @@ -1022,6 +1084,9 @@ IntegerIndexStorage::AddKeysIntoBucketAndSplitIfNecessary( } mutable_bucket.Get().set_posting_list_identifier(result.id); + // We've already verified num_new_data won't exceed the limit of the entire + // storage, so it is safe to add to the counter of the bucket. + mutable_bucket.Get().set_num_data(num_data_in_bucket + num_new_data); return std::vector<Bucket>(); } diff --git a/icing/index/numeric/integer-index-storage.h b/icing/index/numeric/integer-index-storage.h index 9f2e58c..0c1afbb 100644 --- a/icing/index/numeric/integer-index-storage.h +++ b/icing/index/numeric/integer-index-storage.h @@ -75,7 +75,7 @@ namespace lib { class IntegerIndexStorage : public PersistentStorage { public: struct Info { - static constexpr int32_t kMagic = 0xc4bf0ccc; + static constexpr int32_t kMagic = 0x6470e547; int32_t magic; int32_t num_data; @@ -99,10 +99,12 @@ class IntegerIndexStorage : public PersistentStorage { explicit Bucket(int64_t key_lower, int64_t key_upper, PostingListIdentifier posting_list_identifier = - PostingListIdentifier::kInvalid) + PostingListIdentifier::kInvalid, + int32_t num_data = 0) : key_lower_(key_lower), key_upper_(key_upper), - posting_list_identifier_(posting_list_identifier) {} + posting_list_identifier_(posting_list_identifier), + num_data_(num_data) {} bool operator<(const Bucket& other) const { return key_lower_ < other.key_lower_; @@ -130,12 +132,16 @@ class IntegerIndexStorage : public PersistentStorage { posting_list_identifier_ = posting_list_identifier; } + int32_t num_data() const { return num_data_; } + void set_num_data(int32_t num_data) { num_data_ = num_data; } + private: int64_t key_lower_; int64_t key_upper_; PostingListIdentifier posting_list_identifier_; + int32_t num_data_; } __attribute__((packed)); - static_assert(sizeof(Bucket) == 20, ""); + static_assert(sizeof(Bucket) == 24, ""); static_assert(sizeof(Bucket) == FileBackedVector<Bucket>::kElementTypeSize, "Bucket type size is inconsistent with FileBackedVector " "element type size"); @@ -146,15 +152,31 @@ class IntegerIndexStorage : public PersistentStorage { "Max # of buckets cannot fit into FileBackedVector"); struct Options { - explicit Options(bool pre_mapping_fbv_in) - : pre_mapping_fbv(pre_mapping_fbv_in) {} + // - According to the benchmark result, the more # of buckets, the higher + // latency for range query. Therefore, this number cannot be too small to + // avoid splitting bucket too aggressively. + // - We use `num_data_threshold_for_bucket_split / 2 + 5` as the cutoff + // threshold after splitting. This number cannot be too small (e.g. 10) + // because in this case we will have similar # of data in a single bucket + // before and after splitting, which contradicts the purpose of splitting. + // - For convenience, let's set 64 as the minimum value. + static constexpr int32_t kMinNumDataThresholdForBucketSplit = 64; + + explicit Options(int32_t num_data_threshold_for_bucket_split_in, + bool pre_mapping_fbv_in) + : num_data_threshold_for_bucket_split( + num_data_threshold_for_bucket_split_in), + pre_mapping_fbv(pre_mapping_fbv_in) {} explicit Options(std::vector<Bucket> custom_init_sorted_buckets_in, std::vector<Bucket> custom_init_unsorted_buckets_in, + int32_t num_data_threshold_for_bucket_split_in, bool pre_mapping_fbv_in) : custom_init_sorted_buckets(std::move(custom_init_sorted_buckets_in)), custom_init_unsorted_buckets( std::move(custom_init_unsorted_buckets_in)), + num_data_threshold_for_bucket_split( + num_data_threshold_for_bucket_split_in), pre_mapping_fbv(pre_mapping_fbv_in) {} bool IsValid() const; @@ -172,6 +194,14 @@ class IntegerIndexStorage : public PersistentStorage { std::vector<Bucket> custom_init_sorted_buckets; std::vector<Bucket> custom_init_unsorted_buckets; + // Threshold for invoking bucket splitting. If # of data in a bucket exceeds + // this number after adding new data, then it will invoke bucket splitting + // logic. + // + // Note: num_data_threshold_for_bucket_split should be >= + // kMinNumDataThresholdForBucketSplit. + int32_t num_data_threshold_for_bucket_split; + // Flag indicating whether memory map max possible file size for underlying // FileBackedVector before growing the actual file size. bool pre_mapping_fbv; @@ -188,28 +218,25 @@ class IntegerIndexStorage : public PersistentStorage { WorkingPathType::kDirectory; static constexpr std::string_view kFilePrefix = "integer_index_storage"; - // # of data threshold for bucket merging during optimization (TransferIndex). - // If total # data of adjacent buckets exceed this value, then flush the - // accumulated data. Otherwise merge buckets and their data. - // - // Calculated by: 0.7 * (kMaxPostingListSize / sizeof(IntegerIndexData)), - // where kMaxPostingListSize = (kPageSize - sizeof(IndexBlock::BlockHeader)). - static constexpr int32_t kNumDataThresholdForBucketMerge = 240; - - // # of data threshold for bucket splitting during indexing (AddKeys). - // When the posting list of a bucket is full, we will try to split data into - // multiple buckets according to their keys. In order to achieve good - // (amortized) time complexity, we want # of data in new buckets to be at most - // half # of elements in a full posting list. + // Default # of data threshold for bucket splitting during indexing (AddKeys). + // When # of data in a bucket reaches this number, we will try to split data + // into multiple buckets according to their keys. + static constexpr int32_t kDefaultNumDataThresholdForBucketSplit = 65536; + + // # of data threshold for bucket merging during optimization (TransferIndex) + // = kNumDataThresholdRatioForBucketMerge * + // options.num_data_threshold_for_bucket_split // - // Calculated by: 0.5 * (kMaxPostingListSize / sizeof(IntegerIndexData)), - // where kMaxPostingListSize = (kPageSize - sizeof(IndexBlock::BlockHeader)). - static constexpr int32_t kNumDataThresholdForBucketSplit = 170; + // If total # data of adjacent buckets exceed this threshold, then flush the + // accumulated data. Otherwise merge buckets and their data. + static constexpr double kNumDataThresholdRatioForBucketMerge = 0.7; // Length threshold to sort and merge unsorted buckets into sorted buckets. If // the length of unsorted_buckets exceed the threshold, then call // SortBuckets(). - static constexpr int32_t kUnsortedBucketsLengthThreshold = 50; + // TODO(b/259743562): decide if removing unsorted buckets given that we + // changed bucket splitting threshold and # of buckets are small now. + static constexpr int32_t kUnsortedBucketsLengthThreshold = 5; // Creates a new IntegerIndexStorage instance to index integers (for a single // property). If any of the underlying file is missing, then delete the whole @@ -272,6 +299,8 @@ class IntegerIndexStorage : public PersistentStorage { // // Returns: // - OK on success + // - RESOURCE_EXHAUSTED_ERROR if # of integers in this storage exceed + // INT_MAX after adding new_keys // - Any FileBackedVector or PostingList errors libtextclassifier3::Status AddKeys(DocumentId document_id, SectionId section_id, @@ -314,6 +343,8 @@ class IntegerIndexStorage : public PersistentStorage { int32_t num_data() const { return info().num_data; } private: + static constexpr int8_t kNumDataAfterSplitAdjustment = 5; + explicit IntegerIndexStorage( const Filesystem& filesystem, std::string&& working_path, Options&& options, @@ -329,7 +360,9 @@ class IntegerIndexStorage : public PersistentStorage { metadata_mmapped_file_(std::move(metadata_mmapped_file)), sorted_buckets_(std::move(sorted_buckets)), unsorted_buckets_(std::move(unsorted_buckets)), - flash_index_storage_(std::move(flash_index_storage)) {} + flash_index_storage_(std::move(flash_index_storage)), + is_info_dirty_(false), + is_storage_dirty_(false) {} static libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndexStorage>> InitializeNewFiles( @@ -360,20 +393,20 @@ class IntegerIndexStorage : public PersistentStorage { // Returns: // - OK on success // - INTERNAL_ERROR on I/O error - libtextclassifier3::Status PersistStoragesToDisk() override; + libtextclassifier3::Status PersistStoragesToDisk(bool force) override; // Flushes contents of metadata file. // // Returns: // - OK on success // - INTERNAL_ERROR on I/O error - libtextclassifier3::Status PersistMetadataToDisk() override; + libtextclassifier3::Status PersistMetadataToDisk(bool force) override; // Computes and returns Info checksum. // // Returns: // - Crc of the Info on success - libtextclassifier3::StatusOr<Crc32> ComputeInfoChecksum() override; + libtextclassifier3::StatusOr<Crc32> ComputeInfoChecksum(bool force) override; // Computes and returns all storages checksum. Checksums of sorted_buckets_, // unsorted_buckets_ will be combined together by XOR. @@ -382,7 +415,8 @@ class IntegerIndexStorage : public PersistentStorage { // Returns: // - Crc of all storages on success // - INTERNAL_ERROR if any data inconsistency - libtextclassifier3::StatusOr<Crc32> ComputeStoragesChecksum() override; + libtextclassifier3::StatusOr<Crc32> ComputeStoragesChecksum( + bool force) override; // Helper function to add keys in range [it_start, it_end) into the given // bucket. It handles the bucket and its corresponding posting list(s) to make @@ -442,6 +476,17 @@ class IntegerIndexStorage : public PersistentStorage { kInfoMetadataFileOffset); } + void SetInfoDirty() { is_info_dirty_ = true; } + // When storage is dirty, we have to set info dirty as well. So just expose + // SetDirty to set both. + void SetDirty() { + is_info_dirty_ = true; + is_storage_dirty_ = true; + } + + bool is_info_dirty() const { return is_info_dirty_; } + bool is_storage_dirty() const { return is_storage_dirty_; } + Options options_; PostingListIntegerIndexSerializer* posting_list_serializer_; // Does not own. @@ -450,6 +495,9 @@ class IntegerIndexStorage : public PersistentStorage { std::unique_ptr<FileBackedVector<Bucket>> sorted_buckets_; std::unique_ptr<FileBackedVector<Bucket>> unsorted_buckets_; std::unique_ptr<FlashIndexStorage> flash_index_storage_; + + bool is_info_dirty_; + bool is_storage_dirty_; }; } // namespace lib diff --git a/icing/index/numeric/integer-index-storage_benchmark.cc b/icing/index/numeric/integer-index-storage_benchmark.cc index bf5f134..85d381d 100644 --- a/icing/index/numeric/integer-index-storage_benchmark.cc +++ b/icing/index/numeric/integer-index-storage_benchmark.cc @@ -68,6 +68,8 @@ using ::testing::Eq; using ::testing::IsEmpty; using ::testing::SizeIs; +static constexpr int32_t kNumDataThresholdForBucketSplit = + IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit; static constexpr bool kPreMappingFbv = true; static constexpr SectionId kDefaultSectionId = 12; @@ -150,11 +152,13 @@ void BM_Index(benchmark::State& state) { state.PauseTiming(); benchmark.filesystem.DeleteDirectoryRecursively( benchmark.working_path.c_str()); - ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<IntegerIndexStorage> storage, - IntegerIndexStorage::Create( - benchmark.filesystem, benchmark.working_path, - IntegerIndexStorage::Options(kPreMappingFbv), - &benchmark.posting_list_serializer)); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<IntegerIndexStorage> storage, + IntegerIndexStorage::Create( + benchmark.filesystem, benchmark.working_path, + IntegerIndexStorage::Options(kNumDataThresholdForBucketSplit, + kPreMappingFbv), + &benchmark.posting_list_serializer)); state.ResumeTiming(); for (int i = 0; i < num_keys; ++i) { @@ -210,11 +214,13 @@ void BM_BatchIndex(benchmark::State& state) { state.PauseTiming(); benchmark.filesystem.DeleteDirectoryRecursively( benchmark.working_path.c_str()); - ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<IntegerIndexStorage> storage, - IntegerIndexStorage::Create( - benchmark.filesystem, benchmark.working_path, - IntegerIndexStorage::Options(kPreMappingFbv), - &benchmark.posting_list_serializer)); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<IntegerIndexStorage> storage, + IntegerIndexStorage::Create( + benchmark.filesystem, benchmark.working_path, + IntegerIndexStorage::Options(kNumDataThresholdForBucketSplit, + kPreMappingFbv), + &benchmark.posting_list_serializer)); std::vector<int64_t> keys_copy(keys); state.ResumeTiming(); @@ -263,9 +269,11 @@ void BM_ExactQuery(benchmark::State& state) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndexStorage> storage, - IntegerIndexStorage::Create(benchmark.filesystem, benchmark.working_path, - IntegerIndexStorage::Options(kPreMappingFbv), - &benchmark.posting_list_serializer)); + IntegerIndexStorage::Create( + benchmark.filesystem, benchmark.working_path, + IntegerIndexStorage::Options(kNumDataThresholdForBucketSplit, + kPreMappingFbv), + &benchmark.posting_list_serializer)); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<NumberGenerator<int64_t>> generator, CreateIntegerGenerator(distribution_type, kDefaultSeed, num_keys)); @@ -340,9 +348,11 @@ void BM_RangeQueryAll(benchmark::State& state) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndexStorage> storage, - IntegerIndexStorage::Create(benchmark.filesystem, benchmark.working_path, - IntegerIndexStorage::Options(kPreMappingFbv), - &benchmark.posting_list_serializer)); + IntegerIndexStorage::Create( + benchmark.filesystem, benchmark.working_path, + IntegerIndexStorage::Options(kNumDataThresholdForBucketSplit, + kPreMappingFbv), + &benchmark.posting_list_serializer)); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<NumberGenerator<int64_t>> generator, CreateIntegerGenerator(distribution_type, kDefaultSeed, num_keys)); diff --git a/icing/index/numeric/integer-index-storage_test.cc b/icing/index/numeric/integer-index-storage_test.cc index 4d4e665..8675172 100644 --- a/icing/index/numeric/integer-index-storage_test.cc +++ b/icing/index/numeric/integer-index-storage_test.cc @@ -30,8 +30,6 @@ #include "icing/file/file-backed-vector.h" #include "icing/file/filesystem.h" #include "icing/file/persistent-storage.h" -#include "icing/file/posting_list/flash-index-storage.h" -#include "icing/file/posting_list/index-block.h" #include "icing/file/posting_list/posting-list-identifier.h" #include "icing/index/hit/doc-hit-info.h" #include "icing/index/iterator/doc-hit-info-iterator.h" @@ -106,7 +104,32 @@ libtextclassifier3::StatusOr<std::vector<DocHitInfo>> Query( } TEST_P(IntegerIndexStorageTest, OptionsEmptyCustomInitBucketsShouldBeValid) { - EXPECT_THAT(Options(/*pre_mapping_fbv_in=*/GetParam()).IsValid(), IsTrue()); + EXPECT_THAT( + Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()) + .IsValid(), + IsTrue()); +} + +TEST_P(IntegerIndexStorageTest, OptionsInvalidNumDataThresholdForBucketSplit) { + EXPECT_THAT(Options(/*custom_init_sorted_buckets_in=*/{}, + /*custom_init_unsorted_buckets_in=*/{}, + /*num_data_threshold_for_bucket_split=*/-1, + /*pre_mapping_fbv_in=*/GetParam()) + .IsValid(), + IsFalse()); + EXPECT_THAT(Options(/*custom_init_sorted_buckets_in=*/{}, + /*custom_init_unsorted_buckets_in=*/{}, + /*num_data_threshold_for_bucket_split=*/0, + /*pre_mapping_fbv_in=*/GetParam()) + .IsValid(), + IsFalse()); + EXPECT_THAT(Options(/*custom_init_sorted_buckets_in=*/{}, + /*custom_init_unsorted_buckets_in=*/{}, + /*num_data_threshold_for_bucket_split=*/63, + /*pre_mapping_fbv_in=*/GetParam()) + .IsValid(), + IsFalse()); } TEST_P(IntegerIndexStorageTest, OptionsInvalidCustomInitBucketsRange) { @@ -116,6 +139,7 @@ TEST_P(IntegerIndexStorageTest, OptionsInvalidCustomInitBucketsRange) { {Bucket(std::numeric_limits<int64_t>::min(), 5), Bucket(9, 6)}, /*custom_init_unsorted_buckets_in=*/ {Bucket(10, std::numeric_limits<int64_t>::max())}, + IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, /*pre_mapping_fbv_in=*/GetParam()) .IsValid(), IsFalse()); @@ -126,6 +150,7 @@ TEST_P(IntegerIndexStorageTest, OptionsInvalidCustomInitBucketsRange) { {Bucket(10, std::numeric_limits<int64_t>::max())}, /*custom_init_unsorted_buckets_in=*/ {Bucket(std::numeric_limits<int64_t>::min(), 5), Bucket(9, 6)}, + IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, /*pre_mapping_fbv_in=*/GetParam()) .IsValid(), IsFalse()); @@ -138,91 +163,109 @@ TEST_P(IntegerIndexStorageTest, ASSERT_THAT(valid_posting_list_identifier.is_valid(), IsTrue()); // Invalid custom init sorted bucket - EXPECT_THAT(Options(/*custom_init_sorted_buckets_in=*/ - {Bucket(std::numeric_limits<int64_t>::min(), - std::numeric_limits<int64_t>::max(), - valid_posting_list_identifier)}, - /*custom_init_unsorted_buckets_in=*/{}, - /*pre_mapping_fbv_in=*/GetParam()) - .IsValid(), - IsFalse()); + EXPECT_THAT( + Options(/*custom_init_sorted_buckets_in=*/ + {Bucket(std::numeric_limits<int64_t>::min(), + std::numeric_limits<int64_t>::max(), + valid_posting_list_identifier)}, + /*custom_init_unsorted_buckets_in=*/{}, + IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()) + .IsValid(), + IsFalse()); // Invalid custom init unsorted bucket - EXPECT_THAT(Options(/*custom_init_sorted_buckets_in=*/{}, - /*custom_init_unsorted_buckets_in=*/ - {Bucket(std::numeric_limits<int64_t>::min(), - std::numeric_limits<int64_t>::max(), - valid_posting_list_identifier)}, - /*pre_mapping_fbv_in=*/GetParam()) - .IsValid(), - IsFalse()); + EXPECT_THAT( + Options(/*custom_init_sorted_buckets_in=*/{}, + /*custom_init_unsorted_buckets_in=*/ + {Bucket(std::numeric_limits<int64_t>::min(), + std::numeric_limits<int64_t>::max(), + valid_posting_list_identifier)}, + IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()) + .IsValid(), + IsFalse()); } TEST_P(IntegerIndexStorageTest, OptionsInvalidCustomInitBucketsOverlapping) { // sorted buckets overlap - EXPECT_THAT(Options(/*custom_init_sorted_buckets_in=*/ - {Bucket(std::numeric_limits<int64_t>::min(), -100), - Bucket(-100, std::numeric_limits<int64_t>::max())}, - /*custom_init_unsorted_buckets_in=*/{}, - /*pre_mapping_fbv_in=*/GetParam()) - .IsValid(), - IsFalse()); + EXPECT_THAT( + Options(/*custom_init_sorted_buckets_in=*/ + {Bucket(std::numeric_limits<int64_t>::min(), -100), + Bucket(-100, std::numeric_limits<int64_t>::max())}, + /*custom_init_unsorted_buckets_in=*/{}, + IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()) + .IsValid(), + IsFalse()); // unsorted buckets overlap - EXPECT_THAT(Options(/*custom_init_sorted_buckets_in=*/{}, - /*custom_init_unsorted_buckets_in=*/ - {Bucket(-100, std::numeric_limits<int64_t>::max()), - Bucket(std::numeric_limits<int64_t>::min(), -100)}, - /*pre_mapping_fbv_in=*/GetParam()) - .IsValid(), - IsFalse()); + EXPECT_THAT( + Options(/*custom_init_sorted_buckets_in=*/{}, + /*custom_init_unsorted_buckets_in=*/ + {Bucket(-100, std::numeric_limits<int64_t>::max()), + Bucket(std::numeric_limits<int64_t>::min(), -100)}, + IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()) + .IsValid(), + IsFalse()); // Cross buckets overlap - EXPECT_THAT(Options(/*custom_init_sorted_buckets_in=*/ - {Bucket(std::numeric_limits<int64_t>::min(), -100), - Bucket(-99, 0)}, - /*custom_init_unsorted_buckets_in=*/ - {Bucket(200, std::numeric_limits<int64_t>::max()), - Bucket(0, 50), Bucket(51, 199)}, - /*pre_mapping_fbv_in=*/GetParam()) - .IsValid(), - IsFalse()); + EXPECT_THAT( + Options(/*custom_init_sorted_buckets_in=*/ + {Bucket(std::numeric_limits<int64_t>::min(), -100), + Bucket(-99, 0)}, + /*custom_init_unsorted_buckets_in=*/ + {Bucket(200, std::numeric_limits<int64_t>::max()), Bucket(0, 50), + Bucket(51, 199)}, + IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()) + .IsValid(), + IsFalse()); } TEST_P(IntegerIndexStorageTest, OptionsInvalidCustomInitBucketsUnion) { // Missing INT64_MAX - EXPECT_THAT(Options(/*custom_init_sorted_buckets_in=*/ - {Bucket(std::numeric_limits<int64_t>::min(), -100), - Bucket(-99, 0)}, - /*custom_init_unsorted_buckets_in=*/ - {Bucket(1, 1000)}, /*pre_mapping_fbv_in=*/GetParam()) - .IsValid(), - IsFalse()); + EXPECT_THAT( + Options(/*custom_init_sorted_buckets_in=*/ + {Bucket(std::numeric_limits<int64_t>::min(), -100), + Bucket(-99, 0)}, + /*custom_init_unsorted_buckets_in=*/{Bucket(1, 1000)}, + IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()) + .IsValid(), + IsFalse()); // Missing INT64_MIN - EXPECT_THAT(Options(/*custom_init_sorted_buckets_in=*/ - {Bucket(-200, -100), Bucket(-99, 0)}, - /*custom_init_unsorted_buckets_in=*/ - {Bucket(1, std::numeric_limits<int64_t>::max())}, - /*pre_mapping_fbv_in=*/GetParam()) - .IsValid(), - IsFalse()); + EXPECT_THAT( + Options(/*custom_init_sorted_buckets_in=*/ + {Bucket(-200, -100), Bucket(-99, 0)}, + /*custom_init_unsorted_buckets_in=*/ + {Bucket(1, std::numeric_limits<int64_t>::max())}, + IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()) + .IsValid(), + IsFalse()); // Missing some intermediate ranges - EXPECT_THAT(Options(/*custom_init_sorted_buckets_in=*/ - {Bucket(std::numeric_limits<int64_t>::min(), -100)}, - /*custom_init_unsorted_buckets_in=*/ - {Bucket(1, std::numeric_limits<int64_t>::max())}, - /*pre_mapping_fbv_in=*/GetParam()) - .IsValid(), - IsFalse()); + EXPECT_THAT( + Options(/*custom_init_sorted_buckets_in=*/ + {Bucket(std::numeric_limits<int64_t>::min(), -100)}, + /*custom_init_unsorted_buckets_in=*/ + {Bucket(1, std::numeric_limits<int64_t>::max())}, + IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()) + .IsValid(), + IsFalse()); } TEST_P(IntegerIndexStorageTest, InvalidWorkingPath) { EXPECT_THAT( IntegerIndexStorage::Create( filesystem_, "/dev/null/integer_index_storage_test", - Options(/*pre_mapping_fbv_in=*/GetParam()), serializer_.get()), + Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()), + serializer_.get()), StatusIs(libtextclassifier3::StatusCode::INTERNAL)); } @@ -232,6 +275,7 @@ TEST_P(IntegerIndexStorageTest, CreateWithInvalidOptionsShouldFail) { /*custom_init_unsorted_buckets_in=*/ {Bucket(-100, std::numeric_limits<int64_t>::max()), Bucket(std::numeric_limits<int64_t>::min(), -100)}, + IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, /*pre_mapping_fbv_in=*/GetParam()); ASSERT_THAT(invalid_options.IsValid(), IsFalse()); @@ -246,9 +290,11 @@ TEST_P(IntegerIndexStorageTest, InitializeNewFiles) { ASSERT_FALSE(filesystem_.DirectoryExists(working_path_.c_str())); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndexStorage> storage, - IntegerIndexStorage::Create(filesystem_, working_path_, - Options(/*pre_mapping_fbv_in=*/GetParam()), - serializer_.get())); + IntegerIndexStorage::Create( + filesystem_, working_path_, + Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()), + serializer_.get())); ICING_ASSERT_OK(storage->PersistToDisk()); } @@ -290,9 +336,11 @@ TEST_P(IntegerIndexStorageTest, // Create new integer index storage ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndexStorage> storage, - IntegerIndexStorage::Create(filesystem_, working_path_, - Options(/*pre_mapping_fbv_in=*/GetParam()), - serializer_.get())); + IntegerIndexStorage::Create( + filesystem_, working_path_, + Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()), + serializer_.get())); // Insert some data. ICING_ASSERT_OK(storage->AddKeys(/*document_id=*/0, /*section_id=*/20, @@ -305,9 +353,11 @@ TEST_P(IntegerIndexStorageTest, // Without calling PersistToDisk, checksums will not be recomputed or synced // to disk, so initializing another instance on the same files should fail. EXPECT_THAT( - IntegerIndexStorage::Create(filesystem_, working_path_, - Options(/*pre_mapping_fbv_in=*/GetParam()), - serializer_.get()), + IntegerIndexStorage::Create( + filesystem_, working_path_, + Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()), + serializer_.get()), StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); } @@ -315,9 +365,11 @@ TEST_P(IntegerIndexStorageTest, InitializationShouldSucceedWithPersistToDisk) { // Create new integer index storage ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndexStorage> storage1, - IntegerIndexStorage::Create(filesystem_, working_path_, - Options(/*pre_mapping_fbv_in=*/GetParam()), - serializer_.get())); + IntegerIndexStorage::Create( + filesystem_, working_path_, + Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()), + serializer_.get())); // Insert some data. ICING_ASSERT_OK(storage1->AddKeys(/*document_id=*/0, /*section_id=*/20, @@ -339,9 +391,11 @@ TEST_P(IntegerIndexStorageTest, InitializationShouldSucceedWithPersistToDisk) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndexStorage> storage2, - IntegerIndexStorage::Create(filesystem_, working_path_, - Options(/*pre_mapping_fbv_in=*/GetParam()), - serializer_.get())); + IntegerIndexStorage::Create( + filesystem_, working_path_, + Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()), + serializer_.get())); EXPECT_THAT( Query(storage2.get(), /*key_lower=*/std::numeric_limits<int64_t>::min(), /*key_upper=*/std::numeric_limits<int64_t>::max()), @@ -355,9 +409,11 @@ TEST_P(IntegerIndexStorageTest, InitializationShouldSucceedAfterDestruction) { // Create new integer index storage ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndexStorage> storage, - IntegerIndexStorage::Create(filesystem_, working_path_, - Options(/*pre_mapping_fbv_in=*/GetParam()), - serializer_.get())); + IntegerIndexStorage::Create( + filesystem_, working_path_, + Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()), + serializer_.get())); // Insert some data. ICING_ASSERT_OK(storage->AddKeys(/*document_id=*/0, /*section_id=*/20, @@ -380,9 +436,11 @@ TEST_P(IntegerIndexStorageTest, InitializationShouldSucceedAfterDestruction) { // we should be able to get the same contents. ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndexStorage> storage, - IntegerIndexStorage::Create(filesystem_, working_path_, - Options(/*pre_mapping_fbv_in=*/GetParam()), - serializer_.get())); + IntegerIndexStorage::Create( + filesystem_, working_path_, + Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()), + serializer_.get())); EXPECT_THAT( Query(storage.get(), /*key_lower=*/std::numeric_limits<int64_t>::min(), /*key_upper=*/std::numeric_limits<int64_t>::max()), @@ -397,9 +455,11 @@ TEST_P(IntegerIndexStorageTest, // Create new integer index storage ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndexStorage> storage, - IntegerIndexStorage::Create(filesystem_, working_path_, - Options(/*pre_mapping_fbv_in=*/GetParam()), - serializer_.get())); + IntegerIndexStorage::Create( + filesystem_, working_path_, + Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()), + serializer_.get())); ICING_ASSERT_OK(storage->AddKeys(kDefaultDocumentId, kDefaultSectionId, /*new_keys=*/{0, 100, -100})); @@ -428,7 +488,9 @@ TEST_P(IntegerIndexStorageTest, libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndexStorage>> storage_or = IntegerIndexStorage::Create( filesystem_, working_path_, - Options(/*pre_mapping_fbv_in=*/GetParam()), serializer_.get()); + Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()), + serializer_.get()); EXPECT_THAT(storage_or, StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); EXPECT_THAT(storage_or.status().error_message(), @@ -442,9 +504,11 @@ TEST_P(IntegerIndexStorageTest, // Create new integer index storage ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndexStorage> storage, - IntegerIndexStorage::Create(filesystem_, working_path_, - Options(/*pre_mapping_fbv_in=*/GetParam()), - serializer_.get())); + IntegerIndexStorage::Create( + filesystem_, working_path_, + Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()), + serializer_.get())); ICING_ASSERT_OK(storage->AddKeys(kDefaultDocumentId, kDefaultSectionId, /*new_keys=*/{0, 100, -100})); @@ -474,7 +538,9 @@ TEST_P(IntegerIndexStorageTest, libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndexStorage>> storage_or = IntegerIndexStorage::Create( filesystem_, working_path_, - Options(/*pre_mapping_fbv_in=*/GetParam()), serializer_.get()); + Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()), + serializer_.get()); EXPECT_THAT(storage_or, StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); EXPECT_THAT(storage_or.status().error_message(), @@ -488,9 +554,11 @@ TEST_P(IntegerIndexStorageTest, // Create new integer index storage ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndexStorage> storage, - IntegerIndexStorage::Create(filesystem_, working_path_, - Options(/*pre_mapping_fbv_in=*/GetParam()), - serializer_.get())); + IntegerIndexStorage::Create( + filesystem_, working_path_, + Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()), + serializer_.get())); ICING_ASSERT_OK(storage->AddKeys(kDefaultDocumentId, kDefaultSectionId, /*new_keys=*/{0, 100, -100})); @@ -522,7 +590,9 @@ TEST_P(IntegerIndexStorageTest, libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndexStorage>> storage_or = IntegerIndexStorage::Create( filesystem_, working_path_, - Options(/*pre_mapping_fbv_in=*/GetParam()), serializer_.get()); + Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()), + serializer_.get()); EXPECT_THAT(storage_or, StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); EXPECT_THAT(storage_or.status().error_message(), @@ -536,9 +606,11 @@ TEST_P(IntegerIndexStorageTest, // Create new integer index storage ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndexStorage> storage, - IntegerIndexStorage::Create(filesystem_, working_path_, - Options(/*pre_mapping_fbv_in=*/GetParam()), - serializer_.get())); + IntegerIndexStorage::Create( + filesystem_, working_path_, + Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()), + serializer_.get())); ICING_ASSERT_OK(storage->AddKeys(kDefaultDocumentId, kDefaultSectionId, /*new_keys=*/{0, 100, -100})); @@ -572,7 +644,9 @@ TEST_P(IntegerIndexStorageTest, libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndexStorage>> storage_or = IntegerIndexStorage::Create( filesystem_, working_path_, - Options(/*pre_mapping_fbv_in=*/GetParam()), serializer_.get()); + Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()), + serializer_.get()); EXPECT_THAT(storage_or, StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); EXPECT_THAT(storage_or.status().error_message(), @@ -586,14 +660,119 @@ TEST_P(IntegerIndexStorageTest, InvalidQuery) { // Create new integer index storage ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndexStorage> storage, - IntegerIndexStorage::Create(filesystem_, working_path_, - Options(/*pre_mapping_fbv_in=*/GetParam()), - serializer_.get())); + IntegerIndexStorage::Create( + filesystem_, working_path_, + Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()), + serializer_.get())); EXPECT_THAT( storage->GetIterator(/*query_key_lower=*/0, /*query_key_upper=*/-1), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } +TEST_P(IntegerIndexStorageTest, AddKeysShouldUpdateNumData) { + // We use predefined custom buckets to initialize new integer index storage + // and create some test keys accordingly. + std::vector<Bucket> custom_init_sorted_buckets = { + Bucket(-1000, -100), Bucket(0, 100), Bucket(150, 199), Bucket(200, 300), + Bucket(301, 999)}; + std::vector<Bucket> custom_init_unsorted_buckets = { + Bucket(1000, std::numeric_limits<int64_t>::max()), Bucket(-99, -1), + Bucket(101, 149), Bucket(std::numeric_limits<int64_t>::min(), -1001)}; + { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<IntegerIndexStorage> storage, + IntegerIndexStorage::Create( + filesystem_, working_path_, + Options(std::move(custom_init_sorted_buckets), + std::move(custom_init_unsorted_buckets), + IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()), + serializer_.get())); + + // Add some keys into buckets [(-1000,-100), (200,300), (-99,-1)]. + EXPECT_THAT(storage->AddKeys(/*document_id=*/0, kDefaultSectionId, + /*new_keys=*/{-51, -500}), + IsOk()); + EXPECT_THAT(storage->AddKeys(/*document_id=*/1, kDefaultSectionId, + /*new_keys=*/{201, 209, -149}), + IsOk()); + EXPECT_THAT(storage->AddKeys(/*document_id=*/2, kDefaultSectionId, + /*new_keys=*/{208}), + IsOk()); + EXPECT_THAT(storage->num_data(), Eq(6)); + + ICING_ASSERT_OK(storage->PersistToDisk()); + } + + // Check sorted_buckets manually. + const std::string sorted_buckets_file_path = absl_ports::StrCat( + working_path_, "/", IntegerIndexStorage::kFilePrefix, ".s"); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<FileBackedVector<Bucket>> sorted_buckets, + FileBackedVector<Bucket>::Create( + filesystem_, sorted_buckets_file_path, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC)); + EXPECT_THAT(sorted_buckets->num_elements(), Eq(5)); + + ICING_ASSERT_OK_AND_ASSIGN(const Bucket* sbk1, + sorted_buckets->Get(/*idx=*/0)); + EXPECT_THAT(sbk1->key_lower(), Eq(-1000)); + EXPECT_THAT(sbk1->key_upper(), Eq(-100)); + EXPECT_THAT(sbk1->num_data(), Eq(2)); + ICING_ASSERT_OK_AND_ASSIGN(const Bucket* sbk2, + sorted_buckets->Get(/*idx=*/1)); + EXPECT_THAT(sbk2->key_lower(), Eq(0)); + EXPECT_THAT(sbk2->key_upper(), Eq(100)); + EXPECT_THAT(sbk2->num_data(), Eq(0)); + ICING_ASSERT_OK_AND_ASSIGN(const Bucket* sbk3, + sorted_buckets->Get(/*idx=*/2)); + EXPECT_THAT(sbk3->key_lower(), Eq(150)); + EXPECT_THAT(sbk3->key_upper(), Eq(199)); + EXPECT_THAT(sbk3->num_data(), Eq(0)); + ICING_ASSERT_OK_AND_ASSIGN(const Bucket* sbk4, + sorted_buckets->Get(/*idx=*/3)); + EXPECT_THAT(sbk4->key_lower(), Eq(200)); + EXPECT_THAT(sbk4->key_upper(), Eq(300)); + EXPECT_THAT(sbk4->num_data(), Eq(3)); + ICING_ASSERT_OK_AND_ASSIGN(const Bucket* sbk5, + sorted_buckets->Get(/*idx=*/4)); + EXPECT_THAT(sbk5->key_lower(), Eq(301)); + EXPECT_THAT(sbk5->key_upper(), Eq(999)); + EXPECT_THAT(sbk5->num_data(), Eq(0)); + + // Check unsorted_buckets and unsorted buckets manually. + const std::string unsorted_buckets_file_path = absl_ports::StrCat( + working_path_, "/", IntegerIndexStorage::kFilePrefix, ".u"); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<FileBackedVector<Bucket>> unsorted_buckets, + FileBackedVector<Bucket>::Create( + filesystem_, unsorted_buckets_file_path, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC)); + EXPECT_THAT(unsorted_buckets->num_elements(), Eq(4)); + + ICING_ASSERT_OK_AND_ASSIGN(const Bucket* ubk1, + unsorted_buckets->Get(/*idx=*/0)); + EXPECT_THAT(ubk1->key_lower(), Eq(1000)); + EXPECT_THAT(ubk1->key_upper(), Eq(std::numeric_limits<int64_t>::max())); + EXPECT_THAT(ubk1->num_data(), Eq(0)); + ICING_ASSERT_OK_AND_ASSIGN(const Bucket* ubk2, + unsorted_buckets->Get(/*idx=*/1)); + EXPECT_THAT(ubk2->key_lower(), Eq(-99)); + EXPECT_THAT(ubk2->key_upper(), Eq(-1)); + EXPECT_THAT(ubk2->num_data(), Eq(1)); + ICING_ASSERT_OK_AND_ASSIGN(const Bucket* ubk3, + unsorted_buckets->Get(/*idx=*/2)); + EXPECT_THAT(ubk3->key_lower(), Eq(101)); + EXPECT_THAT(ubk3->key_upper(), Eq(149)); + EXPECT_THAT(ubk3->num_data(), Eq(0)); + ICING_ASSERT_OK_AND_ASSIGN(const Bucket* ubk4, + unsorted_buckets->Get(/*idx=*/3)); + EXPECT_THAT(ubk4->key_lower(), Eq(std::numeric_limits<int64_t>::min())); + EXPECT_THAT(ubk4->key_upper(), Eq(-1001)); + EXPECT_THAT(ubk4->num_data(), Eq(0)); +} + TEST_P(IntegerIndexStorageTest, ExactQuerySortedBuckets) { // We use predefined custom buckets to initialize new integer index storage // and create some test keys accordingly. @@ -609,6 +788,7 @@ TEST_P(IntegerIndexStorageTest, ExactQuerySortedBuckets) { filesystem_, working_path_, Options(std::move(custom_init_sorted_buckets), std::move(custom_init_unsorted_buckets), + IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, /*pre_mapping_fbv_in=*/GetParam()), serializer_.get())); @@ -664,6 +844,7 @@ TEST_P(IntegerIndexStorageTest, ExactQueryUnsortedBuckets) { filesystem_, working_path_, Options(std::move(custom_init_sorted_buckets), std::move(custom_init_unsorted_buckets), + IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, /*pre_mapping_fbv_in=*/GetParam()), serializer_.get())); @@ -725,6 +906,7 @@ TEST_P(IntegerIndexStorageTest, ExactQueryIdenticalKeys) { filesystem_, working_path_, Options(std::move(custom_init_sorted_buckets), std::move(custom_init_unsorted_buckets), + IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, /*pre_mapping_fbv_in=*/GetParam()), serializer_.get())); @@ -768,6 +950,7 @@ TEST_P(IntegerIndexStorageTest, RangeQueryEmptyIntegerIndexStorage) { filesystem_, working_path_, Options(std::move(custom_init_sorted_buckets), std::move(custom_init_unsorted_buckets), + IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, /*pre_mapping_fbv_in=*/GetParam()), serializer_.get())); @@ -792,6 +975,7 @@ TEST_P(IntegerIndexStorageTest, RangeQuerySingleEntireSortedBucket) { filesystem_, working_path_, Options(std::move(custom_init_sorted_buckets), std::move(custom_init_unsorted_buckets), + IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, /*pre_mapping_fbv_in=*/GetParam()), serializer_.get())); @@ -847,6 +1031,7 @@ TEST_P(IntegerIndexStorageTest, RangeQuerySingleEntireUnsortedBucket) { filesystem_, working_path_, Options(std::move(custom_init_sorted_buckets), std::move(custom_init_unsorted_buckets), + IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, /*pre_mapping_fbv_in=*/GetParam()), serializer_.get())); @@ -905,6 +1090,7 @@ TEST_P(IntegerIndexStorageTest, RangeQuerySinglePartialSortedBucket) { filesystem_, working_path_, Options(std::move(custom_init_sorted_buckets), std::move(custom_init_unsorted_buckets), + IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, /*pre_mapping_fbv_in=*/GetParam()), serializer_.get())); @@ -957,6 +1143,7 @@ TEST_P(IntegerIndexStorageTest, RangeQuerySinglePartialUnsortedBucket) { filesystem_, working_path_, Options(std::move(custom_init_sorted_buckets), std::move(custom_init_unsorted_buckets), + IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, /*pre_mapping_fbv_in=*/GetParam()), serializer_.get())); @@ -1009,6 +1196,7 @@ TEST_P(IntegerIndexStorageTest, RangeQueryMultipleBuckets) { filesystem_, working_path_, Options(std::move(custom_init_sorted_buckets), std::move(custom_init_unsorted_buckets), + IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, /*pre_mapping_fbv_in=*/GetParam()), serializer_.get())); @@ -1098,6 +1286,7 @@ TEST_P(IntegerIndexStorageTest, BatchAdd) { filesystem_, working_path_, Options(std::move(custom_init_sorted_buckets), std::move(custom_init_unsorted_buckets), + IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, /*pre_mapping_fbv_in=*/GetParam()), serializer_.get())); @@ -1126,9 +1315,11 @@ TEST_P(IntegerIndexStorageTest, BatchAdd) { TEST_P(IntegerIndexStorageTest, BatchAddShouldDedupeKeys) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndexStorage> storage, - IntegerIndexStorage::Create(filesystem_, working_path_, - Options(/*pre_mapping_fbv_in=*/GetParam()), - serializer_.get())); + IntegerIndexStorage::Create( + filesystem_, working_path_, + Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()), + serializer_.get())); std::vector<int64_t> keys = {2, 3, 1, 2, 4, -1, -1, 100, 3}; EXPECT_THAT( @@ -1152,6 +1343,7 @@ TEST_P(IntegerIndexStorageTest, MultipleKeysShouldMergeAndDedupeDocHitInfo) { filesystem_, working_path_, Options(std::move(custom_init_sorted_buckets), std::move(custom_init_unsorted_buckets), + IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, /*pre_mapping_fbv_in=*/GetParam()), serializer_.get())); @@ -1188,6 +1380,7 @@ TEST_P(IntegerIndexStorageTest, filesystem_, working_path_, Options(std::move(custom_init_sorted_buckets), std::move(custom_init_unsorted_buckets), + IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, /*pre_mapping_fbv_in=*/GetParam()), serializer_.get())); @@ -1236,25 +1429,26 @@ TEST_P(IntegerIndexStorageTest, } TEST_P(IntegerIndexStorageTest, SplitBuckets) { + int32_t custom_num_data_threshold_for_bucket_split = 300; + ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndexStorage> storage, - IntegerIndexStorage::Create(filesystem_, working_path_, - Options(/*pre_mapping_fbv_in=*/GetParam()), - serializer_.get())); - - uint32_t block_size = FlashIndexStorage::SelectBlockSize(); - uint32_t max_posting_list_bytes = IndexBlock::CalculateMaxPostingListBytes( - block_size, serializer_->GetDataTypeBytes()); - uint32_t max_num_data_before_split = - max_posting_list_bytes / serializer_->GetDataTypeBytes(); - - // Add max_num_data_before_split + 1 keys to invoke bucket splitting. - // Keys: max_num_data_before_split to 0 - // Document ids: 0 to max_num_data_before_split + IntegerIndexStorage::Create( + filesystem_, working_path_, + Options(/*custom_init_sorted_buckets_in=*/{}, + /*custom_init_unsorted_buckets_in=*/{}, + custom_num_data_threshold_for_bucket_split, + /*pre_mapping_fbv_in=*/GetParam()), + serializer_.get())); + + // Add custom_num_data_threshold_for_bucket_split + 1 keys to invoke bucket + // splitting. + // - Keys: custom_num_data_threshold_for_bucket_split to 0 Document + // - ids: 0 to custom_num_data_threshold_for_bucket_split std::unordered_map<int64_t, DocumentId> data; - int64_t key = max_num_data_before_split; + int64_t key = custom_num_data_threshold_for_bucket_split; DocumentId document_id = 0; - for (int i = 0; i < max_num_data_before_split + 1; ++i) { + for (int i = 0; i < custom_num_data_threshold_for_bucket_split + 1; ++i) { data[key] = document_id; ICING_ASSERT_OK( storage->AddKeys(document_id, kDefaultSectionId, /*new_keys=*/{key})); @@ -1299,7 +1493,8 @@ TEST_P(IntegerIndexStorageTest, SplitBuckets) { // Ensure that search works normally. std::vector<SectionId> expected_sections = {kDefaultSectionId}; - for (int64_t key = max_num_data_before_split; key >= 0; key--) { + for (int64_t key = custom_num_data_threshold_for_bucket_split; key >= 0; + key--) { ASSERT_THAT(data, Contains(Key(key))); DocumentId expected_document_id = data[key]; EXPECT_THAT(Query(storage.get(), /*key_lower=*/key, /*key_upper=*/key), @@ -1309,20 +1504,21 @@ TEST_P(IntegerIndexStorageTest, SplitBuckets) { } TEST_P(IntegerIndexStorageTest, SplitBucketsTriggerSortBuckets) { + int32_t custom_num_data_threshold_for_bucket_split = 300; + ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndexStorage> storage, - IntegerIndexStorage::Create(filesystem_, working_path_, - Options(/*pre_mapping_fbv_in=*/GetParam()), - serializer_.get())); - - uint32_t block_size = FlashIndexStorage::SelectBlockSize(); - uint32_t max_posting_list_bytes = IndexBlock::CalculateMaxPostingListBytes( - block_size, serializer_->GetDataTypeBytes()); - uint32_t max_num_data_before_split = - max_posting_list_bytes / serializer_->GetDataTypeBytes(); + IntegerIndexStorage::Create( + filesystem_, working_path_, + Options(/*custom_init_sorted_buckets_in=*/{}, + /*custom_init_unsorted_buckets_in=*/{}, + custom_num_data_threshold_for_bucket_split, + /*pre_mapping_fbv_in=*/GetParam()), + serializer_.get())); // Add IntegerIndexStorage::kUnsortedBucketsLengthThreshold keys. For each - // key, add max_num_data_before_split + 1 data. Then we will get: + // key, add custom_num_data_threshold_for_bucket_split + 1 data. Then we will + // get: // - Bucket splitting will create kUnsortedBucketsLengthThreshold + 1 unsorted // buckets [[50, 50], [49, 49], ..., [1, 1], [51, INT64_MAX]]. // - Since there are kUnsortedBucketsLengthThreshold + 1 unsorted buckets, we @@ -1332,7 +1528,7 @@ TEST_P(IntegerIndexStorageTest, SplitBucketsTriggerSortBuckets) { DocumentId document_id = 0; for (int i = 0; i < IntegerIndexStorage::kUnsortedBucketsLengthThreshold; ++i) { - for (int j = 0; j < max_num_data_before_split + 1; ++j) { + for (int j = 0; j < custom_num_data_threshold_for_bucket_split + 1; ++j) { data[key].push_back(document_id); ICING_ASSERT_OK( storage->AddKeys(document_id, kDefaultSectionId, /*new_keys=*/{key})); @@ -1396,6 +1592,7 @@ TEST_P(IntegerIndexStorageTest, TransferIndex) { filesystem_, working_path_, Options(std::move(custom_init_sorted_buckets), std::move(custom_init_unsorted_buckets), + IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, /*pre_mapping_fbv_in=*/GetParam()), serializer_.get())); @@ -1433,9 +1630,11 @@ TEST_P(IntegerIndexStorageTest, TransferIndex) { { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndexStorage> new_storage, - IntegerIndexStorage::Create(filesystem_, working_path_ + "_temp", - Options(/*pre_mapping_fbv_in=*/GetParam()), - serializer_.get())); + IntegerIndexStorage::Create( + filesystem_, working_path_ + "_temp", + Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()), + serializer_.get())); EXPECT_THAT( storage->TransferIndex(document_id_old_to_new, new_storage.get()), IsOk()); @@ -1445,9 +1644,11 @@ TEST_P(IntegerIndexStorageTest, TransferIndex) { // Verify after transferring and reinitializing the instance. ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndexStorage> new_storage, - IntegerIndexStorage::Create(filesystem_, working_path_ + "_temp", - Options(/*pre_mapping_fbv_in=*/GetParam()), - serializer_.get())); + IntegerIndexStorage::Create( + filesystem_, working_path_ + "_temp", + Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()), + serializer_.get())); std::vector<SectionId> expected_sections = {kDefaultSectionId}; EXPECT_THAT(new_storage->num_data(), Eq(7)); @@ -1493,9 +1694,11 @@ TEST_P(IntegerIndexStorageTest, TransferIndex) { TEST_P(IntegerIndexStorageTest, TransferIndexOutOfRangeDocumentId) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndexStorage> storage, - IntegerIndexStorage::Create(filesystem_, working_path_, - Options(/*pre_mapping_fbv_in=*/GetParam()), - serializer_.get())); + IntegerIndexStorage::Create( + filesystem_, working_path_, + Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()), + serializer_.get())); ICING_ASSERT_OK(storage->AddKeys(/*document_id=*/1, kDefaultSectionId, /*new_keys=*/{120})); @@ -1510,9 +1713,11 @@ TEST_P(IntegerIndexStorageTest, TransferIndexOutOfRangeDocumentId) { // Transfer to new storage. ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndexStorage> new_storage, - IntegerIndexStorage::Create(filesystem_, working_path_ + "_temp", - Options(/*pre_mapping_fbv_in=*/GetParam()), - serializer_.get())); + IntegerIndexStorage::Create( + filesystem_, working_path_ + "_temp", + Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()), + serializer_.get())); EXPECT_THAT(storage->TransferIndex(document_id_old_to_new, new_storage.get()), IsOk()); @@ -1542,6 +1747,7 @@ TEST_P(IntegerIndexStorageTest, TransferEmptyIndex) { filesystem_, working_path_, Options(std::move(custom_init_sorted_buckets), std::move(custom_init_unsorted_buckets), + IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, /*pre_mapping_fbv_in=*/GetParam()), serializer_.get())); ASSERT_THAT(storage->num_data(), Eq(0)); @@ -1552,9 +1758,11 @@ TEST_P(IntegerIndexStorageTest, TransferEmptyIndex) { // Transfer to new storage. ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndexStorage> new_storage, - IntegerIndexStorage::Create(filesystem_, working_path_ + "_temp", - Options(/*pre_mapping_fbv_in=*/GetParam()), - serializer_.get())); + IntegerIndexStorage::Create( + filesystem_, working_path_ + "_temp", + Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()), + serializer_.get())); EXPECT_THAT(storage->TransferIndex(document_id_old_to_new, new_storage.get()), IsOk()); @@ -1581,6 +1789,7 @@ TEST_P(IntegerIndexStorageTest, TransferIndexDeleteAll) { filesystem_, working_path_, Options(std::move(custom_init_sorted_buckets), std::move(custom_init_unsorted_buckets), + IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, /*pre_mapping_fbv_in=*/GetParam()), serializer_.get())); @@ -1605,9 +1814,11 @@ TEST_P(IntegerIndexStorageTest, TransferIndexDeleteAll) { { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndexStorage> new_storage, - IntegerIndexStorage::Create(filesystem_, working_path_ + "_temp", - Options(/*pre_mapping_fbv_in=*/GetParam()), - serializer_.get())); + IntegerIndexStorage::Create( + filesystem_, working_path_ + "_temp", + Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()), + serializer_.get())); EXPECT_THAT( storage->TransferIndex(document_id_old_to_new, new_storage.get()), IsOk()); @@ -1617,9 +1828,11 @@ TEST_P(IntegerIndexStorageTest, TransferIndexDeleteAll) { // Verify after transferring and reinitializing the instance. ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndexStorage> new_storage, - IntegerIndexStorage::Create(filesystem_, working_path_ + "_temp", - Options(/*pre_mapping_fbv_in=*/GetParam()), - serializer_.get())); + IntegerIndexStorage::Create( + filesystem_, working_path_ + "_temp", + Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv_in=*/GetParam()), + serializer_.get())); std::vector<SectionId> expected_sections = {kDefaultSectionId}; EXPECT_THAT(new_storage->num_data(), Eq(0)); @@ -1630,6 +1843,11 @@ TEST_P(IntegerIndexStorageTest, TransferIndexDeleteAll) { } TEST_P(IntegerIndexStorageTest, TransferIndexShouldInvokeMergeBuckets) { + int32_t custom_num_data_threshold_for_bucket_split = 300; + int32_t custom_num_data_threshold_for_bucket_merge = + IntegerIndexStorage::kNumDataThresholdRatioForBucketMerge * + custom_num_data_threshold_for_bucket_split; + // This test verifies that if TransferIndex invokes bucket merging logic to // ensure sure we're able to avoid having mostly empty buckets after inserting // and deleting data for many rounds. @@ -1648,6 +1866,7 @@ TEST_P(IntegerIndexStorageTest, TransferIndexShouldInvokeMergeBuckets) { filesystem_, working_path_, Options(std::move(custom_init_sorted_buckets), std::move(custom_init_unsorted_buckets), + custom_num_data_threshold_for_bucket_split, /*pre_mapping_fbv_in=*/GetParam()), serializer_.get())); @@ -1671,7 +1890,7 @@ TEST_P(IntegerIndexStorageTest, TransferIndexShouldInvokeMergeBuckets) { /*new_keys=*/{20})); ASSERT_THAT(storage->num_data(), Eq(9)); ASSERT_THAT(storage->num_data(), - Le(IntegerIndexStorage::kNumDataThresholdForBucketMerge)); + Le(custom_num_data_threshold_for_bucket_merge)); // Create document_id_old_to_new that keeps all existing documents. std::vector<DocumentId> document_id_old_to_new(9); @@ -1683,12 +1902,17 @@ TEST_P(IntegerIndexStorageTest, TransferIndexShouldInvokeMergeBuckets) { { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndexStorage> new_storage, - IntegerIndexStorage::Create(filesystem_, new_storage_working_path, - Options(/*pre_mapping_fbv_in=*/GetParam()), - serializer_.get())); + IntegerIndexStorage::Create( + filesystem_, new_storage_working_path, + Options(/*custom_init_sorted_buckets_in=*/{}, + /*custom_init_unsorted_buckets_in=*/{}, + custom_num_data_threshold_for_bucket_split, + /*pre_mapping_fbv_in=*/GetParam()), + serializer_.get())); EXPECT_THAT( storage->TransferIndex(document_id_old_to_new, new_storage.get()), IsOk()); + EXPECT_THAT(new_storage->num_data(), Eq(9)); } // Check new_storage->sorted_bucket_ manually. @@ -1704,9 +1928,15 @@ TEST_P(IntegerIndexStorageTest, TransferIndexShouldInvokeMergeBuckets) { ICING_ASSERT_OK_AND_ASSIGN(const Bucket* bk1, sorted_buckets->Get(/*idx=*/0)); EXPECT_THAT(bk1->key_lower(), Eq(std::numeric_limits<int64_t>::min())); EXPECT_THAT(bk1->key_upper(), Eq(std::numeric_limits<int64_t>::max())); + EXPECT_THAT(bk1->num_data(), Eq(9)); } TEST_P(IntegerIndexStorageTest, TransferIndexExceedsMergeThreshold) { + int32_t custom_num_data_threshold_for_bucket_split = 300; + int32_t custom_num_data_threshold_for_bucket_merge = + IntegerIndexStorage::kNumDataThresholdRatioForBucketMerge * + custom_num_data_threshold_for_bucket_split; + // This test verifies that if TransferIndex invokes bucket merging logic and // doesn't merge buckets too aggressively to ensure we won't get a bucket with // too many data. @@ -1725,15 +1955,16 @@ TEST_P(IntegerIndexStorageTest, TransferIndexExceedsMergeThreshold) { filesystem_, working_path_, Options(std::move(custom_init_sorted_buckets), std::move(custom_init_unsorted_buckets), + custom_num_data_threshold_for_bucket_split, /*pre_mapping_fbv_in=*/GetParam()), serializer_.get())); // Insert data into 2 buckets so that total # of these 2 buckets exceed - // kNumDataThresholdForBucketMerge. + // custom_num_data_threshold_for_bucket_merge. // - Bucket 1: [-1000, -100] // - Bucket 2: [101, 149] DocumentId document_id = 0; - int num_data_for_bucket1 = 200; + int num_data_for_bucket1 = custom_num_data_threshold_for_bucket_merge - 50; for (int i = 0; i < num_data_for_bucket1; ++i) { ICING_ASSERT_OK(storage->AddKeys(document_id, kDefaultSectionId, /*new_keys=*/{-200})); @@ -1747,8 +1978,10 @@ TEST_P(IntegerIndexStorageTest, TransferIndexExceedsMergeThreshold) { ++document_id; } + ASSERT_THAT(storage->num_data(), + Eq(num_data_for_bucket1 + num_data_for_bucket2)); ASSERT_THAT(num_data_for_bucket1 + num_data_for_bucket2, - Gt(IntegerIndexStorage::kNumDataThresholdForBucketMerge)); + Gt(custom_num_data_threshold_for_bucket_merge)); // Create document_id_old_to_new that keeps all existing documents. std::vector<DocumentId> document_id_old_to_new(document_id); @@ -1760,12 +1993,18 @@ TEST_P(IntegerIndexStorageTest, TransferIndexExceedsMergeThreshold) { { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndexStorage> new_storage, - IntegerIndexStorage::Create(filesystem_, new_storage_working_path, - Options(/*pre_mapping_fbv_in=*/GetParam()), - serializer_.get())); + IntegerIndexStorage::Create( + filesystem_, new_storage_working_path, + Options(/*custom_init_sorted_buckets_in=*/{}, + /*custom_init_unsorted_buckets_in=*/{}, + custom_num_data_threshold_for_bucket_split, + /*pre_mapping_fbv_in=*/GetParam()), + serializer_.get())); EXPECT_THAT( storage->TransferIndex(document_id_old_to_new, new_storage.get()), IsOk()); + EXPECT_THAT(new_storage->num_data(), + Eq(num_data_for_bucket1 + num_data_for_bucket2)); } // Check new_storage->sorted_bucket_ manually. @@ -1781,9 +2020,11 @@ TEST_P(IntegerIndexStorageTest, TransferIndexExceedsMergeThreshold) { ICING_ASSERT_OK_AND_ASSIGN(const Bucket* bk1, sorted_buckets->Get(/*idx=*/0)); EXPECT_THAT(bk1->key_lower(), Eq(std::numeric_limits<int64_t>::min())); EXPECT_THAT(bk1->key_upper(), Eq(100)); + EXPECT_THAT(bk1->num_data(), Eq(num_data_for_bucket1)); ICING_ASSERT_OK_AND_ASSIGN(const Bucket* bk2, sorted_buckets->Get(/*idx=*/1)); EXPECT_THAT(bk2->key_lower(), Eq(101)); EXPECT_THAT(bk2->key_upper(), Eq(std::numeric_limits<int64_t>::max())); + EXPECT_THAT(bk2->num_data(), Eq(num_data_for_bucket2)); } INSTANTIATE_TEST_SUITE_P(IntegerIndexStorageTest, IntegerIndexStorageTest, diff --git a/icing/index/numeric/integer-index.cc b/icing/index/numeric/integer-index.cc index 5fa82a5..b2fe159 100644 --- a/icing/index/numeric/integer-index.cc +++ b/icing/index/numeric/integer-index.cc @@ -91,7 +91,7 @@ libtextclassifier3::StatusOr<IntegerIndex::PropertyToStorageMapType> GetPropertyIntegerIndexStorageMap( const Filesystem& filesystem, const std::string& working_path, PostingListIntegerIndexSerializer* posting_list_serializer, - bool pre_mapping_fbv) { + int32_t num_data_threshold_for_bucket_split, bool pre_mapping_fbv) { ICING_ASSIGN_OR_RETURN(std::vector<std::string> property_paths, GetAllExistingPropertyPaths(filesystem, working_path)); @@ -102,11 +102,13 @@ GetPropertyIntegerIndexStorageMap( } std::string storage_working_path = GetPropertyIndexStoragePath(working_path, property_path); - ICING_ASSIGN_OR_RETURN(std::unique_ptr<IntegerIndexStorage> storage, - IntegerIndexStorage::Create( - filesystem, storage_working_path, - IntegerIndexStorage::Options(pre_mapping_fbv), - posting_list_serializer)); + ICING_ASSIGN_OR_RETURN( + std::unique_ptr<IntegerIndexStorage> storage, + IntegerIndexStorage::Create( + filesystem, storage_working_path, + IntegerIndexStorage::Options(num_data_threshold_for_bucket_split, + pre_mapping_fbv), + posting_list_serializer)); property_to_storage_map.insert( std::make_pair(property_path, std::move(storage))); } @@ -141,6 +143,8 @@ libtextclassifier3::StatusOr<std::unordered_set<std::string>> CreatePropertySet( } // namespace libtextclassifier3::Status IntegerIndex::Editor::IndexAllBufferedKeys() && { + integer_index_.SetDirty(); + auto iter = integer_index_.property_to_storage_map_.find(property_path_); IntegerIndexStorage* target_storage = nullptr; // 1. Check if this property already has its own individual index. @@ -161,7 +165,8 @@ libtextclassifier3::Status IntegerIndex::Editor::IndexAllBufferedKeys() && { integer_index_.filesystem_, GetPropertyIndexStoragePath(integer_index_.working_path_, kWildcardPropertyIndexFileName), - IntegerIndexStorage::Options(pre_mapping_fbv_), + IntegerIndexStorage::Options(num_data_threshold_for_bucket_split_, + pre_mapping_fbv_), integer_index_.posting_list_serializer_.get())); } ICING_RETURN_IF_ERROR( @@ -175,7 +180,8 @@ libtextclassifier3::Status IntegerIndex::Editor::IndexAllBufferedKeys() && { integer_index_.filesystem_, GetPropertyIndexStoragePath(integer_index_.working_path_, property_path_), - IntegerIndexStorage::Options(pre_mapping_fbv_), + IntegerIndexStorage::Options(num_data_threshold_for_bucket_split_, + pre_mapping_fbv_), integer_index_.posting_list_serializer_.get())); target_storage = new_storage.get(); integer_index_.property_to_storage_map_.insert( @@ -188,6 +194,7 @@ libtextclassifier3::Status IntegerIndex::Editor::IndexAllBufferedKeys() && { /* static */ libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndex>> IntegerIndex::Create(const Filesystem& filesystem, std::string working_path, + int32_t num_data_threshold_for_bucket_split, bool pre_mapping_fbv) { if (!filesystem.FileExists(GetMetadataFilePath(working_path).c_str())) { // Discard working_path if metadata file is missing, and reinitialize. @@ -195,9 +202,11 @@ IntegerIndex::Create(const Filesystem& filesystem, std::string working_path, ICING_RETURN_IF_ERROR(Discard(filesystem, working_path)); } return InitializeNewFiles(filesystem, std::move(working_path), + num_data_threshold_for_bucket_split, pre_mapping_fbv); } return InitializeExistingFiles(filesystem, std::move(working_path), + num_data_threshold_for_bucket_split, pre_mapping_fbv); } @@ -239,6 +248,8 @@ IntegerIndex::GetIterator(std::string_view property_path, int64_t key_lower, libtextclassifier3::Status IntegerIndex::AddPropertyToWildcardStorage( const std::string& property_path) { + SetDirty(); + WildcardPropertyStorage wildcard_properties; wildcard_properties.mutable_property_entries()->Reserve( wildcard_properties_set_.size()); @@ -272,7 +283,8 @@ libtextclassifier3::Status IntegerIndex::Optimize( // we can safely swap directories later. ICING_ASSIGN_OR_RETURN( std::unique_ptr<IntegerIndex> new_integer_index, - Create(filesystem_, temp_working_path_ddir.dir(), pre_mapping_fbv_)); + Create(filesystem_, temp_working_path_ddir.dir(), + num_data_threshold_for_bucket_split_, pre_mapping_fbv_)); ICING_RETURN_IF_ERROR( TransferIndex(document_id_old_to_new, new_integer_index.get())); new_integer_index->set_last_added_document_id(new_last_added_document_id); @@ -322,20 +334,24 @@ libtextclassifier3::Status IntegerIndex::Optimize( filesystem_, GetPropertyIndexStoragePath(working_path_, kWildcardPropertyIndexFileName), - IntegerIndexStorage::Options(pre_mapping_fbv_), + IntegerIndexStorage::Options(num_data_threshold_for_bucket_split_, + pre_mapping_fbv_), posting_list_serializer_.get())); } // Initialize all existing integer index storages. - ICING_ASSIGN_OR_RETURN(property_to_storage_map_, - GetPropertyIntegerIndexStorageMap( - filesystem_, working_path_, - posting_list_serializer_.get(), pre_mapping_fbv_)); + ICING_ASSIGN_OR_RETURN( + property_to_storage_map_, + GetPropertyIntegerIndexStorageMap( + filesystem_, working_path_, posting_list_serializer_.get(), + num_data_threshold_for_bucket_split_, pre_mapping_fbv_)); return libtextclassifier3::Status::OK; } libtextclassifier3::Status IntegerIndex::Clear() { + SetDirty(); + // Step 1: clear property_to_storage_map_. property_to_storage_map_.clear(); wildcard_index_storage_.reset(); @@ -367,6 +383,7 @@ libtextclassifier3::Status IntegerIndex::Clear() { /* static */ libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndex>> IntegerIndex::InitializeNewFiles(const Filesystem& filesystem, std::string&& working_path, + int32_t num_data_threshold_for_bucket_split, bool pre_mapping_fbv) { // Create working directory. if (!filesystem.CreateDirectoryRecursively(working_path.c_str())) { @@ -399,12 +416,14 @@ IntegerIndex::InitializeNewFiles(const Filesystem& filesystem, std::make_unique<MemoryMappedFile>(std::move(metadata_mmapped_file)), /*property_to_storage_map=*/{}, std::move(wildcard_property_storage), /*wildcard_properties_set=*/{}, /*wildcard_index_storage=*/nullptr, - pre_mapping_fbv)); + num_data_threshold_for_bucket_split, pre_mapping_fbv)); // Initialize info content by writing mapped memory directly. Info& info_ref = new_integer_index->info(); info_ref.magic = Info::kMagic; info_ref.last_added_document_id = kInvalidDocumentId; + info_ref.num_data_threshold_for_bucket_split = + num_data_threshold_for_bucket_split; // Initialize new PersistentStorage. The initial checksums will be computed // and set via InitializeNewStorage. ICING_RETURN_IF_ERROR(new_integer_index->InitializeNewStorage()); @@ -413,9 +432,9 @@ IntegerIndex::InitializeNewFiles(const Filesystem& filesystem, } /* static */ libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndex>> -IntegerIndex::InitializeExistingFiles(const Filesystem& filesystem, - std::string&& working_path, - bool pre_mapping_fbv) { +IntegerIndex::InitializeExistingFiles( + const Filesystem& filesystem, std::string&& working_path, + int32_t num_data_threshold_for_bucket_split, bool pre_mapping_fbv) { // Mmap the content of the crcs and info. ICING_ASSIGN_OR_RETURN( MemoryMappedFile metadata_mmapped_file, @@ -432,10 +451,11 @@ IntegerIndex::InitializeExistingFiles(const Filesystem& filesystem, std::make_unique<PostingListIntegerIndexSerializer>(); // Initialize all existing integer index storages. - ICING_ASSIGN_OR_RETURN(PropertyToStorageMapType property_to_storage_map, - GetPropertyIntegerIndexStorageMap( - filesystem, working_path, - posting_list_serializer.get(), pre_mapping_fbv)); + ICING_ASSIGN_OR_RETURN( + PropertyToStorageMapType property_to_storage_map, + GetPropertyIntegerIndexStorageMap( + filesystem, working_path, posting_list_serializer.get(), + num_data_threshold_for_bucket_split, pre_mapping_fbv)); std::string wildcard_property_path = GetWildcardPropertyStorageFilePath(working_path); @@ -455,7 +475,8 @@ IntegerIndex::InitializeExistingFiles(const Filesystem& filesystem, filesystem, GetPropertyIndexStoragePath(working_path, kWildcardPropertyIndexFileName), - IntegerIndexStorage::Options(pre_mapping_fbv), + IntegerIndexStorage::Options(num_data_threshold_for_bucket_split, + pre_mapping_fbv), posting_list_serializer.get())); } @@ -465,7 +486,7 @@ IntegerIndex::InitializeExistingFiles(const Filesystem& filesystem, std::make_unique<MemoryMappedFile>(std::move(metadata_mmapped_file)), std::move(property_to_storage_map), std::move(wildcard_property_storage), std::move(wildcard_properties_set), std::move(wildcard_index_storage), - pre_mapping_fbv)); + num_data_threshold_for_bucket_split, pre_mapping_fbv)); // Initialize existing PersistentStorage. Checksums will be validated. ICING_RETURN_IF_ERROR(integer_index->InitializeExistingStorage()); @@ -474,6 +495,14 @@ IntegerIndex::InitializeExistingFiles(const Filesystem& filesystem, return absl_ports::FailedPreconditionError("Incorrect magic value"); } + // If num_data_threshold_for_bucket_split mismatches, then return error to let + // caller rebuild. + if (integer_index->info().num_data_threshold_for_bucket_split != + num_data_threshold_for_bucket_split) { + return absl_ports::FailedPreconditionError( + "Mismatch num_data_threshold_for_bucket_split"); + } + return integer_index; } @@ -488,7 +517,8 @@ IntegerIndex::TransferIntegerIndexStorage( std::unique_ptr<IntegerIndexStorage> new_storage, IntegerIndexStorage::Create( new_integer_index->filesystem_, new_storage_working_path, - IntegerIndexStorage::Options(pre_mapping_fbv_), + IntegerIndexStorage::Options(num_data_threshold_for_bucket_split_, + pre_mapping_fbv_), new_integer_index->posting_list_serializer_.get())); ICING_RETURN_IF_ERROR( @@ -552,7 +582,11 @@ libtextclassifier3::Status IntegerIndex::TransferIndex( return libtextclassifier3::Status::OK; } -libtextclassifier3::Status IntegerIndex::PersistStoragesToDisk() { +libtextclassifier3::Status IntegerIndex::PersistStoragesToDisk(bool force) { + if (!force && !is_storage_dirty()) { + return libtextclassifier3::Status::OK; + } + for (auto& [_, storage] : property_to_storage_map_) { ICING_RETURN_IF_ERROR(storage->PersistToDisk()); } @@ -564,18 +598,32 @@ libtextclassifier3::Status IntegerIndex::PersistStoragesToDisk() { return libtextclassifier3::Status::OK; } -libtextclassifier3::Status IntegerIndex::PersistMetadataToDisk() { +libtextclassifier3::Status IntegerIndex::PersistMetadataToDisk(bool force) { + if (!force && !is_info_dirty() && !is_storage_dirty()) { + return libtextclassifier3::Status::OK; + } + // Changes should have been applied to the underlying file when using // MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, but call msync() as an // extra safety step to ensure they are written out. return metadata_mmapped_file_->PersistToDisk(); } -libtextclassifier3::StatusOr<Crc32> IntegerIndex::ComputeInfoChecksum() { +libtextclassifier3::StatusOr<Crc32> IntegerIndex::ComputeInfoChecksum( + bool force) { + if (!force && !is_info_dirty()) { + return Crc32(crcs().component_crcs.info_crc); + } + return info().ComputeChecksum(); } -libtextclassifier3::StatusOr<Crc32> IntegerIndex::ComputeStoragesChecksum() { +libtextclassifier3::StatusOr<Crc32> IntegerIndex::ComputeStoragesChecksum( + bool force) { + if (!force && !is_storage_dirty()) { + return Crc32(crcs().component_crcs.storages_crc); + } + // XOR all crcs of all storages. Since XOR is commutative and associative, // the order doesn't matter. uint32_t storages_checksum = 0; diff --git a/icing/index/numeric/integer-index.h b/icing/index/numeric/integer-index.h index 30f9852..e7a3127 100644 --- a/icing/index/numeric/integer-index.h +++ b/icing/index/numeric/integer-index.h @@ -55,25 +55,29 @@ class IntegerIndex : public NumericIndex<int64_t> { // 'wildcard' storage. static constexpr int kMaxPropertyStorages = 32; + static constexpr int32_t kDefaultNumDataThresholdForBucketSplit = + IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit; + struct Info { - static constexpr int32_t kMagic = 0x238a3dcb; + static constexpr int32_t kMagic = 0x5d8a1e8a; int32_t magic; DocumentId last_added_document_id; + int32_t num_data_threshold_for_bucket_split; Crc32 ComputeChecksum() const { return Crc32( std::string_view(reinterpret_cast<const char*>(this), sizeof(Info))); } } __attribute__((packed)); - static_assert(sizeof(Info) == 8, ""); + static_assert(sizeof(Info) == 12, ""); // Metadata file layout: <Crcs><Info> static constexpr int32_t kCrcsMetadataFileOffset = 0; static constexpr int32_t kInfoMetadataFileOffset = static_cast<int32_t>(sizeof(Crcs)); static constexpr int32_t kMetadataFileSize = sizeof(Crcs) + sizeof(Info); - static_assert(kMetadataFileSize == 20, ""); + static_assert(kMetadataFileSize == 24, ""); static constexpr WorkingPathType kWorkingPathType = WorkingPathType::kDirectory; @@ -90,6 +94,8 @@ class IntegerIndex : public NumericIndex<int64_t> { // related files will be stored under this directory. See // PersistentStorage for more details about the concept of // working_path. + // num_data_threshold_for_bucket_split: see IntegerIndexStorage::Options for + // more details. // pre_mapping_fbv: flag indicating whether memory map max possible file size // for underlying FileBackedVector before growing the actual // file size. @@ -101,7 +107,7 @@ class IntegerIndex : public NumericIndex<int64_t> { // - Any FileBackedVector/MemoryMappedFile errors. static libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndex>> Create( const Filesystem& filesystem, std::string working_path, - bool pre_mapping_fbv); + int32_t num_data_threshold_for_bucket_split, bool pre_mapping_fbv); // Deletes IntegerIndex under working_path. // @@ -122,7 +128,8 @@ class IntegerIndex : public NumericIndex<int64_t> { std::string_view property_path, DocumentId document_id, SectionId section_id) override { return std::make_unique<Editor>(property_path, document_id, section_id, - *this, pre_mapping_fbv_); + *this, num_data_threshold_for_bucket_split_, + pre_mapping_fbv_); } // Returns a DocHitInfoIterator for iterating through all docs which have the @@ -172,6 +179,8 @@ class IntegerIndex : public NumericIndex<int64_t> { } void set_last_added_document_id(DocumentId document_id) override { + SetInfoDirty(); + Info& info_ref = info(); if (info_ref.last_added_document_id == kInvalidDocumentId || document_id > info_ref.last_added_document_id) { @@ -189,9 +198,12 @@ class IntegerIndex : public NumericIndex<int64_t> { public: explicit Editor(std::string_view property_path, DocumentId document_id, SectionId section_id, IntegerIndex& integer_index, + int32_t num_data_threshold_for_bucket_split, bool pre_mapping_fbv) : NumericIndex<int64_t>::Editor(property_path, document_id, section_id), integer_index_(integer_index), + num_data_threshold_for_bucket_split_( + num_data_threshold_for_bucket_split), pre_mapping_fbv_(pre_mapping_fbv) {} ~Editor() override = default; @@ -211,6 +223,8 @@ class IntegerIndex : public NumericIndex<int64_t> { IntegerIndex& integer_index_; // Does not own. + int32_t num_data_threshold_for_bucket_split_; + // Flag indicating whether memory map max possible file size for underlying // FileBackedVector before growing the actual file size. bool pre_mapping_fbv_; @@ -226,7 +240,7 @@ class IntegerIndex : public NumericIndex<int64_t> { wildcard_property_storage, std::unordered_set<std::string> wildcard_properties_set, std::unique_ptr<icing::lib::IntegerIndexStorage> wildcard_index_storage, - bool pre_mapping_fbv) + int32_t num_data_threshold_for_bucket_split, bool pre_mapping_fbv) : NumericIndex<int64_t>(filesystem, std::move(working_path), kWorkingPathType), posting_list_serializer_(std::move(posting_list_serializer)), @@ -235,15 +249,22 @@ class IntegerIndex : public NumericIndex<int64_t> { wildcard_property_storage_(std::move(wildcard_property_storage)), wildcard_properties_set_(std::move(wildcard_properties_set)), wildcard_index_storage_(std::move(wildcard_index_storage)), - pre_mapping_fbv_(pre_mapping_fbv) {} + num_data_threshold_for_bucket_split_( + num_data_threshold_for_bucket_split), + pre_mapping_fbv_(pre_mapping_fbv), + is_info_dirty_(false), + is_storage_dirty_(false) {} static libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndex>> InitializeNewFiles(const Filesystem& filesystem, std::string&& working_path, + int32_t num_data_threshold_for_bucket_split, bool pre_mapping_fbv); static libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndex>> InitializeExistingFiles(const Filesystem& filesystem, - std::string&& working_path, bool pre_mapping_fbv); + std::string&& working_path, + int32_t num_data_threshold_for_bucket_split, + bool pre_mapping_fbv); // Adds the property path to the list of properties using wildcard storage. // This will both update the in-memory list (wildcard_properties_set_) and @@ -296,20 +317,20 @@ class IntegerIndex : public NumericIndex<int64_t> { // Returns: // - OK on success // - INTERNAL_ERROR on I/O error - libtextclassifier3::Status PersistStoragesToDisk() override; + libtextclassifier3::Status PersistStoragesToDisk(bool force) override; // Flushes contents of metadata file. // // Returns: // - OK on success // - INTERNAL_ERROR on I/O error - libtextclassifier3::Status PersistMetadataToDisk() override; + libtextclassifier3::Status PersistMetadataToDisk(bool force) override; // Computes and returns Info checksum. // // Returns: // - Crc of the Info on success - libtextclassifier3::StatusOr<Crc32> ComputeInfoChecksum() override; + libtextclassifier3::StatusOr<Crc32> ComputeInfoChecksum(bool force) override; // Computes and returns all storages checksum. Checksums of (storage_crc, // property_path) for all existing property paths will be combined together by @@ -318,7 +339,8 @@ class IntegerIndex : public NumericIndex<int64_t> { // Returns: // - Crc of all storages on success // - INTERNAL_ERROR if any data inconsistency - libtextclassifier3::StatusOr<Crc32> ComputeStoragesChecksum() override; + libtextclassifier3::StatusOr<Crc32> ComputeStoragesChecksum( + bool force) override; Crcs& crcs() override { return *reinterpret_cast<Crcs*>(metadata_mmapped_file_->mutable_region() + @@ -340,6 +362,17 @@ class IntegerIndex : public NumericIndex<int64_t> { kInfoMetadataFileOffset); } + void SetInfoDirty() { is_info_dirty_ = true; } + // When storage is dirty, we have to set info dirty as well. So just expose + // SetDirty to set both. + void SetDirty() { + is_info_dirty_ = true; + is_storage_dirty_ = true; + } + + bool is_info_dirty() const { return is_info_dirty_; } + bool is_storage_dirty() const { return is_storage_dirty_; } + std::unique_ptr<PostingListIntegerIndexSerializer> posting_list_serializer_; std::unique_ptr<MemoryMappedFile> metadata_mmapped_file_; @@ -360,9 +393,14 @@ class IntegerIndex : public NumericIndex<int64_t> { // kMaxPropertyStorages in property_to_storage_map. std::unique_ptr<icing::lib::IntegerIndexStorage> wildcard_index_storage_; + int32_t num_data_threshold_for_bucket_split_; + // Flag indicating whether memory map max possible file size for underlying // FileBackedVector before growing the actual file size. bool pre_mapping_fbv_; + + bool is_info_dirty_; + bool is_storage_dirty_; }; } // namespace lib diff --git a/icing/index/numeric/integer-index_test.cc b/icing/index/numeric/integer-index_test.cc index 8a7acb9..b2e3fbe 100644 --- a/icing/index/numeric/integer-index_test.cc +++ b/icing/index/numeric/integer-index_test.cc @@ -83,13 +83,14 @@ class NumericIndexIntegerTest : public ::testing::Test { filesystem_.CreateDirectoryRecursively(document_store_dir.c_str())); ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult doc_store_create_result, - DocumentStore::Create(&filesystem_, document_store_dir, &clock_, - schema_store_.get(), - /*force_recovery_and_revalidate_documents=*/false, - /*namespace_id_fingerprint=*/false, - PortableFileBackedProtoLog< - DocumentWrapper>::kDeflateCompressionLevel, - /*initialize_stats=*/nullptr)); + DocumentStore::Create( + &filesystem_, document_store_dir, &clock_, schema_store_.get(), + /*force_recovery_and_revalidate_documents=*/false, + /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false, + /*use_persistent_hash_map=*/false, + PortableFileBackedProtoLog< + DocumentWrapper>::kDeflateCompressionLevel, + /*initialize_stats=*/nullptr)); doc_store_ = std::move(doc_store_create_result.document_store); } @@ -114,8 +115,10 @@ class NumericIndexIntegerTest : public ::testing::Test { template <> libtextclassifier3::StatusOr<std::unique_ptr<NumericIndex<int64_t>>> CreateIntegerIndex<IntegerIndex>() { - return IntegerIndex::Create(filesystem_, working_path_, - /*pre_mapping_fbv=*/false); + return IntegerIndex::Create( + filesystem_, working_path_, /*num_data_threshold_for_bucket_split=*/ + IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit, + /*pre_mapping_fbv=*/false); } template <typename NotIntegerIndexType> @@ -138,8 +141,7 @@ class NumericIndexIntegerTest : public ::testing::Test { } ICING_ASSIGN_OR_RETURN( std::vector<DocumentId> docid_map, - doc_store_->OptimizeInto(document_store_compact_dir, nullptr, - /*namespace_id_fingerprint=*/false)); + doc_store_->OptimizeInto(document_store_compact_dir, nullptr)); doc_store_.reset(); if (!filesystem_.SwapFiles(document_store_dir.c_str(), @@ -153,13 +155,14 @@ class NumericIndexIntegerTest : public ::testing::Test { ICING_ASSIGN_OR_RETURN( DocumentStore::CreateResult doc_store_create_result, - DocumentStore::Create(&filesystem_, document_store_dir, &clock_, - schema_store_.get(), - /*force_recovery_and_revalidate_documents=*/false, - /*namespace_id_fingerprint=*/false, - PortableFileBackedProtoLog< - DocumentWrapper>::kDeflateCompressionLevel, - /*initialize_stats=*/nullptr)); + DocumentStore::Create( + &filesystem_, document_store_dir, &clock_, schema_store_.get(), + /*force_recovery_and_revalidate_documents=*/false, + /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false, + /*use_persistent_hash_map=*/false, + PortableFileBackedProtoLog< + DocumentWrapper>::kDeflateCompressionLevel, + /*initialize_stats=*/nullptr)); doc_store_ = std::move(doc_store_create_result.document_store); return docid_map; } @@ -1126,14 +1129,28 @@ TYPED_TEST(NumericIndexIntegerTest, Clear) { /*document_id=*/4, std::vector<SectionId>{kDefaultSectionId})))); } +struct IntegerIndexTestParam { + int32_t num_data_threshold_for_bucket_split; + bool pre_mapping_fbv; + + explicit IntegerIndexTestParam(int32_t num_data_threshold_for_bucket_split_in, + bool pre_mapping_fbv_in) + : num_data_threshold_for_bucket_split( + num_data_threshold_for_bucket_split_in), + pre_mapping_fbv(pre_mapping_fbv_in) {} +}; + // Tests for persistent integer index only -class IntegerIndexTest : public NumericIndexIntegerTest<IntegerIndex>, - public ::testing::WithParamInterface<bool> {}; +class IntegerIndexTest + : public NumericIndexIntegerTest<IntegerIndex>, + public ::testing::WithParamInterface<IntegerIndexTestParam> {}; TEST_P(IntegerIndexTest, InvalidWorkingPath) { - EXPECT_THAT(IntegerIndex::Create(filesystem_, "/dev/null/integer_index_test", - /*pre_mapping_fbv=*/GetParam()), - StatusIs(libtextclassifier3::StatusCode::INTERNAL)); + EXPECT_THAT( + IntegerIndex::Create(filesystem_, "/dev/null/integer_index_test", + GetParam().num_data_threshold_for_bucket_split, + GetParam().pre_mapping_fbv), + StatusIs(libtextclassifier3::StatusCode::INTERNAL)); } TEST_P(IntegerIndexTest, InitializeNewFiles) { @@ -1142,7 +1159,8 @@ TEST_P(IntegerIndexTest, InitializeNewFiles) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndex> integer_index, IntegerIndex::Create(filesystem_, working_path_, - /*pre_mapping_fbv=*/GetParam())); + GetParam().num_data_threshold_for_bucket_split, + GetParam().pre_mapping_fbv)); ICING_ASSERT_OK(integer_index->PersistToDisk()); } @@ -1160,6 +1178,8 @@ TEST_P(IntegerIndexTest, InitializeNewFiles) { IntegerIndex::kInfoMetadataFileOffset)); EXPECT_THAT(info.magic, Eq(Info::kMagic)); EXPECT_THAT(info.last_added_document_id, Eq(kInvalidDocumentId)); + EXPECT_THAT(info.num_data_threshold_for_bucket_split, + Eq(GetParam().num_data_threshold_for_bucket_split)); // Check crcs section Crcs crcs; @@ -1183,7 +1203,8 @@ TEST_P(IntegerIndexTest, ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndex> integer_index, IntegerIndex::Create(filesystem_, working_path_, - /*pre_mapping_fbv=*/GetParam())); + GetParam().num_data_threshold_for_bucket_split, + GetParam().pre_mapping_fbv)); // Insert some data. Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/0, @@ -1195,16 +1216,19 @@ TEST_P(IntegerIndexTest, // Without calling PersistToDisk, checksums will not be recomputed or synced // to disk, so initializing another instance on the same files should fail. - EXPECT_THAT(IntegerIndex::Create(filesystem_, working_path_, - /*pre_mapping_fbv=*/GetParam()), - StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); + EXPECT_THAT( + IntegerIndex::Create(filesystem_, working_path_, + GetParam().num_data_threshold_for_bucket_split, + GetParam().pre_mapping_fbv), + StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); } TEST_P(IntegerIndexTest, InitializationShouldSucceedWithPersistToDisk) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndex> integer_index1, IntegerIndex::Create(filesystem_, working_path_, - /*pre_mapping_fbv=*/GetParam())); + GetParam().num_data_threshold_for_bucket_split, + GetParam().pre_mapping_fbv)); // Insert some data. Index(integer_index1.get(), kDefaultTestPropertyPath, /*document_id=*/0, @@ -1228,7 +1252,8 @@ TEST_P(IntegerIndexTest, InitializationShouldSucceedWithPersistToDisk) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndex> integer_index2, IntegerIndex::Create(filesystem_, working_path_, - /*pre_mapping_fbv=*/GetParam())); + GetParam().num_data_threshold_for_bucket_split, + GetParam().pre_mapping_fbv)); EXPECT_THAT(integer_index2->last_added_document_id(), Eq(2)); EXPECT_THAT(Query(integer_index2.get(), kDefaultTestPropertyPath, /*key_lower=*/std::numeric_limits<int64_t>::min(), @@ -1243,7 +1268,8 @@ TEST_P(IntegerIndexTest, InitializationShouldSucceedAfterDestruction) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndex> integer_index, IntegerIndex::Create(filesystem_, working_path_, - /*pre_mapping_fbv=*/GetParam())); + GetParam().num_data_threshold_for_bucket_split, + GetParam().pre_mapping_fbv)); // Insert some data. Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/0, @@ -1268,7 +1294,8 @@ TEST_P(IntegerIndexTest, InitializationShouldSucceedAfterDestruction) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndex> integer_index, IntegerIndex::Create(filesystem_, working_path_, - /*pre_mapping_fbv=*/GetParam())); + GetParam().num_data_threshold_for_bucket_split, + GetParam().pre_mapping_fbv)); EXPECT_THAT(integer_index->last_added_document_id(), Eq(2)); EXPECT_THAT(Query(integer_index.get(), kDefaultTestPropertyPath, /*key_lower=*/std::numeric_limits<int64_t>::min(), @@ -1283,7 +1310,8 @@ TEST_P(IntegerIndexTest, InitializeExistingFilesWithWrongAllCrcShouldFail) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndex> integer_index, IntegerIndex::Create(filesystem_, working_path_, - /*pre_mapping_fbv=*/GetParam())); + GetParam().num_data_threshold_for_bucket_split, + GetParam().pre_mapping_fbv)); // Insert some data. Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/0, /*section_id=*/20, /*keys=*/{0, 100, -100}); @@ -1315,8 +1343,10 @@ TEST_P(IntegerIndexTest, InitializeExistingFilesWithWrongAllCrcShouldFail) { // Attempt to create the integer index with metadata containing corrupted // all_crc. This should fail. libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndex>> - integer_index_or = IntegerIndex::Create(filesystem_, working_path_, - /*pre_mapping_fbv=*/GetParam()); + integer_index_or = + IntegerIndex::Create(filesystem_, working_path_, + GetParam().num_data_threshold_for_bucket_split, + GetParam().pre_mapping_fbv); EXPECT_THAT(integer_index_or, StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); EXPECT_THAT(integer_index_or.status().error_message(), @@ -1329,7 +1359,8 @@ TEST_P(IntegerIndexTest, InitializeExistingFilesWithCorruptedInfoShouldFail) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndex> integer_index, IntegerIndex::Create(filesystem_, working_path_, - /*pre_mapping_fbv=*/GetParam())); + GetParam().num_data_threshold_for_bucket_split, + GetParam().pre_mapping_fbv)); // Insert some data. Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/0, /*section_id=*/20, /*keys=*/{0, 100, -100}); @@ -1362,8 +1393,10 @@ TEST_P(IntegerIndexTest, InitializeExistingFilesWithCorruptedInfoShouldFail) { // Attempt to create the integer index with info that doesn't match its // checksum and confirm that it fails. libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndex>> - integer_index_or = IntegerIndex::Create(filesystem_, working_path_, - /*pre_mapping_fbv=*/GetParam()); + integer_index_or = + IntegerIndex::Create(filesystem_, working_path_, + GetParam().num_data_threshold_for_bucket_split, + GetParam().pre_mapping_fbv); EXPECT_THAT(integer_index_or, StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); EXPECT_THAT(integer_index_or.status().error_message(), @@ -1377,7 +1410,8 @@ TEST_P(IntegerIndexTest, ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndex> integer_index, IntegerIndex::Create(filesystem_, working_path_, - /*pre_mapping_fbv=*/GetParam())); + GetParam().num_data_threshold_for_bucket_split, + GetParam().pre_mapping_fbv)); // Insert some data. Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/0, /*section_id=*/20, /*keys=*/{0, 100, -100}); @@ -1400,7 +1434,9 @@ TEST_P(IntegerIndexTest, std::unique_ptr<IntegerIndexStorage> storage, IntegerIndexStorage::Create( filesystem_, std::move(storage_working_path), - IntegerIndexStorage::Options(/*pre_mapping_fbv=*/GetParam()), + IntegerIndexStorage::Options( + GetParam().num_data_threshold_for_bucket_split, + GetParam().pre_mapping_fbv), &posting_list_integer_index_serializer)); ICING_ASSERT_OK(storage->AddKeys(/*document_id=*/3, /*section_id=*/4, /*new_keys=*/{3, 4, 5})); @@ -1412,8 +1448,10 @@ TEST_P(IntegerIndexTest, // Attempt to create the integer index with corrupted storages. This should // fail. libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndex>> - integer_index_or = IntegerIndex::Create(filesystem_, working_path_, - /*pre_mapping_fbv=*/GetParam()); + integer_index_or = + IntegerIndex::Create(filesystem_, working_path_, + GetParam().num_data_threshold_for_bucket_split, + GetParam().pre_mapping_fbv); EXPECT_THAT(integer_index_or, StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); EXPECT_THAT(integer_index_or.status().error_message(), @@ -1421,6 +1459,41 @@ TEST_P(IntegerIndexTest, } } +TEST_P( + IntegerIndexTest, + InitializeExistingFilesWithMismatchNumDataThresholdForBucketSplitShouldFail) { + { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<IntegerIndex> integer_index, + IntegerIndex::Create(filesystem_, working_path_, + GetParam().num_data_threshold_for_bucket_split, + GetParam().pre_mapping_fbv)); + // Insert some data. + Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/0, + /*section_id=*/20, /*keys=*/{0, 100, -100}); + Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/1, + /*section_id=*/2, /*keys=*/{3, -1000, 500}); + Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/2, + /*section_id=*/15, /*keys=*/{-6, 321, 98}); + + ICING_ASSERT_OK(integer_index->PersistToDisk()); + } + + { + // Attempt to create the integer index with different + // num_data_threshold_for_bucket_split. This should fail. + libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndex>> + integer_index_or = IntegerIndex::Create( + filesystem_, working_path_, + GetParam().num_data_threshold_for_bucket_split + 1, + GetParam().pre_mapping_fbv); + EXPECT_THAT(integer_index_or, + StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); + EXPECT_THAT(integer_index_or.status().error_message(), + HasSubstr("Mismatch num_data_threshold_for_bucket_split")); + } +} + TEST_P(IntegerIndexTest, WildcardStoragePersistenceQuery) { // This test sets its schema assuming that max property storages == 32. ASSERT_THAT(IntegerIndex::kMaxPropertyStorages, Eq(32)); @@ -1586,7 +1659,8 @@ TEST_P(IntegerIndexTest, WildcardStoragePersistenceQuery) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndex> integer_index, IntegerIndex::Create(filesystem_, working_path_, - /*pre_mapping_fbv=*/GetParam())); + GetParam().num_data_threshold_for_bucket_split, + GetParam().pre_mapping_fbv)); // Index numeric content for other properties to force our property into the // wildcard storage. @@ -1651,7 +1725,8 @@ TEST_P(IntegerIndexTest, WildcardStoragePersistenceQuery) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndex> integer_index, IntegerIndex::Create(filesystem_, working_path_, - /*pre_mapping_fbv=*/GetParam())); + GetParam().num_data_threshold_for_bucket_split, + GetParam().pre_mapping_fbv)); EXPECT_THAT(integer_index->num_property_indices(), Eq(33)); @@ -1691,7 +1766,8 @@ TEST_P(IntegerIndexTest, ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndex> integer_index, IntegerIndex::Create(filesystem_, working_path_, - /*pre_mapping_fbv=*/GetParam())); + GetParam().num_data_threshold_for_bucket_split, + GetParam().pre_mapping_fbv)); // Doc id = 1: insert 2 data for "prop1", "prop2" Index(integer_index.get(), kPropertyPath2, /*document_id=*/1, kSectionId2, @@ -1742,7 +1818,8 @@ TEST_P(IntegerIndexTest, ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndex> integer_index, IntegerIndex::Create(filesystem_, working_path_, - /*pre_mapping_fbv=*/GetParam())); + GetParam().num_data_threshold_for_bucket_split, + GetParam().pre_mapping_fbv)); // Key = 1 EXPECT_THAT(Query(integer_index.get(), kPropertyPath1, /*key_lower=*/1, @@ -1968,7 +2045,8 @@ TEST_P(IntegerIndexTest, WildcardStorageWorksAfterOptimize) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndex> integer_index, IntegerIndex::Create(filesystem_, working_path_, - /*pre_mapping_fbv=*/GetParam())); + GetParam().num_data_threshold_for_bucket_split, + GetParam().pre_mapping_fbv)); // Index numeric content for other properties to force our property into the // wildcard storage. @@ -2067,7 +2145,8 @@ TEST_P(IntegerIndexTest, WildcardStorageWorksAfterOptimize) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndex> integer_index, IntegerIndex::Create(filesystem_, working_path_, - /*pre_mapping_fbv=*/GetParam())); + GetParam().num_data_threshold_for_bucket_split, + GetParam().pre_mapping_fbv)); EXPECT_THAT(integer_index->num_property_indices(), Eq(33)); @@ -2236,7 +2315,8 @@ TEST_P(IntegerIndexTest, WildcardStorageAvailableIndicesAfterOptimize) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndex> integer_index, IntegerIndex::Create(filesystem_, working_path_, - /*pre_mapping_fbv=*/GetParam())); + GetParam().num_data_threshold_for_bucket_split, + GetParam().pre_mapping_fbv)); // Index numeric content for other properties to force our property into the // wildcard storage. @@ -2317,7 +2397,8 @@ TEST_P(IntegerIndexTest, WildcardStorageAvailableIndicesAfterOptimize) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndex> integer_index, IntegerIndex::Create(filesystem_, working_path_, - /*pre_mapping_fbv=*/GetParam())); + GetParam().num_data_threshold_for_bucket_split, + GetParam().pre_mapping_fbv)); EXPECT_THAT(integer_index->num_property_indices(), Eq(1)); @@ -2363,8 +2444,20 @@ TEST_P(IntegerIndexTest, WildcardStorageAvailableIndicesAfterOptimize) { /*document_id=*/7, expected_sections_typea)))); } -INSTANTIATE_TEST_SUITE_P(IntegerIndexTest, IntegerIndexTest, - testing::Values(true, false)); +INSTANTIATE_TEST_SUITE_P( + IntegerIndexTest, IntegerIndexTest, + testing::Values( + IntegerIndexTestParam(/*num_data_threshold_for_bucket_split_in=*/341, + /*pre_mapping_fbv_in=*/false), + IntegerIndexTestParam(/*num_data_threshold_for_bucket_split_in=*/341, + /*pre_mapping_fbv_in=*/true), + + IntegerIndexTestParam(/*num_data_threshold_for_bucket_split_in=*/16384, + /*pre_mapping_fbv_in=*/false), + IntegerIndexTestParam(/*num_data_threshold_for_bucket_split_in=*/32768, + /*pre_mapping_fbv_in=*/false), + IntegerIndexTestParam(/*num_data_threshold_for_bucket_split_in=*/65536, + /*pre_mapping_fbv_in=*/false))); } // namespace diff --git a/icing/index/numeric/numeric-index.h b/icing/index/numeric/numeric-index.h index 24b81e7..57911de 100644 --- a/icing/index/numeric/numeric-index.h +++ b/icing/index/numeric/numeric-index.h @@ -177,15 +177,17 @@ class NumericIndex : public PersistentStorage { : PersistentStorage(filesystem, std::move(working_path), working_path_type) {} - virtual libtextclassifier3::Status PersistStoragesToDisk() override = 0; + virtual libtextclassifier3::Status PersistStoragesToDisk( + bool force) override = 0; - virtual libtextclassifier3::Status PersistMetadataToDisk() override = 0; + virtual libtextclassifier3::Status PersistMetadataToDisk( + bool force) override = 0; - virtual libtextclassifier3::StatusOr<Crc32> ComputeInfoChecksum() - override = 0; + virtual libtextclassifier3::StatusOr<Crc32> ComputeInfoChecksum( + bool force) override = 0; - virtual libtextclassifier3::StatusOr<Crc32> ComputeStoragesChecksum() - override = 0; + virtual libtextclassifier3::StatusOr<Crc32> ComputeStoragesChecksum( + bool force) override = 0; virtual Crcs& crcs() override = 0; virtual const Crcs& crcs() const override = 0; diff --git a/icing/index/numeric/posting-list-integer-index-accessor.h b/icing/index/numeric/posting-list-integer-index-accessor.h index f0d3d25..4f667a0 100644 --- a/icing/index/numeric/posting-list-integer-index-accessor.h +++ b/icing/index/numeric/posting-list-integer-index-accessor.h @@ -100,16 +100,6 @@ class PostingListIntegerIndexAccessor : public PostingListAccessor { // posting list. libtextclassifier3::Status PrependData(const IntegerIndexData& data); - bool WantsSplit() const { - const PostingListUsed* current_pl = - preexisting_posting_list_ != nullptr - ? &preexisting_posting_list_->posting_list - : &in_memory_posting_list_; - // Only max-sized PLs get split. Smaller PLs just get copied to larger PLs. - return current_pl->size_in_bytes() == storage_->max_posting_list_bytes() && - serializer_->IsFull(current_pl); - } - private: explicit PostingListIntegerIndexAccessor( FlashIndexStorage* storage, PostingListUsed in_memory_posting_list, diff --git a/icing/join/join-processor_test.cc b/icing/join/join-processor_test.cc index 3dd7c87..f503442 100644 --- a/icing/join/join-processor_test.cc +++ b/icing/join/join-processor_test.cc @@ -118,13 +118,14 @@ class JoinProcessorTest : public ::testing::Test { IsTrue()); ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, - DocumentStore::Create(&filesystem_, doc_store_dir_, &fake_clock_, - schema_store_.get(), - /*force_recovery_and_revalidate_documents=*/false, - /*namespace_id_fingerprint=*/false, - PortableFileBackedProtoLog< - DocumentWrapper>::kDeflateCompressionLevel, - /*initialize_stats=*/nullptr)); + DocumentStore::Create( + &filesystem_, doc_store_dir_, &fake_clock_, schema_store_.get(), + /*force_recovery_and_revalidate_documents=*/false, + /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false, + /*use_persistent_hash_map=*/false, + PortableFileBackedProtoLog< + DocumentWrapper>::kDeflateCompressionLevel, + /*initialize_stats=*/nullptr)); doc_store_ = std::move(create_result.document_store); ICING_ASSERT_OK_AND_ASSIGN( diff --git a/icing/join/qualified-id-join-index.cc b/icing/join/qualified-id-join-index.cc index 86f1921..07b5627 100644 --- a/icing/join/qualified-id-join-index.cc +++ b/icing/join/qualified-id-join-index.cc @@ -103,6 +103,8 @@ QualifiedIdJoinIndex::~QualifiedIdJoinIndex() { libtextclassifier3::Status QualifiedIdJoinIndex::Put( const DocJoinInfo& doc_join_info, std::string_view ref_qualified_id_str) { + SetDirty(); + if (!doc_join_info.is_valid()) { return absl_ports::InvalidArgumentError( "Cannot put data for an invalid DocJoinInfo"); @@ -215,6 +217,8 @@ libtextclassifier3::Status QualifiedIdJoinIndex::Optimize( } libtextclassifier3::Status QualifiedIdJoinIndex::Clear() { + SetDirty(); + doc_join_info_mapper_.reset(); // Discard and reinitialize doc join info mapper. std::string doc_join_info_mapper_path = @@ -400,7 +404,12 @@ libtextclassifier3::Status QualifiedIdJoinIndex::TransferIndex( return libtextclassifier3::Status::OK; } -libtextclassifier3::Status QualifiedIdJoinIndex::PersistMetadataToDisk() { +libtextclassifier3::Status QualifiedIdJoinIndex::PersistMetadataToDisk( + bool force) { + if (!force && !is_info_dirty() && !is_storage_dirty()) { + return libtextclassifier3::Status::OK; + } + std::string metadata_file_path = GetMetadataFilePath(working_path_); ScopedFd sfd(filesystem_.OpenForWrite(metadata_file_path.c_str())); @@ -420,19 +429,32 @@ libtextclassifier3::Status QualifiedIdJoinIndex::PersistMetadataToDisk() { return libtextclassifier3::Status::OK; } -libtextclassifier3::Status QualifiedIdJoinIndex::PersistStoragesToDisk() { +libtextclassifier3::Status QualifiedIdJoinIndex::PersistStoragesToDisk( + bool force) { + if (!force && !is_storage_dirty()) { + return libtextclassifier3::Status::OK; + } + ICING_RETURN_IF_ERROR(doc_join_info_mapper_->PersistToDisk()); ICING_RETURN_IF_ERROR(qualified_id_storage_->PersistToDisk()); return libtextclassifier3::Status::OK; } -libtextclassifier3::StatusOr<Crc32> -QualifiedIdJoinIndex::ComputeInfoChecksum() { +libtextclassifier3::StatusOr<Crc32> QualifiedIdJoinIndex::ComputeInfoChecksum( + bool force) { + if (!force && !is_info_dirty()) { + return Crc32(crcs().component_crcs.info_crc); + } + return info().ComputeChecksum(); } libtextclassifier3::StatusOr<Crc32> -QualifiedIdJoinIndex::ComputeStoragesChecksum() { +QualifiedIdJoinIndex::ComputeStoragesChecksum(bool force) { + if (!force && !is_storage_dirty()) { + return Crc32(crcs().component_crcs.storages_crc); + } + ICING_ASSIGN_OR_RETURN(Crc32 doc_join_info_mapper_crc, doc_join_info_mapper_->ComputeChecksum()); ICING_ASSIGN_OR_RETURN(Crc32 qualified_id_storage_crc, diff --git a/icing/join/qualified-id-join-index.h b/icing/join/qualified-id-join-index.h index 6accfaf..86297cd 100644 --- a/icing/join/qualified-id-join-index.h +++ b/icing/join/qualified-id-join-index.h @@ -172,6 +172,8 @@ class QualifiedIdJoinIndex : public PersistentStorage { } void set_last_added_document_id(DocumentId document_id) { + SetInfoDirty(); + Info& info_ref = info(); if (info_ref.last_added_document_id == kInvalidDocumentId || document_id > info_ref.last_added_document_id) { @@ -192,7 +194,9 @@ class QualifiedIdJoinIndex : public PersistentStorage { doc_join_info_mapper_(std::move(doc_join_info_mapper)), qualified_id_storage_(std::move(qualified_id_storage)), pre_mapping_fbv_(pre_mapping_fbv), - use_persistent_hash_map_(use_persistent_hash_map) {} + use_persistent_hash_map_(use_persistent_hash_map), + is_info_dirty_(false), + is_storage_dirty_(false) {} static libtextclassifier3::StatusOr<std::unique_ptr<QualifiedIdJoinIndex>> InitializeNewFiles(const Filesystem& filesystem, std::string&& working_path, @@ -219,27 +223,28 @@ class QualifiedIdJoinIndex : public PersistentStorage { // Returns: // - OK on success // - INTERNAL_ERROR on I/O error - libtextclassifier3::Status PersistMetadataToDisk() override; + libtextclassifier3::Status PersistMetadataToDisk(bool force) override; // Flushes contents of all storages to underlying files. // // Returns: // - OK on success // - INTERNAL_ERROR on I/O error - libtextclassifier3::Status PersistStoragesToDisk() override; + libtextclassifier3::Status PersistStoragesToDisk(bool force) override; // Computes and returns Info checksum. // // Returns: // - Crc of the Info on success - libtextclassifier3::StatusOr<Crc32> ComputeInfoChecksum() override; + libtextclassifier3::StatusOr<Crc32> ComputeInfoChecksum(bool force) override; // Computes and returns all storages checksum. // // Returns: // - Crc of all storages on success // - INTERNAL_ERROR if any data inconsistency - libtextclassifier3::StatusOr<Crc32> ComputeStoragesChecksum() override; + libtextclassifier3::StatusOr<Crc32> ComputeStoragesChecksum( + bool force) override; Crcs& crcs() override { return *reinterpret_cast<Crcs*>(metadata_buffer_.get() + @@ -261,6 +266,17 @@ class QualifiedIdJoinIndex : public PersistentStorage { kInfoMetadataBufferOffset); } + void SetInfoDirty() { is_info_dirty_ = true; } + // When storage is dirty, we have to set info dirty as well. So just expose + // SetDirty to set both. + void SetDirty() { + is_info_dirty_ = true; + is_storage_dirty_ = true; + } + + bool is_info_dirty() const { return is_info_dirty_; } + bool is_storage_dirty() const { return is_storage_dirty_; } + // Metadata buffer std::unique_ptr<uint8_t[]> metadata_buffer_; @@ -281,6 +297,9 @@ class QualifiedIdJoinIndex : public PersistentStorage { // Flag indicating whether use persistent hash map as the key mapper (if // false, then fall back to dynamic trie key mapper). bool use_persistent_hash_map_; + + bool is_info_dirty_; + bool is_storage_dirty_; }; } // namespace lib diff --git a/icing/query/advanced_query_parser/query-visitor_test.cc b/icing/query/advanced_query_parser/query-visitor_test.cc index 0d7ba6d..d118691 100644 --- a/icing/query/advanced_query_parser/query-visitor_test.cc +++ b/icing/query/advanced_query_parser/query-visitor_test.cc @@ -114,13 +114,14 @@ class QueryVisitorTest : public ::testing::TestWithParam<QueryType> { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, - DocumentStore::Create(&filesystem_, store_dir_, &clock_, - schema_store_.get(), - /*force_recovery_and_revalidate_documents=*/false, - /*namespace_id_fingerprint=*/false, - PortableFileBackedProtoLog< - DocumentWrapper>::kDeflateCompressionLevel, - /*initialize_stats=*/nullptr)); + DocumentStore::Create( + &filesystem_, store_dir_, &clock_, schema_store_.get(), + /*force_recovery_and_revalidate_documents=*/false, + /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false, + /*use_persistent_hash_map=*/false, + PortableFileBackedProtoLog< + DocumentWrapper>::kDeflateCompressionLevel, + /*initialize_stats=*/nullptr)); document_store_ = std::move(create_result.document_store); Index::Options options(index_dir_.c_str(), diff --git a/icing/query/query-processor_benchmark.cc b/icing/query/query-processor_benchmark.cc index 89f3b54..626f352 100644 --- a/icing/query/query-processor_benchmark.cc +++ b/icing/query/query-processor_benchmark.cc @@ -98,7 +98,8 @@ libtextclassifier3::StatusOr<DocumentStore::CreateResult> CreateDocumentStore( return DocumentStore::Create( filesystem, base_dir, clock, schema_store, /*force_recovery_and_revalidate_documents=*/false, - /*namespace_id_fingerprint=*/false, + /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false, + /*use_persistent_hash_map=*/false, PortableFileBackedProtoLog<DocumentWrapper>::kDeflateCompressionLevel, /*initialize_stats=*/nullptr); } diff --git a/icing/query/query-processor_test.cc b/icing/query/query-processor_test.cc index 3d3cf48..5bbf9ba 100644 --- a/icing/query/query-processor_test.cc +++ b/icing/query/query-processor_test.cc @@ -69,7 +69,8 @@ libtextclassifier3::StatusOr<DocumentStore::CreateResult> CreateDocumentStore( return DocumentStore::Create( filesystem, base_dir, clock, schema_store, /*force_recovery_and_revalidate_documents=*/false, - /*namespace_id_fingerprint=*/false, + /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false, + /*use_persistent_hash_map=*/false, PortableFileBackedProtoLog<DocumentWrapper>::kDeflateCompressionLevel, /*initialize_stats=*/nullptr); } diff --git a/icing/query/suggestion-processor_test.cc b/icing/query/suggestion-processor_test.cc index b1336b3..49f8d43 100644 --- a/icing/query/suggestion-processor_test.cc +++ b/icing/query/suggestion-processor_test.cc @@ -85,13 +85,14 @@ class SuggestionProcessorTest : public Test { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, - DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_, - schema_store_.get(), - /*force_recovery_and_revalidate_documents=*/false, - /*namespace_id_fingerprint=*/false, - PortableFileBackedProtoLog< - DocumentWrapper>::kDeflateCompressionLevel, - /*initialize_stats=*/nullptr)); + DocumentStore::Create( + &filesystem_, store_dir_, &fake_clock_, schema_store_.get(), + /*force_recovery_and_revalidate_documents=*/false, + /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false, + /*use_persistent_hash_map=*/false, + PortableFileBackedProtoLog< + DocumentWrapper>::kDeflateCompressionLevel, + /*initialize_stats=*/nullptr)); document_store_ = std::move(create_result.document_store); Index::Options options(index_dir_, diff --git a/icing/result/result-retriever-v2_group-result-limiter_test.cc b/icing/result/result-retriever-v2_group-result-limiter_test.cc index 5d8b589..2914a8d 100644 --- a/icing/result/result-retriever-v2_group-result-limiter_test.cc +++ b/icing/result/result-retriever-v2_group-result-limiter_test.cc @@ -89,13 +89,14 @@ class ResultRetrieverV2GroupResultLimiterTest : public testing::Test { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, - DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_, - schema_store_.get(), - /*force_recovery_and_revalidate_documents=*/false, - /*namespace_id_fingerprint=*/false, - PortableFileBackedProtoLog< - DocumentWrapper>::kDeflateCompressionLevel, - /*initialize_stats=*/nullptr)); + DocumentStore::Create( + &filesystem_, test_dir_, &fake_clock_, schema_store_.get(), + /*force_recovery_and_revalidate_documents=*/false, + /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false, + /*use_persistent_hash_map=*/false, + PortableFileBackedProtoLog< + DocumentWrapper>::kDeflateCompressionLevel, + /*initialize_stats=*/nullptr)); document_store_ = std::move(create_result.document_store); } diff --git a/icing/result/result-retriever-v2_projection_test.cc b/icing/result/result-retriever-v2_projection_test.cc index 6b868a5..1a75631 100644 --- a/icing/result/result-retriever-v2_projection_test.cc +++ b/icing/result/result-retriever-v2_projection_test.cc @@ -184,13 +184,14 @@ class ResultRetrieverV2ProjectionTest : public testing::Test { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, - DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_, - schema_store_.get(), - /*force_recovery_and_revalidate_documents=*/false, - /*namespace_id_fingerprint=*/false, - PortableFileBackedProtoLog< - DocumentWrapper>::kDeflateCompressionLevel, - /*initialize_stats=*/nullptr)); + DocumentStore::Create( + &filesystem_, test_dir_, &fake_clock_, schema_store_.get(), + /*force_recovery_and_revalidate_documents=*/false, + /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false, + /*use_persistent_hash_map=*/false, + PortableFileBackedProtoLog< + DocumentWrapper>::kDeflateCompressionLevel, + /*initialize_stats=*/nullptr)); document_store_ = std::move(create_result.document_store); } diff --git a/icing/result/result-retriever-v2_snippet_test.cc b/icing/result/result-retriever-v2_snippet_test.cc index 27f16a0..440d31c 100644 --- a/icing/result/result-retriever-v2_snippet_test.cc +++ b/icing/result/result-retriever-v2_snippet_test.cc @@ -109,13 +109,14 @@ class ResultRetrieverV2SnippetTest : public testing::Test { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, - DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_, - schema_store_.get(), - /*force_recovery_and_revalidate_documents=*/false, - /*namespace_id_fingerprint=*/false, - PortableFileBackedProtoLog< - DocumentWrapper>::kDeflateCompressionLevel, - /*initialize_stats=*/nullptr)); + DocumentStore::Create( + &filesystem_, test_dir_, &fake_clock_, schema_store_.get(), + /*force_recovery_and_revalidate_documents=*/false, + /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false, + /*use_persistent_hash_map=*/false, + PortableFileBackedProtoLog< + DocumentWrapper>::kDeflateCompressionLevel, + /*initialize_stats=*/nullptr)); document_store_ = std::move(create_result.document_store); } diff --git a/icing/result/result-retriever-v2_test.cc b/icing/result/result-retriever-v2_test.cc index 889dc60..0bd40cc 100644 --- a/icing/result/result-retriever-v2_test.cc +++ b/icing/result/result-retriever-v2_test.cc @@ -220,7 +220,8 @@ libtextclassifier3::StatusOr<DocumentStore::CreateResult> CreateDocumentStore( return DocumentStore::Create( filesystem, base_dir, clock, schema_store, /*force_recovery_and_revalidate_documents=*/false, - /*namespace_id_fingerprint=*/false, + /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false, + /*use_persistent_hash_map=*/false, PortableFileBackedProtoLog<DocumentWrapper>::kDeflateCompressionLevel, /*initialize_stats=*/nullptr); } diff --git a/icing/result/result-state-manager_test.cc b/icing/result/result-state-manager_test.cc index 38d67e8..75d1d93 100644 --- a/icing/result/result-state-manager_test.cc +++ b/icing/result/result-state-manager_test.cc @@ -107,13 +107,14 @@ class ResultStateManagerTest : public testing::Test { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult result, - DocumentStore::Create(&filesystem_, test_dir_, clock_.get(), - schema_store_.get(), - /*force_recovery_and_revalidate_documents=*/false, - /*namespace_id_fingerprint=*/false, - PortableFileBackedProtoLog< - DocumentWrapper>::kDeflateCompressionLevel, - /*initialize_stats=*/nullptr)); + DocumentStore::Create( + &filesystem_, test_dir_, clock_.get(), schema_store_.get(), + /*force_recovery_and_revalidate_documents=*/false, + /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false, + /*use_persistent_hash_map=*/false, + PortableFileBackedProtoLog< + DocumentWrapper>::kDeflateCompressionLevel, + /*initialize_stats=*/nullptr)); document_store_ = std::move(result.document_store); ICING_ASSERT_OK_AND_ASSIGN( diff --git a/icing/result/result-state-manager_thread-safety_test.cc b/icing/result/result-state-manager_thread-safety_test.cc index 53745e6..7e7e13c 100644 --- a/icing/result/result-state-manager_thread-safety_test.cc +++ b/icing/result/result-state-manager_thread-safety_test.cc @@ -100,13 +100,14 @@ class ResultStateManagerThreadSafetyTest : public testing::Test { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult result, - DocumentStore::Create(&filesystem_, test_dir_, clock_.get(), - schema_store_.get(), - /*force_recovery_and_revalidate_documents=*/false, - /*namespace_id_fingerprint=*/false, - PortableFileBackedProtoLog< - DocumentWrapper>::kDeflateCompressionLevel, - /*initialize_stats=*/nullptr)); + DocumentStore::Create( + &filesystem_, test_dir_, clock_.get(), schema_store_.get(), + /*force_recovery_and_revalidate_documents=*/false, + /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false, + /*use_persistent_hash_map=*/false, + PortableFileBackedProtoLog< + DocumentWrapper>::kDeflateCompressionLevel, + /*initialize_stats=*/nullptr)); document_store_ = std::move(result.document_store); ICING_ASSERT_OK_AND_ASSIGN( diff --git a/icing/result/result-state-v2_test.cc b/icing/result/result-state-v2_test.cc index ab29d6e..0f88023 100644 --- a/icing/result/result-state-v2_test.cc +++ b/icing/result/result-state-v2_test.cc @@ -76,13 +76,14 @@ class ResultStateV2Test : public ::testing::Test { filesystem_.CreateDirectoryRecursively(doc_store_base_dir_.c_str()); ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult result, - DocumentStore::Create(&filesystem_, doc_store_base_dir_, &clock_, - schema_store_.get(), - /*force_recovery_and_revalidate_documents=*/false, - /*namespace_id_fingerprint=*/false, - PortableFileBackedProtoLog< - DocumentWrapper>::kDeflateCompressionLevel, - /*initialize_stats=*/nullptr)); + DocumentStore::Create( + &filesystem_, doc_store_base_dir_, &clock_, schema_store_.get(), + /*force_recovery_and_revalidate_documents=*/false, + /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false, + /*use_persistent_hash_map=*/false, + PortableFileBackedProtoLog< + DocumentWrapper>::kDeflateCompressionLevel, + /*initialize_stats=*/nullptr)); document_store_ = std::move(result.document_store); num_total_hits_ = 0; diff --git a/icing/schema/schema-store.cc b/icing/schema/schema-store.cc index bcc7c2c..bcacdce 100644 --- a/icing/schema/schema-store.cc +++ b/icing/schema/schema-store.cc @@ -485,9 +485,13 @@ libtextclassifier3::Status SchemaStore::RegenerateDerivedFiles( std::make_unique<SchemaProto>(std::move(base_schema)); ICING_RETURN_IF_ERROR(schema_file_->Write(std::move(base_schema_ptr))); + // LINT.IfChange(min_overlay_version_compatibility) + // Although the current version is 2, the schema is compatible with + // version 1, so min_overlay_version_compatibility should be 1. + int32_t min_overlay_version_compatibility = version_util::kVersionOne; + // LINT.ThenChange(//depot/google3/icing/file/version-util.h:kVersion) header_->SetOverlayInfo( - /*overlay_created=*/true, - /*min_overlay_version_compatibility=*/version_util::kVersionOne); + /*overlay_created=*/true, min_overlay_version_compatibility); // Rebuild in memory data - references to the old schema will be invalid // now. BuildInMemoryCache(); diff --git a/icing/schema/schema-store_test.cc b/icing/schema/schema-store_test.cc index 3298b75..8fc51e7 100644 --- a/icing/schema/schema-store_test.cc +++ b/icing/schema/schema-store_test.cc @@ -2722,7 +2722,8 @@ TEST_F(SchemaStoreTest, MigrateSchemaVersionZeroUpgradeNoChange) { } } -TEST_F(SchemaStoreTest, MigrateSchemaRollbackDiscardsOverlaySchema) { +TEST_F(SchemaStoreTest, + MigrateSchemaRollbackDiscardsIncompatibleOverlaySchema) { // Because we are upgrading from version zero, the schema must be compatible // with version zero. SchemaTypeConfigProto type_a = @@ -2749,12 +2750,12 @@ TEST_F(SchemaStoreTest, MigrateSchemaRollbackDiscardsOverlaySchema) { IsOkAndHolds(Pointee(EqualsProto(schema)))); } - // Rollback to a version before kVersion. The schema header will declare that - // the overlay is compatible with any version starting with kVersion. So - // kVersion - 1 is incompatible and will throw out the schema. + // Rollback to a version before kVersionOne. The schema header will declare + // that the overlay is compatible with any version starting with kVersionOne. + // So kVersionOne - 1 is incompatible and will throw out the schema. ICING_EXPECT_OK(SchemaStore::MigrateSchema( &filesystem_, schema_store_dir_, version_util::StateChange::kRollBack, - version_util::kVersion - 1)); + version_util::kVersionOne - 1)); { // Create a new of the schema store and check that we fell back to the @@ -2777,7 +2778,7 @@ TEST_F(SchemaStoreTest, MigrateSchemaRollbackDiscardsOverlaySchema) { } } -TEST_F(SchemaStoreTest, MigrateSchemaCompatibleRollbackKeepsOverlaySchema) { +TEST_F(SchemaStoreTest, MigrateSchemaRollbackKeepsCompatibleOverlaySchema) { // Because we are upgrading from version zero, the schema must be compatible // with version zero. SchemaTypeConfigProto type_a = @@ -2846,12 +2847,12 @@ TEST_F(SchemaStoreTest, MigrateSchemaRollforwardRetainsBaseSchema) { IsOkAndHolds(Pointee(EqualsProto(schema)))); } - // Rollback to a version before kVersion. The schema header will declare that - // the overlay is compatible with any version starting with kVersion. So - // kVersion - 1 is incompatible and will throw out the schema. + // Rollback to a version before kVersionOne. The schema header will declare + // that the overlay is compatible with any version starting with kVersionOne. + // So kVersionOne - 1 is incompatible and will throw out the schema. ICING_EXPECT_OK(SchemaStore::MigrateSchema( &filesystem_, schema_store_dir_, version_util::StateChange::kRollBack, - version_util::kVersion - 1)); + version_util::kVersionOne - 1)); SchemaTypeConfigProto other_type_a = SchemaTypeConfigBuilder() diff --git a/icing/scoring/advanced_scoring/advanced-scorer_fuzz_test.cc b/icing/scoring/advanced_scoring/advanced-scorer_fuzz_test.cc index bdafa28..3612359 100644 --- a/icing/scoring/advanced_scoring/advanced-scorer_fuzz_test.cc +++ b/icing/scoring/advanced_scoring/advanced-scorer_fuzz_test.cc @@ -37,13 +37,13 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { SchemaStore::Create(&filesystem, schema_store_dir, &fake_clock) .ValueOrDie(); std::unique_ptr<DocumentStore> document_store = - DocumentStore::Create(&filesystem, doc_store_dir, &fake_clock, - schema_store.get(), - /*force_recovery_and_revalidate_documents=*/false, - /*namespace_id_fingerprint=*/false, - PortableFileBackedProtoLog< - DocumentWrapper>::kDeflateCompressionLevel, - /*initialize_stats=*/nullptr) + DocumentStore::Create( + &filesystem, doc_store_dir, &fake_clock, schema_store.get(), + /*force_recovery_and_revalidate_documents=*/false, + /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false, + /*use_persistent_hash_map=*/false, + PortableFileBackedProtoLog<DocumentWrapper>::kDeflateCompressionLevel, + /*initialize_stats=*/nullptr) .ValueOrDie() .document_store; diff --git a/icing/scoring/advanced_scoring/advanced-scorer_test.cc b/icing/scoring/advanced_scoring/advanced-scorer_test.cc index 0ecc21d..cc1d413 100644 --- a/icing/scoring/advanced_scoring/advanced-scorer_test.cc +++ b/icing/scoring/advanced_scoring/advanced-scorer_test.cc @@ -64,13 +64,14 @@ class AdvancedScorerTest : public testing::Test { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, - DocumentStore::Create(&filesystem_, doc_store_dir_, &fake_clock_, - schema_store_.get(), - /*force_recovery_and_revalidate_documents=*/false, - /*namespace_id_fingerprint=*/false, - PortableFileBackedProtoLog< - DocumentWrapper>::kDeflateCompressionLevel, - /*initialize_stats=*/nullptr)); + DocumentStore::Create( + &filesystem_, doc_store_dir_, &fake_clock_, schema_store_.get(), + /*force_recovery_and_revalidate_documents=*/false, + /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false, + /*use_persistent_hash_map=*/false, + PortableFileBackedProtoLog< + DocumentWrapper>::kDeflateCompressionLevel, + /*initialize_stats=*/nullptr)); document_store_ = std::move(create_result.document_store); // Creates a simple email schema diff --git a/icing/scoring/score-and-rank_benchmark.cc b/icing/scoring/score-and-rank_benchmark.cc index abb019f..7cb5a95 100644 --- a/icing/scoring/score-and-rank_benchmark.cc +++ b/icing/scoring/score-and-rank_benchmark.cc @@ -95,7 +95,8 @@ libtextclassifier3::StatusOr<DocumentStore::CreateResult> CreateDocumentStore( return DocumentStore::Create( filesystem, base_dir, clock, schema_store, /*force_recovery_and_revalidate_documents=*/false, - /*namespace_id_fingerprint=*/false, + /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false, + /*use_persistent_hash_map=*/false, PortableFileBackedProtoLog<DocumentWrapper>::kDeflateCompressionLevel, /*initialize_stats=*/nullptr); } diff --git a/icing/scoring/scorer_test.cc b/icing/scoring/scorer_test.cc index 4a97a87..5194c7f 100644 --- a/icing/scoring/scorer_test.cc +++ b/icing/scoring/scorer_test.cc @@ -64,13 +64,14 @@ class ScorerTest : public ::testing::TestWithParam<ScorerTestingMode> { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, - DocumentStore::Create(&filesystem_, doc_store_dir_, &fake_clock1_, - schema_store_.get(), - /*force_recovery_and_revalidate_documents=*/false, - /*namespace_id_fingerprint=*/false, - PortableFileBackedProtoLog< - DocumentWrapper>::kDeflateCompressionLevel, - /*initialize_stats=*/nullptr)); + DocumentStore::Create( + &filesystem_, doc_store_dir_, &fake_clock1_, schema_store_.get(), + /*force_recovery_and_revalidate_documents=*/false, + /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false, + /*use_persistent_hash_map=*/false, + PortableFileBackedProtoLog< + DocumentWrapper>::kDeflateCompressionLevel, + /*initialize_stats=*/nullptr)); document_store_ = std::move(create_result.document_store); // Creates a simple email schema diff --git a/icing/scoring/scoring-processor_test.cc b/icing/scoring/scoring-processor_test.cc index 644e013..deddff8 100644 --- a/icing/scoring/scoring-processor_test.cc +++ b/icing/scoring/scoring-processor_test.cc @@ -62,13 +62,14 @@ class ScoringProcessorTest ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, - DocumentStore::Create(&filesystem_, doc_store_dir_, &fake_clock_, - schema_store_.get(), - /*force_recovery_and_revalidate_documents=*/false, - /*namespace_id_fingerprint=*/false, - PortableFileBackedProtoLog< - DocumentWrapper>::kDeflateCompressionLevel, - /*initialize_stats=*/nullptr)); + DocumentStore::Create( + &filesystem_, doc_store_dir_, &fake_clock_, schema_store_.get(), + /*force_recovery_and_revalidate_documents=*/false, + /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false, + /*use_persistent_hash_map=*/false, + PortableFileBackedProtoLog< + DocumentWrapper>::kDeflateCompressionLevel, + /*initialize_stats=*/nullptr)); document_store_ = std::move(create_result.document_store); // Creates a simple email schema diff --git a/icing/store/document-store.cc b/icing/store/document-store.cc index e99bacf..30de410 100644 --- a/icing/store/document-store.cc +++ b/icing/store/document-store.cc @@ -54,6 +54,7 @@ #include "icing/store/document-log-creator.h" #include "icing/store/dynamic-trie-key-mapper.h" #include "icing/store/namespace-id.h" +#include "icing/store/persistent-hash-map-key-mapper.h" #include "icing/store/usage-store.h" #include "icing/tokenization/language-segmenter.h" #include "icing/util/clock.h" @@ -73,6 +74,7 @@ namespace { // Used in DocumentId mapper to mark a document as deleted constexpr int64_t kDocDeletedFlag = -1; constexpr char kDocumentIdMapperFilename[] = "document_id_mapper"; +constexpr char kUriHashMapperWorkingPath[] = "uri_mapper"; constexpr char kDocumentStoreHeaderFilename[] = "document_store_header"; constexpr char kScoreCacheFilename[] = "score_cache"; constexpr char kCorpusScoreCache[] = "corpus_score_cache"; @@ -81,9 +83,17 @@ constexpr char kNamespaceMapperFilename[] = "namespace_mapper"; constexpr char kUsageStoreDirectoryName[] = "usage_store"; constexpr char kCorpusIdMapperFilename[] = "corpus_mapper"; -// Determined through manual testing to allow for 1 million uris. 1 million -// because we allow up to 1 million DocumentIds. -constexpr int32_t kUriMapperMaxSize = 36 * 1024 * 1024; // 36 MiB +// Determined through manual testing to allow for 4 million uris. 4 million +// because we allow up to 4 million DocumentIds. +constexpr int32_t kUriDynamicTrieKeyMapperMaxSize = + 144 * 1024 * 1024; // 144 MiB + +constexpr int32_t kUriHashKeyMapperMaxNumEntries = + kMaxDocumentId + 1; // 1 << 22, 4M +// - Key: namespace_id_str (3 bytes) + fingerprinted_uri (10 bytes) + '\0' (1 +// byte) +// - Value: DocumentId (4 bytes) +constexpr int32_t kUriHashKeyMapperKVByteSize = 13 + 1 + sizeof(DocumentId); // 384 KiB for a DynamicTrieKeyMapper would allow each internal array to have a // max of 128 KiB for storage. @@ -100,6 +110,10 @@ std::string MakeHeaderFilename(const std::string& base_dir) { return absl_ports::StrCat(base_dir, "/", kDocumentStoreHeaderFilename); } +std::string MakeUriHashMapperWorkingPath(const std::string& base_dir) { + return absl_ports::StrCat(base_dir, "/", kUriHashMapperWorkingPath); +} + std::string MakeDocumentIdMapperFilename(const std::string& base_dir) { return absl_ports::StrCat(base_dir, "/", kDocumentIdMapperFilename); } @@ -207,6 +221,41 @@ std::unordered_map<NamespaceId, std::string> GetNamespaceIdsToNamespaces( return namespace_ids_to_namespaces; } +libtextclassifier3::StatusOr<std::unique_ptr< + KeyMapper<DocumentId, fingerprint_util::FingerprintStringFormatter>>> +CreateUriMapper(const Filesystem& filesystem, const std::string& base_dir, + bool pre_mapping_fbv, bool use_persistent_hash_map) { + std::string uri_hash_mapper_working_path = + MakeUriHashMapperWorkingPath(base_dir); + // Due to historic issue, we use document store's base_dir directly as + // DynamicTrieKeyMapper's working directory for uri mapper. + // DynamicTrieKeyMapper also creates a subdirectory "key_mapper_dir", so the + // actual files will be put under "<base_dir>/key_mapper_dir/". + bool dynamic_trie_key_mapper_dir_exists = filesystem.DirectoryExists( + absl_ports::StrCat(base_dir, "/key_mapper_dir").c_str()); + bool persistent_hash_map_dir_exists = + filesystem.DirectoryExists(uri_hash_mapper_working_path.c_str()); + if ((use_persistent_hash_map && dynamic_trie_key_mapper_dir_exists) || + (!use_persistent_hash_map && persistent_hash_map_dir_exists)) { + // Return a failure here so that the caller can properly delete and rebuild + // this component. + return absl_ports::FailedPreconditionError("Key mapper type mismatch"); + } + + if (use_persistent_hash_map) { + return PersistentHashMapKeyMapper< + DocumentId, fingerprint_util::FingerprintStringFormatter>:: + Create(filesystem, std::move(uri_hash_mapper_working_path), + pre_mapping_fbv, + /*max_num_entries=*/kUriHashKeyMapperMaxNumEntries, + /*average_kv_byte_size=*/kUriHashKeyMapperKVByteSize); + } else { + return DynamicTrieKeyMapper<DocumentId, + fingerprint_util::FingerprintStringFormatter>:: + Create(filesystem, base_dir, kUriDynamicTrieKeyMapperMaxSize); + } +} + } // namespace std::string DocumentStore::MakeFingerprint( @@ -231,6 +280,7 @@ DocumentStore::DocumentStore(const Filesystem* filesystem, const Clock* clock, const SchemaStore* schema_store, bool namespace_id_fingerprint, + bool pre_mapping_fbv, bool use_persistent_hash_map, int32_t compression_level) : filesystem_(filesystem), base_dir_(base_dir), @@ -238,6 +288,8 @@ DocumentStore::DocumentStore(const Filesystem* filesystem, schema_store_(schema_store), document_validator_(schema_store), namespace_id_fingerprint_(namespace_id_fingerprint), + pre_mapping_fbv_(pre_mapping_fbv), + use_persistent_hash_map_(use_persistent_hash_map), compression_level_(compression_level) {} libtextclassifier3::StatusOr<DocumentId> DocumentStore::Put( @@ -266,6 +318,7 @@ libtextclassifier3::StatusOr<DocumentStore::CreateResult> DocumentStore::Create( const Filesystem* filesystem, const std::string& base_dir, const Clock* clock, const SchemaStore* schema_store, bool force_recovery_and_revalidate_documents, bool namespace_id_fingerprint, + bool pre_mapping_fbv, bool use_persistent_hash_map, int32_t compression_level, InitializeStatsProto* initialize_stats) { ICING_RETURN_ERROR_IF_NULL(filesystem); ICING_RETURN_ERROR_IF_NULL(clock); @@ -273,7 +326,7 @@ libtextclassifier3::StatusOr<DocumentStore::CreateResult> DocumentStore::Create( auto document_store = std::unique_ptr<DocumentStore>(new DocumentStore( filesystem, base_dir, clock, schema_store, namespace_id_fingerprint, - compression_level)); + pre_mapping_fbv, use_persistent_hash_map, compression_level)); ICING_ASSIGN_OR_RETURN( DataLoss data_loss, document_store->Initialize(force_recovery_and_revalidate_documents, @@ -293,9 +346,12 @@ libtextclassifier3::StatusOr<DocumentStore::CreateResult> DocumentStore::Create( return absl_ports::InternalError("Couldn't delete header file"); } - // Document key mapper + // Document key mapper. Doesn't hurt to delete both dynamic trie and + // persistent hash map without checking. ICING_RETURN_IF_ERROR( DynamicTrieKeyMapper<DocumentId>::Delete(*filesystem, base_dir)); + ICING_RETURN_IF_ERROR(PersistentHashMapKeyMapper<DocumentId>::Delete( + *filesystem, MakeUriHashMapperWorkingPath(base_dir))); // Document id mapper ICING_RETURN_IF_ERROR(FileBackedVector<int64_t>::Delete( @@ -429,11 +485,8 @@ libtextclassifier3::Status DocumentStore::InitializeExistingDerivedFiles() { // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN // that can support error logging. - auto document_key_mapper_or = DynamicTrieKeyMapper< - DocumentId, - fingerprint_util::FingerprintStringFormatter>::Create(*filesystem_, - base_dir_, - kUriMapperMaxSize); + auto document_key_mapper_or = CreateUriMapper( + *filesystem_, base_dir_, pre_mapping_fbv_, use_persistent_hash_map_); if (!document_key_mapper_or.ok()) { ICING_LOG(ERROR) << document_key_mapper_or.status().error_message() << "Failed to initialize KeyMapper"; @@ -646,6 +699,10 @@ libtextclassifier3::Status DocumentStore::RegenerateDerivedFiles( } libtextclassifier3::Status DocumentStore::ResetDocumentKeyMapper() { + // Only one type of KeyMapper (either DynamicTrieKeyMapper or + // PersistentHashMapKeyMapper) will actually exist at any moment, but it is ok + // to call Delete() for both since Delete() returns OK if any of them doesn't + // exist. // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset(). document_key_mapper_.reset(); // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR @@ -654,17 +711,21 @@ libtextclassifier3::Status DocumentStore::ResetDocumentKeyMapper() { DynamicTrieKeyMapper<DocumentId>::Delete(*filesystem_, base_dir_); if (!status.ok()) { ICING_LOG(ERROR) << status.error_message() - << "Failed to delete old key mapper"; + << "Failed to delete old dynamic trie key mapper"; + return status; + } + status = PersistentHashMapKeyMapper<DocumentId>::Delete( + *filesystem_, MakeUriHashMapperWorkingPath(base_dir_)); + if (!status.ok()) { + ICING_LOG(ERROR) << status.error_message() + << "Failed to delete old persistent hash map key mapper"; return status; } // TODO(b/216487496): Implement a more robust version of TC_ASSIGN_OR_RETURN // that can support error logging. - auto document_key_mapper_or = DynamicTrieKeyMapper< - DocumentId, - fingerprint_util::FingerprintStringFormatter>::Create(*filesystem_, - base_dir_, - kUriMapperMaxSize); + auto document_key_mapper_or = CreateUriMapper( + *filesystem_, base_dir_, pre_mapping_fbv_, use_persistent_hash_map_); if (!document_key_mapper_or.ok()) { ICING_LOG(ERROR) << document_key_mapper_or.status().error_message() << "Failed to re-init key mapper"; @@ -1771,7 +1832,6 @@ libtextclassifier3::Status DocumentStore::Optimize() { libtextclassifier3::StatusOr<std::vector<DocumentId>> DocumentStore::OptimizeInto(const std::string& new_directory, const LanguageSegmenter* lang_segmenter, - bool namespace_id_fingerprint, OptimizeStatsProto* stats) { // Validates directory if (new_directory == base_dir_) { @@ -1783,7 +1843,8 @@ DocumentStore::OptimizeInto(const std::string& new_directory, auto doc_store_create_result, DocumentStore::Create(filesystem_, new_directory, &clock_, schema_store_, /*force_recovery_and_revalidate_documents=*/false, - namespace_id_fingerprint, compression_level_, + namespace_id_fingerprint_, pre_mapping_fbv_, + use_persistent_hash_map_, compression_level_, /*initialize_stats=*/nullptr)); std::unique_ptr<DocumentStore> new_doc_store = std::move(doc_store_create_result.document_store); diff --git a/icing/store/document-store.h b/icing/store/document-store.h index 3941f6d..92d4286 100644 --- a/icing/store/document-store.h +++ b/icing/store/document-store.h @@ -142,8 +142,8 @@ class DocumentStore { const Filesystem* filesystem, const std::string& base_dir, const Clock* clock, const SchemaStore* schema_store, bool force_recovery_and_revalidate_documents, - bool namespace_id_fingerprint, - int32_t compression_level, + bool namespace_id_fingerprint, bool pre_mapping_fbv, + bool use_persistent_hash_map, int32_t compression_level, InitializeStatsProto* initialize_stats); // Discards all derived data in the document store. @@ -456,7 +456,7 @@ class DocumentStore { // INTERNAL_ERROR on IO error libtextclassifier3::StatusOr<std::vector<DocumentId>> OptimizeInto( const std::string& new_directory, const LanguageSegmenter* lang_segmenter, - bool namespace_id_fingerprint, OptimizeStatsProto* stats = nullptr); + OptimizeStatsProto* stats = nullptr); // Calculates status for a potential Optimize call. Includes how many docs // there are vs how many would be optimized away. And also includes an @@ -488,9 +488,12 @@ class DocumentStore { private: // Use DocumentStore::Create() to instantiate. - DocumentStore(const Filesystem* filesystem, std::string_view base_dir, - const Clock* clock, const SchemaStore* schema_store, - bool namespace_id_fingerprint, int32_t compression_level); + explicit DocumentStore(const Filesystem* filesystem, + std::string_view base_dir, const Clock* clock, + const SchemaStore* schema_store, + bool namespace_id_fingerprint, bool pre_mapping_fbv, + bool use_persistent_hash_map, + int32_t compression_level); const Filesystem* const filesystem_; const std::string base_dir_; @@ -507,6 +510,15 @@ class DocumentStore { // document_key_mapper_ and corpus_mapper_. bool namespace_id_fingerprint_; + // Flag indicating whether memory map max possible file size for underlying + // FileBackedVector before growing the actual file size. + bool pre_mapping_fbv_; + + // Flag indicating whether use persistent hash map as the key mapper (if + // false, then fall back to dynamic trie key mapper). Note: we only use + // persistent hash map for uri mapper if it is true. + bool use_persistent_hash_map_; + const int32_t compression_level_; // A log used to store all documents, it serves as a ground truth of doc diff --git a/icing/store/document-store_benchmark.cc b/icing/store/document-store_benchmark.cc index 75995e9..5b9c568 100644 --- a/icing/store/document-store_benchmark.cc +++ b/icing/store/document-store_benchmark.cc @@ -132,7 +132,8 @@ libtextclassifier3::StatusOr<DocumentStore::CreateResult> CreateDocumentStore( return DocumentStore::Create( filesystem, base_dir, clock, schema_store, /*force_recovery_and_revalidate_documents=*/false, - /*namespace_id_fingerprint=*/false, + /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false, + /*use_persistent_hash_map=*/false, PortableFileBackedProtoLog<DocumentWrapper>::kDeflateCompressionLevel, /*initialize_stats=*/nullptr); } diff --git a/icing/store/document-store_test.cc b/icing/store/document-store_test.cc index 9a1f4a6..a9c47f0 100644 --- a/icing/store/document-store_test.cc +++ b/icing/store/document-store_test.cc @@ -71,6 +71,7 @@ using ::testing::Ge; using ::testing::Gt; using ::testing::HasSubstr; using ::testing::IsEmpty; +using ::testing::IsFalse; using ::testing::IsTrue; using ::testing::Not; using ::testing::Return; @@ -120,7 +121,21 @@ void WriteDocumentLogHeader( sizeof(PortableFileBackedProtoLog<DocumentWrapper>::Header)); } -class DocumentStoreTest : public ::testing::Test { +struct DocumentStoreTestParam { + bool namespace_id_fingerprint; + bool pre_mapping_fbv; + bool use_persistent_hash_map; + + explicit DocumentStoreTestParam(bool namespace_id_fingerprint_in, + bool pre_mapping_fbv_in, + bool use_persistent_hash_map_in) + : namespace_id_fingerprint(namespace_id_fingerprint_in), + pre_mapping_fbv(pre_mapping_fbv_in), + use_persistent_hash_map(use_persistent_hash_map_in) {} +}; + +class DocumentStoreTest + : public ::testing::TestWithParam<DocumentStoreTestParam> { protected: DocumentStoreTest() : test_dir_(GetTestTempDir() + "/icing"), @@ -213,7 +228,7 @@ class DocumentStoreTest : public ::testing::Test { absl_ports::StrCat(document_store_dir_, "/document_store_header"); DocumentStore::Header header; header.magic = DocumentStore::Header::GetCurrentMagic( - /*namespace_id_fingerprint=*/false); + GetParam().namespace_id_fingerprint); header.checksum = 10; // Arbitrary garbage checksum filesystem_.DeleteFile(header_file.c_str()); filesystem_.Write(header_file.c_str(), &header, sizeof(header)); @@ -225,7 +240,8 @@ class DocumentStoreTest : public ::testing::Test { return DocumentStore::Create( filesystem, base_dir, clock, schema_store, /*force_recovery_and_revalidate_documents=*/false, - /*namespace_id_fingerprint=*/false, + GetParam().namespace_id_fingerprint, GetParam().pre_mapping_fbv, + GetParam().use_persistent_hash_map, PortableFileBackedProtoLog<DocumentWrapper>::kDeflateCompressionLevel, /*initialize_stats=*/nullptr); } @@ -254,7 +270,7 @@ class DocumentStoreTest : public ::testing::Test { const int64_t document2_expiration_timestamp_ = 3; // creation + ttl }; -TEST_F(DocumentStoreTest, CreationWithNullPointerShouldFail) { +TEST_P(DocumentStoreTest, CreationWithNullPointerShouldFail) { EXPECT_THAT(CreateDocumentStore(/*filesystem=*/nullptr, document_store_dir_, &fake_clock_, schema_store_.get()), StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); @@ -268,7 +284,7 @@ TEST_F(DocumentStoreTest, CreationWithNullPointerShouldFail) { StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); } -TEST_F(DocumentStoreTest, CreationWithBadFilesystemShouldFail) { +TEST_P(DocumentStoreTest, CreationWithBadFilesystemShouldFail) { MockFilesystem mock_filesystem; ON_CALL(mock_filesystem, OpenForWrite(_)).WillByDefault(Return(false)); @@ -277,7 +293,7 @@ TEST_F(DocumentStoreTest, CreationWithBadFilesystemShouldFail) { StatusIs(libtextclassifier3::StatusCode::INTERNAL)); } -TEST_F(DocumentStoreTest, PutAndGetInSameNamespaceOk) { +TEST_P(DocumentStoreTest, PutAndGetInSameNamespaceOk) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -297,7 +313,7 @@ TEST_F(DocumentStoreTest, PutAndGetInSameNamespaceOk) { IsOkAndHolds(EqualsProto(test_document2_))); } -TEST_F(DocumentStoreTest, PutAndGetAcrossNamespacesOk) { +TEST_P(DocumentStoreTest, PutAndGetAcrossNamespacesOk) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -330,7 +346,7 @@ TEST_F(DocumentStoreTest, PutAndGetAcrossNamespacesOk) { // Validates that putting an document with the same key will overwrite previous // document and old doc ids are not getting reused. -TEST_F(DocumentStoreTest, PutSameKey) { +TEST_P(DocumentStoreTest, PutSameKey) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -359,7 +375,7 @@ TEST_F(DocumentStoreTest, PutSameKey) { EXPECT_THAT(doc_store->Put(document3), IsOkAndHolds(Not(document_id1))); } -TEST_F(DocumentStoreTest, IsDocumentExistingWithoutStatus) { +TEST_P(DocumentStoreTest, IsDocumentExistingWithoutStatus) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -395,7 +411,7 @@ TEST_F(DocumentStoreTest, IsDocumentExistingWithoutStatus) { fake_clock_.GetSystemTimeMilliseconds())); } -TEST_F(DocumentStoreTest, GetDeletedDocumentNotFound) { +TEST_P(DocumentStoreTest, GetDeletedDocumentNotFound) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -416,7 +432,7 @@ TEST_F(DocumentStoreTest, GetDeletedDocumentNotFound) { StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); } -TEST_F(DocumentStoreTest, GetExpiredDocumentNotFound) { +TEST_P(DocumentStoreTest, GetExpiredDocumentNotFound) { DocumentProto document = DocumentBuilder() .SetKey("namespace", "uri") .SetSchema("email") @@ -451,7 +467,7 @@ TEST_F(DocumentStoreTest, GetExpiredDocumentNotFound) { StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); } -TEST_F(DocumentStoreTest, GetInvalidDocumentId) { +TEST_P(DocumentStoreTest, GetInvalidDocumentId) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -478,7 +494,7 @@ TEST_F(DocumentStoreTest, GetInvalidDocumentId) { StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); } -TEST_F(DocumentStoreTest, DeleteNonexistentDocumentNotFound) { +TEST_P(DocumentStoreTest, DeleteNonexistentDocumentNotFound) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -504,7 +520,7 @@ TEST_F(DocumentStoreTest, DeleteNonexistentDocumentNotFound) { EXPECT_THAT(document_log_size_before, Eq(document_log_size_after)); } -TEST_F(DocumentStoreTest, DeleteNonexistentDocumentPrintableErrorMessage) { +TEST_P(DocumentStoreTest, DeleteNonexistentDocumentPrintableErrorMessage) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -533,7 +549,7 @@ TEST_F(DocumentStoreTest, DeleteNonexistentDocumentPrintableErrorMessage) { EXPECT_THAT(document_log_size_before, Eq(document_log_size_after)); } -TEST_F(DocumentStoreTest, DeleteAlreadyDeletedDocumentNotFound) { +TEST_P(DocumentStoreTest, DeleteAlreadyDeletedDocumentNotFound) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -555,7 +571,7 @@ TEST_F(DocumentStoreTest, DeleteAlreadyDeletedDocumentNotFound) { StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); } -TEST_F(DocumentStoreTest, DeleteByNamespaceOk) { +TEST_P(DocumentStoreTest, DeleteByNamespaceOk) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -599,7 +615,7 @@ TEST_F(DocumentStoreTest, DeleteByNamespaceOk) { StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); } -TEST_F(DocumentStoreTest, DeleteByNamespaceNonexistentNamespaceNotFound) { +TEST_P(DocumentStoreTest, DeleteByNamespaceNonexistentNamespaceNotFound) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -624,7 +640,7 @@ TEST_F(DocumentStoreTest, DeleteByNamespaceNonexistentNamespaceNotFound) { EXPECT_THAT(document_log_size_before, Eq(document_log_size_after)); } -TEST_F(DocumentStoreTest, DeleteByNamespaceNoExistingDocumentsNotFound) { +TEST_P(DocumentStoreTest, DeleteByNamespaceNoExistingDocumentsNotFound) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -645,7 +661,7 @@ TEST_F(DocumentStoreTest, DeleteByNamespaceNoExistingDocumentsNotFound) { StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); } -TEST_F(DocumentStoreTest, DeleteByNamespaceRecoversOk) { +TEST_P(DocumentStoreTest, DeleteByNamespaceRecoversOk) { DocumentProto document1 = test_document1_; document1.set_namespace_("namespace.1"); document1.set_uri("uri1"); @@ -715,7 +731,7 @@ TEST_F(DocumentStoreTest, DeleteByNamespaceRecoversOk) { StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); } -TEST_F(DocumentStoreTest, DeleteBySchemaTypeOk) { +TEST_P(DocumentStoreTest, DeleteBySchemaTypeOk) { SchemaProto schema = SchemaBuilder() .AddType(SchemaTypeConfigBuilder().SetType("email")) @@ -802,7 +818,7 @@ TEST_F(DocumentStoreTest, DeleteBySchemaTypeOk) { IsOkAndHolds(EqualsProto(person_document))); } -TEST_F(DocumentStoreTest, DeleteBySchemaTypeNonexistentSchemaTypeNotFound) { +TEST_P(DocumentStoreTest, DeleteBySchemaTypeNonexistentSchemaTypeNotFound) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -828,7 +844,7 @@ TEST_F(DocumentStoreTest, DeleteBySchemaTypeNonexistentSchemaTypeNotFound) { EXPECT_THAT(document_log_size_before, Eq(document_log_size_after)); } -TEST_F(DocumentStoreTest, DeleteBySchemaTypeNoExistingDocumentsNotFound) { +TEST_P(DocumentStoreTest, DeleteBySchemaTypeNoExistingDocumentsNotFound) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -846,7 +862,7 @@ TEST_F(DocumentStoreTest, DeleteBySchemaTypeNoExistingDocumentsNotFound) { StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); } -TEST_F(DocumentStoreTest, DeleteBySchemaTypeRecoversOk) { +TEST_P(DocumentStoreTest, DeleteBySchemaTypeRecoversOk) { SchemaProto schema = SchemaBuilder() .AddType(SchemaTypeConfigBuilder().SetType("email")) @@ -926,7 +942,7 @@ TEST_F(DocumentStoreTest, DeleteBySchemaTypeRecoversOk) { IsOkAndHolds(EqualsProto(message_document))); } -TEST_F(DocumentStoreTest, PutDeleteThenPut) { +TEST_P(DocumentStoreTest, PutDeleteThenPut) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -940,7 +956,7 @@ TEST_F(DocumentStoreTest, PutDeleteThenPut) { ICING_EXPECT_OK(doc_store->Put(test_document1_)); } -TEST_F(DocumentStoreTest, DeletedSchemaTypeFromSchemaStoreRecoversOk) { +TEST_P(DocumentStoreTest, DeletedSchemaTypeFromSchemaStoreRecoversOk) { SchemaProto schema = SchemaBuilder() .AddType(SchemaTypeConfigBuilder().SetType("email")) @@ -1034,7 +1050,7 @@ TEST_F(DocumentStoreTest, DeletedSchemaTypeFromSchemaStoreRecoversOk) { IsOkAndHolds(EqualsProto(message_document))); } -TEST_F(DocumentStoreTest, OptimizeInto) { +TEST_P(DocumentStoreTest, OptimizeInto) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -1078,8 +1094,7 @@ TEST_F(DocumentStoreTest, OptimizeInto) { // Optimizing into the same directory is not allowed EXPECT_THAT( - doc_store->OptimizeInto(document_store_dir_, lang_segmenter_.get(), - /*namespace_id_fingerprint=*/false), + doc_store->OptimizeInto(document_store_dir_, lang_segmenter_.get()), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT, HasSubstr("directory is the same"))); @@ -1091,8 +1106,7 @@ TEST_F(DocumentStoreTest, OptimizeInto) { // deleted ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(optimized_dir.c_str())); ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(optimized_dir.c_str())); - EXPECT_THAT(doc_store->OptimizeInto(optimized_dir, lang_segmenter_.get(), - /*namespace_id_fingerprint=*/false), + EXPECT_THAT(doc_store->OptimizeInto(optimized_dir, lang_segmenter_.get()), IsOkAndHolds(ElementsAre(0, 1, 2))); int64_t optimized_size1 = filesystem_.GetFileSize(optimized_document_log.c_str()); @@ -1105,8 +1119,7 @@ TEST_F(DocumentStoreTest, OptimizeInto) { ICING_ASSERT_OK(doc_store->Delete("namespace", "uri1", fake_clock_.GetSystemTimeMilliseconds())); // DocumentId 0 is removed. - EXPECT_THAT(doc_store->OptimizeInto(optimized_dir, lang_segmenter_.get(), - /*namespace_id_fingerprint=*/false), + EXPECT_THAT(doc_store->OptimizeInto(optimized_dir, lang_segmenter_.get()), IsOkAndHolds(ElementsAre(kInvalidDocumentId, 0, 1))); int64_t optimized_size2 = filesystem_.GetFileSize(optimized_document_log.c_str()); @@ -1122,8 +1135,7 @@ TEST_F(DocumentStoreTest, OptimizeInto) { ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(optimized_dir.c_str())); // DocumentId 0 is removed, and DocumentId 2 is expired. EXPECT_THAT( - doc_store->OptimizeInto(optimized_dir, lang_segmenter_.get(), - /*namespace_id_fingerprint=*/false), + doc_store->OptimizeInto(optimized_dir, lang_segmenter_.get()), IsOkAndHolds(ElementsAre(kInvalidDocumentId, 0, kInvalidDocumentId))); int64_t optimized_size3 = filesystem_.GetFileSize(optimized_document_log.c_str()); @@ -1135,8 +1147,7 @@ TEST_F(DocumentStoreTest, OptimizeInto) { ICING_ASSERT_OK(doc_store->Delete("namespace", "uri2", fake_clock_.GetSystemTimeMilliseconds())); // DocumentId 0 and 1 is removed, and DocumentId 2 is expired. - EXPECT_THAT(doc_store->OptimizeInto(optimized_dir, lang_segmenter_.get(), - /*namespace_id_fingerprint=*/false), + EXPECT_THAT(doc_store->OptimizeInto(optimized_dir, lang_segmenter_.get()), IsOkAndHolds(ElementsAre(kInvalidDocumentId, kInvalidDocumentId, kInvalidDocumentId))); int64_t optimized_size4 = @@ -1144,7 +1155,7 @@ TEST_F(DocumentStoreTest, OptimizeInto) { EXPECT_THAT(optimized_size3, Gt(optimized_size4)); } -TEST_F(DocumentStoreTest, OptimizeIntoForEmptyDocumentStore) { +TEST_P(DocumentStoreTest, OptimizeIntoForEmptyDocumentStore) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -1154,12 +1165,11 @@ TEST_F(DocumentStoreTest, OptimizeIntoForEmptyDocumentStore) { std::string optimized_dir = document_store_dir_ + "_optimize"; ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(optimized_dir.c_str())); ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(optimized_dir.c_str())); - EXPECT_THAT(doc_store->OptimizeInto(optimized_dir, lang_segmenter_.get(), - /*namespace_id_fingerprint=*/false), + EXPECT_THAT(doc_store->OptimizeInto(optimized_dir, lang_segmenter_.get()), IsOkAndHolds(IsEmpty())); } -TEST_F(DocumentStoreTest, ShouldRecoverFromDataLoss) { +TEST_P(DocumentStoreTest, ShouldRecoverFromDataLoss) { DocumentId document_id1, document_id2; { // Can put and delete fine. @@ -1250,7 +1260,7 @@ TEST_F(DocumentStoreTest, ShouldRecoverFromDataLoss) { /*num_docs=*/1, /*sum_length_in_tokens=*/4))); } -TEST_F(DocumentStoreTest, ShouldRecoverFromCorruptDerivedFile) { +TEST_P(DocumentStoreTest, ShouldRecoverFromCorruptDerivedFile) { DocumentId document_id1, document_id2; { // Can put and delete fine. @@ -1361,7 +1371,7 @@ TEST_F(DocumentStoreTest, ShouldRecoverFromCorruptDerivedFile) { EXPECT_THAT(actual_scores, Eq(expected_scores)); } -TEST_F(DocumentStoreTest, ShouldRecoverFromDiscardDerivedFiles) { +TEST_P(DocumentStoreTest, ShouldRecoverFromDiscardDerivedFiles) { DocumentId document_id1, document_id2; { // Can put and delete fine. @@ -1459,7 +1469,7 @@ TEST_F(DocumentStoreTest, ShouldRecoverFromDiscardDerivedFiles) { EXPECT_THAT(actual_scores, Eq(expected_scores)); } -TEST_F(DocumentStoreTest, ShouldRecoverFromBadChecksum) { +TEST_P(DocumentStoreTest, ShouldRecoverFromBadChecksum) { DocumentId document_id1, document_id2; { // Can put and delete fine. @@ -1537,7 +1547,7 @@ TEST_F(DocumentStoreTest, ShouldRecoverFromBadChecksum) { /*num_docs=*/1, /*sum_length_in_tokens=*/4))); } -TEST_F(DocumentStoreTest, GetStorageInfo) { +TEST_P(DocumentStoreTest, GetStorageInfo) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -1580,7 +1590,7 @@ TEST_F(DocumentStoreTest, GetStorageInfo) { EXPECT_THAT(doc_store_storage_info.document_store_size(), Eq(-1)); } -TEST_F(DocumentStoreTest, MaxDocumentId) { +TEST_P(DocumentStoreTest, MaxDocumentId) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -1605,7 +1615,7 @@ TEST_F(DocumentStoreTest, MaxDocumentId) { EXPECT_THAT(doc_store->last_added_document_id(), Eq(document_id2)); } -TEST_F(DocumentStoreTest, GetNamespaceId) { +TEST_P(DocumentStoreTest, GetNamespaceId) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -1638,7 +1648,7 @@ TEST_F(DocumentStoreTest, GetNamespaceId) { EXPECT_THAT(doc_store->GetNamespaceId("namespace1"), IsOkAndHolds(Eq(0))); } -TEST_F(DocumentStoreTest, GetDuplicateNamespaceId) { +TEST_P(DocumentStoreTest, GetDuplicateNamespaceId) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -1658,7 +1668,7 @@ TEST_F(DocumentStoreTest, GetDuplicateNamespaceId) { EXPECT_THAT(doc_store->GetNamespaceId("namespace"), IsOkAndHolds(Eq(0))); } -TEST_F(DocumentStoreTest, NonexistentNamespaceNotFound) { +TEST_P(DocumentStoreTest, NonexistentNamespaceNotFound) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -1670,7 +1680,7 @@ TEST_F(DocumentStoreTest, NonexistentNamespaceNotFound) { StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); } -TEST_F(DocumentStoreTest, GetCorpusDuplicateCorpusId) { +TEST_P(DocumentStoreTest, GetCorpusDuplicateCorpusId) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -1691,7 +1701,7 @@ TEST_F(DocumentStoreTest, GetCorpusDuplicateCorpusId) { IsOkAndHolds(Eq(0))); } -TEST_F(DocumentStoreTest, GetCorpusId) { +TEST_P(DocumentStoreTest, GetCorpusId) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -1726,7 +1736,7 @@ TEST_F(DocumentStoreTest, GetCorpusId) { EXPECT_THAT(doc_store->GetNamespaceId("namespace1"), IsOkAndHolds(Eq(0))); } -TEST_F(DocumentStoreTest, NonexistentCorpusNotFound) { +TEST_P(DocumentStoreTest, NonexistentCorpusNotFound) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -1750,7 +1760,7 @@ TEST_F(DocumentStoreTest, NonexistentCorpusNotFound) { StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); } -TEST_F(DocumentStoreTest, GetCorpusAssociatedScoreDataSameCorpus) { +TEST_P(DocumentStoreTest, GetCorpusAssociatedScoreDataSameCorpus) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -1775,7 +1785,7 @@ TEST_F(DocumentStoreTest, GetCorpusAssociatedScoreDataSameCorpus) { StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); } -TEST_F(DocumentStoreTest, GetCorpusAssociatedScoreData) { +TEST_P(DocumentStoreTest, GetCorpusAssociatedScoreData) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -1813,7 +1823,7 @@ TEST_F(DocumentStoreTest, GetCorpusAssociatedScoreData) { /*num_docs=*/1, /*sum_length_in_tokens=*/5))); } -TEST_F(DocumentStoreTest, NonexistentCorpusAssociatedScoreDataOutOfRange) { +TEST_P(DocumentStoreTest, NonexistentCorpusAssociatedScoreDataOutOfRange) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -1825,7 +1835,7 @@ TEST_F(DocumentStoreTest, NonexistentCorpusAssociatedScoreDataOutOfRange) { StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); } -TEST_F(DocumentStoreTest, GetDocumentAssociatedScoreDataSameCorpus) { +TEST_P(DocumentStoreTest, GetDocumentAssociatedScoreDataSameCorpus) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -1869,7 +1879,7 @@ TEST_F(DocumentStoreTest, GetDocumentAssociatedScoreDataSameCorpus) { /*length_in_tokens=*/7))); } -TEST_F(DocumentStoreTest, GetDocumentAssociatedScoreDataDifferentCorpus) { +TEST_P(DocumentStoreTest, GetDocumentAssociatedScoreDataDifferentCorpus) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -1913,7 +1923,7 @@ TEST_F(DocumentStoreTest, GetDocumentAssociatedScoreDataDifferentCorpus) { /*length_in_tokens=*/7))); } -TEST_F(DocumentStoreTest, NonexistentDocumentAssociatedScoreDataNotFound) { +TEST_P(DocumentStoreTest, NonexistentDocumentAssociatedScoreDataNotFound) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -1925,7 +1935,7 @@ TEST_F(DocumentStoreTest, NonexistentDocumentAssociatedScoreDataNotFound) { StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); } -TEST_F(DocumentStoreTest, NonexistentDocumentFilterDataNotFound) { +TEST_P(DocumentStoreTest, NonexistentDocumentFilterDataNotFound) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -1937,7 +1947,7 @@ TEST_F(DocumentStoreTest, NonexistentDocumentFilterDataNotFound) { /*document_id=*/0, fake_clock_.GetSystemTimeMilliseconds())); } -TEST_F(DocumentStoreTest, DeleteClearsFilterCache) { +TEST_P(DocumentStoreTest, DeleteClearsFilterCache) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -1964,7 +1974,7 @@ TEST_F(DocumentStoreTest, DeleteClearsFilterCache) { document_id, fake_clock_.GetSystemTimeMilliseconds())); } -TEST_F(DocumentStoreTest, DeleteClearsScoreCache) { +TEST_P(DocumentStoreTest, DeleteClearsScoreCache) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -1993,7 +2003,7 @@ TEST_F(DocumentStoreTest, DeleteClearsScoreCache) { /*length_in_tokens=*/0))); } -TEST_F(DocumentStoreTest, DeleteShouldPreventUsageScores) { +TEST_P(DocumentStoreTest, DeleteShouldPreventUsageScores) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -2032,7 +2042,7 @@ TEST_F(DocumentStoreTest, DeleteShouldPreventUsageScores) { document_id, fake_clock_.GetSystemTimeMilliseconds())); } -TEST_F(DocumentStoreTest, ExpirationShouldPreventUsageScores) { +TEST_P(DocumentStoreTest, ExpirationShouldPreventUsageScores) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -2082,7 +2092,7 @@ TEST_F(DocumentStoreTest, ExpirationShouldPreventUsageScores) { document_id, fake_clock_.GetSystemTimeMilliseconds())); } -TEST_F(DocumentStoreTest, +TEST_P(DocumentStoreTest, ExpirationTimestampIsSumOfNonZeroTtlAndCreationTimestamp) { DocumentProto document = DocumentBuilder() .SetKey("namespace1", "1") @@ -2109,7 +2119,7 @@ TEST_F(DocumentStoreTest, /*expiration_timestamp_ms=*/1100))); } -TEST_F(DocumentStoreTest, ExpirationTimestampIsInt64MaxIfTtlIsZero) { +TEST_P(DocumentStoreTest, ExpirationTimestampIsInt64MaxIfTtlIsZero) { DocumentProto document = DocumentBuilder() .SetKey("namespace1", "1") .SetSchema("email") @@ -2139,7 +2149,7 @@ TEST_F(DocumentStoreTest, ExpirationTimestampIsInt64MaxIfTtlIsZero) { /*expiration_timestamp_ms=*/std::numeric_limits<int64_t>::max()))); } -TEST_F(DocumentStoreTest, ExpirationTimestampIsInt64MaxOnOverflow) { +TEST_P(DocumentStoreTest, ExpirationTimestampIsInt64MaxOnOverflow) { DocumentProto document = DocumentBuilder() .SetKey("namespace1", "1") @@ -2170,7 +2180,7 @@ TEST_F(DocumentStoreTest, ExpirationTimestampIsInt64MaxOnOverflow) { /*expiration_timestamp_ms=*/std::numeric_limits<int64_t>::max()))); } -TEST_F(DocumentStoreTest, CreationTimestampShouldBePopulated) { +TEST_P(DocumentStoreTest, CreationTimestampShouldBePopulated) { // Creates a document without a given creation timestamp DocumentProto document_without_creation_timestamp = DocumentBuilder() @@ -2201,7 +2211,7 @@ TEST_F(DocumentStoreTest, CreationTimestampShouldBePopulated) { Eq(fake_real_time)); } -TEST_F(DocumentStoreTest, ShouldWriteAndReadScoresCorrectly) { +TEST_P(DocumentStoreTest, ShouldWriteAndReadScoresCorrectly) { DocumentProto document1 = DocumentBuilder() .SetKey("icing", "email/1") .SetSchema("email") @@ -2240,7 +2250,7 @@ TEST_F(DocumentStoreTest, ShouldWriteAndReadScoresCorrectly) { /*length_in_tokens=*/0))); } -TEST_F(DocumentStoreTest, ComputeChecksumSameBetweenCalls) { +TEST_P(DocumentStoreTest, ComputeChecksumSameBetweenCalls) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -2255,7 +2265,7 @@ TEST_F(DocumentStoreTest, ComputeChecksumSameBetweenCalls) { EXPECT_THAT(document_store->ComputeChecksum(), IsOkAndHolds(checksum)); } -TEST_F(DocumentStoreTest, ComputeChecksumSameAcrossInstances) { +TEST_P(DocumentStoreTest, ComputeChecksumSameAcrossInstances) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -2276,7 +2286,7 @@ TEST_F(DocumentStoreTest, ComputeChecksumSameAcrossInstances) { EXPECT_THAT(document_store->ComputeChecksum(), IsOkAndHolds(checksum)); } -TEST_F(DocumentStoreTest, ComputeChecksumChangesOnNewDocument) { +TEST_P(DocumentStoreTest, ComputeChecksumChangesOnNewDocument) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -2292,7 +2302,7 @@ TEST_F(DocumentStoreTest, ComputeChecksumChangesOnNewDocument) { IsOkAndHolds(Not(Eq(checksum)))); } -TEST_F(DocumentStoreTest, ComputeChecksumDoesntChangeOnNewUsage) { +TEST_P(DocumentStoreTest, ComputeChecksumDoesntChangeOnNewUsage) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -2310,7 +2320,7 @@ TEST_F(DocumentStoreTest, ComputeChecksumDoesntChangeOnNewUsage) { EXPECT_THAT(document_store->ComputeChecksum(), IsOkAndHolds(Eq(checksum))); } -TEST_F(DocumentStoreTest, RegenerateDerivedFilesSkipsUnknownSchemaTypeIds) { +TEST_P(DocumentStoreTest, RegenerateDerivedFilesSkipsUnknownSchemaTypeIds) { const std::string schema_store_dir = schema_store_dir_ + "_custom"; DocumentId email_document_id; @@ -2445,7 +2455,7 @@ TEST_F(DocumentStoreTest, RegenerateDerivedFilesSkipsUnknownSchemaTypeIds) { Eq(message_expiration_timestamp)); } -TEST_F(DocumentStoreTest, UpdateSchemaStoreUpdatesSchemaTypeIds) { +TEST_P(DocumentStoreTest, UpdateSchemaStoreUpdatesSchemaTypeIds) { const std::string schema_store_dir = test_dir_ + "_custom"; filesystem_.DeleteDirectoryRecursively(schema_store_dir.c_str()); filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str()); @@ -2541,7 +2551,7 @@ TEST_F(DocumentStoreTest, UpdateSchemaStoreUpdatesSchemaTypeIds) { EXPECT_THAT(message_data.schema_type_id(), Eq(new_message_schema_type_id)); } -TEST_F(DocumentStoreTest, UpdateSchemaStoreDeletesInvalidDocuments) { +TEST_P(DocumentStoreTest, UpdateSchemaStoreDeletesInvalidDocuments) { const std::string schema_store_dir = test_dir_ + "_custom"; filesystem_.DeleteDirectoryRecursively(schema_store_dir.c_str()); filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str()); @@ -2617,7 +2627,7 @@ TEST_F(DocumentStoreTest, UpdateSchemaStoreDeletesInvalidDocuments) { IsOkAndHolds(EqualsProto(email_with_subject))); } -TEST_F(DocumentStoreTest, +TEST_P(DocumentStoreTest, UpdateSchemaStoreDeletesDocumentsByDeletedSchemaType) { const std::string schema_store_dir = test_dir_ + "_custom"; filesystem_.DeleteDirectoryRecursively(schema_store_dir.c_str()); @@ -2691,7 +2701,7 @@ TEST_F(DocumentStoreTest, IsOkAndHolds(EqualsProto(message_document))); } -TEST_F(DocumentStoreTest, OptimizedUpdateSchemaStoreUpdatesSchemaTypeIds) { +TEST_P(DocumentStoreTest, OptimizedUpdateSchemaStoreUpdatesSchemaTypeIds) { const std::string schema_store_dir = test_dir_ + "_custom"; filesystem_.DeleteDirectoryRecursively(schema_store_dir.c_str()); filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str()); @@ -2790,7 +2800,7 @@ TEST_F(DocumentStoreTest, OptimizedUpdateSchemaStoreUpdatesSchemaTypeIds) { EXPECT_THAT(message_data.schema_type_id(), Eq(new_message_schema_type_id)); } -TEST_F(DocumentStoreTest, OptimizedUpdateSchemaStoreDeletesInvalidDocuments) { +TEST_P(DocumentStoreTest, OptimizedUpdateSchemaStoreDeletesInvalidDocuments) { const std::string schema_store_dir = test_dir_ + "_custom"; filesystem_.DeleteDirectoryRecursively(schema_store_dir.c_str()); filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str()); @@ -2869,7 +2879,7 @@ TEST_F(DocumentStoreTest, OptimizedUpdateSchemaStoreDeletesInvalidDocuments) { IsOkAndHolds(EqualsProto(email_with_subject))); } -TEST_F(DocumentStoreTest, +TEST_P(DocumentStoreTest, OptimizedUpdateSchemaStoreDeletesDocumentsByDeletedSchemaType) { const std::string schema_store_dir = test_dir_ + "_custom"; filesystem_.DeleteDirectoryRecursively(schema_store_dir.c_str()); @@ -2945,7 +2955,7 @@ TEST_F(DocumentStoreTest, IsOkAndHolds(EqualsProto(message_document))); } -TEST_F(DocumentStoreTest, GetOptimizeInfo) { +TEST_P(DocumentStoreTest, GetOptimizeInfo) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -2983,8 +2993,7 @@ TEST_F(DocumentStoreTest, GetOptimizeInfo) { EXPECT_TRUE(filesystem_.DeleteDirectoryRecursively(optimized_dir.c_str())); EXPECT_TRUE(filesystem_.CreateDirectoryRecursively(optimized_dir.c_str())); ICING_ASSERT_OK( - document_store->OptimizeInto(optimized_dir, lang_segmenter_.get(), - /*namespace_id_fingerprint=*/false)); + document_store->OptimizeInto(optimized_dir, lang_segmenter_.get())); document_store.reset(); ICING_ASSERT_OK_AND_ASSIGN( create_result, CreateDocumentStore(&filesystem_, optimized_dir, @@ -2999,7 +3008,7 @@ TEST_F(DocumentStoreTest, GetOptimizeInfo) { EXPECT_THAT(optimize_info.estimated_optimizable_bytes, Eq(0)); } -TEST_F(DocumentStoreTest, GetAllNamespaces) { +TEST_P(DocumentStoreTest, GetAllNamespaces) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -3071,7 +3080,7 @@ TEST_F(DocumentStoreTest, GetAllNamespaces) { UnorderedElementsAre("namespace1")); } -TEST_F(DocumentStoreTest, ReportUsageWithDifferentTimestampsAndGetUsageScores) { +TEST_P(DocumentStoreTest, ReportUsageWithDifferentTimestampsAndGetUsageScores) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -3163,7 +3172,7 @@ TEST_F(DocumentStoreTest, ReportUsageWithDifferentTimestampsAndGetUsageScores) { EXPECT_THAT(actual_scores, Eq(expected_scores)); } -TEST_F(DocumentStoreTest, ReportUsageWithDifferentTypesAndGetUsageScores) { +TEST_P(DocumentStoreTest, ReportUsageWithDifferentTypesAndGetUsageScores) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -3213,7 +3222,7 @@ TEST_F(DocumentStoreTest, ReportUsageWithDifferentTypesAndGetUsageScores) { EXPECT_THAT(actual_scores, Eq(expected_scores)); } -TEST_F(DocumentStoreTest, UsageScoresShouldNotBeClearedOnChecksumMismatch) { +TEST_P(DocumentStoreTest, UsageScoresShouldNotBeClearedOnChecksumMismatch) { UsageStore::UsageScores expected_scores; DocumentId document_id; { @@ -3258,7 +3267,7 @@ TEST_F(DocumentStoreTest, UsageScoresShouldNotBeClearedOnChecksumMismatch) { EXPECT_THAT(actual_scores, Eq(expected_scores)); } -TEST_F(DocumentStoreTest, UsageScoresShouldBeAvailableAfterDataLoss) { +TEST_P(DocumentStoreTest, UsageScoresShouldBeAvailableAfterDataLoss) { UsageStore::UsageScores expected_scores; DocumentId document_id; { @@ -3314,7 +3323,7 @@ TEST_F(DocumentStoreTest, UsageScoresShouldBeAvailableAfterDataLoss) { EXPECT_THAT(actual_scores, Eq(expected_scores)); } -TEST_F(DocumentStoreTest, UsageScoresShouldBeCopiedOverToUpdatedDocument) { +TEST_P(DocumentStoreTest, UsageScoresShouldBeCopiedOverToUpdatedDocument) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -3355,7 +3364,7 @@ TEST_F(DocumentStoreTest, UsageScoresShouldBeCopiedOverToUpdatedDocument) { EXPECT_THAT(actual_scores, Eq(expected_scores)); } -TEST_F(DocumentStoreTest, UsageScoresShouldPersistOnOptimize) { +TEST_P(DocumentStoreTest, UsageScoresShouldPersistOnOptimize) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -3390,8 +3399,7 @@ TEST_F(DocumentStoreTest, UsageScoresShouldPersistOnOptimize) { std::string optimized_dir = document_store_dir_ + "/optimize_test"; filesystem_.CreateDirectoryRecursively(optimized_dir.c_str()); ICING_ASSERT_OK( - document_store->OptimizeInto(optimized_dir, lang_segmenter_.get(), - /*namespace_id_fingerprint=*/false)); + document_store->OptimizeInto(optimized_dir, lang_segmenter_.get())); // Get optimized document store ICING_ASSERT_OK_AND_ASSIGN( @@ -3409,7 +3417,7 @@ TEST_F(DocumentStoreTest, UsageScoresShouldPersistOnOptimize) { EXPECT_THAT(actual_scores, Eq(expected_scores)); } -TEST_F(DocumentStoreTest, DetectPartialDataLoss) { +TEST_P(DocumentStoreTest, DetectPartialDataLoss) { { // Can put and delete fine. ICING_ASSERT_OK_AND_ASSIGN( @@ -3450,7 +3458,7 @@ TEST_F(DocumentStoreTest, DetectPartialDataLoss) { ASSERT_THAT(create_result.data_loss, Eq(DataLoss::PARTIAL)); } -TEST_F(DocumentStoreTest, DetectCompleteDataLoss) { +TEST_P(DocumentStoreTest, DetectCompleteDataLoss) { int64_t corruptible_offset; const std::string document_log_file = absl_ports::StrCat( document_store_dir_, "/", DocumentLogCreator::GetDocumentLogFilename()); @@ -3515,7 +3523,7 @@ TEST_F(DocumentStoreTest, DetectCompleteDataLoss) { ASSERT_THAT(create_result.data_loss, Eq(DataLoss::COMPLETE)); } -TEST_F(DocumentStoreTest, LoadScoreCacheAndInitializeSuccessfully) { +TEST_P(DocumentStoreTest, LoadScoreCacheAndInitializeSuccessfully) { // The directory testdata/score_cache_without_length_in_tokens/document_store // contains only the scoring_cache and the document_store_header (holding the // crc for the scoring_cache). If the current code is compatible with the @@ -3557,7 +3565,8 @@ TEST_F(DocumentStoreTest, LoadScoreCacheAndInitializeSuccessfully) { DocumentStore::Create( &filesystem_, document_store_dir_, &fake_clock_, schema_store_.get(), /*force_recovery_and_revalidate_documents=*/false, - /*namespace_id_fingerprint=*/false, + GetParam().namespace_id_fingerprint, GetParam().pre_mapping_fbv, + GetParam().use_persistent_hash_map, PortableFileBackedProtoLog<DocumentWrapper>::kDeflateCompressionLevel, &initialize_stats)); std::unique_ptr<DocumentStore> doc_store = @@ -3568,7 +3577,7 @@ TEST_F(DocumentStoreTest, LoadScoreCacheAndInitializeSuccessfully) { InitializeStatsProto::LEGACY_DOCUMENT_LOG_FORMAT); } -TEST_F(DocumentStoreTest, DocumentStoreStorageInfo) { +TEST_P(DocumentStoreTest, DocumentStoreStorageInfo) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -3678,7 +3687,7 @@ TEST_F(DocumentStoreTest, DocumentStoreStorageInfo) { Eq(0)); } -TEST_F(DocumentStoreTest, InitializeForceRecoveryUpdatesTypeIds) { +TEST_P(DocumentStoreTest, InitializeForceRecoveryUpdatesTypeIds) { // Start fresh and set the schema with one type. filesystem_.DeleteDirectoryRecursively(test_dir_.c_str()); filesystem_.CreateDirectoryRecursively(test_dir_.c_str()); @@ -3768,13 +3777,14 @@ TEST_F(DocumentStoreTest, InitializeForceRecoveryUpdatesTypeIds) { InitializeStatsProto initialize_stats; ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, - DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, - schema_store.get(), - /*force_recovery_and_revalidate_documents=*/true, - /*namespace_id_fingerprint=*/false, - PortableFileBackedProtoLog< - DocumentWrapper>::kDeflateCompressionLevel, - &initialize_stats)); + DocumentStore::Create( + &filesystem_, document_store_dir_, &fake_clock_, schema_store.get(), + /*force_recovery_and_revalidate_documents=*/true, + GetParam().namespace_id_fingerprint, GetParam().pre_mapping_fbv, + GetParam().use_persistent_hash_map, + PortableFileBackedProtoLog< + DocumentWrapper>::kDeflateCompressionLevel, + &initialize_stats)); std::unique_ptr<DocumentStore> doc_store = std::move(create_result.document_store); @@ -3789,7 +3799,7 @@ TEST_F(DocumentStoreTest, InitializeForceRecoveryUpdatesTypeIds) { } } -TEST_F(DocumentStoreTest, InitializeDontForceRecoveryDoesntUpdateTypeIds) { +TEST_P(DocumentStoreTest, InitializeDontForceRecoveryDoesntUpdateTypeIds) { // Start fresh and set the schema with one type. filesystem_.DeleteDirectoryRecursively(test_dir_.c_str()); filesystem_.CreateDirectoryRecursively(test_dir_.c_str()); @@ -3892,7 +3902,7 @@ TEST_F(DocumentStoreTest, InitializeDontForceRecoveryDoesntUpdateTypeIds) { } } -TEST_F(DocumentStoreTest, InitializeForceRecoveryDeletesInvalidDocument) { +TEST_P(DocumentStoreTest, InitializeForceRecoveryDeletesInvalidDocument) { // Start fresh and set the schema with one type. filesystem_.DeleteDirectoryRecursively(test_dir_.c_str()); filesystem_.CreateDirectoryRecursively(test_dir_.c_str()); @@ -3987,13 +3997,14 @@ TEST_F(DocumentStoreTest, InitializeForceRecoveryDeletesInvalidDocument) { CorruptDocStoreHeaderChecksumFile(); ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, - DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, - schema_store.get(), - /*force_recovery_and_revalidate_documents=*/true, - /*namespace_id_fingerprint=*/false, - PortableFileBackedProtoLog< - DocumentWrapper>::kDeflateCompressionLevel, - /*initialize_stats=*/nullptr)); + DocumentStore::Create( + &filesystem_, document_store_dir_, &fake_clock_, schema_store.get(), + /*force_recovery_and_revalidate_documents=*/true, + GetParam().namespace_id_fingerprint, GetParam().pre_mapping_fbv, + GetParam().use_persistent_hash_map, + PortableFileBackedProtoLog< + DocumentWrapper>::kDeflateCompressionLevel, + /*initialize_stats=*/nullptr)); std::unique_ptr<DocumentStore> doc_store = std::move(create_result.document_store); @@ -4005,7 +4016,7 @@ TEST_F(DocumentStoreTest, InitializeForceRecoveryDeletesInvalidDocument) { } } -TEST_F(DocumentStoreTest, InitializeDontForceRecoveryKeepsInvalidDocument) { +TEST_P(DocumentStoreTest, InitializeDontForceRecoveryKeepsInvalidDocument) { // Start fresh and set the schema with one type. filesystem_.DeleteDirectoryRecursively(test_dir_.c_str()); filesystem_.CreateDirectoryRecursively(test_dir_.c_str()); @@ -4114,7 +4125,7 @@ TEST_F(DocumentStoreTest, InitializeDontForceRecoveryKeepsInvalidDocument) { } } -TEST_F(DocumentStoreTest, MigrateToPortableFileBackedProtoLog) { +TEST_P(DocumentStoreTest, MigrateToPortableFileBackedProtoLog) { // Set up schema. SchemaProto schema = SchemaBuilder() @@ -4182,7 +4193,8 @@ TEST_F(DocumentStoreTest, MigrateToPortableFileBackedProtoLog) { DocumentStore::Create( &filesystem_, document_store_dir, &fake_clock_, schema_store.get(), /*force_recovery_and_revalidate_documents=*/false, - /*namespace_id_fingerprint=*/false, + GetParam().pre_mapping_fbv, GetParam().use_persistent_hash_map, + GetParam().namespace_id_fingerprint, PortableFileBackedProtoLog<DocumentWrapper>::kDeflateCompressionLevel, &initialize_stats)); std::unique_ptr<DocumentStore> document_store = @@ -4240,7 +4252,7 @@ TEST_F(DocumentStoreTest, MigrateToPortableFileBackedProtoLog) { IsOkAndHolds(EqualsProto(document3))); } -TEST_F(DocumentStoreTest, GetDebugInfo) { +TEST_P(DocumentStoreTest, GetDebugInfo) { SchemaProto schema = SchemaBuilder() .AddType(SchemaTypeConfigBuilder() @@ -4365,7 +4377,7 @@ TEST_F(DocumentStoreTest, GetDebugInfo) { EXPECT_THAT(out3.corpus_info(), IsEmpty()); } -TEST_F(DocumentStoreTest, GetDebugInfoWithoutSchema) { +TEST_P(DocumentStoreTest, GetDebugInfoWithoutSchema) { std::string schema_store_dir = schema_store_dir_ + "_custom"; filesystem_.DeleteDirectoryRecursively(schema_store_dir.c_str()); filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str()); @@ -4389,7 +4401,7 @@ TEST_F(DocumentStoreTest, GetDebugInfoWithoutSchema) { EXPECT_THAT(out.corpus_info(), IsEmpty()); } -TEST_F(DocumentStoreTest, GetDebugInfoForEmptyDocumentStore) { +TEST_P(DocumentStoreTest, GetDebugInfoForEmptyDocumentStore) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_, @@ -4406,6 +4418,198 @@ TEST_F(DocumentStoreTest, GetDebugInfoForEmptyDocumentStore) { EXPECT_THAT(out.corpus_info(), IsEmpty()); } +TEST_P(DocumentStoreTest, SwitchKeyMapperTypeShouldRegenerateDerivedFiles) { + std::string dynamic_trie_uri_mapper_dir = + document_store_dir_ + "/key_mapper_dir"; + std::string persistent_hash_map_uri_mapper_dir = + document_store_dir_ + "/uri_mapper"; + DocumentId document_id1; + { + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult create_result, + DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, + schema_store_.get(), + /*force_recovery_and_revalidate_documents=*/false, + GetParam().namespace_id_fingerprint, + GetParam().pre_mapping_fbv, + GetParam().use_persistent_hash_map, + PortableFileBackedProtoLog< + DocumentWrapper>::kDeflateCompressionLevel, + /*initialize_stats=*/nullptr)); + + std::unique_ptr<DocumentStore> doc_store = + std::move(create_result.document_store); + ICING_ASSERT_OK_AND_ASSIGN(document_id1, doc_store->Put(test_document1_)); + + if (GetParam().use_persistent_hash_map) { + EXPECT_THAT(filesystem_.DirectoryExists( + persistent_hash_map_uri_mapper_dir.c_str()), + IsTrue()); + EXPECT_THAT( + filesystem_.DirectoryExists(dynamic_trie_uri_mapper_dir.c_str()), + IsFalse()); + } else { + EXPECT_THAT(filesystem_.DirectoryExists( + persistent_hash_map_uri_mapper_dir.c_str()), + IsFalse()); + EXPECT_THAT( + filesystem_.DirectoryExists(dynamic_trie_uri_mapper_dir.c_str()), + IsTrue()); + } + } + + // Switch key mapper. We should get I/O error and derived files should be + // regenerated. + { + bool switch_key_mapper_flag = !GetParam().use_persistent_hash_map; + InitializeStatsProto initialize_stats; + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult create_result, + DocumentStore::Create( + &filesystem_, document_store_dir_, &fake_clock_, + schema_store_.get(), + /*force_recovery_and_revalidate_documents=*/false, + GetParam().namespace_id_fingerprint, GetParam().pre_mapping_fbv, + /*use_persistent_hash_map=*/switch_key_mapper_flag, + PortableFileBackedProtoLog< + DocumentWrapper>::kDeflateCompressionLevel, + &initialize_stats)); + EXPECT_THAT(initialize_stats.document_store_recovery_cause(), + Eq(InitializeStatsProto::IO_ERROR)); + + std::unique_ptr<DocumentStore> doc_store = + std::move(create_result.document_store); + EXPECT_THAT(doc_store->GetDocumentId(test_document1_.namespace_(), + test_document1_.uri()), + IsOkAndHolds(document_id1)); + + if (switch_key_mapper_flag) { + EXPECT_THAT(filesystem_.DirectoryExists( + persistent_hash_map_uri_mapper_dir.c_str()), + IsTrue()); + EXPECT_THAT( + filesystem_.DirectoryExists(dynamic_trie_uri_mapper_dir.c_str()), + IsFalse()); + } else { + EXPECT_THAT(filesystem_.DirectoryExists( + persistent_hash_map_uri_mapper_dir.c_str()), + IsFalse()); + EXPECT_THAT( + filesystem_.DirectoryExists(dynamic_trie_uri_mapper_dir.c_str()), + IsTrue()); + } + } +} + +TEST_P(DocumentStoreTest, SameKeyMapperTypeShouldNotRegenerateDerivedFiles) { + std::string dynamic_trie_uri_mapper_dir = + document_store_dir_ + "/key_mapper_dir"; + std::string persistent_hash_map_uri_mapper_dir = + document_store_dir_ + "/uri_mapper"; + DocumentId document_id1; + { + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult create_result, + DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, + schema_store_.get(), + /*force_recovery_and_revalidate_documents=*/false, + GetParam().namespace_id_fingerprint, + GetParam().pre_mapping_fbv, + GetParam().use_persistent_hash_map, + PortableFileBackedProtoLog< + DocumentWrapper>::kDeflateCompressionLevel, + /*initialize_stats=*/nullptr)); + + std::unique_ptr<DocumentStore> doc_store = + std::move(create_result.document_store); + ICING_ASSERT_OK_AND_ASSIGN(document_id1, doc_store->Put(test_document1_)); + + if (GetParam().use_persistent_hash_map) { + EXPECT_THAT(filesystem_.DirectoryExists( + persistent_hash_map_uri_mapper_dir.c_str()), + IsTrue()); + EXPECT_THAT( + filesystem_.DirectoryExists(dynamic_trie_uri_mapper_dir.c_str()), + IsFalse()); + } else { + EXPECT_THAT(filesystem_.DirectoryExists( + persistent_hash_map_uri_mapper_dir.c_str()), + IsFalse()); + EXPECT_THAT( + filesystem_.DirectoryExists(dynamic_trie_uri_mapper_dir.c_str()), + IsTrue()); + } + } + + // Use the same key mapper type. Derived files should not be regenerated. + { + InitializeStatsProto initialize_stats; + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult create_result, + DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, + schema_store_.get(), + /*force_recovery_and_revalidate_documents=*/false, + GetParam().namespace_id_fingerprint, + GetParam().pre_mapping_fbv, + GetParam().use_persistent_hash_map, + PortableFileBackedProtoLog< + DocumentWrapper>::kDeflateCompressionLevel, + &initialize_stats)); + EXPECT_THAT(initialize_stats.document_store_recovery_cause(), + Eq(InitializeStatsProto::NONE)); + + std::unique_ptr<DocumentStore> doc_store = + std::move(create_result.document_store); + EXPECT_THAT(doc_store->GetDocumentId(test_document1_.namespace_(), + test_document1_.uri()), + IsOkAndHolds(document_id1)); + + if (GetParam().use_persistent_hash_map) { + EXPECT_THAT(filesystem_.DirectoryExists( + persistent_hash_map_uri_mapper_dir.c_str()), + IsTrue()); + EXPECT_THAT( + filesystem_.DirectoryExists(dynamic_trie_uri_mapper_dir.c_str()), + IsFalse()); + } else { + EXPECT_THAT(filesystem_.DirectoryExists( + persistent_hash_map_uri_mapper_dir.c_str()), + IsFalse()); + EXPECT_THAT( + filesystem_.DirectoryExists(dynamic_trie_uri_mapper_dir.c_str()), + IsTrue()); + } + } +} + +INSTANTIATE_TEST_SUITE_P( + DocumentStoreTest, DocumentStoreTest, + testing::Values( + DocumentStoreTestParam(/*namespace_id_fingerprint_in=*/false, + /*pre_mapping_fbv_in=*/false, + /*use_persistent_hash_map_in=*/false), + DocumentStoreTestParam(/*namespace_id_fingerprint_in=*/true, + /*pre_mapping_fbv_in=*/false, + /*use_persistent_hash_map_in=*/false), + DocumentStoreTestParam(/*namespace_id_fingerprint_in=*/false, + /*pre_mapping_fbv_in=*/true, + /*use_persistent_hash_map_in=*/false), + DocumentStoreTestParam(/*namespace_id_fingerprint_in=*/true, + /*pre_mapping_fbv_in=*/true, + /*use_persistent_hash_map_in=*/false), + DocumentStoreTestParam(/*namespace_id_fingerprint_in=*/false, + /*pre_mapping_fbv_in=*/false, + /*use_persistent_hash_map_in=*/true), + DocumentStoreTestParam(/*namespace_id_fingerprint_in=*/true, + /*pre_mapping_fbv_in=*/false, + /*use_persistent_hash_map_in=*/true), + DocumentStoreTestParam(/*namespace_id_fingerprint_in=*/false, + /*pre_mapping_fbv_in=*/true, + /*use_persistent_hash_map_in=*/true), + DocumentStoreTestParam(/*namespace_id_fingerprint_in=*/true, + /*pre_mapping_fbv_in=*/true, + /*use_persistent_hash_map_in=*/true))); + } // namespace } // namespace lib diff --git a/proto/icing/proto/initialize.proto b/proto/icing/proto/initialize.proto index d4b1aee..507e745 100644 --- a/proto/icing/proto/initialize.proto +++ b/proto/icing/proto/initialize.proto @@ -23,7 +23,7 @@ option java_package = "com.google.android.icing.proto"; option java_multiple_files = true; option objc_class_prefix = "ICNG"; -// Next tag: 11 +// Next tag: 12 message IcingSearchEngineOptions { // Directory to persist files for Icing. Required. // If Icing was previously initialized with this directory, it will reload @@ -104,6 +104,9 @@ message IcingSearchEngineOptions { // to dynamic trie key mapper). optional bool use_persistent_hash_map = 10; + // Integer index bucket split threshold. + optional int32 integer_index_bucket_split_threshold = 11 [default = 65536]; + reserved 2; } diff --git a/synced_AOSP_CL_number.txt b/synced_AOSP_CL_number.txt index 20e7738..9a7bd49 100644 --- a/synced_AOSP_CL_number.txt +++ b/synced_AOSP_CL_number.txt @@ -1 +1 @@ -set(synced_AOSP_CL_number=544124118) +set(synced_AOSP_CL_number=546080947) |