diff options
author | Xin Li <delphij@google.com> | 2024-01-17 22:13:52 -0800 |
---|---|---|
committer | Xin Li <delphij@google.com> | 2024-01-17 22:13:52 -0800 |
commit | b6d2d80d87a6b4096bb32833cc6ac67295713f55 (patch) | |
tree | 9892cbbd0fb247ce252a38f258e33741ca025a45 /icing/join/qualified-id-join-index-impl-v2.cc | |
parent | 95f4ed2f5a7365814321253aea37ce9cd9572dc6 (diff) | |
parent | ad9e08ff59020f5b52410dc093061d8ef715a781 (diff) | |
download | icing-b6d2d80d87a6b4096bb32833cc6ac67295713f55.tar.gz |
Merge Android 24Q1 Release (ab/11220357)temp_319669529
Bug: 319669529
Merged-In: I9f448463ecaae2b44ce31833a90d9672c33907b1
Change-Id: I986936f038c20a9b2579d658eea5176ede545011
Diffstat (limited to 'icing/join/qualified-id-join-index-impl-v2.cc')
-rw-r--r-- | icing/join/qualified-id-join-index-impl-v2.cc | 681 |
1 files changed, 681 insertions, 0 deletions
diff --git a/icing/join/qualified-id-join-index-impl-v2.cc b/icing/join/qualified-id-join-index-impl-v2.cc new file mode 100644 index 0000000..70fd13c --- /dev/null +++ b/icing/join/qualified-id-join-index-impl-v2.cc @@ -0,0 +1,681 @@ +// Copyright (C) 2023 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/join/qualified-id-join-index-impl-v2.h" + +#include <algorithm> +#include <cstdint> +#include <memory> +#include <string> +#include <string_view> +#include <utility> +#include <vector> + +#include "icing/text_classifier/lib3/utils/base/status.h" +#include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "icing/absl_ports/canonical_errors.h" +#include "icing/absl_ports/str_cat.h" +#include "icing/file/destructible-directory.h" +#include "icing/file/filesystem.h" +#include "icing/file/posting_list/flash-index-storage.h" +#include "icing/file/posting_list/posting-list-accessor.h" +#include "icing/file/posting_list/posting-list-identifier.h" +#include "icing/join/document-id-to-join-info.h" +#include "icing/join/posting-list-join-data-accessor.h" +#include "icing/join/posting-list-join-data-serializer.h" +#include "icing/join/qualified-id-join-index.h" +#include "icing/schema/joinable-property.h" +#include "icing/store/document-filter-data.h" +#include "icing/store/document-id.h" +#include "icing/store/key-mapper.h" +#include "icing/store/namespace-fingerprint-identifier.h" +#include "icing/store/namespace-id.h" +#include "icing/store/persistent-hash-map-key-mapper.h" +#include "icing/util/crc32.h" +#include "icing/util/encode-util.h" +#include "icing/util/logging.h" +#include "icing/util/status-macros.h" + +namespace icing { +namespace lib { + +namespace { + +// Set 1M for max # of qualified id entries and 10 bytes for key-value bytes. +// This will take at most 23 MiB disk space and mmap for persistent hash map. +static constexpr int32_t kSchemaJoinableIdToPostingListMapperMaxNumEntries = + 1 << 20; +static constexpr int32_t kSchemaJoinableIdToPostingListMapperAverageKVByteSize = + 10; + +inline DocumentId GetNewDocumentId( + const std::vector<DocumentId>& document_id_old_to_new, + DocumentId old_document_id) { + if (old_document_id >= document_id_old_to_new.size()) { + return kInvalidDocumentId; + } + return document_id_old_to_new[old_document_id]; +} + +inline NamespaceId GetNewNamespaceId( + const std::vector<NamespaceId>& namespace_id_old_to_new, + NamespaceId namespace_id) { + if (namespace_id >= namespace_id_old_to_new.size()) { + return kInvalidNamespaceId; + } + return namespace_id_old_to_new[namespace_id]; +} + +libtextclassifier3::StatusOr<PostingListIdentifier> GetPostingListIdentifier( + const KeyMapper<PostingListIdentifier>& + schema_joinable_id_to_posting_list_mapper, + const std::string& encoded_schema_type_joinable_property_id_str) { + auto posting_list_identifier_or = + schema_joinable_id_to_posting_list_mapper.Get( + encoded_schema_type_joinable_property_id_str); + if (!posting_list_identifier_or.ok()) { + if (absl_ports::IsNotFound(posting_list_identifier_or.status())) { + // Not found. Return invalid posting list id. + return PostingListIdentifier::kInvalid; + } + // Real error. + return posting_list_identifier_or; + } + return std::move(posting_list_identifier_or).ValueOrDie(); +} + +libtextclassifier3::StatusOr<std::string> EncodeSchemaTypeJoinablePropertyId( + SchemaTypeId schema_type_id, JoinablePropertyId joinable_property_id) { + if (schema_type_id < 0) { + return absl_ports::InvalidArgumentError("Invalid schema type id"); + } + + if (!IsJoinablePropertyIdValid(joinable_property_id)) { + return absl_ports::InvalidArgumentError("Invalid joinable property id"); + } + + static constexpr int kEncodedSchemaTypeIdLength = 3; + + // encoded_schema_type_id_str should be 1 to 3 bytes based on the value of + // schema_type_id. + std::string encoded_schema_type_id_str = + encode_util::EncodeIntToCString(schema_type_id); + // Make encoded_schema_type_id_str to fixed kEncodedSchemaTypeIdLength bytes. + while (encoded_schema_type_id_str.size() < kEncodedSchemaTypeIdLength) { + // C string cannot contain 0 bytes, so we append it using 1, just like what + // we do in encode_util::EncodeIntToCString. + // + // The reason that this works is because DecodeIntToString decodes a byte + // value of 0x01 as 0x00. When EncodeIntToCString returns an encoded + // schema type id that is less than 3 bytes, it means that the id contains + // unencoded leading 0x00. So here we're explicitly encoding those bytes as + // 0x01. + encoded_schema_type_id_str.push_back(1); + } + + return absl_ports::StrCat( + encoded_schema_type_id_str, + encode_util::EncodeIntToCString(joinable_property_id)); +} + +std::string GetMetadataFilePath(std::string_view working_path) { + return absl_ports::StrCat(working_path, "/metadata"); +} + +std::string GetSchemaJoinableIdToPostingListMapperPath( + std::string_view working_path) { + return absl_ports::StrCat(working_path, + "/schema_joinable_id_to_posting_list_mapper"); +} + +std::string GetFlashIndexStorageFilePath(std::string_view working_path) { + return absl_ports::StrCat(working_path, "/flash_index_storage"); +} + +} // namespace + +libtextclassifier3::Status +QualifiedIdJoinIndexImplV2::JoinDataIterator::Advance() { + if (pl_accessor_ == nullptr) { + return absl_ports::ResourceExhaustedError("End of iterator"); + } + + if (!should_retrieve_next_batch_) { + // In this case, cached_batch_join_data_ is not empty (contains some data + // fetched in the previous round), so move curr_ to the next position and + // check if we have to fetch the next batch. + // + // Note: in the 1st round, should_retrieve_next_batch_ is true, so this part + // will never be executed. + ++curr_; + should_retrieve_next_batch_ = curr_ >= cached_batch_join_data_.cend(); + } + + if (should_retrieve_next_batch_) { + // Fetch next batch if needed. + ICING_RETURN_IF_ERROR(GetNextDataBatch()); + should_retrieve_next_batch_ = false; + } + + return libtextclassifier3::Status::OK; +} + +libtextclassifier3::Status +QualifiedIdJoinIndexImplV2::JoinDataIterator::GetNextDataBatch() { + auto cached_batch_join_data_or = pl_accessor_->GetNextDataBatch(); + if (!cached_batch_join_data_or.ok()) { + ICING_LOG(WARNING) + << "Fail to get next batch data from posting list due to: " + << cached_batch_join_data_or.status().error_message(); + return std::move(cached_batch_join_data_or).status(); + } + + cached_batch_join_data_ = std::move(cached_batch_join_data_or).ValueOrDie(); + curr_ = cached_batch_join_data_.cbegin(); + + if (cached_batch_join_data_.empty()) { + return absl_ports::ResourceExhaustedError("End of iterator"); + } + + return libtextclassifier3::Status::OK; +} + +/* static */ libtextclassifier3::StatusOr< + std::unique_ptr<QualifiedIdJoinIndexImplV2>> +QualifiedIdJoinIndexImplV2::Create(const Filesystem& filesystem, + std::string working_path, + bool pre_mapping_fbv) { + if (!filesystem.FileExists(GetMetadataFilePath(working_path).c_str()) || + !filesystem.DirectoryExists( + GetSchemaJoinableIdToPostingListMapperPath(working_path).c_str()) || + !filesystem.FileExists( + GetFlashIndexStorageFilePath(working_path).c_str())) { + // Discard working_path if any file/directory is missing, and reinitialize. + if (filesystem.DirectoryExists(working_path.c_str())) { + ICING_RETURN_IF_ERROR( + QualifiedIdJoinIndex::Discard(filesystem, working_path)); + } + return InitializeNewFiles(filesystem, std::move(working_path), + pre_mapping_fbv); + } + return InitializeExistingFiles(filesystem, std::move(working_path), + pre_mapping_fbv); +} + +QualifiedIdJoinIndexImplV2::~QualifiedIdJoinIndexImplV2() { + if (!PersistToDisk().ok()) { + ICING_LOG(WARNING) << "Failed to persist qualified id join index (v2) to " + "disk while destructing " + << working_path_; + } +} + +libtextclassifier3::Status QualifiedIdJoinIndexImplV2::Put( + SchemaTypeId schema_type_id, JoinablePropertyId joinable_property_id, + DocumentId document_id, + std::vector<NamespaceFingerprintIdentifier>&& + ref_namespace_fingerprint_ids) { + std::sort(ref_namespace_fingerprint_ids.begin(), + ref_namespace_fingerprint_ids.end()); + + // Dedupe. + auto last = std::unique(ref_namespace_fingerprint_ids.begin(), + ref_namespace_fingerprint_ids.end()); + ref_namespace_fingerprint_ids.erase(last, + ref_namespace_fingerprint_ids.end()); + if (ref_namespace_fingerprint_ids.empty()) { + return libtextclassifier3::Status::OK; + } + + SetDirty(); + ICING_ASSIGN_OR_RETURN( + std::string encoded_schema_type_joinable_property_id_str, + EncodeSchemaTypeJoinablePropertyId(schema_type_id, joinable_property_id)); + + ICING_ASSIGN_OR_RETURN( + PostingListIdentifier posting_list_identifier, + GetPostingListIdentifier(*schema_joinable_id_to_posting_list_mapper_, + encoded_schema_type_joinable_property_id_str)); + std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor; + if (posting_list_identifier.is_valid()) { + ICING_ASSIGN_OR_RETURN( + pl_accessor, + PostingListJoinDataAccessor<JoinDataType>::CreateFromExisting( + flash_index_storage_.get(), posting_list_serializer_.get(), + posting_list_identifier)); + } else { + ICING_ASSIGN_OR_RETURN( + pl_accessor, + PostingListJoinDataAccessor<JoinDataType>::Create( + flash_index_storage_.get(), posting_list_serializer_.get())); + } + + // Prepend join data into posting list. + for (const NamespaceFingerprintIdentifier& ref_namespace_fingerprint_id : + ref_namespace_fingerprint_ids) { + ICING_RETURN_IF_ERROR(pl_accessor->PrependData( + DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>( + document_id, ref_namespace_fingerprint_id))); + } + + // Finalize the posting list and update mapper. + PostingListAccessor::FinalizeResult result = + std::move(*pl_accessor).Finalize(); + if (!result.status.ok()) { + return result.status; + } + if (!result.id.is_valid()) { + return absl_ports::InternalError("Fail to flush data into posting list(s)"); + } + ICING_RETURN_IF_ERROR(schema_joinable_id_to_posting_list_mapper_->Put( + encoded_schema_type_joinable_property_id_str, result.id)); + + // Update info. + info().num_data += ref_namespace_fingerprint_ids.size(); + + return libtextclassifier3::Status::OK; +} + +libtextclassifier3::StatusOr< + std::unique_ptr<QualifiedIdJoinIndex::JoinDataIteratorBase>> +QualifiedIdJoinIndexImplV2::GetIterator( + SchemaTypeId schema_type_id, + JoinablePropertyId joinable_property_id) const { + ICING_ASSIGN_OR_RETURN( + std::string encoded_schema_type_joinable_property_id_str, + EncodeSchemaTypeJoinablePropertyId(schema_type_id, joinable_property_id)); + + ICING_ASSIGN_OR_RETURN( + PostingListIdentifier posting_list_identifier, + GetPostingListIdentifier(*schema_joinable_id_to_posting_list_mapper_, + encoded_schema_type_joinable_property_id_str)); + + if (!posting_list_identifier.is_valid()) { + return std::make_unique<JoinDataIterator>(nullptr); + } + + ICING_ASSIGN_OR_RETURN( + std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor, + PostingListJoinDataAccessor<JoinDataType>::CreateFromExisting( + flash_index_storage_.get(), posting_list_serializer_.get(), + posting_list_identifier)); + + return std::make_unique<JoinDataIterator>(std::move(pl_accessor)); +} + +libtextclassifier3::Status QualifiedIdJoinIndexImplV2::Optimize( + const std::vector<DocumentId>& document_id_old_to_new, + const std::vector<NamespaceId>& namespace_id_old_to_new, + DocumentId new_last_added_document_id) { + std::string temp_working_path = working_path_ + "_temp"; + ICING_RETURN_IF_ERROR( + QualifiedIdJoinIndex::Discard(filesystem_, temp_working_path)); + + DestructibleDirectory temp_working_path_ddir(&filesystem_, + std::move(temp_working_path)); + if (!temp_working_path_ddir.is_valid()) { + return absl_ports::InternalError( + "Unable to create temp directory to build new qualified id join index " + "(v2)"); + } + + { + // Transfer all data from the current to new qualified id join index. Also + // PersistToDisk and destruct the instance after finishing, so we can safely + // swap directories later. + ICING_ASSIGN_OR_RETURN( + std::unique_ptr<QualifiedIdJoinIndexImplV2> new_index, + Create(filesystem_, temp_working_path_ddir.dir(), pre_mapping_fbv_)); + ICING_RETURN_IF_ERROR(TransferIndex( + document_id_old_to_new, namespace_id_old_to_new, new_index.get())); + new_index->set_last_added_document_id(new_last_added_document_id); + ICING_RETURN_IF_ERROR(new_index->PersistToDisk()); + } + + // Destruct current index's storage instances to safely swap directories. + // TODO(b/268521214): handle delete propagation storage + schema_joinable_id_to_posting_list_mapper_.reset(); + flash_index_storage_.reset(); + + if (!filesystem_.SwapFiles(temp_working_path_ddir.dir().c_str(), + working_path_.c_str())) { + return absl_ports::InternalError( + "Unable to apply new qualified id join index (v2) due to failed swap"); + } + + // Reinitialize qualified id join index. + if (!filesystem_.PRead(GetMetadataFilePath(working_path_).c_str(), + metadata_buffer_.get(), kMetadataFileSize, + /*offset=*/0)) { + return absl_ports::InternalError("Fail to read metadata file"); + } + ICING_ASSIGN_OR_RETURN( + schema_joinable_id_to_posting_list_mapper_, + PersistentHashMapKeyMapper<PostingListIdentifier>::Create( + filesystem_, + GetSchemaJoinableIdToPostingListMapperPath(working_path_), + pre_mapping_fbv_, + /*max_num_entries=*/ + kSchemaJoinableIdToPostingListMapperMaxNumEntries, + /*average_kv_byte_size=*/ + kSchemaJoinableIdToPostingListMapperAverageKVByteSize)); + ICING_ASSIGN_OR_RETURN( + FlashIndexStorage flash_index_storage, + FlashIndexStorage::Create(GetFlashIndexStorageFilePath(working_path_), + &filesystem_, posting_list_serializer_.get())); + flash_index_storage_ = + std::make_unique<FlashIndexStorage>(std::move(flash_index_storage)); + + return libtextclassifier3::Status::OK; +} + +libtextclassifier3::Status QualifiedIdJoinIndexImplV2::Clear() { + SetDirty(); + + schema_joinable_id_to_posting_list_mapper_.reset(); + // Discard and reinitialize schema_joinable_id_to_posting_list_mapper. + std::string schema_joinable_id_to_posting_list_mapper_path = + GetSchemaJoinableIdToPostingListMapperPath(working_path_); + ICING_RETURN_IF_ERROR( + PersistentHashMapKeyMapper<PostingListIdentifier>::Delete( + filesystem_, schema_joinable_id_to_posting_list_mapper_path)); + ICING_ASSIGN_OR_RETURN( + schema_joinable_id_to_posting_list_mapper_, + PersistentHashMapKeyMapper<PostingListIdentifier>::Create( + filesystem_, + std::move(schema_joinable_id_to_posting_list_mapper_path), + pre_mapping_fbv_, + /*max_num_entries=*/ + kSchemaJoinableIdToPostingListMapperMaxNumEntries, + /*average_kv_byte_size=*/ + kSchemaJoinableIdToPostingListMapperAverageKVByteSize)); + + // Discard and reinitialize flash_index_storage. + flash_index_storage_.reset(); + if (!filesystem_.DeleteFile( + GetFlashIndexStorageFilePath(working_path_).c_str())) { + return absl_ports::InternalError("Fail to delete flash index storage file"); + } + ICING_ASSIGN_OR_RETURN( + FlashIndexStorage flash_index_storage, + FlashIndexStorage::Create(GetFlashIndexStorageFilePath(working_path_), + &filesystem_, posting_list_serializer_.get())); + flash_index_storage_ = + std::make_unique<FlashIndexStorage>(std::move(flash_index_storage)); + + // TODO(b/268521214): clear delete propagation storage + + info().num_data = 0; + info().last_added_document_id = kInvalidDocumentId; + return libtextclassifier3::Status::OK; +} + +/* static */ libtextclassifier3::StatusOr< + std::unique_ptr<QualifiedIdJoinIndexImplV2>> +QualifiedIdJoinIndexImplV2::InitializeNewFiles(const Filesystem& filesystem, + std::string&& working_path, + bool pre_mapping_fbv) { + // Create working directory. + if (!filesystem.CreateDirectoryRecursively(working_path.c_str())) { + return absl_ports::InternalError( + absl_ports::StrCat("Failed to create directory: ", working_path)); + } + + // Initialize schema_joinable_id_to_posting_list_mapper + ICING_ASSIGN_OR_RETURN( + std::unique_ptr<KeyMapper<PostingListIdentifier>> + schema_joinable_id_to_posting_list_mapper, + PersistentHashMapKeyMapper<PostingListIdentifier>::Create( + filesystem, GetSchemaJoinableIdToPostingListMapperPath(working_path), + pre_mapping_fbv, + /*max_num_entries=*/ + kSchemaJoinableIdToPostingListMapperMaxNumEntries, + /*average_kv_byte_size=*/ + kSchemaJoinableIdToPostingListMapperAverageKVByteSize)); + + // Initialize flash_index_storage + auto posting_list_serializer = + std::make_unique<PostingListJoinDataSerializer<JoinDataType>>(); + ICING_ASSIGN_OR_RETURN( + FlashIndexStorage flash_index_storage, + FlashIndexStorage::Create(GetFlashIndexStorageFilePath(working_path), + &filesystem, posting_list_serializer.get())); + + // Create instance. + auto new_join_index = std::unique_ptr<QualifiedIdJoinIndexImplV2>( + new QualifiedIdJoinIndexImplV2( + filesystem, std::move(working_path), + /*metadata_buffer=*/std::make_unique<uint8_t[]>(kMetadataFileSize), + std::move(schema_joinable_id_to_posting_list_mapper), + std::move(posting_list_serializer), + std::make_unique<FlashIndexStorage>(std::move(flash_index_storage)), + pre_mapping_fbv)); + // Initialize info content. + new_join_index->info().magic = Info::kMagic; + new_join_index->info().num_data = 0; + new_join_index->info().last_added_document_id = kInvalidDocumentId; + // Initialize new PersistentStorage. The initial checksums will be computed + // and set via InitializeNewStorage. + ICING_RETURN_IF_ERROR(new_join_index->InitializeNewStorage()); + + return new_join_index; +} + +/* static */ libtextclassifier3::StatusOr< + std::unique_ptr<QualifiedIdJoinIndexImplV2>> +QualifiedIdJoinIndexImplV2::InitializeExistingFiles( + const Filesystem& filesystem, std::string&& working_path, + bool pre_mapping_fbv) { + // PRead metadata file. + auto metadata_buffer = std::make_unique<uint8_t[]>(kMetadataFileSize); + if (!filesystem.PRead(GetMetadataFilePath(working_path).c_str(), + metadata_buffer.get(), kMetadataFileSize, + /*offset=*/0)) { + return absl_ports::InternalError("Fail to read metadata file"); + } + + // Initialize schema_joinable_id_to_posting_list_mapper + ICING_ASSIGN_OR_RETURN( + std::unique_ptr<KeyMapper<PostingListIdentifier>> + schema_joinable_id_to_posting_list_mapper, + PersistentHashMapKeyMapper<PostingListIdentifier>::Create( + filesystem, GetSchemaJoinableIdToPostingListMapperPath(working_path), + pre_mapping_fbv, + /*max_num_entries=*/ + kSchemaJoinableIdToPostingListMapperMaxNumEntries, + /*average_kv_byte_size=*/ + kSchemaJoinableIdToPostingListMapperAverageKVByteSize)); + + // Initialize flash_index_storage + auto posting_list_serializer = + std::make_unique<PostingListJoinDataSerializer<JoinDataType>>(); + ICING_ASSIGN_OR_RETURN( + FlashIndexStorage flash_index_storage, + FlashIndexStorage::Create(GetFlashIndexStorageFilePath(working_path), + &filesystem, posting_list_serializer.get())); + + // Create instance. + auto join_index = std::unique_ptr<QualifiedIdJoinIndexImplV2>( + new QualifiedIdJoinIndexImplV2( + filesystem, std::move(working_path), std::move(metadata_buffer), + std::move(schema_joinable_id_to_posting_list_mapper), + std::move(posting_list_serializer), + std::make_unique<FlashIndexStorage>(std::move(flash_index_storage)), + pre_mapping_fbv)); + // Initialize existing PersistentStorage. Checksums will be validated. + ICING_RETURN_IF_ERROR(join_index->InitializeExistingStorage()); + + // Validate magic. + if (join_index->info().magic != Info::kMagic) { + return absl_ports::FailedPreconditionError("Incorrect magic value"); + } + + return join_index; +} + +libtextclassifier3::Status QualifiedIdJoinIndexImplV2::TransferIndex( + const std::vector<DocumentId>& document_id_old_to_new, + const std::vector<NamespaceId>& namespace_id_old_to_new, + QualifiedIdJoinIndexImplV2* new_index) const { + std::unique_ptr<KeyMapper<PostingListIdentifier>::Iterator> iter = + schema_joinable_id_to_posting_list_mapper_->GetIterator(); + + // Iterate through all (schema_type_id, joinable_property_id). + while (iter->Advance()) { + PostingListIdentifier old_pl_id = iter->GetValue(); + if (!old_pl_id.is_valid()) { + // Skip invalid posting list id. + continue; + } + + // Read all join data from old posting lists and convert to new join data + // with new document id, namespace id. + std::vector<JoinDataType> new_join_data_vec; + ICING_ASSIGN_OR_RETURN( + std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> + old_pl_accessor, + PostingListJoinDataAccessor<JoinDataType>::CreateFromExisting( + flash_index_storage_.get(), posting_list_serializer_.get(), + old_pl_id)); + ICING_ASSIGN_OR_RETURN(std::vector<JoinDataType> batch_old_join_data, + old_pl_accessor->GetNextDataBatch()); + while (!batch_old_join_data.empty()) { + for (const JoinDataType& old_join_data : batch_old_join_data) { + DocumentId new_document_id = GetNewDocumentId( + document_id_old_to_new, old_join_data.document_id()); + NamespaceId new_ref_namespace_id = GetNewNamespaceId( + namespace_id_old_to_new, old_join_data.join_info().namespace_id()); + + // Transfer if the document and namespace are not deleted or outdated. + if (new_document_id != kInvalidDocumentId && + new_ref_namespace_id != kInvalidNamespaceId) { + // We can reuse the fingerprint from old_join_data, since document uri + // (and its fingerprint) will never change. + new_join_data_vec.push_back(JoinDataType( + new_document_id, NamespaceFingerprintIdentifier( + new_ref_namespace_id, + old_join_data.join_info().fingerprint()))); + } + } + ICING_ASSIGN_OR_RETURN(batch_old_join_data, + old_pl_accessor->GetNextDataBatch()); + } + + if (new_join_data_vec.empty()) { + continue; + } + + // NamespaceId order may change, so we have to sort the vector. + std::sort(new_join_data_vec.begin(), new_join_data_vec.end()); + + // Create new posting list in new_index and prepend all new join data into + // it. + ICING_ASSIGN_OR_RETURN( + std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> + new_pl_accessor, + PostingListJoinDataAccessor<JoinDataType>::Create( + new_index->flash_index_storage_.get(), + new_index->posting_list_serializer_.get())); + for (const JoinDataType& new_join_data : new_join_data_vec) { + ICING_RETURN_IF_ERROR(new_pl_accessor->PrependData(new_join_data)); + } + + // Finalize the posting list and update mapper of new_index. + PostingListAccessor::FinalizeResult result = + std::move(*new_pl_accessor).Finalize(); + if (!result.status.ok()) { + return result.status; + } + if (!result.id.is_valid()) { + return absl_ports::InternalError( + "Fail to flush data into posting list(s)"); + } + ICING_RETURN_IF_ERROR( + new_index->schema_joinable_id_to_posting_list_mapper_->Put( + iter->GetKey(), result.id)); + + // Update info. + new_index->info().num_data += new_join_data_vec.size(); + } + + // TODO(b/268521214): transfer delete propagation storage + + return libtextclassifier3::Status::OK; +} + +libtextclassifier3::Status QualifiedIdJoinIndexImplV2::PersistMetadataToDisk( + bool force) { + if (!force && !is_info_dirty() && !is_storage_dirty()) { + return libtextclassifier3::Status::OK; + } + + std::string metadata_file_path = GetMetadataFilePath(working_path_); + + ScopedFd sfd(filesystem_.OpenForWrite(metadata_file_path.c_str())); + if (!sfd.is_valid()) { + return absl_ports::InternalError("Fail to open metadata file for write"); + } + + if (!filesystem_.PWrite(sfd.get(), /*offset=*/0, metadata_buffer_.get(), + kMetadataFileSize)) { + return absl_ports::InternalError("Fail to write metadata file"); + } + + if (!filesystem_.DataSync(sfd.get())) { + return absl_ports::InternalError("Fail to sync metadata to disk"); + } + + return libtextclassifier3::Status::OK; +} + +libtextclassifier3::Status QualifiedIdJoinIndexImplV2::PersistStoragesToDisk( + bool force) { + if (!force && !is_storage_dirty()) { + return libtextclassifier3::Status::OK; + } + + ICING_RETURN_IF_ERROR( + schema_joinable_id_to_posting_list_mapper_->PersistToDisk()); + if (!flash_index_storage_->PersistToDisk()) { + return absl_ports::InternalError( + "Fail to persist FlashIndexStorage to disk"); + } + + return libtextclassifier3::Status::OK; +} + +libtextclassifier3::StatusOr<Crc32> +QualifiedIdJoinIndexImplV2::ComputeInfoChecksum(bool force) { + if (!force && !is_info_dirty()) { + return Crc32(crcs().component_crcs.info_crc); + } + + return info().ComputeChecksum(); +} + +libtextclassifier3::StatusOr<Crc32> +QualifiedIdJoinIndexImplV2::ComputeStoragesChecksum(bool force) { + if (!force && !is_storage_dirty()) { + return Crc32(crcs().component_crcs.storages_crc); + } + + ICING_ASSIGN_OR_RETURN( + Crc32 schema_joinable_id_to_posting_list_mapper_crc, + schema_joinable_id_to_posting_list_mapper_->ComputeChecksum()); + + return Crc32(schema_joinable_id_to_posting_list_mapper_crc.Get()); +} + +} // namespace lib +} // namespace icing |