diff options
author | Tim Barron <tjbarron@google.com> | 2022-08-11 17:05:22 -0700 |
---|---|---|
committer | Tim Barron <tjbarron@google.com> | 2022-08-11 17:05:22 -0700 |
commit | 87267cbc5531600072a283ba0c9500c3fcac87af (patch) | |
tree | 2ec5c19afb1f4d5ee229d0619c25b2f2819ccea0 /icing/file | |
parent | 7c93c404e1fb4ed5e35326245ebc820ed774c6b2 (diff) | |
download | icing-87267cbc5531600072a283ba0c9500c3fcac87af.tar.gz |
Sync from upstream.
Descriptions:
======================================================================
Implement new version of ResultState and ResultStateManager to 1)
enforce a page byte size limit and 2) improve handling of pagination
when we encounter deleted documents.
======================================================================
Fix bugs in IcingDynamicTrie::Delete.
======================================================================
Implement IcingDynamicTrie::IsBranchingTerm.
======================================================================
Change Icing default logging level to INFO
======================================================================
Refactor KeyMapper class to be an interface.
======================================================================
Improve NamespaceChecker logic to improve Suggest latency.
======================================================================
Change icing native log tag to "AppSearchIcing"
======================================================================
Implement Index Compaction rather than rebuilding index during
Compaction.
======================================================================
Implement reverse iterator for IcingDynamicTrie
======================================================================
Avoid adding unnecessary branch points during index compaction
======================================================================
Invalidate expired result states when adding to/retrieving from
ResultStateManager.
======================================================================
Add new methods (MutableView, MutableArrayView, Append, Allocate) to
FileBackedVector
======================================================================
Create and implement PersistentHashMap class.
======================================================================
Implement RFC822 Tokenizer
======================================================================
Remove uses of StringPrintf in ICING_LOG statements
======================================================================
Properly set query latency when an error is encountered or results are
empty.
======================================================================
Bug: 146903474
Bug: 152934343
Bug: 193919210
Bug: 193453081
Bug: 231368517
Bug: 235395538
Bug: 236412165
Change-Id: I8aa278cebb12b25b39deb0ef584c0f198952659d
Diffstat (limited to 'icing/file')
-rw-r--r-- | icing/file/file-backed-bitmap.cc | 3 | ||||
-rw-r--r-- | icing/file/file-backed-vector.h | 496 | ||||
-rw-r--r-- | icing/file/file-backed-vector_test.cc | 579 | ||||
-rw-r--r-- | icing/file/filesystem.cc | 97 | ||||
-rw-r--r-- | icing/file/persistent-hash-map.cc | 534 | ||||
-rw-r--r-- | icing/file/persistent-hash-map.h | 383 | ||||
-rw-r--r-- | icing/file/persistent-hash-map_test.cc | 662 |
7 files changed, 2558 insertions, 196 deletions
diff --git a/icing/file/file-backed-bitmap.cc b/icing/file/file-backed-bitmap.cc index eec7668..a8231e3 100644 --- a/icing/file/file-backed-bitmap.cc +++ b/icing/file/file-backed-bitmap.cc @@ -269,8 +269,7 @@ libtextclassifier3::Status FileBackedBitmap::GrowTo(int new_num_bits) { return status; } - ICING_VLOG(1) << IcingStringUtil::StringPrintf( - "Grew file %s to new size %zd", file_path_.c_str(), new_file_size); + ICING_VLOG(1) << "Grew file " << file_path_ << " to new size " << new_file_size; mutable_header()->state = Header::ChecksumState::kStale; return libtextclassifier3::Status::OK; } diff --git a/icing/file/file-backed-vector.h b/icing/file/file-backed-vector.h index 7e42e32..bcfbbdd 100644 --- a/icing/file/file-backed-vector.h +++ b/icing/file/file-backed-vector.h @@ -58,8 +58,12 @@ #include <sys/mman.h> +#include <algorithm> #include <cinttypes> #include <cstdint> +#include <cstring> +#include <functional> +#include <limits> #include <memory> #include <string> #include <utility> @@ -83,6 +87,9 @@ namespace lib { template <typename T> class FileBackedVector { public: + class MutableArrayView; + class MutableView; + // Header stored at the beginning of the file before the rest of the vector // elements. Stores metadata on the vector. struct Header { @@ -133,15 +140,24 @@ class FileBackedVector { kHeaderChecksumOffset, ""); - Crc32 crc; - std::string_view header_str( - reinterpret_cast<const char*>(this), - offsetof(FileBackedVector::Header, header_checksum)); - crc.Append(header_str); - return crc.Get(); + return Crc32(std::string_view( + reinterpret_cast<const char*>(this), + offsetof(FileBackedVector::Header, header_checksum))) + .Get(); } }; + // Absolute max file size for FileBackedVector. Note that Android has a + // (2^31-1)-byte single file size limit, so kMaxFileSize is 2^31-1. + static constexpr int32_t kMaxFileSize = + std::numeric_limits<int32_t>::max(); // 2^31-1 Bytes, ~2.1 GB; + + // Size of element type T. The value is same as sizeof(T), while we should + // avoid using sizeof(T) in our codebase to prevent unexpected unsigned + // integer casting. + static constexpr int32_t kElementTypeSize = static_cast<int32_t>(sizeof(T)); + static_assert(sizeof(T) <= (1 << 10)); + // Creates a new FileBackedVector to read/write content to. // // filesystem: Object to make system level calls @@ -149,15 +165,20 @@ class FileBackedVector { // within a directory that already exists. // mmap_strategy : Strategy/optimizations to access the content in the vector, // see MemoryMappedFile::Strategy for more details + // max_file_size: Maximum file size for FileBackedVector, default + // kMaxFileSize. See max_file_size_ and kMaxFileSize for more + // details. // // Return: // FAILED_PRECONDITION_ERROR if the file checksum doesn't match the stored // checksum. // INTERNAL_ERROR on I/O errors. + // INVALID_ARGUMENT_ERROR if max_file_size is incorrect. // UNIMPLEMENTED_ERROR if created with strategy READ_WRITE_MANUAL_SYNC. static libtextclassifier3::StatusOr<std::unique_ptr<FileBackedVector<T>>> Create(const Filesystem& filesystem, const std::string& file_path, - MemoryMappedFile::Strategy mmap_strategy); + MemoryMappedFile::Strategy mmap_strategy, + int32_t max_file_size = kMaxFileSize); // Deletes the FileBackedVector // @@ -184,13 +205,13 @@ class FileBackedVector { // referencing the now-invalidated region. // // Returns: - // OUT_OF_RANGE_ERROR if idx < 0 or > num_elements() + // OUT_OF_RANGE_ERROR if idx < 0 or idx >= num_elements() libtextclassifier3::StatusOr<T> GetCopy(int32_t idx) const; - // Gets a pointer to the element at idx. + // Gets an immutable pointer to the element at idx. // - // WARNING: Subsequent calls to Set may invalidate the pointer returned by - // Get. + // WARNING: Subsequent calls to Set/Append/Allocate may invalidate the pointer + // returned by Get. // // This is useful if you do not think the FileBackedVector will grow before // you need to reference this value, and you want to avoid a copy. When the @@ -198,27 +219,102 @@ class FileBackedVector { // which will invalidate this pointer to the previously mapped region. // // Returns: - // OUT_OF_RANGE_ERROR if idx < 0 or > num_elements() + // OUT_OF_RANGE_ERROR if idx < 0 or idx >= num_elements() libtextclassifier3::StatusOr<const T*> Get(int32_t idx) const; + // Gets a MutableView to the element at idx. + // + // WARNING: Subsequent calls to Set/Append/Allocate may invalidate the + // reference returned by MutableView::Get(). + // + // This is useful if you do not think the FileBackedVector will grow before + // you need to reference this value, and you want to mutate the underlying + // data directly. When the FileBackedVector grows, the underlying mmap will be + // unmapped and remapped, which will invalidate this MutableView to the + // previously mapped region. + // + // Returns: + // OUT_OF_RANGE_ERROR if idx < 0 or idx >= num_elements() + libtextclassifier3::StatusOr<MutableView> GetMutable(int32_t idx); + + // Gets a MutableArrayView to the elements at range [idx, idx + len). + // + // WARNING: Subsequent calls to Set/Append/Allocate may invalidate the + // reference/pointer returned by MutableArrayView::operator[]/data(). + // + // This is useful if you do not think the FileBackedVector will grow before + // you need to reference this value, and you want to mutate the underlying + // data directly. When the FileBackedVector grows, the underlying mmap will be + // unmapped and remapped, which will invalidate this MutableArrayView to the + // previously mapped region. + // + // Returns: + // OUT_OF_RANGE_ERROR if idx < 0 or idx + len > num_elements() + libtextclassifier3::StatusOr<MutableArrayView> GetMutable(int32_t idx, + int32_t len); + // Writes the value at idx. // // May grow the underlying file and mmapped region as needed to fit the new - // value. If it does grow, then any pointers to previous values returned - // from Get() may be invalidated. + // value. If it does grow, then any pointers/references to previous values + // returned from Get/GetMutable/Allocate may be invalidated. // // Returns: - // OUT_OF_RANGE_ERROR if idx < 0 or file cannot be grown idx size + // OUT_OF_RANGE_ERROR if idx < 0 or idx > kMaxIndex or file cannot be grown + // idx size libtextclassifier3::Status Set(int32_t idx, const T& value); + // Appends the value to the end of the vector. + // + // May grow the underlying file and mmapped region as needed to fit the new + // value. If it does grow, then any pointers/references to previous values + // returned from Get/GetMutable/Allocate may be invalidated. + // + // Returns: + // OUT_OF_RANGE_ERROR if file cannot be grown (i.e. reach max_file_size_) + libtextclassifier3::Status Append(const T& value) { + return Set(header_->num_elements, value); + } + + // Allocates spaces with given length in the end of the vector and returns a + // MutableArrayView to the space. + // + // May grow the underlying file and mmapped region as needed to fit the new + // value. If it does grow, then any pointers/references to previous values + // returned from Get/GetMutable/Allocate may be invalidated. + // + // WARNING: Subsequent calls to Set/Append/Allocate may invalidate the + // reference/pointer returned by MutableArrayView::operator[]/data(). + // + // This is useful if you do not think the FileBackedVector will grow before + // you need to reference this value, and you want to allocate adjacent spaces + // for multiple elements and mutate the underlying data directly. When the + // FileBackedVector grows, the underlying mmap will be unmapped and remapped, + // which will invalidate this MutableArrayView to the previously mapped + // region. + // + // Returns: + // OUT_OF_RANGE_ERROR if len <= 0 or file cannot be grown (i.e. reach + // max_file_size_) + libtextclassifier3::StatusOr<MutableArrayView> Allocate(int32_t len); + // Resizes to first len elements. The crc is cleared on truncation and will be // updated on destruction, or once the client calls ComputeChecksum() or // PersistToDisk(). // // Returns: - // OUT_OF_RANGE_ERROR if len < 0 or >= num_elements() + // OUT_OF_RANGE_ERROR if len < 0 or len >= num_elements() libtextclassifier3::Status TruncateTo(int32_t new_num_elements); + // Mark idx as changed iff idx < changes_end_, so later ComputeChecksum() can + // update checksum by the cached changes without going over [0, changes_end_). + // + // If the buffer size exceeds kPartialCrcLimitDiv, then clear all change + // buffers and set changes_end_ as 0, indicating that the checksum should be + // recomputed from idx 0 (starting from the beginning). Otherwise cache the + // change. + void SetDirty(int32_t idx); + // Flushes content to underlying file. // // Returns: @@ -248,10 +344,6 @@ class FileBackedVector { return reinterpret_cast<const T*>(mmapped_file_->region()); } - T* mutable_array() const { - return reinterpret_cast<T*>(mmapped_file_->mutable_region()); - } - int32_t num_elements() const { return header_->num_elements; } // Updates checksum of the vector contents and returns it. @@ -260,6 +352,66 @@ class FileBackedVector { // INTERNAL_ERROR if the vector's internal state is inconsistent libtextclassifier3::StatusOr<Crc32> ComputeChecksum(); + public: + class MutableArrayView { + public: + const T& operator[](int32_t idx) const { return data_[idx]; } + T& operator[](int32_t idx) { + SetDirty(idx); + return data_[idx]; + } + + const T* data() const { return data_; } + + int32_t size() const { return len_; } + + // Set the mutable array slice (starting at idx) by the given element array. + // It handles SetDirty properly for the file-backed-vector when modifying + // elements. + // + // REQUIRES: arr is valid && arr_len >= 0 && idx + arr_len <= size(), + // otherwise the behavior is undefined. + void SetArray(int32_t idx, const T* arr, int32_t arr_len) { + for (int32_t i = 0; i < arr_len; ++i) { + SetDirty(idx + i); + data_[idx + i] = arr[i]; + } + } + + private: + MutableArrayView(FileBackedVector<T>* vector, T* data, int32_t len) + : vector_(vector), + data_(data), + original_idx_(data - vector->array()), + len_(len) {} + + void SetDirty(int32_t idx) { vector_->SetDirty(original_idx_ + idx); } + + // Does not own. For SetDirty only. + FileBackedVector<T>* vector_; + + // data_ points at vector_->mutable_array()[original_idx_] + T* data_; + int32_t original_idx_; + int32_t len_; + + friend class FileBackedVector; + }; + + class MutableView { + public: + const T& Get() const { return mutable_array_view_[0]; } + T& Get() { return mutable_array_view_[0]; } + + private: + MutableView(FileBackedVector<T>* vector, T* data) + : mutable_array_view_(vector, data, 1) {} + + MutableArrayView mutable_array_view_; + + friend class FileBackedVector; + }; + private: // We track partial updates to the array for crc updating. This // requires extra memory to keep track of original buffers but @@ -271,24 +423,33 @@ class FileBackedVector { // Grow file by at least this many elements if array is growable. static constexpr int64_t kGrowElements = 1u << 14; // 16K - // Max number of elements that can be held by the vector. - static constexpr int64_t kMaxNumElements = 1u << 20; // 1M + // Absolute max # of elements allowed. Since we are using int32_t to store + // num_elements, max value is 2^31-1. Still the actual max # of elements are + // determined by max_file_size, kElementTypeSize, and Header::kHeaderSize. + static constexpr int32_t kMaxNumElements = + std::numeric_limits<int32_t>::max(); + + // Absolute max index allowed. + static constexpr int32_t kMaxIndex = kMaxNumElements - 1; // Can only be created through the factory ::Create function FileBackedVector(const Filesystem& filesystem, const std::string& file_path, std::unique_ptr<Header> header, - std::unique_ptr<MemoryMappedFile> mmapped_file); + std::unique_ptr<MemoryMappedFile> mmapped_file, + int32_t max_file_size); // Initialize a new FileBackedVector, and create the file. static libtextclassifier3::StatusOr<std::unique_ptr<FileBackedVector<T>>> InitializeNewFile(const Filesystem& filesystem, const std::string& file_path, - ScopedFd fd, MemoryMappedFile::Strategy mmap_strategy); + ScopedFd fd, MemoryMappedFile::Strategy mmap_strategy, + int32_t max_file_size); // Initialize a FileBackedVector from an existing file. static libtextclassifier3::StatusOr<std::unique_ptr<FileBackedVector<T>>> InitializeExistingFile(const Filesystem& filesystem, const std::string& file_path, ScopedFd fd, - MemoryMappedFile::Strategy mmap_strategy); + MemoryMappedFile::Strategy mmap_strategy, + int32_t max_file_size); // Grows the underlying file to hold at least num_elements // @@ -296,6 +457,10 @@ class FileBackedVector { // OUT_OF_RANGE_ERROR if we can't grow to the specified size libtextclassifier3::Status GrowIfNecessary(int32_t num_elements); + T* mutable_array() const { + return reinterpret_cast<T*>(mmapped_file_->mutable_region()); + } + // Cached constructor params. const Filesystem* const filesystem_; const std::string file_path_; @@ -314,25 +479,42 @@ class FileBackedVector { // update. Will be cleared if the size grows too big. std::string saved_original_buffer_; - // Keep track of all pages we touched so we can write them back to - // disk. - std::vector<bool> dirty_pages_; + // Max file size for FileBackedVector, default kMaxFileSize. Note that this + // value won't be written into the header, so maximum file size will always be + // specified in runtime and the caller should make sure its value is correct + // and reasonable. Note that file size includes size of header + elements. + // + // The range should be in + // [Header::kHeaderSize + kElementTypeSize, kMaxFileSize], and + // (max_file_size_ - Header::kHeaderSize) / kElementTypeSize is max # of + // elements that can be stored. + int32_t max_file_size_; }; template <typename T> +constexpr int32_t FileBackedVector<T>::kMaxFileSize; + +template <typename T> +constexpr int32_t FileBackedVector<T>::kElementTypeSize; + +template <typename T> constexpr int32_t FileBackedVector<T>::kPartialCrcLimitDiv; template <typename T> constexpr int64_t FileBackedVector<T>::kGrowElements; template <typename T> -constexpr int64_t FileBackedVector<T>::kMaxNumElements; +constexpr int32_t FileBackedVector<T>::kMaxNumElements; + +template <typename T> +constexpr int32_t FileBackedVector<T>::kMaxIndex; template <typename T> libtextclassifier3::StatusOr<std::unique_ptr<FileBackedVector<T>>> FileBackedVector<T>::Create(const Filesystem& filesystem, const std::string& file_path, - MemoryMappedFile::Strategy mmap_strategy) { + MemoryMappedFile::Strategy mmap_strategy, + int32_t max_file_size) { if (mmap_strategy == MemoryMappedFile::Strategy::READ_WRITE_MANUAL_SYNC) { // FileBackedVector's behavior of growing the file underneath the mmap is // inherently broken with MAP_PRIVATE. Growing the vector requires extending @@ -345,6 +527,14 @@ FileBackedVector<T>::Create(const Filesystem& filesystem, "mmap strategy."); } + if (max_file_size < Header::kHeaderSize + kElementTypeSize || + max_file_size > kMaxFileSize) { + // FileBackedVector should be able to store at least 1 element, so + // max_file_size should be at least Header::kHeaderSize + kElementTypeSize. + return absl_ports::InvalidArgumentError( + "Invalid max file size for FileBackedVector"); + } + ScopedFd fd(filesystem.OpenForWrite(file_path.c_str())); if (!fd.is_valid()) { return absl_ports::InternalError( @@ -357,31 +547,38 @@ FileBackedVector<T>::Create(const Filesystem& filesystem, absl_ports::StrCat("Bad file size for file ", file_path)); } + if (max_file_size < file_size) { + return absl_ports::InvalidArgumentError( + "Max file size should not be smaller than the existing file size"); + } + const bool new_file = file_size == 0; if (new_file) { return InitializeNewFile(filesystem, file_path, std::move(fd), - mmap_strategy); + mmap_strategy, max_file_size); } return InitializeExistingFile(filesystem, file_path, std::move(fd), - mmap_strategy); + mmap_strategy, max_file_size); } template <typename T> libtextclassifier3::StatusOr<std::unique_ptr<FileBackedVector<T>>> -FileBackedVector<T>::InitializeNewFile( - const Filesystem& filesystem, const std::string& file_path, ScopedFd fd, - MemoryMappedFile::Strategy mmap_strategy) { +FileBackedVector<T>::InitializeNewFile(const Filesystem& filesystem, + const std::string& file_path, + ScopedFd fd, + MemoryMappedFile::Strategy mmap_strategy, + int32_t max_file_size) { // Create header. auto header = std::make_unique<Header>(); header->magic = FileBackedVector<T>::Header::kMagic; - header->element_size = sizeof(T); + header->element_size = kElementTypeSize; header->header_checksum = header->CalculateHeaderChecksum(); // We use Write() here, instead of writing through the mmapped region // created below, so we can gracefully handle errors that occur when the // disk is full. See b/77309668 for details. if (!filesystem.PWrite(fd.get(), /*offset=*/0, header.get(), - sizeof(Header))) { + Header::kHeaderSize)) { return absl_ports::InternalError("Failed to write header"); } @@ -393,23 +590,30 @@ FileBackedVector<T>::InitializeNewFile( auto mmapped_file = std::make_unique<MemoryMappedFile>(filesystem, file_path, mmap_strategy); - return std::unique_ptr<FileBackedVector<T>>(new FileBackedVector<T>( - filesystem, file_path, std::move(header), std::move(mmapped_file))); + return std::unique_ptr<FileBackedVector<T>>( + new FileBackedVector<T>(filesystem, file_path, std::move(header), + std::move(mmapped_file), max_file_size)); } template <typename T> libtextclassifier3::StatusOr<std::unique_ptr<FileBackedVector<T>>> FileBackedVector<T>::InitializeExistingFile( const Filesystem& filesystem, const std::string& file_path, - const ScopedFd fd, MemoryMappedFile::Strategy mmap_strategy) { + const ScopedFd fd, MemoryMappedFile::Strategy mmap_strategy, + int32_t max_file_size) { int64_t file_size = filesystem.GetFileSize(file_path.c_str()); - if (file_size < sizeof(FileBackedVector<T>::Header)) { + if (file_size == Filesystem::kBadFileSize) { + return absl_ports::InternalError( + absl_ports::StrCat("Bad file size for file ", file_path)); + } + + if (file_size < Header::kHeaderSize) { return absl_ports::InternalError( absl_ports::StrCat("File header too short for ", file_path)); } auto header = std::make_unique<Header>(); - if (!filesystem.PRead(fd.get(), header.get(), sizeof(Header), + if (!filesystem.PRead(fd.get(), header.get(), Header::kHeaderSize, /*offset=*/0)) { return absl_ports::InternalError( absl_ports::StrCat("Failed to read header of ", file_path)); @@ -429,13 +633,15 @@ FileBackedVector<T>::InitializeExistingFile( absl_ports::StrCat("Invalid header crc for ", file_path)); } - if (header->element_size != sizeof(T)) { + if (header->element_size != kElementTypeSize) { return absl_ports::InternalError(IcingStringUtil::StringPrintf( - "Inconsistent element size, expected %zd, actual %d", sizeof(T), + "Inconsistent element size, expected %d, actual %d", kElementTypeSize, header->element_size)); } - int64_t min_file_size = header->num_elements * sizeof(T) + sizeof(Header); + int64_t min_file_size = + static_cast<int64_t>(header->num_elements) * kElementTypeSize + + Header::kHeaderSize; if (min_file_size > file_size) { return absl_ports::InternalError(IcingStringUtil::StringPrintf( "Inconsistent file size, expected %" PRId64 ", actual %" PRId64, @@ -446,23 +652,22 @@ FileBackedVector<T>::InitializeExistingFile( // access elements from the mmapped region auto mmapped_file = std::make_unique<MemoryMappedFile>(filesystem, file_path, mmap_strategy); - ICING_RETURN_IF_ERROR( - mmapped_file->Remap(sizeof(Header), file_size - sizeof(Header))); + ICING_RETURN_IF_ERROR(mmapped_file->Remap(Header::kHeaderSize, + file_size - Header::kHeaderSize)); // Check vector contents - Crc32 vector_checksum; - std::string_view vector_contents( - reinterpret_cast<const char*>(mmapped_file->region()), - header->num_elements * sizeof(T)); - vector_checksum.Append(vector_contents); + Crc32 vector_checksum( + std::string_view(reinterpret_cast<const char*>(mmapped_file->region()), + header->num_elements * kElementTypeSize)); if (vector_checksum.Get() != header->vector_checksum) { return absl_ports::FailedPreconditionError( absl_ports::StrCat("Invalid vector contents for ", file_path)); } - return std::unique_ptr<FileBackedVector<T>>(new FileBackedVector<T>( - filesystem, file_path, std::move(header), std::move(mmapped_file))); + return std::unique_ptr<FileBackedVector<T>>( + new FileBackedVector<T>(filesystem, file_path, std::move(header), + std::move(mmapped_file), max_file_size)); } template <typename T> @@ -479,12 +684,13 @@ template <typename T> FileBackedVector<T>::FileBackedVector( const Filesystem& filesystem, const std::string& file_path, std::unique_ptr<Header> header, - std::unique_ptr<MemoryMappedFile> mmapped_file) + std::unique_ptr<MemoryMappedFile> mmapped_file, int32_t max_file_size) : filesystem_(&filesystem), file_path_(file_path), header_(std::move(header)), mmapped_file_(std::move(mmapped_file)), - changes_end_(header_->num_elements) {} + changes_end_(header_->num_elements), + max_file_size_(max_file_size) {} template <typename T> FileBackedVector<T>::~FileBackedVector() { @@ -523,6 +729,40 @@ libtextclassifier3::StatusOr<const T*> FileBackedVector<T>::Get( } template <typename T> +libtextclassifier3::StatusOr<typename FileBackedVector<T>::MutableView> +FileBackedVector<T>::GetMutable(int32_t idx) { + if (idx < 0) { + return absl_ports::OutOfRangeError( + IcingStringUtil::StringPrintf("Index, %d, was less than 0", idx)); + } + + if (idx >= header_->num_elements) { + return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf( + "Index, %d, was greater than vector size, %d", idx, + header_->num_elements)); + } + + return MutableView(this, &mutable_array()[idx]); +} + +template <typename T> +libtextclassifier3::StatusOr<typename FileBackedVector<T>::MutableArrayView> +FileBackedVector<T>::GetMutable(int32_t idx, int32_t len) { + if (idx < 0) { + return absl_ports::OutOfRangeError( + IcingStringUtil::StringPrintf("Index, %d, was less than 0", idx)); + } + + if (idx > header_->num_elements - len) { + return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf( + "Index with len, %d %d, was greater than vector size, %d", idx, len, + header_->num_elements)); + } + + return MutableArrayView(this, &mutable_array()[idx], len); +} + +template <typename T> libtextclassifier3::Status FileBackedVector<T>::Set(int32_t idx, const T& value) { if (idx < 0) { @@ -530,6 +770,11 @@ libtextclassifier3::Status FileBackedVector<T>::Set(int32_t idx, IcingStringUtil::StringPrintf("Index, %d, was less than 0", idx)); } + if (idx > kMaxIndex) { + return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf( + "Index, %d, was greater than max index allowed, %d", idx, kMaxIndex)); + } + ICING_RETURN_IF_ERROR(GrowIfNecessary(idx + 1)); if (idx + 1 > header_->num_elements) { @@ -541,36 +786,39 @@ libtextclassifier3::Status FileBackedVector<T>::Set(int32_t idx, return libtextclassifier3::Status::OK; } - // Cache original value to update crcs. - if (idx < changes_end_) { - // If we exceed kPartialCrcLimitDiv, clear changes_end_ to - // revert to full CRC. - if ((saved_original_buffer_.size() + sizeof(T)) * - FileBackedVector<T>::kPartialCrcLimitDiv > - changes_end_ * sizeof(T)) { - ICING_VLOG(2) << "FileBackedVector change tracking limit exceeded"; - changes_.clear(); - saved_original_buffer_.clear(); - changes_end_ = 0; - header_->vector_checksum = 0; - } else { - int32_t start_byte = idx * sizeof(T); - - changes_.push_back(idx); - saved_original_buffer_.append( - reinterpret_cast<char*>(const_cast<T*>(array())) + start_byte, - sizeof(T)); - } - } + SetDirty(idx); mutable_array()[idx] = value; return libtextclassifier3::Status::OK; } template <typename T> +libtextclassifier3::StatusOr<typename FileBackedVector<T>::MutableArrayView> +FileBackedVector<T>::Allocate(int32_t len) { + if (len <= 0) { + return absl_ports::OutOfRangeError("Invalid allocate length"); + } + + if (len > kMaxNumElements - header_->num_elements) { + return absl_ports::OutOfRangeError( + IcingStringUtil::StringPrintf("Cannot allocate %d elements", len)); + } + + // Although header_->num_elements + len doesn't exceed kMaxNumElements, the + // actual max # of elements are determined by max_file_size, kElementTypeSize, + // and kHeaderSize. Thus, it is still possible to fail to grow the file. + ICING_RETURN_IF_ERROR(GrowIfNecessary(header_->num_elements + len)); + + int32_t start_idx = header_->num_elements; + header_->num_elements += len; + + return MutableArrayView(this, &mutable_array()[start_idx], len); +} + +template <typename T> libtextclassifier3::Status FileBackedVector<T>::GrowIfNecessary( int32_t num_elements) { - if (sizeof(T) == 0) { + if (kElementTypeSize == 0) { // Growing is a no-op return libtextclassifier3::Status::OK; } @@ -579,10 +827,12 @@ libtextclassifier3::Status FileBackedVector<T>::GrowIfNecessary( return libtextclassifier3::Status::OK; } - if (num_elements > FileBackedVector<T>::kMaxNumElements) { + if (num_elements > + (max_file_size_ - Header::kHeaderSize) / kElementTypeSize) { return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf( - "%d exceeds maximum number of elements allowed, %lld", num_elements, - static_cast<long long>(FileBackedVector<T>::kMaxNumElements))); + "%d elements total size exceed maximum bytes of elements allowed, " + "%d bytes", + num_elements, max_file_size_ - Header::kHeaderSize)); } int64_t current_file_size = filesystem_->GetFileSize(file_path_.c_str()); @@ -590,7 +840,8 @@ libtextclassifier3::Status FileBackedVector<T>::GrowIfNecessary( return absl_ports::InternalError("Unable to retrieve file size."); } - int64_t least_file_size_needed = sizeof(Header) + num_elements * sizeof(T); + int32_t least_file_size_needed = + Header::kHeaderSize + num_elements * kElementTypeSize; // Won't overflow if (least_file_size_needed <= current_file_size) { // Our underlying file can hold the target num_elements cause we've grown // before @@ -598,9 +849,13 @@ libtextclassifier3::Status FileBackedVector<T>::GrowIfNecessary( } // Otherwise, we need to grow. Grow to kGrowElements boundary. - least_file_size_needed = math_util::RoundUpTo( - least_file_size_needed, - int64_t{FileBackedVector<T>::kGrowElements * sizeof(T)}); + // Note that we need to use int64_t here, since int32_t might overflow after + // round up. + int64_t round_up_file_size_needed = math_util::RoundUpTo( + int64_t{least_file_size_needed}, + int64_t{FileBackedVector<T>::kGrowElements} * kElementTypeSize); + least_file_size_needed = + std::min(round_up_file_size_needed, int64_t{max_file_size_}); // We use PWrite here rather than Grow because Grow doesn't actually allocate // an underlying disk block. This can lead to problems with mmap because mmap @@ -609,20 +864,22 @@ libtextclassifier3::Status FileBackedVector<T>::GrowIfNecessary( // these blocks, which will ensure that any failure to grow will surface here. int64_t page_size = getpagesize(); auto buf = std::make_unique<uint8_t[]>(page_size); - int64_t size_to_write = page_size - (current_file_size % page_size); + int64_t size_to_write = std::min(page_size - (current_file_size % page_size), + max_file_size_ - current_file_size); ScopedFd sfd(filesystem_->OpenForWrite(file_path_.c_str())); - while (current_file_size < least_file_size_needed) { + while (size_to_write > 0 && current_file_size < least_file_size_needed) { if (!filesystem_->PWrite(sfd.get(), current_file_size, buf.get(), size_to_write)) { return absl_ports::InternalError( absl_ports::StrCat("Couldn't grow file ", file_path_)); } current_file_size += size_to_write; - size_to_write = page_size - (current_file_size % page_size); + size_to_write = std::min(page_size - (current_file_size % page_size), + max_file_size_ - current_file_size); } ICING_RETURN_IF_ERROR(mmapped_file_->Remap( - sizeof(Header), least_file_size_needed - sizeof(Header))); + Header::kHeaderSize, least_file_size_needed - Header::kHeaderSize)); return libtextclassifier3::Status::OK; } @@ -653,6 +910,31 @@ libtextclassifier3::Status FileBackedVector<T>::TruncateTo( } template <typename T> +void FileBackedVector<T>::SetDirty(int32_t idx) { + // Cache original value to update crcs. + if (idx >= 0 && idx < changes_end_) { + // If we exceed kPartialCrcLimitDiv, clear changes_end_ to + // revert to full CRC. + if ((saved_original_buffer_.size() + kElementTypeSize) * + FileBackedVector<T>::kPartialCrcLimitDiv > + changes_end_ * kElementTypeSize) { + ICING_VLOG(2) << "FileBackedVector change tracking limit exceeded"; + changes_.clear(); + saved_original_buffer_.clear(); + changes_end_ = 0; + header_->vector_checksum = 0; + } else { + int32_t start_byte = idx * kElementTypeSize; + + changes_.push_back(idx); + saved_original_buffer_.append( + reinterpret_cast<char*>(const_cast<T*>(array())) + start_byte, + kElementTypeSize); + } + } +} + +template <typename T> libtextclassifier3::StatusOr<Crc32> FileBackedVector<T>::ComputeChecksum() { // First apply the modified area. Keep a bitmap of already updated // regions so we don't double-update. @@ -663,8 +945,7 @@ libtextclassifier3::StatusOr<Crc32> FileBackedVector<T>::ComputeChecksum() { int num_truncated = 0; int num_overlapped = 0; int num_duplicate = 0; - for (size_t i = 0; i < changes_.size(); i++) { - const int32_t change_offset = changes_[i]; + for (const int32_t change_offset : changes_) { if (change_offset > changes_end_) { return absl_ports::InternalError(IcingStringUtil::StringPrintf( "Failed to update crc, change offset %d, changes_end_ %d", @@ -678,9 +959,10 @@ libtextclassifier3::StatusOr<Crc32> FileBackedVector<T>::ComputeChecksum() { } // Turn change buffer into change^original. - const char* buffer_end = &saved_original_buffer_[cur_offset + sizeof(T)]; - const char* cur_array = - reinterpret_cast<const char*>(array()) + change_offset * sizeof(T); + const char* buffer_end = + &saved_original_buffer_[cur_offset + kElementTypeSize]; + const char* cur_array = reinterpret_cast<const char*>(array()) + + change_offset * kElementTypeSize; // Now xor in. SSE acceleration please? for (char* cur = &saved_original_buffer_[cur_offset]; cur < buffer_end; cur++, cur_array++) { @@ -692,9 +974,9 @@ libtextclassifier3::StatusOr<Crc32> FileBackedVector<T>::ComputeChecksum() { bool overlap = false; uint32_t cur_element = change_offset; for (char* cur = &saved_original_buffer_[cur_offset]; cur < buffer_end; - cur_element++, cur += sizeof(T)) { + cur_element++, cur += kElementTypeSize) { if (updated[cur_element]) { - memset(cur, 0, sizeof(T)); + memset(cur, 0, kElementTypeSize); overlap = true; } else { updated[cur_element] = true; @@ -705,10 +987,11 @@ libtextclassifier3::StatusOr<Crc32> FileBackedVector<T>::ComputeChecksum() { // Apply update to crc. if (new_update) { // Explicitly create the string_view with length - std::string_view xored_str(buffer_end - sizeof(T), sizeof(T)); + std::string_view xored_str(buffer_end - kElementTypeSize, + kElementTypeSize); if (!cur_crc - .UpdateWithXor(xored_str, changes_end_ * sizeof(T), - change_offset * sizeof(T)) + .UpdateWithXor(xored_str, changes_end_ * kElementTypeSize, + change_offset * kElementTypeSize) .ok()) { return absl_ports::InternalError(IcingStringUtil::StringPrintf( "Failed to update crc, change offset %d, change " @@ -722,7 +1005,7 @@ libtextclassifier3::StatusOr<Crc32> FileBackedVector<T>::ComputeChecksum() { } else { num_duplicate++; } - cur_offset += sizeof(T); + cur_offset += kElementTypeSize; } if (!changes_.empty()) { @@ -735,8 +1018,9 @@ libtextclassifier3::StatusOr<Crc32> FileBackedVector<T>::ComputeChecksum() { if (changes_end_ < header_->num_elements) { // Explicitly create the string_view with length std::string_view update_str( - reinterpret_cast<const char*>(array()) + changes_end_ * sizeof(T), - (header_->num_elements - changes_end_) * sizeof(T)); + reinterpret_cast<const char*>(array()) + + changes_end_ * kElementTypeSize, + (header_->num_elements - changes_end_) * kElementTypeSize); cur_crc.Append(update_str); ICING_VLOG(2) << IcingStringUtil::StringPrintf( "Array update tail crc offset %d -> %d", changes_end_, @@ -761,7 +1045,7 @@ libtextclassifier3::Status FileBackedVector<T>::PersistToDisk() { header_->header_checksum = header_->CalculateHeaderChecksum(); if (!filesystem_->PWrite(file_path_.c_str(), /*offset=*/0, header_.get(), - sizeof(Header))) { + Header::kHeaderSize)) { return absl_ports::InternalError("Failed to sync header"); } @@ -795,7 +1079,11 @@ libtextclassifier3::StatusOr<int64_t> FileBackedVector<T>::GetElementsFileSize() return absl_ports::InternalError( "Failed to get file size of elements in the file-backed vector"); } - return total_file_size - sizeof(Header); + if (total_file_size < Header::kHeaderSize) { + return absl_ports::InternalError( + "File size should not be smaller than header size"); + } + return total_file_size - Header::kHeaderSize; } } // namespace lib diff --git a/icing/file/file-backed-vector_test.cc b/icing/file/file-backed-vector_test.cc index 2f60c6b..60ed887 100644 --- a/icing/file/file-backed-vector_test.cc +++ b/icing/file/file-backed-vector_test.cc @@ -19,7 +19,9 @@ #include <algorithm> #include <cerrno> #include <cstdint> +#include <limits> #include <memory> +#include <string> #include <string_view> #include <vector> @@ -34,10 +36,14 @@ #include "icing/util/crc32.h" #include "icing/util/logging.h" +using ::testing::ElementsAre; using ::testing::Eq; using ::testing::IsTrue; +using ::testing::Lt; +using ::testing::Not; using ::testing::Pointee; using ::testing::Return; +using ::testing::SizeIs; namespace icing { namespace lib { @@ -60,20 +66,30 @@ class FileBackedVectorTest : public testing::Test { // Helper method to loop over some data and insert into the vector at some idx template <typename T> - void Insert(FileBackedVector<T>* vector, int32_t idx, std::string data) { - for (int i = 0; i < data.length(); ++i) { + void Insert(FileBackedVector<T>* vector, int32_t idx, + const std::vector<T>& data) { + for (int i = 0; i < data.size(); ++i) { ICING_ASSERT_OK(vector->Set(idx + i, data.at(i))); } } + void Insert(FileBackedVector<char>* vector, int32_t idx, std::string data) { + Insert(vector, idx, std::vector<char>(data.begin(), data.end())); + } + // Helper method to retrieve data from the beginning of the vector template <typename T> - std::string_view Get(FileBackedVector<T>* vector, int32_t expected_len) { + std::vector<T> Get(FileBackedVector<T>* vector, int32_t idx, + int32_t expected_len) { + return std::vector<T>(vector->array() + idx, + vector->array() + idx + expected_len); + } + + std::string_view Get(FileBackedVector<char>* vector, int32_t expected_len) { return Get(vector, 0, expected_len); } - template <typename T> - std::string_view Get(FileBackedVector<T>* vector, int32_t idx, + std::string_view Get(FileBackedVector<char>* vector, int32_t idx, int32_t expected_len) { return std::string_view(vector->array() + idx, expected_len); } @@ -103,6 +119,79 @@ TEST_F(FileBackedVectorTest, Create) { } } +TEST_F(FileBackedVectorTest, CreateWithInvalidStrategy) { + // Create a vector with unimplemented strategy + EXPECT_THAT(FileBackedVector<char>::Create( + filesystem_, file_path_, + MemoryMappedFile::Strategy::READ_WRITE_MANUAL_SYNC), + StatusIs(libtextclassifier3::StatusCode::UNIMPLEMENTED)); +} + +TEST_F(FileBackedVectorTest, CreateWithCustomMaxFileSize) { + int32_t header_size = FileBackedVector<char>::Header::kHeaderSize; + + // Create a vector with invalid max_file_size + EXPECT_THAT(FileBackedVector<char>::Create( + filesystem_, file_path_, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, + /*max_file_size=*/-1), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + EXPECT_THAT(FileBackedVector<char>::Create( + filesystem_, file_path_, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, + /*max_file_size=*/header_size - 1), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + EXPECT_THAT(FileBackedVector<char>::Create( + filesystem_, file_path_, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, + /*max_file_size=*/header_size + sizeof(char) - 1), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + + { + // Create a vector with max_file_size that allows only 1 element. + ICING_ASSERT_OK_AND_ASSIGN( + auto vector, FileBackedVector<char>::Create( + filesystem_, file_path_, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, + /*max_file_size=*/header_size + sizeof(char) * 1)); + ICING_ASSERT_OK(vector->Set(0, 'a')); + } + + { + // We can create it again with larger max_file_size, as long as it is not + // greater than kMaxFileSize. + ICING_ASSERT_OK_AND_ASSIGN( + auto vector, FileBackedVector<char>::Create( + filesystem_, file_path_, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, + /*max_file_size=*/header_size + sizeof(char) * 2)); + EXPECT_THAT(vector->Get(0), IsOkAndHolds(Pointee(Eq('a')))); + ICING_ASSERT_OK(vector->Set(1, 'b')); + } + + // We cannot create it again with max_file_size < current_file_size, even if + // it is a valid value. + int64_t current_file_size = filesystem_.GetFileSize(file_path_.c_str()); + ASSERT_THAT(current_file_size, Eq(header_size + sizeof(char) * 2)); + ASSERT_THAT(current_file_size - 1, Not(Lt(header_size + sizeof(char)))); + EXPECT_THAT(FileBackedVector<char>::Create( + filesystem_, file_path_, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, + /*max_file_size=*/current_file_size - 1), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + + { + // We can create it again with max_file_size == current_file_size. + ICING_ASSERT_OK_AND_ASSIGN( + auto vector, FileBackedVector<char>::Create( + filesystem_, file_path_, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, + /*max_file_size=*/current_file_size)); + EXPECT_THAT(vector->Get(0), IsOkAndHolds(Pointee(Eq('a')))); + EXPECT_THAT(vector->Get(1), IsOkAndHolds(Pointee(Eq('b')))); + } +} + TEST_F(FileBackedVectorTest, SimpleShared) { // Create a vector and add some data. ICING_ASSERT_OK_AND_ASSIGN( @@ -195,6 +284,373 @@ TEST_F(FileBackedVectorTest, Get) { StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); } +TEST_F(FileBackedVectorTest, MutableView) { + // Create a vector and add some data. + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<FileBackedVector<char>> vector, + FileBackedVector<char>::Create( + filesystem_, file_path_, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC)); + Insert(vector.get(), /*idx=*/0, std::string(1000, 'a')); + EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(Crc32(2620640643U))); + + ICING_ASSERT_OK_AND_ASSIGN(FileBackedVector<char>::MutableView mutable_elt, + vector->GetMutable(3)); + + mutable_elt.Get() = 'b'; + EXPECT_THAT(vector->Get(3), IsOkAndHolds(Pointee(Eq('b')))); + + mutable_elt.Get() = 'c'; + EXPECT_THAT(vector->Get(3), IsOkAndHolds(Pointee(Eq('c')))); +} + +TEST_F(FileBackedVectorTest, MutableViewShouldSetDirty) { + // Create a vector and add some data. + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<FileBackedVector<char>> vector, + FileBackedVector<char>::Create( + filesystem_, file_path_, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC)); + Insert(vector.get(), /*idx=*/0, std::string(1000, 'a')); + EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(Crc32(2620640643U))); + + std::string_view reconstructed_view = + std::string_view(vector->array(), vector->num_elements()); + + ICING_ASSERT_OK_AND_ASSIGN(FileBackedVector<char>::MutableView mutable_elt, + vector->GetMutable(3)); + + // Mutate the element via MutateView + // If non-const Get() is called, MutateView should set the element index dirty + // so that ComputeChecksum() can pick up the change and compute the checksum + // correctly. Validate by mapping another array on top. + mutable_elt.Get() = 'b'; + ASSERT_THAT(vector->Get(3), IsOkAndHolds(Pointee(Eq('b')))); + ICING_ASSERT_OK_AND_ASSIGN(Crc32 crc1, vector->ComputeChecksum()); + Crc32 full_crc1; + full_crc1.Append(reconstructed_view); + EXPECT_THAT(crc1, Eq(full_crc1)); + + // Mutate and test again. + mutable_elt.Get() = 'c'; + ASSERT_THAT(vector->Get(3), IsOkAndHolds(Pointee(Eq('c')))); + ICING_ASSERT_OK_AND_ASSIGN(Crc32 crc2, vector->ComputeChecksum()); + Crc32 full_crc2; + full_crc2.Append(reconstructed_view); + EXPECT_THAT(crc2, Eq(full_crc2)); +} + +TEST_F(FileBackedVectorTest, MutableArrayView) { + // Create a vector and add some data. + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<FileBackedVector<int>> vector, + FileBackedVector<int>::Create( + filesystem_, file_path_, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC)); + Insert(vector.get(), /*idx=*/0, std::vector<int>(/*count=*/100, /*value=*/1)); + EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(Crc32(2494890115U))); + + constexpr int kArrayViewOffset = 5; + ICING_ASSERT_OK_AND_ASSIGN( + FileBackedVector<int>::MutableArrayView mutable_arr, + vector->GetMutable(kArrayViewOffset, /*len=*/3)); + EXPECT_THAT(mutable_arr, SizeIs(3)); + + mutable_arr[0] = 2; + mutable_arr[1] = 3; + mutable_arr[2] = 4; + + EXPECT_THAT(vector->Get(kArrayViewOffset + 0), IsOkAndHolds(Pointee(Eq(2)))); + EXPECT_THAT(mutable_arr.data()[0], Eq(2)); + + EXPECT_THAT(vector->Get(kArrayViewOffset + 1), IsOkAndHolds(Pointee(Eq(3)))); + EXPECT_THAT(mutable_arr.data()[1], Eq(3)); + + EXPECT_THAT(vector->Get(kArrayViewOffset + 2), IsOkAndHolds(Pointee(Eq(4)))); + EXPECT_THAT(mutable_arr.data()[2], Eq(4)); +} + +TEST_F(FileBackedVectorTest, MutableArrayViewSetArray) { + // Create a vector and add some data. + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<FileBackedVector<int>> vector, + FileBackedVector<int>::Create( + filesystem_, file_path_, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC)); + Insert(vector.get(), /*idx=*/0, std::vector<int>(/*count=*/100, /*value=*/1)); + EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(Crc32(2494890115U))); + + constexpr int kArrayViewOffset = 3; + constexpr int kArrayViewLen = 5; + ICING_ASSERT_OK_AND_ASSIGN( + FileBackedVector<int>::MutableArrayView mutable_arr, + vector->GetMutable(kArrayViewOffset, kArrayViewLen)); + + std::vector<int> change1{2, 3, 4}; + mutable_arr.SetArray(/*idx=*/0, change1.data(), change1.size()); + EXPECT_THAT(Get(vector.get(), kArrayViewOffset, kArrayViewLen), + ElementsAre(2, 3, 4, 1, 1)); + + std::vector<int> change2{5, 6}; + mutable_arr.SetArray(/*idx=*/2, change2.data(), change2.size()); + EXPECT_THAT(Get(vector.get(), kArrayViewOffset, kArrayViewLen), + ElementsAre(2, 3, 5, 6, 1)); +} + +TEST_F(FileBackedVectorTest, MutableArrayViewSetArrayWithZeroLength) { + // Create a vector and add some data. + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<FileBackedVector<int>> vector, + FileBackedVector<int>::Create( + filesystem_, file_path_, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC)); + Insert(vector.get(), /*idx=*/0, std::vector<int>(/*count=*/100, /*value=*/1)); + EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(Crc32(2494890115U))); + + constexpr int kArrayViewOffset = 3; + constexpr int kArrayViewLen = 5; + ICING_ASSERT_OK_AND_ASSIGN( + FileBackedVector<int>::MutableArrayView mutable_arr, + vector->GetMutable(kArrayViewOffset, kArrayViewLen)); + + // Zero arr_len should work and change nothing + std::vector<int> change{2, 3}; + mutable_arr.SetArray(/*idx=*/0, change.data(), /*arr_len=*/0); + EXPECT_THAT(Get(vector.get(), kArrayViewOffset, kArrayViewLen), + ElementsAre(1, 1, 1, 1, 1)); +} + +TEST_F(FileBackedVectorTest, MutableArrayViewIndexOperatorShouldSetDirty) { + // Create an array with some data. + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<FileBackedVector<int>> vector, + FileBackedVector<int>::Create( + filesystem_, file_path_, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC)); + Insert(vector.get(), /*idx=*/0, std::vector<int>(/*count=*/100, /*value=*/1)); + EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(Crc32(2494890115U))); + + std::string_view reconstructed_view( + reinterpret_cast<const char*>(vector->array()), + vector->num_elements() * sizeof(int)); + + constexpr int kArrayViewOffset = 5; + ICING_ASSERT_OK_AND_ASSIGN( + FileBackedVector<int>::MutableArrayView mutable_arr, + vector->GetMutable(kArrayViewOffset, /*len=*/3)); + + // Use operator[] to mutate elements + // If non-const operator[] is called, MutateView should set the element index + // dirty so that ComputeChecksum() can pick up the change and compute the + // checksum correctly. Validate by mapping another array on top. + mutable_arr[0] = 2; + ASSERT_THAT(vector->Get(kArrayViewOffset + 0), IsOkAndHolds(Pointee(Eq(2)))); + ICING_ASSERT_OK_AND_ASSIGN(Crc32 crc1, vector->ComputeChecksum()); + EXPECT_THAT(crc1, Eq(Crc32(reconstructed_view))); + + mutable_arr[1] = 3; + ASSERT_THAT(vector->Get(kArrayViewOffset + 1), IsOkAndHolds(Pointee(Eq(3)))); + ICING_ASSERT_OK_AND_ASSIGN(Crc32 crc2, vector->ComputeChecksum()); + EXPECT_THAT(crc2, Eq(Crc32(reconstructed_view))); + + mutable_arr[2] = 4; + ASSERT_THAT(vector->Get(kArrayViewOffset + 2), IsOkAndHolds(Pointee(Eq(4)))); + ICING_ASSERT_OK_AND_ASSIGN(Crc32 crc3, vector->ComputeChecksum()); + EXPECT_THAT(crc3, Eq(Crc32(reconstructed_view))); + + // Change the same position. It should set dirty again. + mutable_arr[0] = 5; + ASSERT_THAT(vector->Get(kArrayViewOffset + 0), IsOkAndHolds(Pointee(Eq(5)))); + ICING_ASSERT_OK_AND_ASSIGN(Crc32 crc4, vector->ComputeChecksum()); + EXPECT_THAT(crc4, Eq(Crc32(reconstructed_view))); +} + +TEST_F(FileBackedVectorTest, MutableArrayViewSetArrayShouldSetDirty) { + // Create an array with some data. + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<FileBackedVector<int>> vector, + FileBackedVector<int>::Create( + filesystem_, file_path_, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC)); + Insert(vector.get(), /*idx=*/0, std::vector<int>(/*count=*/100, /*value=*/1)); + EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(Crc32(2494890115U))); + + std::string_view reconstructed_view( + reinterpret_cast<const char*>(vector->array()), + vector->num_elements() * sizeof(int)); + + constexpr int kArrayViewOffset = 3; + constexpr int kArrayViewLen = 5; + ICING_ASSERT_OK_AND_ASSIGN( + FileBackedVector<int>::MutableArrayView mutable_arr, + vector->GetMutable(kArrayViewOffset, kArrayViewLen)); + + std::vector<int> change{2, 3, 4}; + mutable_arr.SetArray(/*idx=*/0, change.data(), change.size()); + ASSERT_THAT(Get(vector.get(), kArrayViewOffset, kArrayViewLen), + ElementsAre(2, 3, 4, 1, 1)); + ICING_ASSERT_OK_AND_ASSIGN(Crc32 crc, vector->ComputeChecksum()); + EXPECT_THAT(crc, Eq(Crc32(reconstructed_view))); +} + +TEST_F(FileBackedVectorTest, Append) { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<FileBackedVector<char>> vector, + FileBackedVector<char>::Create( + filesystem_, file_path_, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC)); + ASSERT_THAT(vector->num_elements(), Eq(0)); + + ICING_EXPECT_OK(vector->Append('a')); + EXPECT_THAT(vector->num_elements(), Eq(1)); + EXPECT_THAT(vector->Get(0), IsOkAndHolds(Pointee(Eq('a')))); + + ICING_EXPECT_OK(vector->Append('b')); + EXPECT_THAT(vector->num_elements(), Eq(2)); + EXPECT_THAT(vector->Get(1), IsOkAndHolds(Pointee(Eq('b')))); +} + +TEST_F(FileBackedVectorTest, AppendAfterSet) { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<FileBackedVector<char>> vector, + FileBackedVector<char>::Create( + filesystem_, file_path_, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC)); + ASSERT_THAT(vector->num_elements(), Eq(0)); + + ICING_ASSERT_OK(vector->Set(9, 'z')); + ASSERT_THAT(vector->num_elements(), Eq(10)); + ICING_EXPECT_OK(vector->Append('a')); + EXPECT_THAT(vector->num_elements(), Eq(11)); + EXPECT_THAT(vector->Get(10), IsOkAndHolds(Pointee(Eq('a')))); +} + +TEST_F(FileBackedVectorTest, AppendAfterTruncate) { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<FileBackedVector<char>> vector, + FileBackedVector<char>::Create( + filesystem_, file_path_, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC)); + Insert(vector.get(), /*idx=*/0, std::string(1000, 'z')); + ASSERT_THAT(vector->num_elements(), Eq(1000)); + + ICING_ASSERT_OK(vector->TruncateTo(5)); + ICING_EXPECT_OK(vector->Append('a')); + EXPECT_THAT(vector->num_elements(), Eq(6)); + EXPECT_THAT(vector->Get(5), IsOkAndHolds(Pointee(Eq('a')))); +} + +TEST_F(FileBackedVectorTest, AppendShouldFailIfExceedingMaxFileSize) { + int32_t max_file_size = (1 << 10) - 1; + int32_t max_num_elements = + (max_file_size - FileBackedVector<char>::Header::kHeaderSize) / + sizeof(char); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<FileBackedVector<char>> vector, + FileBackedVector<char>::Create( + filesystem_, file_path_, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size)); + ICING_ASSERT_OK(vector->Set(max_num_elements - 1, 'z')); + ASSERT_THAT(vector->num_elements(), Eq(max_num_elements)); + + EXPECT_THAT(vector->Append('a'), + StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); +} + +TEST_F(FileBackedVectorTest, Allocate) { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<FileBackedVector<char>> vector, + FileBackedVector<char>::Create( + filesystem_, file_path_, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC)); + ASSERT_THAT(vector->num_elements(), Eq(0)); + + ICING_ASSERT_OK_AND_ASSIGN( + typename FileBackedVector<char>::MutableArrayView mutable_arr, + vector->Allocate(3)); + EXPECT_THAT(vector->num_elements(), Eq(3)); + EXPECT_THAT(mutable_arr, SizeIs(3)); + std::string change = "abc"; + mutable_arr.SetArray(/*idx=*/0, /*arr=*/change.data(), /*arr_len=*/3); + EXPECT_THAT(Get(vector.get(), /*idx=*/0, /*expected_len=*/3), Eq(change)); +} + +TEST_F(FileBackedVectorTest, AllocateAfterSet) { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<FileBackedVector<char>> vector, + FileBackedVector<char>::Create( + filesystem_, file_path_, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC)); + ASSERT_THAT(vector->num_elements(), Eq(0)); + + ICING_ASSERT_OK(vector->Set(9, 'z')); + ASSERT_THAT(vector->num_elements(), Eq(10)); + ICING_ASSERT_OK_AND_ASSIGN( + typename FileBackedVector<char>::MutableArrayView mutable_arr, + vector->Allocate(3)); + EXPECT_THAT(vector->num_elements(), Eq(13)); + EXPECT_THAT(mutable_arr, SizeIs(3)); + std::string change = "abc"; + mutable_arr.SetArray(/*idx=*/0, /*arr=*/change.data(), /*arr_len=*/3); + EXPECT_THAT(Get(vector.get(), /*idx=*/10, /*expected_len=*/3), Eq(change)); +} + +TEST_F(FileBackedVectorTest, AllocateAfterTruncate) { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<FileBackedVector<char>> vector, + FileBackedVector<char>::Create( + filesystem_, file_path_, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC)); + Insert(vector.get(), /*idx=*/0, std::string(1000, 'z')); + ASSERT_THAT(vector->num_elements(), Eq(1000)); + + ICING_ASSERT_OK(vector->TruncateTo(5)); + ICING_ASSERT_OK_AND_ASSIGN( + typename FileBackedVector<char>::MutableArrayView mutable_arr, + vector->Allocate(3)); + EXPECT_THAT(vector->num_elements(), Eq(8)); + std::string change = "abc"; + mutable_arr.SetArray(/*idx=*/0, /*arr=*/change.data(), /*arr_len=*/3); + EXPECT_THAT(Get(vector.get(), /*idx=*/5, /*expected_len=*/3), Eq(change)); +} + +TEST_F(FileBackedVectorTest, AllocateInvalidLengthShouldFail) { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<FileBackedVector<char>> vector, + FileBackedVector<char>::Create( + filesystem_, file_path_, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC)); + ASSERT_THAT(vector->num_elements(), Eq(0)); + + EXPECT_THAT(vector->Allocate(-1), + StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); + EXPECT_THAT(vector->num_elements(), Eq(0)); + + EXPECT_THAT(vector->Allocate(0), + StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); + EXPECT_THAT(vector->num_elements(), Eq(0)); +} + +TEST_F(FileBackedVectorTest, AllocateShouldFailIfExceedingMaxFileSize) { + int32_t max_file_size = (1 << 10) - 1; + int32_t max_num_elements = + (max_file_size - FileBackedVector<char>::Header::kHeaderSize) / + sizeof(char); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<FileBackedVector<char>> vector, + FileBackedVector<char>::Create( + filesystem_, file_path_, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size)); + ICING_ASSERT_OK(vector->Set(max_num_elements - 3, 'z')); + ASSERT_THAT(vector->num_elements(), Eq(max_num_elements - 2)); + + EXPECT_THAT(vector->Allocate(3), + StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); + EXPECT_THAT(vector->Allocate(2), IsOk()); +} + TEST_F(FileBackedVectorTest, IncrementalCrc_NonOverlappingChanges) { int num_elements = 1000; int incremental_size = 3; @@ -272,29 +728,58 @@ TEST_F(FileBackedVectorTest, IncrementalCrc_OverlappingChanges) { } } +TEST_F(FileBackedVectorTest, SetIntMaxShouldReturnOutOfRangeError) { + // Create a vector and add some data. + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<FileBackedVector<int32_t>> vector, + FileBackedVector<int32_t>::Create( + filesystem_, file_path_, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC)); + EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(Crc32(0))); + + // It is an edge case. Since Set() calls GrowIfNecessary(idx + 1), we have to + // make sure that when idx is INT32_MAX, Set() should handle it correctly. + EXPECT_THAT(vector->Set(std::numeric_limits<int32_t>::max(), 1), + StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); +} + TEST_F(FileBackedVectorTest, Grow) { - // This is the same value as FileBackedVector::kMaxNumElts - constexpr int32_t kMaxNumElts = 1U << 20; + int32_t max_file_size = (1 << 20) - 1; + int32_t header_size = FileBackedVector<int32_t>::Header::kHeaderSize; + int32_t element_type_size = static_cast<int32_t>(sizeof(int32_t)); + + // Max file size includes size of the header and elements, so max # of + // elements will be (max_file_size - header_size) / element_type_size. + // + // Also ensure that (max_file_size - header_size) is not a multiple of + // element_type_size, in order to test if the desired # of elements is + // computed by (math) floor instead of ceil. + ASSERT_THAT((max_file_size - header_size) % element_type_size, Not(Eq(0))); + int32_t max_num_elements = (max_file_size - header_size) / element_type_size; ASSERT_TRUE(filesystem_.Truncate(fd_, 0)); - // Create an array and add some data. + // Create a vector and add some data. ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<FileBackedVector<char>> vector, - FileBackedVector<char>::Create( + std::unique_ptr<FileBackedVector<int32_t>> vector, + FileBackedVector<int32_t>::Create( filesystem_, file_path_, - MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC)); + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size)); EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(Crc32(0))); - EXPECT_THAT(vector->Set(kMaxNumElts + 11, 'a'), + // max_num_elements is the allowed max # of elements, so the valid index + // should be 0 to max_num_elements-1. + EXPECT_THAT(vector->Set(max_num_elements, 1), StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); - EXPECT_THAT(vector->Set(-1, 'a'), + EXPECT_THAT(vector->Set(-1, 1), StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); + EXPECT_THAT(vector->Set(max_num_elements - 1, 1), IsOk()); - uint32_t start = kMaxNumElts - 13; - Insert(vector.get(), start, "abcde"); + int32_t start = max_num_elements - 5; + std::vector<int32_t> data{1, 2, 3, 4, 5}; + Insert(vector.get(), start, data); // Crc works? - const Crc32 good_crc(1134899064U); + const Crc32 good_crc(650981917U); EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(good_crc)); // PersistToDisk does nothing bad, and ensures the content is still there @@ -306,12 +791,12 @@ TEST_F(FileBackedVectorTest, Grow) { vector.reset(); ICING_ASSERT_OK_AND_ASSIGN( - vector, FileBackedVector<char>::Create( - filesystem_, file_path_, - MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC)); + vector, + FileBackedVector<int32_t>::Create( + filesystem_, file_path_, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size)); - std::string expected = "abcde"; - EXPECT_EQ(expected, Get(vector.get(), start, expected.length())); + EXPECT_THAT(Get(vector.get(), start, data.size()), Eq(data)); } TEST_F(FileBackedVectorTest, GrowsInChunks) { @@ -334,20 +819,20 @@ TEST_F(FileBackedVectorTest, GrowsInChunks) { // Once we add something though, we'll grow to be kGrowElements big. From this // point on, file size and disk usage should be the same because Growing will // explicitly allocate the number of blocks needed to accomodate the file. - Insert(vector.get(), 0, "a"); - int file_size = kGrowElements * sizeof(int); + Insert(vector.get(), 0, {1}); + int file_size = 1 * kGrowElements * sizeof(int); EXPECT_THAT(filesystem_.GetFileSize(fd_), Eq(file_size)); EXPECT_THAT(filesystem_.GetDiskUsage(fd_), Eq(file_size)); // Should still be the same size, don't need to grow underlying file - Insert(vector.get(), 1, "b"); + Insert(vector.get(), 1, {2}); EXPECT_THAT(filesystem_.GetFileSize(fd_), Eq(file_size)); EXPECT_THAT(filesystem_.GetDiskUsage(fd_), Eq(file_size)); // Now we grow by a kGrowElements chunk, so the underlying file is 2 // kGrowElements big - file_size *= 2; - Insert(vector.get(), 2, std::string(kGrowElements, 'c')); + file_size = 2 * kGrowElements * sizeof(int); + Insert(vector.get(), 2, std::vector<int>(kGrowElements, 3)); EXPECT_THAT(filesystem_.GetFileSize(fd_), Eq(file_size)); EXPECT_THAT(filesystem_.GetDiskUsage(fd_), Eq(file_size)); @@ -476,6 +961,48 @@ TEST_F(FileBackedVectorTest, TruncateAndReReadFile) { } } +TEST_F(FileBackedVectorTest, SetDirty) { + // 1. Create a vector and add some data. + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<FileBackedVector<char>> vector, + FileBackedVector<char>::Create( + filesystem_, file_path_, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC)); + Insert(vector.get(), 0, "abcd"); + + std::string_view reconstructed_view = + std::string_view(vector->array(), vector->num_elements()); + + ICING_ASSERT_OK_AND_ASSIGN(Crc32 crc1, vector->ComputeChecksum()); + Crc32 full_crc_before_overwrite; + full_crc_before_overwrite.Append(reconstructed_view); + EXPECT_THAT(crc1, Eq(full_crc_before_overwrite)); + + // 2. Manually overwrite the values of the first two elements. + std::string corrupted_content = "ef"; + ASSERT_THAT( + filesystem_.PWrite(fd_, /*offset=*/sizeof(FileBackedVector<char>::Header), + corrupted_content.c_str(), corrupted_content.length()), + IsTrue()); + ASSERT_THAT(Get(vector.get(), 0, 4), Eq("efcd")); + Crc32 full_crc_after_overwrite; + full_crc_after_overwrite.Append(reconstructed_view); + ASSERT_THAT(full_crc_before_overwrite, Not(Eq(full_crc_after_overwrite))); + + // 3. Without calling SetDirty(), the checksum will be recomputed incorrectly. + ICING_ASSERT_OK_AND_ASSIGN(Crc32 crc2, vector->ComputeChecksum()); + EXPECT_THAT(crc2, Not(Eq(full_crc_after_overwrite))); + + // 4. Call SetDirty() + vector->SetDirty(0); + vector->SetDirty(1); + + // 5. The checksum should be computed correctly after calling SetDirty() with + // correct index. + ICING_ASSERT_OK_AND_ASSIGN(Crc32 crc3, vector->ComputeChecksum()); + EXPECT_THAT(crc3, Eq(full_crc_after_overwrite)); +} + TEST_F(FileBackedVectorTest, InitFileTooSmallForHeaderFails) { { // 1. Create a vector with a few elements. diff --git a/icing/file/filesystem.cc b/icing/file/filesystem.cc index 82b8d98..10b77db 100644 --- a/icing/file/filesystem.cc +++ b/icing/file/filesystem.cc @@ -63,18 +63,16 @@ void LogOpenFileDescriptors() { constexpr int kMaxFileDescriptorsToStat = 4096; struct rlimit rlim = {0, 0}; if (getrlimit(RLIMIT_NOFILE, &rlim) != 0) { - ICING_LOG(ERROR) << IcingStringUtil::StringPrintf( - "getrlimit() failed (errno=%d)", errno); + ICING_LOG(ERROR) << "getrlimit() failed (errno=" << errno << ")"; return; } int fd_lim = rlim.rlim_cur; if (fd_lim > kMaxFileDescriptorsToStat) { - ICING_LOG(ERROR) << IcingStringUtil::StringPrintf( - "Maximum number of file descriptors (%d) too large.", fd_lim); + ICING_LOG(ERROR) << "Maximum number of file descriptors (" << fd_lim + << ") too large."; fd_lim = kMaxFileDescriptorsToStat; } - ICING_LOG(ERROR) << IcingStringUtil::StringPrintf( - "Listing up to %d file descriptors.", fd_lim); + ICING_LOG(ERROR) << "Listing up to " << fd_lim << " file descriptors."; // Verify that /proc/self/fd is a directory. If not, procfs is not mounted or // inaccessible for some other reason. In that case, there's no point trying @@ -96,15 +94,12 @@ void LogOpenFileDescriptors() { if (len >= 0) { // Zero-terminate the buffer, because readlink() won't. target[len < target_size ? len : target_size - 1] = '\0'; - ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("fd %d -> \"%s\"", fd, - target); + ICING_LOG(ERROR) << "fd " << fd << " -> \"" << target << "\""; } else if (errno != ENOENT) { - ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("fd %d -> ? (errno=%d)", - fd, errno); + ICING_LOG(ERROR) << "fd " << fd << " -> ? (errno=" << errno << ")"; } } - ICING_LOG(ERROR) << IcingStringUtil::StringPrintf( - "File descriptor list complete."); + ICING_LOG(ERROR) << "File descriptor list complete."; } // Logs an error formatted as: desc1 + file_name + desc2 + strerror(errnum). @@ -113,8 +108,7 @@ void LogOpenFileDescriptors() { // file descriptors (see LogOpenFileDescriptors() above). void LogOpenError(const char* desc1, const char* file_name, const char* desc2, int errnum) { - ICING_LOG(ERROR) << IcingStringUtil::StringPrintf( - "%s%s%s%s", desc1, file_name, desc2, strerror(errnum)); + ICING_LOG(ERROR) << desc1 << file_name << desc2 << strerror(errnum); if (errnum == EMFILE) { LogOpenFileDescriptors(); } @@ -155,8 +149,7 @@ bool ListDirectoryInternal(const char* dir_name, } } if (closedir(dir) != 0) { - ICING_LOG(ERROR) << IcingStringUtil::StringPrintf( - "Error closing %s: %s", dir_name, strerror(errno)); + ICING_LOG(ERROR) << "Error closing " << dir_name << " " << strerror(errno); } return true; } @@ -179,11 +172,10 @@ void ScopedFd::reset(int fd) { const int64_t Filesystem::kBadFileSize; bool Filesystem::DeleteFile(const char* file_name) const { - ICING_VLOG(1) << IcingStringUtil::StringPrintf("Deleting file %s", file_name); + ICING_VLOG(1) << "Deleting file " << file_name; int ret = unlink(file_name); if (ret != 0 && errno != ENOENT) { - ICING_LOG(ERROR) << IcingStringUtil::StringPrintf( - "Deleting file %s failed: %s", file_name, strerror(errno)); + ICING_LOG(ERROR) << "Deleting file " << file_name << " failed: " << strerror(errno); return false; } return true; @@ -192,8 +184,7 @@ bool Filesystem::DeleteFile(const char* file_name) const { bool Filesystem::DeleteDirectory(const char* dir_name) const { int ret = rmdir(dir_name); if (ret != 0 && errno != ENOENT) { - ICING_LOG(ERROR) << IcingStringUtil::StringPrintf( - "Deleting directory %s failed: %s", dir_name, strerror(errno)); + ICING_LOG(ERROR) << "Deleting directory " << dir_name << " failed: " << strerror(errno); return false; } return true; @@ -206,8 +197,7 @@ bool Filesystem::DeleteDirectoryRecursively(const char* dir_name) const { if (errno == ENOENT) { return true; // If directory didn't exist, this was successful. } - ICING_LOG(ERROR) << IcingStringUtil::StringPrintf( - "Stat %s failed: %s", dir_name, strerror(errno)); + ICING_LOG(ERROR) << "Stat " << dir_name << " failed: " << strerror(errno); return false; } vector<std::string> entries; @@ -220,8 +210,7 @@ bool Filesystem::DeleteDirectoryRecursively(const char* dir_name) const { ++i) { std::string filename = std::string(dir_name) + '/' + *i; if (stat(filename.c_str(), &st) < 0) { - ICING_LOG(ERROR) << IcingStringUtil::StringPrintf( - "Stat %s failed: %s", filename.c_str(), strerror(errno)); + ICING_LOG(ERROR) << "Stat " << filename << " failed: " << strerror(errno); success = false; } else if (S_ISDIR(st.st_mode)) { success = DeleteDirectoryRecursively(filename.c_str()) && success; @@ -244,8 +233,7 @@ bool Filesystem::FileExists(const char* file_name) const { exists = S_ISREG(st.st_mode) != 0; } else { if (errno != ENOENT) { - ICING_LOG(ERROR) << IcingStringUtil::StringPrintf( - "Unable to stat file %s: %s", file_name, strerror(errno)); + ICING_LOG(ERROR) << "Unable to stat file " << file_name << ": " << strerror(errno); } exists = false; } @@ -259,8 +247,7 @@ bool Filesystem::DirectoryExists(const char* dir_name) const { exists = S_ISDIR(st.st_mode) != 0; } else { if (errno != ENOENT) { - ICING_LOG(ERROR) << IcingStringUtil::StringPrintf( - "Unable to stat directory %s: %s", dir_name, strerror(errno)); + ICING_LOG(ERROR) << "Unable to stat directory " << dir_name << ": " << strerror(errno); } exists = false; } @@ -316,8 +303,7 @@ bool Filesystem::GetMatchingFiles(const char* glob, int basename_idx = GetBasenameIndex(glob); if (basename_idx == 0) { // We need a directory. - ICING_VLOG(1) << IcingStringUtil::StringPrintf( - "Expected directory, no matching files for: %s", glob); + ICING_VLOG(1) << "Expected directory, no matching files for: " << glob; return true; } const char* basename_glob = glob + basename_idx; @@ -372,8 +358,7 @@ int Filesystem::OpenForRead(const char* file_name) const { int64_t Filesystem::GetFileSize(int fd) const { struct stat st; if (fstat(fd, &st) < 0) { - ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Unable to stat file: %s", - strerror(errno)); + ICING_LOG(ERROR) << "Unable to stat file: " << strerror(errno); return kBadFileSize; } return st.st_size; @@ -383,11 +368,9 @@ int64_t Filesystem::GetFileSize(const char* filename) const { struct stat st; if (stat(filename, &st) < 0) { if (errno == ENOENT) { - ICING_VLOG(1) << IcingStringUtil::StringPrintf( - "Unable to stat file %s: %s", filename, strerror(errno)); + ICING_VLOG(1) << "Unable to stat file " << filename << ": " << strerror(errno); } else { - ICING_LOG(WARNING) << IcingStringUtil::StringPrintf( - "Unable to stat file %s: %s", filename, strerror(errno)); + ICING_LOG(WARNING) << "Unable to stat file " << filename << ": " << strerror(errno); } return kBadFileSize; } @@ -396,8 +379,7 @@ int64_t Filesystem::GetFileSize(const char* filename) const { bool Filesystem::Truncate(int fd, int64_t new_size) const { if (ftruncate(fd, new_size) != 0) { - ICING_LOG(ERROR) << IcingStringUtil::StringPrintf( - "Unable to truncate file: %s", strerror(errno)); + ICING_LOG(ERROR) << "Unable to truncate file: " << strerror(errno); return false; } lseek(fd, new_size, SEEK_SET); @@ -416,8 +398,7 @@ bool Filesystem::Truncate(const char* filename, int64_t new_size) const { bool Filesystem::Grow(int fd, int64_t new_size) const { if (ftruncate(fd, new_size) != 0) { - ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Unable to grow file: %s", - strerror(errno)); + ICING_LOG(ERROR) << "Unable to grow file: " << strerror(errno); return false; } @@ -442,8 +423,7 @@ bool Filesystem::Write(int fd, const void* data, size_t data_size) const { size_t chunk_size = std::min<size_t>(write_len, 64u * 1024); ssize_t wrote = write(fd, data, chunk_size); if (wrote < 0) { - ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Bad write: %s", - strerror(errno)); + ICING_LOG(ERROR) << "Bad write: " << strerror(errno); return false; } data = static_cast<const uint8_t*>(data) + wrote; @@ -521,8 +501,7 @@ bool Filesystem::CopyDirectory(const char* src_dir, const char* dst_dir, } } if (closedir(dir) != 0) { - ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Error closing %s: %s", - src_dir, strerror(errno)); + ICING_LOG(ERROR) << "Error closing " << src_dir << ": " << strerror(errno); } return true; } @@ -535,8 +514,7 @@ bool Filesystem::PWrite(int fd, off_t offset, const void* data, size_t chunk_size = std::min<size_t>(write_len, 64u * 1024); ssize_t wrote = pwrite(fd, data, chunk_size, offset); if (wrote < 0) { - ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Bad write: %s", - strerror(errno)); + ICING_LOG(ERROR) << "Bad write: " << strerror(errno); return false; } data = static_cast<const uint8_t*>(data) + wrote; @@ -561,8 +539,7 @@ bool Filesystem::PWrite(const char* filename, off_t offset, const void* data, bool Filesystem::Read(int fd, void* buf, size_t buf_size) const { ssize_t read_status = read(fd, buf, buf_size); if (read_status < 0) { - ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Bad read: %s", - strerror(errno)); + ICING_LOG(ERROR) << "Bad read: " << strerror(errno); return false; } return true; @@ -582,8 +559,7 @@ bool Filesystem::Read(const char* filename, void* buf, size_t buf_size) const { bool Filesystem::PRead(int fd, void* buf, size_t buf_size, off_t offset) const { ssize_t read_status = pread(fd, buf, buf_size, offset); if (read_status < 0) { - ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Bad read: %s", - strerror(errno)); + ICING_LOG(ERROR) << "Bad read: " << strerror(errno); return false; } return true; @@ -609,8 +585,7 @@ bool Filesystem::DataSync(int fd) const { #endif if (result < 0) { - ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Unable to sync data: %s", - strerror(errno)); + ICING_LOG(ERROR) << "Unable to sync data: " << strerror(errno); return false; } return true; @@ -618,9 +593,7 @@ bool Filesystem::DataSync(int fd) const { bool Filesystem::RenameFile(const char* old_name, const char* new_name) const { if (rename(old_name, new_name) < 0) { - ICING_LOG(ERROR) << IcingStringUtil::StringPrintf( - "Unable to rename file %s to %s: %s", old_name, new_name, - strerror(errno)); + ICING_LOG(ERROR) << "Unable to rename file " << old_name << " to " << new_name << ": " << strerror(errno); return false; } return true; @@ -658,8 +631,7 @@ bool Filesystem::CreateDirectory(const char* dir_name) const { if (mkdir(dir_name, S_IRUSR | S_IWUSR | S_IXUSR) == 0) { success = true; } else { - ICING_LOG(ERROR) << IcingStringUtil::StringPrintf( - "Creating directory %s failed: %s", dir_name, strerror(errno)); + ICING_LOG(ERROR) << "Creating directory " << dir_name << " failed: " << strerror(errno); } } return success; @@ -679,8 +651,7 @@ bool Filesystem::CreateDirectoryRecursively(const char* dir_name) const { int64_t Filesystem::GetDiskUsage(int fd) const { struct stat st; if (fstat(fd, &st) < 0) { - ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Unable to stat file: %s", - strerror(errno)); + ICING_LOG(ERROR) << "Unable to stat file: " << strerror(errno); return kBadFileSize; } return st.st_blocks * kStatBlockSize; @@ -689,8 +660,7 @@ int64_t Filesystem::GetDiskUsage(int fd) const { int64_t Filesystem::GetFileDiskUsage(const char* path) const { struct stat st; if (stat(path, &st) != 0) { - ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Unable to stat %s: %s", - path, strerror(errno)); + ICING_LOG(ERROR) << "Unable to stat " << path << ": " << strerror(errno); return kBadFileSize; } return st.st_blocks * kStatBlockSize; @@ -699,8 +669,7 @@ int64_t Filesystem::GetFileDiskUsage(const char* path) const { int64_t Filesystem::GetDiskUsage(const char* path) const { struct stat st; if (stat(path, &st) != 0) { - ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Unable to stat %s: %s", - path, strerror(errno)); + ICING_LOG(ERROR) << "Unable to stat " << path << ": " << strerror(errno); return kBadFileSize; } int64_t result = st.st_blocks * kStatBlockSize; diff --git a/icing/file/persistent-hash-map.cc b/icing/file/persistent-hash-map.cc new file mode 100644 index 0000000..d20285a --- /dev/null +++ b/icing/file/persistent-hash-map.cc @@ -0,0 +1,534 @@ +// Copyright (C) 2022 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/file/persistent-hash-map.h" + +#include <cstring> +#include <memory> +#include <string> +#include <string_view> + +#include "icing/text_classifier/lib3/utils/base/status.h" +#include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "icing/absl_ports/canonical_errors.h" +#include "icing/absl_ports/str_cat.h" +#include "icing/file/file-backed-vector.h" +#include "icing/file/memory-mapped-file.h" +#include "icing/util/crc32.h" +#include "icing/util/status-macros.h" + +namespace icing { +namespace lib { + +namespace { + +// Helper function to check if there is no termination character '\0' in the +// key. +libtextclassifier3::Status ValidateKey(std::string_view key) { + if (key.find('\0') != std::string_view::npos) { // NOLINT + return absl_ports::InvalidArgumentError( + "Key cannot contain termination character '\\0'"); + } + return libtextclassifier3::Status::OK; +} + +// Helper function to convert the key to bucket index by hash. +// +// Returns: +// int32_t: A valid bucket index with range [0, num_buckets - 1]. +// INTERNAL_ERROR if num_buckets == 0 +libtextclassifier3::StatusOr<int32_t> HashKeyToBucketIndex( + std::string_view key, int32_t num_buckets) { + if (num_buckets == 0) { + return absl_ports::InternalError("Should not have empty bucket"); + } + return static_cast<int32_t>(std::hash<std::string_view>()(key) % num_buckets); +} + +// Helper function to PWrite crcs and info to metadata_file_path. Note that +// metadata_file_path will be the normal or temporary (for branching use when +// rehashing) metadata file path. +libtextclassifier3::Status WriteMetadata(const Filesystem& filesystem, + const char* metadata_file_path, + const PersistentHashMap::Crcs* crcs, + const PersistentHashMap::Info* info) { + ScopedFd sfd(filesystem.OpenForWrite(metadata_file_path)); + if (!sfd.is_valid()) { + return absl_ports::InternalError("Failed to create metadata file"); + } + + // Write crcs and info. File layout: <Crcs><Info> + if (!filesystem.PWrite(sfd.get(), PersistentHashMap::Crcs::kFileOffset, crcs, + sizeof(PersistentHashMap::Crcs))) { + return absl_ports::InternalError("Failed to write crcs into metadata file"); + } + // Note that PWrite won't change the file offset, so we need to specify + // the correct offset when writing Info. + if (!filesystem.PWrite(sfd.get(), PersistentHashMap::Info::kFileOffset, info, + sizeof(PersistentHashMap::Info))) { + return absl_ports::InternalError("Failed to write info into metadata file"); + } + + return libtextclassifier3::Status::OK; +} + +// Helper function to update checksums from info and storages to a Crcs +// instance. Note that storages will be the normal instances used by +// PersistentHashMap, or the temporary instances (for branching use when +// rehashing). +libtextclassifier3::Status UpdateChecksums( + PersistentHashMap::Crcs* crcs, PersistentHashMap::Info* info, + FileBackedVector<PersistentHashMap::Bucket>* bucket_storage, + FileBackedVector<PersistentHashMap::Entry>* entry_storage, + FileBackedVector<char>* kv_storage) { + // Compute crcs + ICING_ASSIGN_OR_RETURN(Crc32 bucket_storage_crc, + bucket_storage->ComputeChecksum()); + ICING_ASSIGN_OR_RETURN(Crc32 entry_storage_crc, + entry_storage->ComputeChecksum()); + ICING_ASSIGN_OR_RETURN(Crc32 kv_storage_crc, kv_storage->ComputeChecksum()); + + crcs->component_crcs.info_crc = info->ComputeChecksum().Get(); + crcs->component_crcs.bucket_storage_crc = bucket_storage_crc.Get(); + crcs->component_crcs.entry_storage_crc = entry_storage_crc.Get(); + crcs->component_crcs.kv_storage_crc = kv_storage_crc.Get(); + crcs->all_crc = crcs->component_crcs.ComputeChecksum().Get(); + + return libtextclassifier3::Status::OK; +} + +// Helper function to validate checksums. +libtextclassifier3::Status ValidateChecksums( + const PersistentHashMap::Crcs* crcs, const PersistentHashMap::Info* info, + FileBackedVector<PersistentHashMap::Bucket>* bucket_storage, + FileBackedVector<PersistentHashMap::Entry>* entry_storage, + FileBackedVector<char>* kv_storage) { + if (crcs->all_crc != crcs->component_crcs.ComputeChecksum().Get()) { + return absl_ports::FailedPreconditionError( + "Invalid all crc for PersistentHashMap"); + } + + if (crcs->component_crcs.info_crc != info->ComputeChecksum().Get()) { + return absl_ports::FailedPreconditionError( + "Invalid info crc for PersistentHashMap"); + } + + ICING_ASSIGN_OR_RETURN(Crc32 bucket_storage_crc, + bucket_storage->ComputeChecksum()); + if (crcs->component_crcs.bucket_storage_crc != bucket_storage_crc.Get()) { + return absl_ports::FailedPreconditionError( + "Mismatch crc with PersistentHashMap bucket storage"); + } + + ICING_ASSIGN_OR_RETURN(Crc32 entry_storage_crc, + entry_storage->ComputeChecksum()); + if (crcs->component_crcs.entry_storage_crc != entry_storage_crc.Get()) { + return absl_ports::FailedPreconditionError( + "Mismatch crc with PersistentHashMap entry storage"); + } + + ICING_ASSIGN_OR_RETURN(Crc32 kv_storage_crc, kv_storage->ComputeChecksum()); + if (crcs->component_crcs.kv_storage_crc != kv_storage_crc.Get()) { + return absl_ports::FailedPreconditionError( + "Mismatch crc with PersistentHashMap key value storage"); + } + + return libtextclassifier3::Status::OK; +} + +// Since metadata/bucket/entry storages should be branched when rehashing, we +// have to store them together under the same sub directory +// ("<base_dir>/<sub_dir>"). On the other hand, key-value storage won't be +// branched and it will be stored under <base_dir>. +// +// The following 4 methods are helper functions to get the correct path of +// metadata/bucket/entry/key-value storages, according to the given base +// directory and sub directory. +std::string GetMetadataFilePath(std::string_view base_dir, + std::string_view sub_dir) { + return absl_ports::StrCat(base_dir, "/", sub_dir, "/", + PersistentHashMap::kFilePrefix, ".m"); +} + +std::string GetBucketStorageFilePath(std::string_view base_dir, + std::string_view sub_dir) { + return absl_ports::StrCat(base_dir, "/", sub_dir, "/", + PersistentHashMap::kFilePrefix, ".b"); +} + +std::string GetEntryStorageFilePath(std::string_view base_dir, + std::string_view sub_dir) { + return absl_ports::StrCat(base_dir, "/", sub_dir, "/", + PersistentHashMap::kFilePrefix, ".e"); +} + +std::string GetKeyValueStorageFilePath(std::string_view base_dir) { + return absl_ports::StrCat(base_dir, "/", PersistentHashMap::kFilePrefix, + ".k"); +} + +} // namespace + +/* static */ libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>> +PersistentHashMap::Create(const Filesystem& filesystem, + std::string_view base_dir, int32_t value_type_size, + int32_t max_load_factor_percent) { + if (!filesystem.FileExists( + GetMetadataFilePath(base_dir, kSubDirectory).c_str()) || + !filesystem.FileExists( + GetBucketStorageFilePath(base_dir, kSubDirectory).c_str()) || + !filesystem.FileExists( + GetEntryStorageFilePath(base_dir, kSubDirectory).c_str()) || + !filesystem.FileExists(GetKeyValueStorageFilePath(base_dir).c_str())) { + // TODO: erase all files if missing any. + return InitializeNewFiles(filesystem, base_dir, value_type_size, + max_load_factor_percent); + } + return InitializeExistingFiles(filesystem, base_dir, value_type_size, + max_load_factor_percent); +} + +PersistentHashMap::~PersistentHashMap() { + if (!PersistToDisk().ok()) { + ICING_LOG(WARNING) + << "Failed to persist hash map to disk while destructing " << base_dir_; + } +} + +libtextclassifier3::Status PersistentHashMap::Put(std::string_view key, + const void* value) { + ICING_RETURN_IF_ERROR(ValidateKey(key)); + ICING_ASSIGN_OR_RETURN( + int32_t bucket_idx, + HashKeyToBucketIndex(key, bucket_storage_->num_elements())); + + ICING_ASSIGN_OR_RETURN(int32_t target_entry_idx, + FindEntryIndexByKey(bucket_idx, key)); + if (target_entry_idx == Entry::kInvalidIndex) { + // If not found, then insert new key value pair. + return Insert(bucket_idx, key, value); + } + + // Otherwise, overwrite the value. + ICING_ASSIGN_OR_RETURN(const Entry* entry, + entry_storage_->Get(target_entry_idx)); + + int32_t kv_len = key.length() + 1 + info()->value_type_size; + int32_t value_offset = key.length() + 1; + ICING_ASSIGN_OR_RETURN( + typename FileBackedVector<char>::MutableArrayView mutable_kv_arr, + kv_storage_->GetMutable(entry->key_value_index(), kv_len)); + // It is the same key and value_size is fixed, so we can directly overwrite + // serialized value. + mutable_kv_arr.SetArray(value_offset, reinterpret_cast<const char*>(value), + info()->value_type_size); + + return libtextclassifier3::Status::OK; +} + +libtextclassifier3::Status PersistentHashMap::GetOrPut(std::string_view key, + void* next_value) { + ICING_RETURN_IF_ERROR(ValidateKey(key)); + ICING_ASSIGN_OR_RETURN( + int32_t bucket_idx, + HashKeyToBucketIndex(key, bucket_storage_->num_elements())); + + ICING_ASSIGN_OR_RETURN(int32_t target_entry_idx, + FindEntryIndexByKey(bucket_idx, key)); + if (target_entry_idx == Entry::kInvalidIndex) { + // If not found, then insert new key value pair. + return Insert(bucket_idx, key, next_value); + } + + // Otherwise, copy the hash map value into next_value. + return CopyEntryValue(target_entry_idx, next_value); +} + +libtextclassifier3::Status PersistentHashMap::Get(std::string_view key, + void* value) const { + ICING_RETURN_IF_ERROR(ValidateKey(key)); + ICING_ASSIGN_OR_RETURN( + int32_t bucket_idx, + HashKeyToBucketIndex(key, bucket_storage_->num_elements())); + + ICING_ASSIGN_OR_RETURN(int32_t target_entry_idx, + FindEntryIndexByKey(bucket_idx, key)); + if (target_entry_idx == Entry::kInvalidIndex) { + return absl_ports::NotFoundError( + absl_ports::StrCat("Key not found in PersistentHashMap ", base_dir_)); + } + + return CopyEntryValue(target_entry_idx, value); +} + +libtextclassifier3::Status PersistentHashMap::PersistToDisk() { + ICING_RETURN_IF_ERROR(bucket_storage_->PersistToDisk()); + ICING_RETURN_IF_ERROR(entry_storage_->PersistToDisk()); + ICING_RETURN_IF_ERROR(kv_storage_->PersistToDisk()); + + ICING_RETURN_IF_ERROR(UpdateChecksums(crcs(), info(), bucket_storage_.get(), + entry_storage_.get(), + kv_storage_.get())); + // Changes should have been applied to the underlying file when using + // MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, but call msync() as an + // extra safety step to ensure they are written out. + ICING_RETURN_IF_ERROR(metadata_mmapped_file_->PersistToDisk()); + + return libtextclassifier3::Status::OK; +} + +libtextclassifier3::StatusOr<int64_t> PersistentHashMap::GetDiskUsage() const { + ICING_ASSIGN_OR_RETURN(int64_t bucket_storage_disk_usage, + bucket_storage_->GetDiskUsage()); + ICING_ASSIGN_OR_RETURN(int64_t entry_storage_disk_usage, + entry_storage_->GetDiskUsage()); + ICING_ASSIGN_OR_RETURN(int64_t kv_storage_disk_usage, + kv_storage_->GetDiskUsage()); + + int64_t total = bucket_storage_disk_usage + entry_storage_disk_usage + + kv_storage_disk_usage; + Filesystem::IncrementByOrSetInvalid( + filesystem_->GetDiskUsage( + GetMetadataFilePath(base_dir_, kSubDirectory).c_str()), + &total); + + if (total < 0 || total == Filesystem::kBadFileSize) { + return absl_ports::InternalError( + "Failed to get disk usage of PersistentHashMap"); + } + return total; +} + +libtextclassifier3::StatusOr<int64_t> PersistentHashMap::GetElementsSize() + const { + ICING_ASSIGN_OR_RETURN(int64_t bucket_storage_elements_size, + bucket_storage_->GetElementsFileSize()); + ICING_ASSIGN_OR_RETURN(int64_t entry_storage_elements_size, + entry_storage_->GetElementsFileSize()); + ICING_ASSIGN_OR_RETURN(int64_t kv_storage_elements_size, + kv_storage_->GetElementsFileSize()); + return bucket_storage_elements_size + entry_storage_elements_size + + kv_storage_elements_size; +} + +libtextclassifier3::StatusOr<Crc32> PersistentHashMap::ComputeChecksum() { + Crcs* crcs_ptr = crcs(); + ICING_RETURN_IF_ERROR(UpdateChecksums(crcs_ptr, info(), bucket_storage_.get(), + entry_storage_.get(), + kv_storage_.get())); + return Crc32(crcs_ptr->all_crc); +} + +/* static */ libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>> +PersistentHashMap::InitializeNewFiles(const Filesystem& filesystem, + std::string_view base_dir, + int32_t value_type_size, + int32_t max_load_factor_percent) { + // Create directory. + const std::string dir_path = absl_ports::StrCat(base_dir, "/", kSubDirectory); + if (!filesystem.CreateDirectoryRecursively(dir_path.c_str())) { + return absl_ports::InternalError( + absl_ports::StrCat("Failed to create directory: ", dir_path)); + } + + // Initialize 3 storages + ICING_ASSIGN_OR_RETURN( + std::unique_ptr<FileBackedVector<Bucket>> bucket_storage, + FileBackedVector<Bucket>::Create( + filesystem, GetBucketStorageFilePath(base_dir, kSubDirectory), + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC)); + ICING_ASSIGN_OR_RETURN( + std::unique_ptr<FileBackedVector<Entry>> entry_storage, + FileBackedVector<Entry>::Create( + filesystem, GetEntryStorageFilePath(base_dir, kSubDirectory), + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC)); + ICING_ASSIGN_OR_RETURN(std::unique_ptr<FileBackedVector<char>> kv_storage, + FileBackedVector<char>::Create( + filesystem, GetKeyValueStorageFilePath(base_dir), + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC)); + + // Initialize one bucket. + ICING_RETURN_IF_ERROR(bucket_storage->Append(Bucket())); + ICING_RETURN_IF_ERROR(bucket_storage->PersistToDisk()); + + // Create and initialize new info + Info new_info; + new_info.version = kVersion; + new_info.value_type_size = value_type_size; + new_info.max_load_factor_percent = max_load_factor_percent; + new_info.num_deleted_entries = 0; + new_info.num_deleted_key_value_bytes = 0; + + // Compute checksums + Crcs new_crcs; + ICING_RETURN_IF_ERROR(UpdateChecksums(&new_crcs, &new_info, + bucket_storage.get(), + entry_storage.get(), kv_storage.get())); + + const std::string metadata_file_path = + GetMetadataFilePath(base_dir, kSubDirectory); + // Write new metadata file + ICING_RETURN_IF_ERROR(WriteMetadata(filesystem, metadata_file_path.c_str(), + &new_crcs, &new_info)); + + // Mmap the content of the crcs and info. + auto metadata_mmapped_file = std::make_unique<MemoryMappedFile>( + filesystem, metadata_file_path, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC); + ICING_RETURN_IF_ERROR(metadata_mmapped_file->Remap( + /*file_offset=*/0, /*mmap_size=*/sizeof(Crcs) + sizeof(Info))); + + return std::unique_ptr<PersistentHashMap>(new PersistentHashMap( + filesystem, base_dir, std::move(metadata_mmapped_file), + std::move(bucket_storage), std::move(entry_storage), + std::move(kv_storage))); +} + +/* static */ libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>> +PersistentHashMap::InitializeExistingFiles(const Filesystem& filesystem, + std::string_view base_dir, + int32_t value_type_size, + int32_t max_load_factor_percent) { + // Mmap the content of the crcs and info. + auto metadata_mmapped_file = std::make_unique<MemoryMappedFile>( + filesystem, GetMetadataFilePath(base_dir, kSubDirectory), + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC); + ICING_RETURN_IF_ERROR(metadata_mmapped_file->Remap( + /*file_offset=*/0, /*mmap_size=*/sizeof(Crcs) + sizeof(Info))); + + // Initialize 3 storages + ICING_ASSIGN_OR_RETURN( + std::unique_ptr<FileBackedVector<Bucket>> bucket_storage, + FileBackedVector<Bucket>::Create( + filesystem, GetBucketStorageFilePath(base_dir, kSubDirectory), + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC)); + ICING_ASSIGN_OR_RETURN( + std::unique_ptr<FileBackedVector<Entry>> entry_storage, + FileBackedVector<Entry>::Create( + filesystem, GetEntryStorageFilePath(base_dir, kSubDirectory), + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC)); + ICING_ASSIGN_OR_RETURN(std::unique_ptr<FileBackedVector<char>> kv_storage, + FileBackedVector<char>::Create( + filesystem, GetKeyValueStorageFilePath(base_dir), + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC)); + + Crcs* crcs_ptr = reinterpret_cast<Crcs*>( + metadata_mmapped_file->mutable_region() + Crcs::kFileOffset); + Info* info_ptr = reinterpret_cast<Info*>( + metadata_mmapped_file->mutable_region() + Info::kFileOffset); + + // Value type size should be consistent. + if (value_type_size != info_ptr->value_type_size) { + return absl_ports::FailedPreconditionError("Incorrect value type size"); + } + + // Validate checksums of info and 3 storages. + ICING_RETURN_IF_ERROR( + ValidateChecksums(crcs_ptr, info_ptr, bucket_storage.get(), + entry_storage.get(), kv_storage.get())); + + // Allow max_load_factor_percent_ change. + if (max_load_factor_percent != info_ptr->max_load_factor_percent) { + ICING_VLOG(2) << "Changing max_load_factor_percent from " << info_ptr->max_load_factor_percent << " to " << max_load_factor_percent; + + info_ptr->max_load_factor_percent = max_load_factor_percent; + crcs_ptr->component_crcs.info_crc = info_ptr->ComputeChecksum().Get(); + crcs_ptr->all_crc = crcs_ptr->component_crcs.ComputeChecksum().Get(); + ICING_RETURN_IF_ERROR(metadata_mmapped_file->PersistToDisk()); + // TODO(b/193919210): rehash if needed + } + + return std::unique_ptr<PersistentHashMap>(new PersistentHashMap( + filesystem, base_dir, std::move(metadata_mmapped_file), + std::move(bucket_storage), std::move(entry_storage), + std::move(kv_storage))); +} + +libtextclassifier3::StatusOr<int32_t> PersistentHashMap::FindEntryIndexByKey( + int32_t bucket_idx, std::string_view key) const { + // Iterate all entries in the bucket, compare with key, and return the entry + // index if exists. + ICING_ASSIGN_OR_RETURN(const Bucket* bucket, + bucket_storage_->Get(bucket_idx)); + int32_t curr_entry_idx = bucket->head_entry_index(); + while (curr_entry_idx != Entry::kInvalidIndex) { + ICING_ASSIGN_OR_RETURN(const Entry* entry, + entry_storage_->Get(curr_entry_idx)); + if (entry->key_value_index() == kInvalidKVIndex) { + ICING_LOG(ERROR) << "Got an invalid key value index in the persistent " + "hash map bucket. This shouldn't happen"; + return absl_ports::InternalError("Unexpected invalid key value index"); + } + ICING_ASSIGN_OR_RETURN(const char* kv_arr, + kv_storage_->Get(entry->key_value_index())); + if (key.compare(kv_arr) == 0) { + return curr_entry_idx; + } + + curr_entry_idx = entry->next_entry_index(); + } + + return curr_entry_idx; +} + +libtextclassifier3::Status PersistentHashMap::CopyEntryValue( + int32_t entry_idx, void* value) const { + ICING_ASSIGN_OR_RETURN(const Entry* entry, entry_storage_->Get(entry_idx)); + + ICING_ASSIGN_OR_RETURN(const char* kv_arr, + kv_storage_->Get(entry->key_value_index())); + int32_t value_offset = strlen(kv_arr) + 1; + memcpy(value, kv_arr + value_offset, info()->value_type_size); + + return libtextclassifier3::Status::OK; +} + +libtextclassifier3::Status PersistentHashMap::Insert(int32_t bucket_idx, + std::string_view key, + const void* value) { + // If size() + 1 exceeds Entry::kMaxNumEntries, then return error. + if (size() > Entry::kMaxNumEntries - 1) { + return absl_ports::ResourceExhaustedError("Cannot insert new entry"); + } + + ICING_ASSIGN_OR_RETURN( + typename FileBackedVector<Bucket>::MutableView mutable_bucket, + bucket_storage_->GetMutable(bucket_idx)); + + // Append new key value. + int32_t new_kv_idx = kv_storage_->num_elements(); + int32_t kv_len = key.size() + 1 + info()->value_type_size; + int32_t value_offset = key.size() + 1; + ICING_ASSIGN_OR_RETURN( + typename FileBackedVector<char>::MutableArrayView mutable_new_kv_arr, + kv_storage_->Allocate(kv_len)); + mutable_new_kv_arr.SetArray(/*idx=*/0, key.data(), key.size()); + mutable_new_kv_arr.SetArray(/*idx=*/key.size(), "\0", 1); + mutable_new_kv_arr.SetArray(/*idx=*/value_offset, + reinterpret_cast<const char*>(value), + info()->value_type_size); + + // Append new entry. + int32_t new_entry_idx = entry_storage_->num_elements(); + ICING_RETURN_IF_ERROR(entry_storage_->Append( + Entry(new_kv_idx, mutable_bucket.Get().head_entry_index()))); + mutable_bucket.Get().set_head_entry_index(new_entry_idx); + + // TODO: rehash if needed + + return libtextclassifier3::Status::OK; +} + +} // namespace lib +} // namespace icing diff --git a/icing/file/persistent-hash-map.h b/icing/file/persistent-hash-map.h new file mode 100644 index 0000000..24a47ea --- /dev/null +++ b/icing/file/persistent-hash-map.h @@ -0,0 +1,383 @@ +// Copyright (C) 2022 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ICING_FILE_PERSISTENT_HASH_MAP_H_ +#define ICING_FILE_PERSISTENT_HASH_MAP_H_ + +#include <cstdint> +#include <memory> +#include <string> +#include <string_view> + +#include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "icing/file/file-backed-vector.h" +#include "icing/file/filesystem.h" +#include "icing/file/memory-mapped-file.h" +#include "icing/util/crc32.h" + +namespace icing { +namespace lib { + +// Low level persistent hash map. +// It supports variant length serialized key + fixed length serialized value. +// Key and value can be any type, but callers should serialize key/value by +// themselves and pass raw bytes into the hash map, and the serialized key +// should not contain termination character '\0'. +class PersistentHashMap { + public: + // Crcs and Info will be written into the metadata file. + // File layout: <Crcs><Info> + // Crcs + struct Crcs { + static constexpr int32_t kFileOffset = 0; + + struct ComponentCrcs { + uint32_t info_crc; + uint32_t bucket_storage_crc; + uint32_t entry_storage_crc; + uint32_t kv_storage_crc; + + bool operator==(const ComponentCrcs& other) const { + return info_crc == other.info_crc && + bucket_storage_crc == other.bucket_storage_crc && + entry_storage_crc == other.entry_storage_crc && + kv_storage_crc == other.kv_storage_crc; + } + + Crc32 ComputeChecksum() const { + return Crc32(std::string_view(reinterpret_cast<const char*>(this), + sizeof(ComponentCrcs))); + } + } __attribute__((packed)); + + bool operator==(const Crcs& other) const { + return all_crc == other.all_crc && component_crcs == other.component_crcs; + } + + uint32_t all_crc; + ComponentCrcs component_crcs; + } __attribute__((packed)); + static_assert(sizeof(Crcs) == 20, ""); + + // Info + struct Info { + static constexpr int32_t kFileOffset = static_cast<int32_t>(sizeof(Crcs)); + + int32_t version; + int32_t value_type_size; + int32_t max_load_factor_percent; + int32_t num_deleted_entries; + int32_t num_deleted_key_value_bytes; + + Crc32 ComputeChecksum() const { + return Crc32( + std::string_view(reinterpret_cast<const char*>(this), sizeof(Info))); + } + } __attribute__((packed)); + static_assert(sizeof(Info) == 20, ""); + + // Bucket + class Bucket { + public: + // Absolute max # of buckets allowed. Since max file size on Android is + // 2^31-1, we can at most have ~2^29 buckets. To make it power of 2, round + // it down to 2^28. Also since we're using FileBackedVector to store + // buckets, add some static_asserts to ensure numbers here are compatible + // with FileBackedVector. + static constexpr int32_t kMaxNumBuckets = 1 << 28; + + explicit Bucket(int32_t head_entry_index = Entry::kInvalidIndex) + : head_entry_index_(head_entry_index) {} + + // For FileBackedVector + bool operator==(const Bucket& other) const { + return head_entry_index_ == other.head_entry_index_; + } + + int32_t head_entry_index() const { return head_entry_index_; } + void set_head_entry_index(int32_t head_entry_index) { + head_entry_index_ = head_entry_index; + } + + private: + int32_t head_entry_index_; + } __attribute__((packed)); + static_assert(sizeof(Bucket) == 4, ""); + static_assert(sizeof(Bucket) == FileBackedVector<Bucket>::kElementTypeSize, + "Bucket type size is inconsistent with FileBackedVector " + "element type size"); + static_assert(Bucket::kMaxNumBuckets <= + (FileBackedVector<Bucket>::kMaxFileSize - + FileBackedVector<Bucket>::Header::kHeaderSize) / + FileBackedVector<Bucket>::kElementTypeSize, + "Max # of buckets cannot fit into FileBackedVector"); + + // Entry + class Entry { + public: + // Absolute max # of entries allowed. Since max file size on Android is + // 2^31-1, we can at most have ~2^28 entries. To make it power of 2, round + // it down to 2^27. Also since we're using FileBackedVector to store + // entries, add some static_asserts to ensure numbers here are compatible + // with FileBackedVector. + // + // Still the actual max # of entries are determined by key-value storage, + // since length of the key varies and affects # of actual key-value pairs + // that can be stored. + static constexpr int32_t kMaxNumEntries = 1 << 27; + static constexpr int32_t kMaxIndex = kMaxNumEntries - 1; + static constexpr int32_t kInvalidIndex = -1; + + explicit Entry(int32_t key_value_index, int32_t next_entry_index) + : key_value_index_(key_value_index), + next_entry_index_(next_entry_index) {} + + bool operator==(const Entry& other) const { + return key_value_index_ == other.key_value_index_ && + next_entry_index_ == other.next_entry_index_; + } + + int32_t key_value_index() const { return key_value_index_; } + void set_key_value_index(int32_t key_value_index) { + key_value_index_ = key_value_index; + } + + int32_t next_entry_index() const { return next_entry_index_; } + void set_next_entry_index(int32_t next_entry_index) { + next_entry_index_ = next_entry_index; + } + + private: + int32_t key_value_index_; + int32_t next_entry_index_; + } __attribute__((packed)); + static_assert(sizeof(Entry) == 8, ""); + static_assert(sizeof(Entry) == FileBackedVector<Entry>::kElementTypeSize, + "Entry type size is inconsistent with FileBackedVector " + "element type size"); + static_assert(Entry::kMaxNumEntries <= + (FileBackedVector<Entry>::kMaxFileSize - + FileBackedVector<Entry>::Header::kHeaderSize) / + FileBackedVector<Entry>::kElementTypeSize, + "Max # of entries cannot fit into FileBackedVector"); + + // Key-value serialized type + static constexpr int32_t kMaxKVTotalByteSize = + (FileBackedVector<char>::kMaxFileSize - + FileBackedVector<char>::Header::kHeaderSize) / + FileBackedVector<char>::kElementTypeSize; + static constexpr int32_t kMaxKVIndex = kMaxKVTotalByteSize - 1; + static constexpr int32_t kInvalidKVIndex = -1; + static_assert(sizeof(char) == FileBackedVector<char>::kElementTypeSize, + "Char type size is inconsistent with FileBackedVector element " + "type size"); + + static constexpr int32_t kVersion = 1; + static constexpr int32_t kDefaultMaxLoadFactorPercent = 75; + + static constexpr std::string_view kFilePrefix = "persistent_hash_map"; + // Only metadata, bucket, entry files are stored under this sub-directory, for + // rehashing branching use. + static constexpr std::string_view kSubDirectory = "dynamic"; + + // Creates a new PersistentHashMap to read/write/delete key value pairs. + // + // filesystem: Object to make system level calls + // base_dir: Specifies the directory for all persistent hash map related + // sub-directory and files to be stored. If base_dir doesn't exist, + // then PersistentHashMap will automatically create it. If files + // exist, then it will initialize the hash map from existing files. + // value_type_size: (fixed) size of the serialized value type for hash map. + // max_load_factor_percent: percentage of the max loading for the hash map. + // load_factor_percent = 100 * num_keys / num_buckets + // If load_factor_percent exceeds + // max_load_factor_percent, then rehash will be + // invoked (and # of buckets will be doubled). + // Note that load_factor_percent exceeding 100 is + // considered valid. + // + // Returns: + // FAILED_PRECONDITION_ERROR if the file checksum doesn't match the stored + // checksum. + // INTERNAL_ERROR on I/O errors. + // Any FileBackedVector errors. + static libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>> + Create(const Filesystem& filesystem, std::string_view base_dir, + int32_t value_type_size, + int32_t max_load_factor_percent = kDefaultMaxLoadFactorPercent); + + ~PersistentHashMap(); + + // Update a key value pair. If key does not exist, then insert (key, value) + // into the storage. Otherwise overwrite the value into the storage. + // + // REQUIRES: the buffer pointed to by value must be of value_size() + // + // Returns: + // OK on success + // RESOURCE_EXHAUSTED_ERROR if # of entries reach kMaxNumEntries + // INVALID_ARGUMENT_ERROR if the key is invalid (i.e. contains '\0') + // INTERNAL_ERROR on I/O error or any data inconsistency + // Any FileBackedVector errors + libtextclassifier3::Status Put(std::string_view key, const void* value); + + // If key does not exist, then insert (key, next_value) into the storage. + // Otherwise, copy the hash map value into next_value. + // + // REQUIRES: the buffer pointed to by next_value must be of value_size() + // + // Returns: + // OK on success + // INVALID_ARGUMENT_ERROR if the key is invalid (i.e. contains '\0') + // INTERNAL_ERROR on I/O error or any data inconsistency + // Any FileBackedVector errors + libtextclassifier3::Status GetOrPut(std::string_view key, void* next_value); + + // Get the value by key from the storage. If key exists, then copy the hash + // map value into into value buffer. Otherwise, return NOT_FOUND_ERROR. + // + // REQUIRES: the buffer pointed to by value must be of value_size() + // + // Returns: + // OK on success + // NOT_FOUND_ERROR if the key doesn't exist + // INVALID_ARGUMENT_ERROR if the key is invalid (i.e. contains '\0') + // INTERNAL_ERROR on I/O error or any data inconsistency + // Any FileBackedVector errors + libtextclassifier3::Status Get(std::string_view key, void* value) const; + + // Flushes content to underlying files. + // + // Returns: + // OK on success + // INTERNAL_ERROR on I/O error + libtextclassifier3::Status PersistToDisk(); + + // Calculates and returns the disk usage (metadata + 3 storages total file + // size) in bytes. + // + // Returns: + // Disk usage on success + // INTERNAL_ERROR on I/O error + libtextclassifier3::StatusOr<int64_t> GetDiskUsage() const; + + // Returns the total file size of the all the elements held in the persistent + // hash map. File size is in bytes. This excludes the size of any internal + // metadata, i.e. crcs/info of persistent hash map, file backed vector's + // header. + // + // Returns: + // File size on success + // INTERNAL_ERROR on I/O error + libtextclassifier3::StatusOr<int64_t> GetElementsSize() const; + + // Updates all checksums of the persistent hash map components and returns + // all_crc. + // + // Returns: + // Crc of all components (all_crc) on success + // INTERNAL_ERROR if any data inconsistency + libtextclassifier3::StatusOr<Crc32> ComputeChecksum(); + + int32_t size() const { + return entry_storage_->num_elements() - info()->num_deleted_entries; + } + + bool empty() const { return size() == 0; } + + private: + explicit PersistentHashMap( + const Filesystem& filesystem, std::string_view base_dir, + std::unique_ptr<MemoryMappedFile> metadata_mmapped_file, + std::unique_ptr<FileBackedVector<Bucket>> bucket_storage, + std::unique_ptr<FileBackedVector<Entry>> entry_storage, + std::unique_ptr<FileBackedVector<char>> kv_storage) + : filesystem_(&filesystem), + base_dir_(base_dir), + metadata_mmapped_file_(std::move(metadata_mmapped_file)), + bucket_storage_(std::move(bucket_storage)), + entry_storage_(std::move(entry_storage)), + kv_storage_(std::move(kv_storage)) {} + + static libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>> + InitializeNewFiles(const Filesystem& filesystem, std::string_view base_dir, + int32_t value_type_size, int32_t max_load_factor_percent); + + static libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>> + InitializeExistingFiles(const Filesystem& filesystem, + std::string_view base_dir, int32_t value_type_size, + int32_t max_load_factor_percent); + + // Find the index of the key entry from a bucket (specified by bucket index). + // The caller should specify the desired bucket index. + // + // Returns: + // int32_t: on success, the index of the entry, or Entry::kInvalidIndex if + // not found + // INTERNAL_ERROR if any content inconsistency + // Any FileBackedVector errors + libtextclassifier3::StatusOr<int32_t> FindEntryIndexByKey( + int32_t bucket_idx, std::string_view key) const; + + // Copy the hash map value of the entry into value buffer. + // + // REQUIRES: entry_idx should be valid. + // REQUIRES: the buffer pointed to by value must be of value_size() + // + // Returns: + // OK on success + // Any FileBackedVector errors + libtextclassifier3::Status CopyEntryValue(int32_t entry_idx, + void* value) const; + + // Insert a new key value pair into a bucket (specified by the bucket index). + // The caller should specify the desired bucket index and make sure that the + // key is not present in the hash map before calling. + // + // Returns: + // OK on success + // Any FileBackedVector errors + libtextclassifier3::Status Insert(int32_t bucket_idx, std::string_view key, + const void* value); + + Crcs* crcs() { + return reinterpret_cast<Crcs*>(metadata_mmapped_file_->mutable_region() + + Crcs::kFileOffset); + } + + Info* info() { + return reinterpret_cast<Info*>(metadata_mmapped_file_->mutable_region() + + Info::kFileOffset); + } + + const Info* info() const { + return reinterpret_cast<const Info*>(metadata_mmapped_file_->region() + + Info::kFileOffset); + } + + const Filesystem* filesystem_; + std::string base_dir_; + + std::unique_ptr<MemoryMappedFile> metadata_mmapped_file_; + + // Storages + std::unique_ptr<FileBackedVector<Bucket>> bucket_storage_; + std::unique_ptr<FileBackedVector<Entry>> entry_storage_; + std::unique_ptr<FileBackedVector<char>> kv_storage_; +}; + +} // namespace lib +} // namespace icing + +#endif // ICING_FILE_PERSISTENT_HASH_MAP_H_ diff --git a/icing/file/persistent-hash-map_test.cc b/icing/file/persistent-hash-map_test.cc new file mode 100644 index 0000000..fb15175 --- /dev/null +++ b/icing/file/persistent-hash-map_test.cc @@ -0,0 +1,662 @@ +// Copyright (C) 2022 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/file/persistent-hash-map.h" + +#include <cstring> +#include <vector> + +#include "icing/text_classifier/lib3/utils/base/status.h" +#include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "icing/file/file-backed-vector.h" +#include "icing/file/filesystem.h" +#include "icing/testing/common-matchers.h" +#include "icing/testing/tmp-directory.h" +#include "icing/util/crc32.h" + +namespace icing { +namespace lib { + +namespace { + +static constexpr int32_t kCorruptedValueOffset = 3; + +using ::testing::Eq; +using ::testing::HasSubstr; +using ::testing::IsEmpty; +using ::testing::Not; +using ::testing::Pointee; +using ::testing::SizeIs; + +using Bucket = PersistentHashMap::Bucket; +using Crcs = PersistentHashMap::Crcs; +using Entry = PersistentHashMap::Entry; +using Info = PersistentHashMap::Info; + +class PersistentHashMapTest : public ::testing::Test { + protected: + void SetUp() override { + base_dir_ = GetTestTempDir() + "/persistent_hash_map_test"; + } + + void TearDown() override { + filesystem_.DeleteDirectoryRecursively(base_dir_.c_str()); + } + + std::vector<char> Serialize(int val) { + std::vector<char> ret(sizeof(val)); + memcpy(ret.data(), &val, sizeof(val)); + return ret; + } + + libtextclassifier3::StatusOr<int> GetValueByKey( + PersistentHashMap* persistent_hash_map, std::string_view key) { + int val; + ICING_RETURN_IF_ERROR(persistent_hash_map->Get(key, &val)); + return val; + } + + Filesystem filesystem_; + std::string base_dir_; +}; + +TEST_F(PersistentHashMapTest, InvalidBaseDir) { + EXPECT_THAT(PersistentHashMap::Create(filesystem_, "/dev/null", + /*value_type_size=*/sizeof(int)), + StatusIs(libtextclassifier3::StatusCode::INTERNAL)); +} + +TEST_F(PersistentHashMapTest, InitializeNewFiles) { + { + ASSERT_FALSE(filesystem_.DirectoryExists(base_dir_.c_str())); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<PersistentHashMap> persistent_hash_map, + PersistentHashMap::Create(filesystem_, base_dir_, + /*value_type_size=*/sizeof(int))); + EXPECT_THAT(persistent_hash_map, Pointee(IsEmpty())); + + ICING_ASSERT_OK(persistent_hash_map->PersistToDisk()); + } + + // Metadata file should be initialized correctly for both info and crcs + // sections. + const std::string metadata_file_path = + absl_ports::StrCat(base_dir_, "/", PersistentHashMap::kSubDirectory, "/", + PersistentHashMap::kFilePrefix, ".m"); + ScopedFd metadata_sfd(filesystem_.OpenForWrite(metadata_file_path.c_str())); + ASSERT_TRUE(metadata_sfd.is_valid()); + + // Check info section + Info info; + ASSERT_TRUE(filesystem_.PRead(metadata_sfd.get(), &info, sizeof(Info), + Info::kFileOffset)); + EXPECT_THAT(info.version, Eq(PersistentHashMap::kVersion)); + EXPECT_THAT(info.value_type_size, Eq(sizeof(int))); + EXPECT_THAT(info.max_load_factor_percent, + Eq(PersistentHashMap::kDefaultMaxLoadFactorPercent)); + EXPECT_THAT(info.num_deleted_entries, Eq(0)); + EXPECT_THAT(info.num_deleted_key_value_bytes, Eq(0)); + + // Check crcs section + Crcs crcs; + ASSERT_TRUE(filesystem_.PRead(metadata_sfd.get(), &crcs, sizeof(Crcs), + Crcs::kFileOffset)); + // # of elements in bucket_storage should be 1, so it should have non-zero + // crc value. + EXPECT_THAT(crcs.component_crcs.bucket_storage_crc, Not(Eq(0))); + // Other empty file backed vectors should have 0 crc value. + EXPECT_THAT(crcs.component_crcs.entry_storage_crc, Eq(0)); + EXPECT_THAT(crcs.component_crcs.kv_storage_crc, Eq(0)); + EXPECT_THAT(crcs.component_crcs.info_crc, + Eq(Crc32(std::string_view(reinterpret_cast<const char*>(&info), + sizeof(Info))) + .Get())); + EXPECT_THAT(crcs.all_crc, + Eq(Crc32(std::string_view( + reinterpret_cast<const char*>(&crcs.component_crcs), + sizeof(Crcs::ComponentCrcs))) + .Get())); +} + +TEST_F(PersistentHashMapTest, + TestInitializationFailsWithoutPersistToDiskOrDestruction) { + // Create new persistent hash map + // Set max_load_factor_percent as 1000. Load factor percent is calculated as + // 100 * num_keys / num_buckets. Therefore, with 1 bucket (the initial # of + // buckets in an empty PersistentHashMap) and a max_load_factor_percent of + // 1000, we would allow the insertion of up to 10 keys before rehashing, to + // avoid PersistToDisk being called implicitly by rehashing. + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<PersistentHashMap> persistent_hash_map, + PersistentHashMap::Create(filesystem_, base_dir_, + /*value_type_size=*/sizeof(int), + /*max_load_factor_percent=*/1000)); + + // Put some key value pairs. + ICING_ASSERT_OK(persistent_hash_map->Put("a", Serialize(1).data())); + ICING_ASSERT_OK(persistent_hash_map->Put("b", Serialize(2).data())); + // TODO(b/193919210): call Delete() to change PersistentHashMap header + + ASSERT_THAT(persistent_hash_map, Pointee(SizeIs(2))); + ASSERT_THAT(GetValueByKey(persistent_hash_map.get(), "a"), IsOkAndHolds(1)); + ASSERT_THAT(GetValueByKey(persistent_hash_map.get(), "b"), IsOkAndHolds(2)); + + // Without calling PersistToDisk, checksums will not be recomputed or synced + // to disk, so initializing another instance on the same files should fail. + EXPECT_THAT(PersistentHashMap::Create(filesystem_, base_dir_, + /*value_type_size=*/sizeof(int), + /*max_load_factor_percent=*/1000), + StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); +} + +TEST_F(PersistentHashMapTest, TestInitializationSucceedsWithPersistToDisk) { + // Create new persistent hash map + // Set max_load_factor_percent as 1000. Load factor percent is calculated as + // 100 * num_keys / num_buckets. Therefore, with 1 bucket (the initial # of + // buckets in an empty PersistentHashMap) and a max_load_factor_percent of + // 1000, we would allow the insertion of up to 10 keys before rehashing, to + // avoid PersistToDisk being called implicitly by rehashing. + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<PersistentHashMap> persistent_hash_map1, + PersistentHashMap::Create(filesystem_, base_dir_, + /*value_type_size=*/sizeof(int), + /*max_load_factor_percent=*/1000)); + + // Put some key value pairs. + ICING_ASSERT_OK(persistent_hash_map1->Put("a", Serialize(1).data())); + ICING_ASSERT_OK(persistent_hash_map1->Put("b", Serialize(2).data())); + // TODO(b/193919210): call Delete() to change PersistentHashMap header + + ASSERT_THAT(persistent_hash_map1, Pointee(SizeIs(2))); + ASSERT_THAT(GetValueByKey(persistent_hash_map1.get(), "a"), IsOkAndHolds(1)); + ASSERT_THAT(GetValueByKey(persistent_hash_map1.get(), "b"), IsOkAndHolds(2)); + + // After calling PersistToDisk, all checksums should be recomputed and synced + // correctly to disk, so initializing another instance on the same files + // should succeed, and we should be able to get the same contents. + ICING_EXPECT_OK(persistent_hash_map1->PersistToDisk()); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<PersistentHashMap> persistent_hash_map2, + PersistentHashMap::Create(filesystem_, base_dir_, + /*value_type_size=*/sizeof(int), + /*max_load_factor_percent=*/1000)); + EXPECT_THAT(persistent_hash_map2, Pointee(SizeIs(2))); + EXPECT_THAT(GetValueByKey(persistent_hash_map2.get(), "a"), IsOkAndHolds(1)); + EXPECT_THAT(GetValueByKey(persistent_hash_map2.get(), "b"), IsOkAndHolds(2)); +} + +TEST_F(PersistentHashMapTest, TestInitializationSucceedsAfterDestruction) { + { + // Create new persistent hash map + // Set max_load_factor_percent as 1000. Load factor percent is calculated as + // 100 * num_keys / num_buckets. Therefore, with 1 bucket (the initial # of + // buckets in an empty PersistentHashMap) and a max_load_factor_percent of + // 1000, we would allow the insertion of up to 10 keys before rehashing, to + // avoid PersistToDisk being called implicitly by rehashing. + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<PersistentHashMap> persistent_hash_map, + PersistentHashMap::Create(filesystem_, base_dir_, + /*value_type_size=*/sizeof(int), + /*max_load_factor_percent=*/1000)); + ICING_ASSERT_OK(persistent_hash_map->Put("a", Serialize(1).data())); + ICING_ASSERT_OK(persistent_hash_map->Put("b", Serialize(2).data())); + // TODO(b/193919210): call Delete() to change PersistentHashMap header + + ASSERT_THAT(persistent_hash_map, Pointee(SizeIs(2))); + ASSERT_THAT(GetValueByKey(persistent_hash_map.get(), "a"), IsOkAndHolds(1)); + ASSERT_THAT(GetValueByKey(persistent_hash_map.get(), "b"), IsOkAndHolds(2)); + } + + { + // The previous instance went out of scope and was destructed. Although we + // didn't call PersistToDisk explicitly, the destructor should invoke it and + // thus initializing another instance on the same files should succeed, and + // we should be able to get the same contents. + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<PersistentHashMap> persistent_hash_map, + PersistentHashMap::Create(filesystem_, base_dir_, + /*value_type_size=*/sizeof(int), + /*max_load_factor_percent=*/1000)); + EXPECT_THAT(persistent_hash_map, Pointee(SizeIs(2))); + EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "a"), IsOkAndHolds(1)); + EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "b"), IsOkAndHolds(2)); + } +} + +TEST_F(PersistentHashMapTest, + InitializeExistingFilesWithDifferentValueTypeSizeShouldFail) { + { + // Create new persistent hash map + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<PersistentHashMap> persistent_hash_map, + PersistentHashMap::Create(filesystem_, base_dir_, + /*value_type_size=*/sizeof(int))); + ICING_ASSERT_OK(persistent_hash_map->Put("a", Serialize(1).data())); + + ICING_ASSERT_OK(persistent_hash_map->PersistToDisk()); + } + + { + // Attempt to create the persistent hash map with different value type size. + // This should fail. + ASSERT_THAT(sizeof(char), Not(Eq(sizeof(int)))); + libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>> + persistent_hash_map_or = PersistentHashMap::Create( + filesystem_, base_dir_, /*value_type_size=*/sizeof(char)); + EXPECT_THAT(persistent_hash_map_or, + StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); + EXPECT_THAT(persistent_hash_map_or.status().error_message(), + HasSubstr("Incorrect value type size")); + } +} + +TEST_F(PersistentHashMapTest, InitializeExistingFilesWithWrongAllCrc) { + { + // Create new persistent hash map + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<PersistentHashMap> persistent_hash_map, + PersistentHashMap::Create(filesystem_, base_dir_, + /*value_type_size=*/sizeof(int))); + ICING_ASSERT_OK(persistent_hash_map->Put("a", Serialize(1).data())); + + ICING_ASSERT_OK(persistent_hash_map->PersistToDisk()); + } + + const std::string metadata_file_path = + absl_ports::StrCat(base_dir_, "/", PersistentHashMap::kSubDirectory, "/", + PersistentHashMap::kFilePrefix, ".m"); + ScopedFd metadata_sfd(filesystem_.OpenForWrite(metadata_file_path.c_str())); + ASSERT_TRUE(metadata_sfd.is_valid()); + + Crcs crcs; + ASSERT_TRUE(filesystem_.PRead(metadata_sfd.get(), &crcs, sizeof(Crcs), + Crcs::kFileOffset)); + + // Manually corrupt all_crc + crcs.all_crc += kCorruptedValueOffset; + ASSERT_TRUE(filesystem_.PWrite(metadata_sfd.get(), Crcs::kFileOffset, &crcs, + sizeof(Crcs))); + metadata_sfd.reset(); + + { + // Attempt to create the persistent hash map with metadata containing + // corrupted all_crc. This should fail. + libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>> + persistent_hash_map_or = PersistentHashMap::Create( + filesystem_, base_dir_, /*value_type_size=*/sizeof(int)); + EXPECT_THAT(persistent_hash_map_or, + StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); + EXPECT_THAT(persistent_hash_map_or.status().error_message(), + HasSubstr("Invalid all crc for PersistentHashMap")); + } +} + +TEST_F(PersistentHashMapTest, + InitializeExistingFilesWithCorruptedInfoShouldFail) { + { + // Create new persistent hash map + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<PersistentHashMap> persistent_hash_map, + PersistentHashMap::Create(filesystem_, base_dir_, + /*value_type_size=*/sizeof(int))); + ICING_ASSERT_OK(persistent_hash_map->Put("a", Serialize(1).data())); + + ICING_ASSERT_OK(persistent_hash_map->PersistToDisk()); + } + + const std::string metadata_file_path = + absl_ports::StrCat(base_dir_, "/", PersistentHashMap::kSubDirectory, "/", + PersistentHashMap::kFilePrefix, ".m"); + ScopedFd metadata_sfd(filesystem_.OpenForWrite(metadata_file_path.c_str())); + ASSERT_TRUE(metadata_sfd.is_valid()); + + Info info; + ASSERT_TRUE(filesystem_.PRead(metadata_sfd.get(), &info, sizeof(Info), + Info::kFileOffset)); + + // Modify info, but don't update the checksum. This would be similar to + // corruption of info. + info.num_deleted_entries += kCorruptedValueOffset; + ASSERT_TRUE(filesystem_.PWrite(metadata_sfd.get(), Info::kFileOffset, &info, + sizeof(Info))); + { + // Attempt to create the persistent hash map with info that doesn't match + // its checksum and confirm that it fails. + libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>> + persistent_hash_map_or = PersistentHashMap::Create( + filesystem_, base_dir_, /*value_type_size=*/sizeof(int)); + EXPECT_THAT(persistent_hash_map_or, + StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); + EXPECT_THAT(persistent_hash_map_or.status().error_message(), + HasSubstr("Invalid info crc for PersistentHashMap")); + } +} + +TEST_F(PersistentHashMapTest, + InitializeExistingFilesWithWrongBucketStorageCrc) { + { + // Create new persistent hash map + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<PersistentHashMap> persistent_hash_map, + PersistentHashMap::Create(filesystem_, base_dir_, + /*value_type_size=*/sizeof(int))); + ICING_ASSERT_OK(persistent_hash_map->Put("a", Serialize(1).data())); + + ICING_ASSERT_OK(persistent_hash_map->PersistToDisk()); + } + + const std::string metadata_file_path = + absl_ports::StrCat(base_dir_, "/", PersistentHashMap::kSubDirectory, "/", + PersistentHashMap::kFilePrefix, ".m"); + ScopedFd metadata_sfd(filesystem_.OpenForWrite(metadata_file_path.c_str())); + ASSERT_TRUE(metadata_sfd.is_valid()); + + Crcs crcs; + ASSERT_TRUE(filesystem_.PRead(metadata_sfd.get(), &crcs, sizeof(Crcs), + Crcs::kFileOffset)); + + // Manually corrupt bucket_storage_crc + crcs.component_crcs.bucket_storage_crc += kCorruptedValueOffset; + crcs.all_crc = Crc32(std::string_view( + reinterpret_cast<const char*>(&crcs.component_crcs), + sizeof(Crcs::ComponentCrcs))) + .Get(); + ASSERT_TRUE(filesystem_.PWrite(metadata_sfd.get(), Crcs::kFileOffset, &crcs, + sizeof(Crcs))); + { + // Attempt to create the persistent hash map with metadata containing + // corrupted bucket_storage_crc. This should fail. + libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>> + persistent_hash_map_or = PersistentHashMap::Create( + filesystem_, base_dir_, /*value_type_size=*/sizeof(int)); + EXPECT_THAT(persistent_hash_map_or, + StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); + EXPECT_THAT( + persistent_hash_map_or.status().error_message(), + HasSubstr("Mismatch crc with PersistentHashMap bucket storage")); + } +} + +TEST_F(PersistentHashMapTest, InitializeExistingFilesWithWrongEntryStorageCrc) { + { + // Create new persistent hash map + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<PersistentHashMap> persistent_hash_map, + PersistentHashMap::Create(filesystem_, base_dir_, + /*value_type_size=*/sizeof(int))); + ICING_ASSERT_OK(persistent_hash_map->Put("a", Serialize(1).data())); + + ICING_ASSERT_OK(persistent_hash_map->PersistToDisk()); + } + + const std::string metadata_file_path = + absl_ports::StrCat(base_dir_, "/", PersistentHashMap::kSubDirectory, "/", + PersistentHashMap::kFilePrefix, ".m"); + ScopedFd metadata_sfd(filesystem_.OpenForWrite(metadata_file_path.c_str())); + ASSERT_TRUE(metadata_sfd.is_valid()); + + Crcs crcs; + ASSERT_TRUE(filesystem_.PRead(metadata_sfd.get(), &crcs, sizeof(Crcs), + Crcs::kFileOffset)); + + // Manually corrupt entry_storage_crc + crcs.component_crcs.entry_storage_crc += kCorruptedValueOffset; + crcs.all_crc = Crc32(std::string_view( + reinterpret_cast<const char*>(&crcs.component_crcs), + sizeof(Crcs::ComponentCrcs))) + .Get(); + ASSERT_TRUE(filesystem_.PWrite(metadata_sfd.get(), Crcs::kFileOffset, &crcs, + sizeof(Crcs))); + { + // Attempt to create the persistent hash map with metadata containing + // corrupted entry_storage_crc. This should fail. + libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>> + persistent_hash_map_or = PersistentHashMap::Create( + filesystem_, base_dir_, /*value_type_size=*/sizeof(int)); + EXPECT_THAT(persistent_hash_map_or, + StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); + EXPECT_THAT(persistent_hash_map_or.status().error_message(), + HasSubstr("Mismatch crc with PersistentHashMap entry storage")); + } +} + +TEST_F(PersistentHashMapTest, + InitializeExistingFilesWithWrongKeyValueStorageCrc) { + { + // Create new persistent hash map + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<PersistentHashMap> persistent_hash_map, + PersistentHashMap::Create(filesystem_, base_dir_, + /*value_type_size=*/sizeof(int))); + ICING_ASSERT_OK(persistent_hash_map->Put("a", Serialize(1).data())); + + ICING_ASSERT_OK(persistent_hash_map->PersistToDisk()); + } + + const std::string metadata_file_path = + absl_ports::StrCat(base_dir_, "/", PersistentHashMap::kSubDirectory, "/", + PersistentHashMap::kFilePrefix, ".m"); + ScopedFd metadata_sfd(filesystem_.OpenForWrite(metadata_file_path.c_str())); + ASSERT_TRUE(metadata_sfd.is_valid()); + + Crcs crcs; + ASSERT_TRUE(filesystem_.PRead(metadata_sfd.get(), &crcs, sizeof(Crcs), + Crcs::kFileOffset)); + + // Manually corrupt kv_storage_crc + crcs.component_crcs.kv_storage_crc += kCorruptedValueOffset; + crcs.all_crc = Crc32(std::string_view( + reinterpret_cast<const char*>(&crcs.component_crcs), + sizeof(Crcs::ComponentCrcs))) + .Get(); + ASSERT_TRUE(filesystem_.PWrite(metadata_sfd.get(), Crcs::kFileOffset, &crcs, + sizeof(Crcs))); + { + // Attempt to create the persistent hash map with metadata containing + // corrupted kv_storage_crc. This should fail. + libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>> + persistent_hash_map_or = PersistentHashMap::Create( + filesystem_, base_dir_, /*value_type_size=*/sizeof(int)); + EXPECT_THAT(persistent_hash_map_or, + StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); + EXPECT_THAT( + persistent_hash_map_or.status().error_message(), + HasSubstr("Mismatch crc with PersistentHashMap key value storage")); + } +} + +TEST_F(PersistentHashMapTest, + InitializeExistingFilesAllowDifferentMaxLoadFactorPercent) { + { + // Create new persistent hash map + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<PersistentHashMap> persistent_hash_map, + PersistentHashMap::Create(filesystem_, base_dir_, + /*value_type_size=*/sizeof(int))); + ICING_ASSERT_OK(persistent_hash_map->Put("a", Serialize(1).data())); + ICING_ASSERT_OK(persistent_hash_map->Put("b", Serialize(2).data())); + + ASSERT_THAT(persistent_hash_map, Pointee(SizeIs(2))); + ASSERT_THAT(GetValueByKey(persistent_hash_map.get(), "a"), IsOkAndHolds(1)); + ASSERT_THAT(GetValueByKey(persistent_hash_map.get(), "b"), IsOkAndHolds(2)); + + ICING_ASSERT_OK(persistent_hash_map->PersistToDisk()); + } + + int32_t new_max_load_factor_percent = 100; + { + ASSERT_THAT(new_max_load_factor_percent, + Not(Eq(PersistentHashMap::kDefaultMaxLoadFactorPercent))); + // Attempt to create the persistent hash map with different max load factor + // percent. This should succeed and metadata should be modified correctly. + // Also verify all entries should remain unchanged. + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<PersistentHashMap> persistent_hash_map, + PersistentHashMap::Create(filesystem_, base_dir_, + /*value_type_size=*/sizeof(int), + new_max_load_factor_percent)); + + EXPECT_THAT(persistent_hash_map, Pointee(SizeIs(2))); + EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "a"), IsOkAndHolds(1)); + EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "b"), IsOkAndHolds(2)); + + ICING_ASSERT_OK(persistent_hash_map->PersistToDisk()); + } + + const std::string metadata_file_path = + absl_ports::StrCat(base_dir_, "/", PersistentHashMap::kSubDirectory, "/", + PersistentHashMap::kFilePrefix, ".m"); + ScopedFd metadata_sfd(filesystem_.OpenForWrite(metadata_file_path.c_str())); + ASSERT_TRUE(metadata_sfd.is_valid()); + + Info info; + ASSERT_TRUE(filesystem_.PRead(metadata_sfd.get(), &info, sizeof(Info), + Info::kFileOffset)); + EXPECT_THAT(info.max_load_factor_percent, Eq(new_max_load_factor_percent)); + + // Also should update crcs correctly. We test it by creating instance again + // and make sure it won't get corrupted crcs/info errors. + { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<PersistentHashMap> persistent_hash_map, + PersistentHashMap::Create(filesystem_, base_dir_, + /*value_type_size=*/sizeof(int), + new_max_load_factor_percent)); + + ICING_ASSERT_OK(persistent_hash_map->PersistToDisk()); + } +} + +TEST_F(PersistentHashMapTest, PutAndGet) { + // Create new persistent hash map + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<PersistentHashMap> persistent_hash_map, + PersistentHashMap::Create(filesystem_, base_dir_, + /*value_type_size=*/sizeof(int))); + + EXPECT_THAT(persistent_hash_map, Pointee(IsEmpty())); + EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "default-google.com"), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "default-youtube.com"), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + + ICING_EXPECT_OK( + persistent_hash_map->Put("default-google.com", Serialize(100).data())); + ICING_EXPECT_OK( + persistent_hash_map->Put("default-youtube.com", Serialize(50).data())); + + EXPECT_THAT(persistent_hash_map, Pointee(SizeIs(2))); + EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "default-google.com"), + IsOkAndHolds(100)); + EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "default-youtube.com"), + IsOkAndHolds(50)); + EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "key-not-exist"), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + + ICING_ASSERT_OK(persistent_hash_map->PersistToDisk()); +} + +TEST_F(PersistentHashMapTest, PutShouldOverwriteValueIfKeyExists) { + // Create new persistent hash map + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<PersistentHashMap> persistent_hash_map, + PersistentHashMap::Create(filesystem_, base_dir_, + /*value_type_size=*/sizeof(int))); + + ICING_ASSERT_OK( + persistent_hash_map->Put("default-google.com", Serialize(100).data())); + ASSERT_THAT(persistent_hash_map, Pointee(SizeIs(1))); + ASSERT_THAT(GetValueByKey(persistent_hash_map.get(), "default-google.com"), + IsOkAndHolds(100)); + + ICING_EXPECT_OK( + persistent_hash_map->Put("default-google.com", Serialize(200).data())); + EXPECT_THAT(persistent_hash_map, Pointee(SizeIs(1))); + EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "default-google.com"), + IsOkAndHolds(200)); + + ICING_EXPECT_OK( + persistent_hash_map->Put("default-google.com", Serialize(300).data())); + EXPECT_THAT(persistent_hash_map, Pointee(SizeIs(1))); + EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "default-google.com"), + IsOkAndHolds(300)); +} + +TEST_F(PersistentHashMapTest, GetOrPutShouldPutIfKeyDoesNotExist) { + // Create new persistent hash map + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<PersistentHashMap> persistent_hash_map, + PersistentHashMap::Create(filesystem_, base_dir_, + /*value_type_size=*/sizeof(int))); + + ASSERT_THAT(GetValueByKey(persistent_hash_map.get(), "default-google.com"), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + + int val = 1; + EXPECT_THAT(persistent_hash_map->GetOrPut("default-google.com", &val), + IsOk()); + EXPECT_THAT(val, Eq(1)); + EXPECT_THAT(persistent_hash_map, Pointee(SizeIs(1))); + EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "default-google.com"), + IsOkAndHolds(1)); +} + +TEST_F(PersistentHashMapTest, GetOrPutShouldGetIfKeyExists) { + // Create new persistent hash map + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<PersistentHashMap> persistent_hash_map, + PersistentHashMap::Create(filesystem_, base_dir_, + /*value_type_size=*/sizeof(int))); + + ASSERT_THAT( + persistent_hash_map->Put("default-google.com", Serialize(1).data()), + IsOk()); + ASSERT_THAT(GetValueByKey(persistent_hash_map.get(), "default-google.com"), + IsOkAndHolds(1)); + + int val = 2; + EXPECT_THAT(persistent_hash_map->GetOrPut("default-google.com", &val), + IsOk()); + EXPECT_THAT(val, Eq(1)); + EXPECT_THAT(persistent_hash_map, Pointee(SizeIs(1))); + EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "default-google.com"), + IsOkAndHolds(1)); +} + +TEST_F(PersistentHashMapTest, ShouldFailIfKeyContainsTerminationCharacter) { + // Create new persistent hash map + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<PersistentHashMap> persistent_hash_map, + PersistentHashMap::Create(filesystem_, base_dir_, + /*value_type_size=*/sizeof(int))); + + const char invalid_key[] = "a\0bc"; + std::string_view invalid_key_view(invalid_key, 4); + + int val = 1; + EXPECT_THAT(persistent_hash_map->Put(invalid_key_view, &val), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + EXPECT_THAT(persistent_hash_map->GetOrPut(invalid_key_view, &val), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + EXPECT_THAT(persistent_hash_map->Get(invalid_key_view, &val), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); +} + +} // namespace + +} // namespace lib +} // namespace icing |