aboutsummaryrefslogtreecommitdiff
path: root/icing/file
diff options
context:
space:
mode:
authorTim Barron <tjbarron@google.com>2022-08-11 17:05:22 -0700
committerTim Barron <tjbarron@google.com>2022-08-11 17:05:22 -0700
commit87267cbc5531600072a283ba0c9500c3fcac87af (patch)
tree2ec5c19afb1f4d5ee229d0619c25b2f2819ccea0 /icing/file
parent7c93c404e1fb4ed5e35326245ebc820ed774c6b2 (diff)
downloadicing-87267cbc5531600072a283ba0c9500c3fcac87af.tar.gz
Sync from upstream.
Descriptions: ====================================================================== Implement new version of ResultState and ResultStateManager to 1) enforce a page byte size limit and 2) improve handling of pagination when we encounter deleted documents. ====================================================================== Fix bugs in IcingDynamicTrie::Delete. ====================================================================== Implement IcingDynamicTrie::IsBranchingTerm. ====================================================================== Change Icing default logging level to INFO ====================================================================== Refactor KeyMapper class to be an interface. ====================================================================== Improve NamespaceChecker logic to improve Suggest latency. ====================================================================== Change icing native log tag to "AppSearchIcing" ====================================================================== Implement Index Compaction rather than rebuilding index during Compaction. ====================================================================== Implement reverse iterator for IcingDynamicTrie ====================================================================== Avoid adding unnecessary branch points during index compaction ====================================================================== Invalidate expired result states when adding to/retrieving from ResultStateManager. ====================================================================== Add new methods (MutableView, MutableArrayView, Append, Allocate) to FileBackedVector ====================================================================== Create and implement PersistentHashMap class. ====================================================================== Implement RFC822 Tokenizer ====================================================================== Remove uses of StringPrintf in ICING_LOG statements ====================================================================== Properly set query latency when an error is encountered or results are empty. ====================================================================== Bug: 146903474 Bug: 152934343 Bug: 193919210 Bug: 193453081 Bug: 231368517 Bug: 235395538 Bug: 236412165 Change-Id: I8aa278cebb12b25b39deb0ef584c0f198952659d
Diffstat (limited to 'icing/file')
-rw-r--r--icing/file/file-backed-bitmap.cc3
-rw-r--r--icing/file/file-backed-vector.h496
-rw-r--r--icing/file/file-backed-vector_test.cc579
-rw-r--r--icing/file/filesystem.cc97
-rw-r--r--icing/file/persistent-hash-map.cc534
-rw-r--r--icing/file/persistent-hash-map.h383
-rw-r--r--icing/file/persistent-hash-map_test.cc662
7 files changed, 2558 insertions, 196 deletions
diff --git a/icing/file/file-backed-bitmap.cc b/icing/file/file-backed-bitmap.cc
index eec7668..a8231e3 100644
--- a/icing/file/file-backed-bitmap.cc
+++ b/icing/file/file-backed-bitmap.cc
@@ -269,8 +269,7 @@ libtextclassifier3::Status FileBackedBitmap::GrowTo(int new_num_bits) {
return status;
}
- ICING_VLOG(1) << IcingStringUtil::StringPrintf(
- "Grew file %s to new size %zd", file_path_.c_str(), new_file_size);
+ ICING_VLOG(1) << "Grew file " << file_path_ << " to new size " << new_file_size;
mutable_header()->state = Header::ChecksumState::kStale;
return libtextclassifier3::Status::OK;
}
diff --git a/icing/file/file-backed-vector.h b/icing/file/file-backed-vector.h
index 7e42e32..bcfbbdd 100644
--- a/icing/file/file-backed-vector.h
+++ b/icing/file/file-backed-vector.h
@@ -58,8 +58,12 @@
#include <sys/mman.h>
+#include <algorithm>
#include <cinttypes>
#include <cstdint>
+#include <cstring>
+#include <functional>
+#include <limits>
#include <memory>
#include <string>
#include <utility>
@@ -83,6 +87,9 @@ namespace lib {
template <typename T>
class FileBackedVector {
public:
+ class MutableArrayView;
+ class MutableView;
+
// Header stored at the beginning of the file before the rest of the vector
// elements. Stores metadata on the vector.
struct Header {
@@ -133,15 +140,24 @@ class FileBackedVector {
kHeaderChecksumOffset,
"");
- Crc32 crc;
- std::string_view header_str(
- reinterpret_cast<const char*>(this),
- offsetof(FileBackedVector::Header, header_checksum));
- crc.Append(header_str);
- return crc.Get();
+ return Crc32(std::string_view(
+ reinterpret_cast<const char*>(this),
+ offsetof(FileBackedVector::Header, header_checksum)))
+ .Get();
}
};
+ // Absolute max file size for FileBackedVector. Note that Android has a
+ // (2^31-1)-byte single file size limit, so kMaxFileSize is 2^31-1.
+ static constexpr int32_t kMaxFileSize =
+ std::numeric_limits<int32_t>::max(); // 2^31-1 Bytes, ~2.1 GB;
+
+ // Size of element type T. The value is same as sizeof(T), while we should
+ // avoid using sizeof(T) in our codebase to prevent unexpected unsigned
+ // integer casting.
+ static constexpr int32_t kElementTypeSize = static_cast<int32_t>(sizeof(T));
+ static_assert(sizeof(T) <= (1 << 10));
+
// Creates a new FileBackedVector to read/write content to.
//
// filesystem: Object to make system level calls
@@ -149,15 +165,20 @@ class FileBackedVector {
// within a directory that already exists.
// mmap_strategy : Strategy/optimizations to access the content in the vector,
// see MemoryMappedFile::Strategy for more details
+ // max_file_size: Maximum file size for FileBackedVector, default
+ // kMaxFileSize. See max_file_size_ and kMaxFileSize for more
+ // details.
//
// Return:
// FAILED_PRECONDITION_ERROR if the file checksum doesn't match the stored
// checksum.
// INTERNAL_ERROR on I/O errors.
+ // INVALID_ARGUMENT_ERROR if max_file_size is incorrect.
// UNIMPLEMENTED_ERROR if created with strategy READ_WRITE_MANUAL_SYNC.
static libtextclassifier3::StatusOr<std::unique_ptr<FileBackedVector<T>>>
Create(const Filesystem& filesystem, const std::string& file_path,
- MemoryMappedFile::Strategy mmap_strategy);
+ MemoryMappedFile::Strategy mmap_strategy,
+ int32_t max_file_size = kMaxFileSize);
// Deletes the FileBackedVector
//
@@ -184,13 +205,13 @@ class FileBackedVector {
// referencing the now-invalidated region.
//
// Returns:
- // OUT_OF_RANGE_ERROR if idx < 0 or > num_elements()
+ // OUT_OF_RANGE_ERROR if idx < 0 or idx >= num_elements()
libtextclassifier3::StatusOr<T> GetCopy(int32_t idx) const;
- // Gets a pointer to the element at idx.
+ // Gets an immutable pointer to the element at idx.
//
- // WARNING: Subsequent calls to Set may invalidate the pointer returned by
- // Get.
+ // WARNING: Subsequent calls to Set/Append/Allocate may invalidate the pointer
+ // returned by Get.
//
// This is useful if you do not think the FileBackedVector will grow before
// you need to reference this value, and you want to avoid a copy. When the
@@ -198,27 +219,102 @@ class FileBackedVector {
// which will invalidate this pointer to the previously mapped region.
//
// Returns:
- // OUT_OF_RANGE_ERROR if idx < 0 or > num_elements()
+ // OUT_OF_RANGE_ERROR if idx < 0 or idx >= num_elements()
libtextclassifier3::StatusOr<const T*> Get(int32_t idx) const;
+ // Gets a MutableView to the element at idx.
+ //
+ // WARNING: Subsequent calls to Set/Append/Allocate may invalidate the
+ // reference returned by MutableView::Get().
+ //
+ // This is useful if you do not think the FileBackedVector will grow before
+ // you need to reference this value, and you want to mutate the underlying
+ // data directly. When the FileBackedVector grows, the underlying mmap will be
+ // unmapped and remapped, which will invalidate this MutableView to the
+ // previously mapped region.
+ //
+ // Returns:
+ // OUT_OF_RANGE_ERROR if idx < 0 or idx >= num_elements()
+ libtextclassifier3::StatusOr<MutableView> GetMutable(int32_t idx);
+
+ // Gets a MutableArrayView to the elements at range [idx, idx + len).
+ //
+ // WARNING: Subsequent calls to Set/Append/Allocate may invalidate the
+ // reference/pointer returned by MutableArrayView::operator[]/data().
+ //
+ // This is useful if you do not think the FileBackedVector will grow before
+ // you need to reference this value, and you want to mutate the underlying
+ // data directly. When the FileBackedVector grows, the underlying mmap will be
+ // unmapped and remapped, which will invalidate this MutableArrayView to the
+ // previously mapped region.
+ //
+ // Returns:
+ // OUT_OF_RANGE_ERROR if idx < 0 or idx + len > num_elements()
+ libtextclassifier3::StatusOr<MutableArrayView> GetMutable(int32_t idx,
+ int32_t len);
+
// Writes the value at idx.
//
// May grow the underlying file and mmapped region as needed to fit the new
- // value. If it does grow, then any pointers to previous values returned
- // from Get() may be invalidated.
+ // value. If it does grow, then any pointers/references to previous values
+ // returned from Get/GetMutable/Allocate may be invalidated.
//
// Returns:
- // OUT_OF_RANGE_ERROR if idx < 0 or file cannot be grown idx size
+ // OUT_OF_RANGE_ERROR if idx < 0 or idx > kMaxIndex or file cannot be grown
+ // idx size
libtextclassifier3::Status Set(int32_t idx, const T& value);
+ // Appends the value to the end of the vector.
+ //
+ // May grow the underlying file and mmapped region as needed to fit the new
+ // value. If it does grow, then any pointers/references to previous values
+ // returned from Get/GetMutable/Allocate may be invalidated.
+ //
+ // Returns:
+ // OUT_OF_RANGE_ERROR if file cannot be grown (i.e. reach max_file_size_)
+ libtextclassifier3::Status Append(const T& value) {
+ return Set(header_->num_elements, value);
+ }
+
+ // Allocates spaces with given length in the end of the vector and returns a
+ // MutableArrayView to the space.
+ //
+ // May grow the underlying file and mmapped region as needed to fit the new
+ // value. If it does grow, then any pointers/references to previous values
+ // returned from Get/GetMutable/Allocate may be invalidated.
+ //
+ // WARNING: Subsequent calls to Set/Append/Allocate may invalidate the
+ // reference/pointer returned by MutableArrayView::operator[]/data().
+ //
+ // This is useful if you do not think the FileBackedVector will grow before
+ // you need to reference this value, and you want to allocate adjacent spaces
+ // for multiple elements and mutate the underlying data directly. When the
+ // FileBackedVector grows, the underlying mmap will be unmapped and remapped,
+ // which will invalidate this MutableArrayView to the previously mapped
+ // region.
+ //
+ // Returns:
+ // OUT_OF_RANGE_ERROR if len <= 0 or file cannot be grown (i.e. reach
+ // max_file_size_)
+ libtextclassifier3::StatusOr<MutableArrayView> Allocate(int32_t len);
+
// Resizes to first len elements. The crc is cleared on truncation and will be
// updated on destruction, or once the client calls ComputeChecksum() or
// PersistToDisk().
//
// Returns:
- // OUT_OF_RANGE_ERROR if len < 0 or >= num_elements()
+ // OUT_OF_RANGE_ERROR if len < 0 or len >= num_elements()
libtextclassifier3::Status TruncateTo(int32_t new_num_elements);
+ // Mark idx as changed iff idx < changes_end_, so later ComputeChecksum() can
+ // update checksum by the cached changes without going over [0, changes_end_).
+ //
+ // If the buffer size exceeds kPartialCrcLimitDiv, then clear all change
+ // buffers and set changes_end_ as 0, indicating that the checksum should be
+ // recomputed from idx 0 (starting from the beginning). Otherwise cache the
+ // change.
+ void SetDirty(int32_t idx);
+
// Flushes content to underlying file.
//
// Returns:
@@ -248,10 +344,6 @@ class FileBackedVector {
return reinterpret_cast<const T*>(mmapped_file_->region());
}
- T* mutable_array() const {
- return reinterpret_cast<T*>(mmapped_file_->mutable_region());
- }
-
int32_t num_elements() const { return header_->num_elements; }
// Updates checksum of the vector contents and returns it.
@@ -260,6 +352,66 @@ class FileBackedVector {
// INTERNAL_ERROR if the vector's internal state is inconsistent
libtextclassifier3::StatusOr<Crc32> ComputeChecksum();
+ public:
+ class MutableArrayView {
+ public:
+ const T& operator[](int32_t idx) const { return data_[idx]; }
+ T& operator[](int32_t idx) {
+ SetDirty(idx);
+ return data_[idx];
+ }
+
+ const T* data() const { return data_; }
+
+ int32_t size() const { return len_; }
+
+ // Set the mutable array slice (starting at idx) by the given element array.
+ // It handles SetDirty properly for the file-backed-vector when modifying
+ // elements.
+ //
+ // REQUIRES: arr is valid && arr_len >= 0 && idx + arr_len <= size(),
+ // otherwise the behavior is undefined.
+ void SetArray(int32_t idx, const T* arr, int32_t arr_len) {
+ for (int32_t i = 0; i < arr_len; ++i) {
+ SetDirty(idx + i);
+ data_[idx + i] = arr[i];
+ }
+ }
+
+ private:
+ MutableArrayView(FileBackedVector<T>* vector, T* data, int32_t len)
+ : vector_(vector),
+ data_(data),
+ original_idx_(data - vector->array()),
+ len_(len) {}
+
+ void SetDirty(int32_t idx) { vector_->SetDirty(original_idx_ + idx); }
+
+ // Does not own. For SetDirty only.
+ FileBackedVector<T>* vector_;
+
+ // data_ points at vector_->mutable_array()[original_idx_]
+ T* data_;
+ int32_t original_idx_;
+ int32_t len_;
+
+ friend class FileBackedVector;
+ };
+
+ class MutableView {
+ public:
+ const T& Get() const { return mutable_array_view_[0]; }
+ T& Get() { return mutable_array_view_[0]; }
+
+ private:
+ MutableView(FileBackedVector<T>* vector, T* data)
+ : mutable_array_view_(vector, data, 1) {}
+
+ MutableArrayView mutable_array_view_;
+
+ friend class FileBackedVector;
+ };
+
private:
// We track partial updates to the array for crc updating. This
// requires extra memory to keep track of original buffers but
@@ -271,24 +423,33 @@ class FileBackedVector {
// Grow file by at least this many elements if array is growable.
static constexpr int64_t kGrowElements = 1u << 14; // 16K
- // Max number of elements that can be held by the vector.
- static constexpr int64_t kMaxNumElements = 1u << 20; // 1M
+ // Absolute max # of elements allowed. Since we are using int32_t to store
+ // num_elements, max value is 2^31-1. Still the actual max # of elements are
+ // determined by max_file_size, kElementTypeSize, and Header::kHeaderSize.
+ static constexpr int32_t kMaxNumElements =
+ std::numeric_limits<int32_t>::max();
+
+ // Absolute max index allowed.
+ static constexpr int32_t kMaxIndex = kMaxNumElements - 1;
// Can only be created through the factory ::Create function
FileBackedVector(const Filesystem& filesystem, const std::string& file_path,
std::unique_ptr<Header> header,
- std::unique_ptr<MemoryMappedFile> mmapped_file);
+ std::unique_ptr<MemoryMappedFile> mmapped_file,
+ int32_t max_file_size);
// Initialize a new FileBackedVector, and create the file.
static libtextclassifier3::StatusOr<std::unique_ptr<FileBackedVector<T>>>
InitializeNewFile(const Filesystem& filesystem, const std::string& file_path,
- ScopedFd fd, MemoryMappedFile::Strategy mmap_strategy);
+ ScopedFd fd, MemoryMappedFile::Strategy mmap_strategy,
+ int32_t max_file_size);
// Initialize a FileBackedVector from an existing file.
static libtextclassifier3::StatusOr<std::unique_ptr<FileBackedVector<T>>>
InitializeExistingFile(const Filesystem& filesystem,
const std::string& file_path, ScopedFd fd,
- MemoryMappedFile::Strategy mmap_strategy);
+ MemoryMappedFile::Strategy mmap_strategy,
+ int32_t max_file_size);
// Grows the underlying file to hold at least num_elements
//
@@ -296,6 +457,10 @@ class FileBackedVector {
// OUT_OF_RANGE_ERROR if we can't grow to the specified size
libtextclassifier3::Status GrowIfNecessary(int32_t num_elements);
+ T* mutable_array() const {
+ return reinterpret_cast<T*>(mmapped_file_->mutable_region());
+ }
+
// Cached constructor params.
const Filesystem* const filesystem_;
const std::string file_path_;
@@ -314,25 +479,42 @@ class FileBackedVector {
// update. Will be cleared if the size grows too big.
std::string saved_original_buffer_;
- // Keep track of all pages we touched so we can write them back to
- // disk.
- std::vector<bool> dirty_pages_;
+ // Max file size for FileBackedVector, default kMaxFileSize. Note that this
+ // value won't be written into the header, so maximum file size will always be
+ // specified in runtime and the caller should make sure its value is correct
+ // and reasonable. Note that file size includes size of header + elements.
+ //
+ // The range should be in
+ // [Header::kHeaderSize + kElementTypeSize, kMaxFileSize], and
+ // (max_file_size_ - Header::kHeaderSize) / kElementTypeSize is max # of
+ // elements that can be stored.
+ int32_t max_file_size_;
};
template <typename T>
+constexpr int32_t FileBackedVector<T>::kMaxFileSize;
+
+template <typename T>
+constexpr int32_t FileBackedVector<T>::kElementTypeSize;
+
+template <typename T>
constexpr int32_t FileBackedVector<T>::kPartialCrcLimitDiv;
template <typename T>
constexpr int64_t FileBackedVector<T>::kGrowElements;
template <typename T>
-constexpr int64_t FileBackedVector<T>::kMaxNumElements;
+constexpr int32_t FileBackedVector<T>::kMaxNumElements;
+
+template <typename T>
+constexpr int32_t FileBackedVector<T>::kMaxIndex;
template <typename T>
libtextclassifier3::StatusOr<std::unique_ptr<FileBackedVector<T>>>
FileBackedVector<T>::Create(const Filesystem& filesystem,
const std::string& file_path,
- MemoryMappedFile::Strategy mmap_strategy) {
+ MemoryMappedFile::Strategy mmap_strategy,
+ int32_t max_file_size) {
if (mmap_strategy == MemoryMappedFile::Strategy::READ_WRITE_MANUAL_SYNC) {
// FileBackedVector's behavior of growing the file underneath the mmap is
// inherently broken with MAP_PRIVATE. Growing the vector requires extending
@@ -345,6 +527,14 @@ FileBackedVector<T>::Create(const Filesystem& filesystem,
"mmap strategy.");
}
+ if (max_file_size < Header::kHeaderSize + kElementTypeSize ||
+ max_file_size > kMaxFileSize) {
+ // FileBackedVector should be able to store at least 1 element, so
+ // max_file_size should be at least Header::kHeaderSize + kElementTypeSize.
+ return absl_ports::InvalidArgumentError(
+ "Invalid max file size for FileBackedVector");
+ }
+
ScopedFd fd(filesystem.OpenForWrite(file_path.c_str()));
if (!fd.is_valid()) {
return absl_ports::InternalError(
@@ -357,31 +547,38 @@ FileBackedVector<T>::Create(const Filesystem& filesystem,
absl_ports::StrCat("Bad file size for file ", file_path));
}
+ if (max_file_size < file_size) {
+ return absl_ports::InvalidArgumentError(
+ "Max file size should not be smaller than the existing file size");
+ }
+
const bool new_file = file_size == 0;
if (new_file) {
return InitializeNewFile(filesystem, file_path, std::move(fd),
- mmap_strategy);
+ mmap_strategy, max_file_size);
}
return InitializeExistingFile(filesystem, file_path, std::move(fd),
- mmap_strategy);
+ mmap_strategy, max_file_size);
}
template <typename T>
libtextclassifier3::StatusOr<std::unique_ptr<FileBackedVector<T>>>
-FileBackedVector<T>::InitializeNewFile(
- const Filesystem& filesystem, const std::string& file_path, ScopedFd fd,
- MemoryMappedFile::Strategy mmap_strategy) {
+FileBackedVector<T>::InitializeNewFile(const Filesystem& filesystem,
+ const std::string& file_path,
+ ScopedFd fd,
+ MemoryMappedFile::Strategy mmap_strategy,
+ int32_t max_file_size) {
// Create header.
auto header = std::make_unique<Header>();
header->magic = FileBackedVector<T>::Header::kMagic;
- header->element_size = sizeof(T);
+ header->element_size = kElementTypeSize;
header->header_checksum = header->CalculateHeaderChecksum();
// We use Write() here, instead of writing through the mmapped region
// created below, so we can gracefully handle errors that occur when the
// disk is full. See b/77309668 for details.
if (!filesystem.PWrite(fd.get(), /*offset=*/0, header.get(),
- sizeof(Header))) {
+ Header::kHeaderSize)) {
return absl_ports::InternalError("Failed to write header");
}
@@ -393,23 +590,30 @@ FileBackedVector<T>::InitializeNewFile(
auto mmapped_file =
std::make_unique<MemoryMappedFile>(filesystem, file_path, mmap_strategy);
- return std::unique_ptr<FileBackedVector<T>>(new FileBackedVector<T>(
- filesystem, file_path, std::move(header), std::move(mmapped_file)));
+ return std::unique_ptr<FileBackedVector<T>>(
+ new FileBackedVector<T>(filesystem, file_path, std::move(header),
+ std::move(mmapped_file), max_file_size));
}
template <typename T>
libtextclassifier3::StatusOr<std::unique_ptr<FileBackedVector<T>>>
FileBackedVector<T>::InitializeExistingFile(
const Filesystem& filesystem, const std::string& file_path,
- const ScopedFd fd, MemoryMappedFile::Strategy mmap_strategy) {
+ const ScopedFd fd, MemoryMappedFile::Strategy mmap_strategy,
+ int32_t max_file_size) {
int64_t file_size = filesystem.GetFileSize(file_path.c_str());
- if (file_size < sizeof(FileBackedVector<T>::Header)) {
+ if (file_size == Filesystem::kBadFileSize) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Bad file size for file ", file_path));
+ }
+
+ if (file_size < Header::kHeaderSize) {
return absl_ports::InternalError(
absl_ports::StrCat("File header too short for ", file_path));
}
auto header = std::make_unique<Header>();
- if (!filesystem.PRead(fd.get(), header.get(), sizeof(Header),
+ if (!filesystem.PRead(fd.get(), header.get(), Header::kHeaderSize,
/*offset=*/0)) {
return absl_ports::InternalError(
absl_ports::StrCat("Failed to read header of ", file_path));
@@ -429,13 +633,15 @@ FileBackedVector<T>::InitializeExistingFile(
absl_ports::StrCat("Invalid header crc for ", file_path));
}
- if (header->element_size != sizeof(T)) {
+ if (header->element_size != kElementTypeSize) {
return absl_ports::InternalError(IcingStringUtil::StringPrintf(
- "Inconsistent element size, expected %zd, actual %d", sizeof(T),
+ "Inconsistent element size, expected %d, actual %d", kElementTypeSize,
header->element_size));
}
- int64_t min_file_size = header->num_elements * sizeof(T) + sizeof(Header);
+ int64_t min_file_size =
+ static_cast<int64_t>(header->num_elements) * kElementTypeSize +
+ Header::kHeaderSize;
if (min_file_size > file_size) {
return absl_ports::InternalError(IcingStringUtil::StringPrintf(
"Inconsistent file size, expected %" PRId64 ", actual %" PRId64,
@@ -446,23 +652,22 @@ FileBackedVector<T>::InitializeExistingFile(
// access elements from the mmapped region
auto mmapped_file =
std::make_unique<MemoryMappedFile>(filesystem, file_path, mmap_strategy);
- ICING_RETURN_IF_ERROR(
- mmapped_file->Remap(sizeof(Header), file_size - sizeof(Header)));
+ ICING_RETURN_IF_ERROR(mmapped_file->Remap(Header::kHeaderSize,
+ file_size - Header::kHeaderSize));
// Check vector contents
- Crc32 vector_checksum;
- std::string_view vector_contents(
- reinterpret_cast<const char*>(mmapped_file->region()),
- header->num_elements * sizeof(T));
- vector_checksum.Append(vector_contents);
+ Crc32 vector_checksum(
+ std::string_view(reinterpret_cast<const char*>(mmapped_file->region()),
+ header->num_elements * kElementTypeSize));
if (vector_checksum.Get() != header->vector_checksum) {
return absl_ports::FailedPreconditionError(
absl_ports::StrCat("Invalid vector contents for ", file_path));
}
- return std::unique_ptr<FileBackedVector<T>>(new FileBackedVector<T>(
- filesystem, file_path, std::move(header), std::move(mmapped_file)));
+ return std::unique_ptr<FileBackedVector<T>>(
+ new FileBackedVector<T>(filesystem, file_path, std::move(header),
+ std::move(mmapped_file), max_file_size));
}
template <typename T>
@@ -479,12 +684,13 @@ template <typename T>
FileBackedVector<T>::FileBackedVector(
const Filesystem& filesystem, const std::string& file_path,
std::unique_ptr<Header> header,
- std::unique_ptr<MemoryMappedFile> mmapped_file)
+ std::unique_ptr<MemoryMappedFile> mmapped_file, int32_t max_file_size)
: filesystem_(&filesystem),
file_path_(file_path),
header_(std::move(header)),
mmapped_file_(std::move(mmapped_file)),
- changes_end_(header_->num_elements) {}
+ changes_end_(header_->num_elements),
+ max_file_size_(max_file_size) {}
template <typename T>
FileBackedVector<T>::~FileBackedVector() {
@@ -523,6 +729,40 @@ libtextclassifier3::StatusOr<const T*> FileBackedVector<T>::Get(
}
template <typename T>
+libtextclassifier3::StatusOr<typename FileBackedVector<T>::MutableView>
+FileBackedVector<T>::GetMutable(int32_t idx) {
+ if (idx < 0) {
+ return absl_ports::OutOfRangeError(
+ IcingStringUtil::StringPrintf("Index, %d, was less than 0", idx));
+ }
+
+ if (idx >= header_->num_elements) {
+ return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
+ "Index, %d, was greater than vector size, %d", idx,
+ header_->num_elements));
+ }
+
+ return MutableView(this, &mutable_array()[idx]);
+}
+
+template <typename T>
+libtextclassifier3::StatusOr<typename FileBackedVector<T>::MutableArrayView>
+FileBackedVector<T>::GetMutable(int32_t idx, int32_t len) {
+ if (idx < 0) {
+ return absl_ports::OutOfRangeError(
+ IcingStringUtil::StringPrintf("Index, %d, was less than 0", idx));
+ }
+
+ if (idx > header_->num_elements - len) {
+ return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
+ "Index with len, %d %d, was greater than vector size, %d", idx, len,
+ header_->num_elements));
+ }
+
+ return MutableArrayView(this, &mutable_array()[idx], len);
+}
+
+template <typename T>
libtextclassifier3::Status FileBackedVector<T>::Set(int32_t idx,
const T& value) {
if (idx < 0) {
@@ -530,6 +770,11 @@ libtextclassifier3::Status FileBackedVector<T>::Set(int32_t idx,
IcingStringUtil::StringPrintf("Index, %d, was less than 0", idx));
}
+ if (idx > kMaxIndex) {
+ return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
+ "Index, %d, was greater than max index allowed, %d", idx, kMaxIndex));
+ }
+
ICING_RETURN_IF_ERROR(GrowIfNecessary(idx + 1));
if (idx + 1 > header_->num_elements) {
@@ -541,36 +786,39 @@ libtextclassifier3::Status FileBackedVector<T>::Set(int32_t idx,
return libtextclassifier3::Status::OK;
}
- // Cache original value to update crcs.
- if (idx < changes_end_) {
- // If we exceed kPartialCrcLimitDiv, clear changes_end_ to
- // revert to full CRC.
- if ((saved_original_buffer_.size() + sizeof(T)) *
- FileBackedVector<T>::kPartialCrcLimitDiv >
- changes_end_ * sizeof(T)) {
- ICING_VLOG(2) << "FileBackedVector change tracking limit exceeded";
- changes_.clear();
- saved_original_buffer_.clear();
- changes_end_ = 0;
- header_->vector_checksum = 0;
- } else {
- int32_t start_byte = idx * sizeof(T);
-
- changes_.push_back(idx);
- saved_original_buffer_.append(
- reinterpret_cast<char*>(const_cast<T*>(array())) + start_byte,
- sizeof(T));
- }
- }
+ SetDirty(idx);
mutable_array()[idx] = value;
return libtextclassifier3::Status::OK;
}
template <typename T>
+libtextclassifier3::StatusOr<typename FileBackedVector<T>::MutableArrayView>
+FileBackedVector<T>::Allocate(int32_t len) {
+ if (len <= 0) {
+ return absl_ports::OutOfRangeError("Invalid allocate length");
+ }
+
+ if (len > kMaxNumElements - header_->num_elements) {
+ return absl_ports::OutOfRangeError(
+ IcingStringUtil::StringPrintf("Cannot allocate %d elements", len));
+ }
+
+ // Although header_->num_elements + len doesn't exceed kMaxNumElements, the
+ // actual max # of elements are determined by max_file_size, kElementTypeSize,
+ // and kHeaderSize. Thus, it is still possible to fail to grow the file.
+ ICING_RETURN_IF_ERROR(GrowIfNecessary(header_->num_elements + len));
+
+ int32_t start_idx = header_->num_elements;
+ header_->num_elements += len;
+
+ return MutableArrayView(this, &mutable_array()[start_idx], len);
+}
+
+template <typename T>
libtextclassifier3::Status FileBackedVector<T>::GrowIfNecessary(
int32_t num_elements) {
- if (sizeof(T) == 0) {
+ if (kElementTypeSize == 0) {
// Growing is a no-op
return libtextclassifier3::Status::OK;
}
@@ -579,10 +827,12 @@ libtextclassifier3::Status FileBackedVector<T>::GrowIfNecessary(
return libtextclassifier3::Status::OK;
}
- if (num_elements > FileBackedVector<T>::kMaxNumElements) {
+ if (num_elements >
+ (max_file_size_ - Header::kHeaderSize) / kElementTypeSize) {
return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
- "%d exceeds maximum number of elements allowed, %lld", num_elements,
- static_cast<long long>(FileBackedVector<T>::kMaxNumElements)));
+ "%d elements total size exceed maximum bytes of elements allowed, "
+ "%d bytes",
+ num_elements, max_file_size_ - Header::kHeaderSize));
}
int64_t current_file_size = filesystem_->GetFileSize(file_path_.c_str());
@@ -590,7 +840,8 @@ libtextclassifier3::Status FileBackedVector<T>::GrowIfNecessary(
return absl_ports::InternalError("Unable to retrieve file size.");
}
- int64_t least_file_size_needed = sizeof(Header) + num_elements * sizeof(T);
+ int32_t least_file_size_needed =
+ Header::kHeaderSize + num_elements * kElementTypeSize; // Won't overflow
if (least_file_size_needed <= current_file_size) {
// Our underlying file can hold the target num_elements cause we've grown
// before
@@ -598,9 +849,13 @@ libtextclassifier3::Status FileBackedVector<T>::GrowIfNecessary(
}
// Otherwise, we need to grow. Grow to kGrowElements boundary.
- least_file_size_needed = math_util::RoundUpTo(
- least_file_size_needed,
- int64_t{FileBackedVector<T>::kGrowElements * sizeof(T)});
+ // Note that we need to use int64_t here, since int32_t might overflow after
+ // round up.
+ int64_t round_up_file_size_needed = math_util::RoundUpTo(
+ int64_t{least_file_size_needed},
+ int64_t{FileBackedVector<T>::kGrowElements} * kElementTypeSize);
+ least_file_size_needed =
+ std::min(round_up_file_size_needed, int64_t{max_file_size_});
// We use PWrite here rather than Grow because Grow doesn't actually allocate
// an underlying disk block. This can lead to problems with mmap because mmap
@@ -609,20 +864,22 @@ libtextclassifier3::Status FileBackedVector<T>::GrowIfNecessary(
// these blocks, which will ensure that any failure to grow will surface here.
int64_t page_size = getpagesize();
auto buf = std::make_unique<uint8_t[]>(page_size);
- int64_t size_to_write = page_size - (current_file_size % page_size);
+ int64_t size_to_write = std::min(page_size - (current_file_size % page_size),
+ max_file_size_ - current_file_size);
ScopedFd sfd(filesystem_->OpenForWrite(file_path_.c_str()));
- while (current_file_size < least_file_size_needed) {
+ while (size_to_write > 0 && current_file_size < least_file_size_needed) {
if (!filesystem_->PWrite(sfd.get(), current_file_size, buf.get(),
size_to_write)) {
return absl_ports::InternalError(
absl_ports::StrCat("Couldn't grow file ", file_path_));
}
current_file_size += size_to_write;
- size_to_write = page_size - (current_file_size % page_size);
+ size_to_write = std::min(page_size - (current_file_size % page_size),
+ max_file_size_ - current_file_size);
}
ICING_RETURN_IF_ERROR(mmapped_file_->Remap(
- sizeof(Header), least_file_size_needed - sizeof(Header)));
+ Header::kHeaderSize, least_file_size_needed - Header::kHeaderSize));
return libtextclassifier3::Status::OK;
}
@@ -653,6 +910,31 @@ libtextclassifier3::Status FileBackedVector<T>::TruncateTo(
}
template <typename T>
+void FileBackedVector<T>::SetDirty(int32_t idx) {
+ // Cache original value to update crcs.
+ if (idx >= 0 && idx < changes_end_) {
+ // If we exceed kPartialCrcLimitDiv, clear changes_end_ to
+ // revert to full CRC.
+ if ((saved_original_buffer_.size() + kElementTypeSize) *
+ FileBackedVector<T>::kPartialCrcLimitDiv >
+ changes_end_ * kElementTypeSize) {
+ ICING_VLOG(2) << "FileBackedVector change tracking limit exceeded";
+ changes_.clear();
+ saved_original_buffer_.clear();
+ changes_end_ = 0;
+ header_->vector_checksum = 0;
+ } else {
+ int32_t start_byte = idx * kElementTypeSize;
+
+ changes_.push_back(idx);
+ saved_original_buffer_.append(
+ reinterpret_cast<char*>(const_cast<T*>(array())) + start_byte,
+ kElementTypeSize);
+ }
+ }
+}
+
+template <typename T>
libtextclassifier3::StatusOr<Crc32> FileBackedVector<T>::ComputeChecksum() {
// First apply the modified area. Keep a bitmap of already updated
// regions so we don't double-update.
@@ -663,8 +945,7 @@ libtextclassifier3::StatusOr<Crc32> FileBackedVector<T>::ComputeChecksum() {
int num_truncated = 0;
int num_overlapped = 0;
int num_duplicate = 0;
- for (size_t i = 0; i < changes_.size(); i++) {
- const int32_t change_offset = changes_[i];
+ for (const int32_t change_offset : changes_) {
if (change_offset > changes_end_) {
return absl_ports::InternalError(IcingStringUtil::StringPrintf(
"Failed to update crc, change offset %d, changes_end_ %d",
@@ -678,9 +959,10 @@ libtextclassifier3::StatusOr<Crc32> FileBackedVector<T>::ComputeChecksum() {
}
// Turn change buffer into change^original.
- const char* buffer_end = &saved_original_buffer_[cur_offset + sizeof(T)];
- const char* cur_array =
- reinterpret_cast<const char*>(array()) + change_offset * sizeof(T);
+ const char* buffer_end =
+ &saved_original_buffer_[cur_offset + kElementTypeSize];
+ const char* cur_array = reinterpret_cast<const char*>(array()) +
+ change_offset * kElementTypeSize;
// Now xor in. SSE acceleration please?
for (char* cur = &saved_original_buffer_[cur_offset]; cur < buffer_end;
cur++, cur_array++) {
@@ -692,9 +974,9 @@ libtextclassifier3::StatusOr<Crc32> FileBackedVector<T>::ComputeChecksum() {
bool overlap = false;
uint32_t cur_element = change_offset;
for (char* cur = &saved_original_buffer_[cur_offset]; cur < buffer_end;
- cur_element++, cur += sizeof(T)) {
+ cur_element++, cur += kElementTypeSize) {
if (updated[cur_element]) {
- memset(cur, 0, sizeof(T));
+ memset(cur, 0, kElementTypeSize);
overlap = true;
} else {
updated[cur_element] = true;
@@ -705,10 +987,11 @@ libtextclassifier3::StatusOr<Crc32> FileBackedVector<T>::ComputeChecksum() {
// Apply update to crc.
if (new_update) {
// Explicitly create the string_view with length
- std::string_view xored_str(buffer_end - sizeof(T), sizeof(T));
+ std::string_view xored_str(buffer_end - kElementTypeSize,
+ kElementTypeSize);
if (!cur_crc
- .UpdateWithXor(xored_str, changes_end_ * sizeof(T),
- change_offset * sizeof(T))
+ .UpdateWithXor(xored_str, changes_end_ * kElementTypeSize,
+ change_offset * kElementTypeSize)
.ok()) {
return absl_ports::InternalError(IcingStringUtil::StringPrintf(
"Failed to update crc, change offset %d, change "
@@ -722,7 +1005,7 @@ libtextclassifier3::StatusOr<Crc32> FileBackedVector<T>::ComputeChecksum() {
} else {
num_duplicate++;
}
- cur_offset += sizeof(T);
+ cur_offset += kElementTypeSize;
}
if (!changes_.empty()) {
@@ -735,8 +1018,9 @@ libtextclassifier3::StatusOr<Crc32> FileBackedVector<T>::ComputeChecksum() {
if (changes_end_ < header_->num_elements) {
// Explicitly create the string_view with length
std::string_view update_str(
- reinterpret_cast<const char*>(array()) + changes_end_ * sizeof(T),
- (header_->num_elements - changes_end_) * sizeof(T));
+ reinterpret_cast<const char*>(array()) +
+ changes_end_ * kElementTypeSize,
+ (header_->num_elements - changes_end_) * kElementTypeSize);
cur_crc.Append(update_str);
ICING_VLOG(2) << IcingStringUtil::StringPrintf(
"Array update tail crc offset %d -> %d", changes_end_,
@@ -761,7 +1045,7 @@ libtextclassifier3::Status FileBackedVector<T>::PersistToDisk() {
header_->header_checksum = header_->CalculateHeaderChecksum();
if (!filesystem_->PWrite(file_path_.c_str(), /*offset=*/0, header_.get(),
- sizeof(Header))) {
+ Header::kHeaderSize)) {
return absl_ports::InternalError("Failed to sync header");
}
@@ -795,7 +1079,11 @@ libtextclassifier3::StatusOr<int64_t> FileBackedVector<T>::GetElementsFileSize()
return absl_ports::InternalError(
"Failed to get file size of elements in the file-backed vector");
}
- return total_file_size - sizeof(Header);
+ if (total_file_size < Header::kHeaderSize) {
+ return absl_ports::InternalError(
+ "File size should not be smaller than header size");
+ }
+ return total_file_size - Header::kHeaderSize;
}
} // namespace lib
diff --git a/icing/file/file-backed-vector_test.cc b/icing/file/file-backed-vector_test.cc
index 2f60c6b..60ed887 100644
--- a/icing/file/file-backed-vector_test.cc
+++ b/icing/file/file-backed-vector_test.cc
@@ -19,7 +19,9 @@
#include <algorithm>
#include <cerrno>
#include <cstdint>
+#include <limits>
#include <memory>
+#include <string>
#include <string_view>
#include <vector>
@@ -34,10 +36,14 @@
#include "icing/util/crc32.h"
#include "icing/util/logging.h"
+using ::testing::ElementsAre;
using ::testing::Eq;
using ::testing::IsTrue;
+using ::testing::Lt;
+using ::testing::Not;
using ::testing::Pointee;
using ::testing::Return;
+using ::testing::SizeIs;
namespace icing {
namespace lib {
@@ -60,20 +66,30 @@ class FileBackedVectorTest : public testing::Test {
// Helper method to loop over some data and insert into the vector at some idx
template <typename T>
- void Insert(FileBackedVector<T>* vector, int32_t idx, std::string data) {
- for (int i = 0; i < data.length(); ++i) {
+ void Insert(FileBackedVector<T>* vector, int32_t idx,
+ const std::vector<T>& data) {
+ for (int i = 0; i < data.size(); ++i) {
ICING_ASSERT_OK(vector->Set(idx + i, data.at(i)));
}
}
+ void Insert(FileBackedVector<char>* vector, int32_t idx, std::string data) {
+ Insert(vector, idx, std::vector<char>(data.begin(), data.end()));
+ }
+
// Helper method to retrieve data from the beginning of the vector
template <typename T>
- std::string_view Get(FileBackedVector<T>* vector, int32_t expected_len) {
+ std::vector<T> Get(FileBackedVector<T>* vector, int32_t idx,
+ int32_t expected_len) {
+ return std::vector<T>(vector->array() + idx,
+ vector->array() + idx + expected_len);
+ }
+
+ std::string_view Get(FileBackedVector<char>* vector, int32_t expected_len) {
return Get(vector, 0, expected_len);
}
- template <typename T>
- std::string_view Get(FileBackedVector<T>* vector, int32_t idx,
+ std::string_view Get(FileBackedVector<char>* vector, int32_t idx,
int32_t expected_len) {
return std::string_view(vector->array() + idx, expected_len);
}
@@ -103,6 +119,79 @@ TEST_F(FileBackedVectorTest, Create) {
}
}
+TEST_F(FileBackedVectorTest, CreateWithInvalidStrategy) {
+ // Create a vector with unimplemented strategy
+ EXPECT_THAT(FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_MANUAL_SYNC),
+ StatusIs(libtextclassifier3::StatusCode::UNIMPLEMENTED));
+}
+
+TEST_F(FileBackedVectorTest, CreateWithCustomMaxFileSize) {
+ int32_t header_size = FileBackedVector<char>::Header::kHeaderSize;
+
+ // Create a vector with invalid max_file_size
+ EXPECT_THAT(FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
+ /*max_file_size=*/-1),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
+ /*max_file_size=*/header_size - 1),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
+ /*max_file_size=*/header_size + sizeof(char) - 1),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ {
+ // Create a vector with max_file_size that allows only 1 element.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto vector, FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
+ /*max_file_size=*/header_size + sizeof(char) * 1));
+ ICING_ASSERT_OK(vector->Set(0, 'a'));
+ }
+
+ {
+ // We can create it again with larger max_file_size, as long as it is not
+ // greater than kMaxFileSize.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto vector, FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
+ /*max_file_size=*/header_size + sizeof(char) * 2));
+ EXPECT_THAT(vector->Get(0), IsOkAndHolds(Pointee(Eq('a'))));
+ ICING_ASSERT_OK(vector->Set(1, 'b'));
+ }
+
+ // We cannot create it again with max_file_size < current_file_size, even if
+ // it is a valid value.
+ int64_t current_file_size = filesystem_.GetFileSize(file_path_.c_str());
+ ASSERT_THAT(current_file_size, Eq(header_size + sizeof(char) * 2));
+ ASSERT_THAT(current_file_size - 1, Not(Lt(header_size + sizeof(char))));
+ EXPECT_THAT(FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
+ /*max_file_size=*/current_file_size - 1),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ {
+ // We can create it again with max_file_size == current_file_size.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto vector, FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
+ /*max_file_size=*/current_file_size));
+ EXPECT_THAT(vector->Get(0), IsOkAndHolds(Pointee(Eq('a'))));
+ EXPECT_THAT(vector->Get(1), IsOkAndHolds(Pointee(Eq('b'))));
+ }
+}
+
TEST_F(FileBackedVectorTest, SimpleShared) {
// Create a vector and add some data.
ICING_ASSERT_OK_AND_ASSIGN(
@@ -195,6 +284,373 @@ TEST_F(FileBackedVectorTest, Get) {
StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
}
+TEST_F(FileBackedVectorTest, MutableView) {
+ // Create a vector and add some data.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<char>> vector,
+ FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ Insert(vector.get(), /*idx=*/0, std::string(1000, 'a'));
+ EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(Crc32(2620640643U)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(FileBackedVector<char>::MutableView mutable_elt,
+ vector->GetMutable(3));
+
+ mutable_elt.Get() = 'b';
+ EXPECT_THAT(vector->Get(3), IsOkAndHolds(Pointee(Eq('b'))));
+
+ mutable_elt.Get() = 'c';
+ EXPECT_THAT(vector->Get(3), IsOkAndHolds(Pointee(Eq('c'))));
+}
+
+TEST_F(FileBackedVectorTest, MutableViewShouldSetDirty) {
+ // Create a vector and add some data.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<char>> vector,
+ FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ Insert(vector.get(), /*idx=*/0, std::string(1000, 'a'));
+ EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(Crc32(2620640643U)));
+
+ std::string_view reconstructed_view =
+ std::string_view(vector->array(), vector->num_elements());
+
+ ICING_ASSERT_OK_AND_ASSIGN(FileBackedVector<char>::MutableView mutable_elt,
+ vector->GetMutable(3));
+
+ // Mutate the element via MutateView
+ // If non-const Get() is called, MutateView should set the element index dirty
+ // so that ComputeChecksum() can pick up the change and compute the checksum
+ // correctly. Validate by mapping another array on top.
+ mutable_elt.Get() = 'b';
+ ASSERT_THAT(vector->Get(3), IsOkAndHolds(Pointee(Eq('b'))));
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 crc1, vector->ComputeChecksum());
+ Crc32 full_crc1;
+ full_crc1.Append(reconstructed_view);
+ EXPECT_THAT(crc1, Eq(full_crc1));
+
+ // Mutate and test again.
+ mutable_elt.Get() = 'c';
+ ASSERT_THAT(vector->Get(3), IsOkAndHolds(Pointee(Eq('c'))));
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 crc2, vector->ComputeChecksum());
+ Crc32 full_crc2;
+ full_crc2.Append(reconstructed_view);
+ EXPECT_THAT(crc2, Eq(full_crc2));
+}
+
+TEST_F(FileBackedVectorTest, MutableArrayView) {
+ // Create a vector and add some data.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<int>> vector,
+ FileBackedVector<int>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ Insert(vector.get(), /*idx=*/0, std::vector<int>(/*count=*/100, /*value=*/1));
+ EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(Crc32(2494890115U)));
+
+ constexpr int kArrayViewOffset = 5;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FileBackedVector<int>::MutableArrayView mutable_arr,
+ vector->GetMutable(kArrayViewOffset, /*len=*/3));
+ EXPECT_THAT(mutable_arr, SizeIs(3));
+
+ mutable_arr[0] = 2;
+ mutable_arr[1] = 3;
+ mutable_arr[2] = 4;
+
+ EXPECT_THAT(vector->Get(kArrayViewOffset + 0), IsOkAndHolds(Pointee(Eq(2))));
+ EXPECT_THAT(mutable_arr.data()[0], Eq(2));
+
+ EXPECT_THAT(vector->Get(kArrayViewOffset + 1), IsOkAndHolds(Pointee(Eq(3))));
+ EXPECT_THAT(mutable_arr.data()[1], Eq(3));
+
+ EXPECT_THAT(vector->Get(kArrayViewOffset + 2), IsOkAndHolds(Pointee(Eq(4))));
+ EXPECT_THAT(mutable_arr.data()[2], Eq(4));
+}
+
+TEST_F(FileBackedVectorTest, MutableArrayViewSetArray) {
+ // Create a vector and add some data.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<int>> vector,
+ FileBackedVector<int>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ Insert(vector.get(), /*idx=*/0, std::vector<int>(/*count=*/100, /*value=*/1));
+ EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(Crc32(2494890115U)));
+
+ constexpr int kArrayViewOffset = 3;
+ constexpr int kArrayViewLen = 5;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FileBackedVector<int>::MutableArrayView mutable_arr,
+ vector->GetMutable(kArrayViewOffset, kArrayViewLen));
+
+ std::vector<int> change1{2, 3, 4};
+ mutable_arr.SetArray(/*idx=*/0, change1.data(), change1.size());
+ EXPECT_THAT(Get(vector.get(), kArrayViewOffset, kArrayViewLen),
+ ElementsAre(2, 3, 4, 1, 1));
+
+ std::vector<int> change2{5, 6};
+ mutable_arr.SetArray(/*idx=*/2, change2.data(), change2.size());
+ EXPECT_THAT(Get(vector.get(), kArrayViewOffset, kArrayViewLen),
+ ElementsAre(2, 3, 5, 6, 1));
+}
+
+TEST_F(FileBackedVectorTest, MutableArrayViewSetArrayWithZeroLength) {
+ // Create a vector and add some data.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<int>> vector,
+ FileBackedVector<int>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ Insert(vector.get(), /*idx=*/0, std::vector<int>(/*count=*/100, /*value=*/1));
+ EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(Crc32(2494890115U)));
+
+ constexpr int kArrayViewOffset = 3;
+ constexpr int kArrayViewLen = 5;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FileBackedVector<int>::MutableArrayView mutable_arr,
+ vector->GetMutable(kArrayViewOffset, kArrayViewLen));
+
+ // Zero arr_len should work and change nothing
+ std::vector<int> change{2, 3};
+ mutable_arr.SetArray(/*idx=*/0, change.data(), /*arr_len=*/0);
+ EXPECT_THAT(Get(vector.get(), kArrayViewOffset, kArrayViewLen),
+ ElementsAre(1, 1, 1, 1, 1));
+}
+
+TEST_F(FileBackedVectorTest, MutableArrayViewIndexOperatorShouldSetDirty) {
+ // Create an array with some data.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<int>> vector,
+ FileBackedVector<int>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ Insert(vector.get(), /*idx=*/0, std::vector<int>(/*count=*/100, /*value=*/1));
+ EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(Crc32(2494890115U)));
+
+ std::string_view reconstructed_view(
+ reinterpret_cast<const char*>(vector->array()),
+ vector->num_elements() * sizeof(int));
+
+ constexpr int kArrayViewOffset = 5;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FileBackedVector<int>::MutableArrayView mutable_arr,
+ vector->GetMutable(kArrayViewOffset, /*len=*/3));
+
+ // Use operator[] to mutate elements
+ // If non-const operator[] is called, MutateView should set the element index
+ // dirty so that ComputeChecksum() can pick up the change and compute the
+ // checksum correctly. Validate by mapping another array on top.
+ mutable_arr[0] = 2;
+ ASSERT_THAT(vector->Get(kArrayViewOffset + 0), IsOkAndHolds(Pointee(Eq(2))));
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 crc1, vector->ComputeChecksum());
+ EXPECT_THAT(crc1, Eq(Crc32(reconstructed_view)));
+
+ mutable_arr[1] = 3;
+ ASSERT_THAT(vector->Get(kArrayViewOffset + 1), IsOkAndHolds(Pointee(Eq(3))));
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 crc2, vector->ComputeChecksum());
+ EXPECT_THAT(crc2, Eq(Crc32(reconstructed_view)));
+
+ mutable_arr[2] = 4;
+ ASSERT_THAT(vector->Get(kArrayViewOffset + 2), IsOkAndHolds(Pointee(Eq(4))));
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 crc3, vector->ComputeChecksum());
+ EXPECT_THAT(crc3, Eq(Crc32(reconstructed_view)));
+
+ // Change the same position. It should set dirty again.
+ mutable_arr[0] = 5;
+ ASSERT_THAT(vector->Get(kArrayViewOffset + 0), IsOkAndHolds(Pointee(Eq(5))));
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 crc4, vector->ComputeChecksum());
+ EXPECT_THAT(crc4, Eq(Crc32(reconstructed_view)));
+}
+
+TEST_F(FileBackedVectorTest, MutableArrayViewSetArrayShouldSetDirty) {
+ // Create an array with some data.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<int>> vector,
+ FileBackedVector<int>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ Insert(vector.get(), /*idx=*/0, std::vector<int>(/*count=*/100, /*value=*/1));
+ EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(Crc32(2494890115U)));
+
+ std::string_view reconstructed_view(
+ reinterpret_cast<const char*>(vector->array()),
+ vector->num_elements() * sizeof(int));
+
+ constexpr int kArrayViewOffset = 3;
+ constexpr int kArrayViewLen = 5;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FileBackedVector<int>::MutableArrayView mutable_arr,
+ vector->GetMutable(kArrayViewOffset, kArrayViewLen));
+
+ std::vector<int> change{2, 3, 4};
+ mutable_arr.SetArray(/*idx=*/0, change.data(), change.size());
+ ASSERT_THAT(Get(vector.get(), kArrayViewOffset, kArrayViewLen),
+ ElementsAre(2, 3, 4, 1, 1));
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 crc, vector->ComputeChecksum());
+ EXPECT_THAT(crc, Eq(Crc32(reconstructed_view)));
+}
+
+TEST_F(FileBackedVectorTest, Append) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<char>> vector,
+ FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ ASSERT_THAT(vector->num_elements(), Eq(0));
+
+ ICING_EXPECT_OK(vector->Append('a'));
+ EXPECT_THAT(vector->num_elements(), Eq(1));
+ EXPECT_THAT(vector->Get(0), IsOkAndHolds(Pointee(Eq('a'))));
+
+ ICING_EXPECT_OK(vector->Append('b'));
+ EXPECT_THAT(vector->num_elements(), Eq(2));
+ EXPECT_THAT(vector->Get(1), IsOkAndHolds(Pointee(Eq('b'))));
+}
+
+TEST_F(FileBackedVectorTest, AppendAfterSet) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<char>> vector,
+ FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ ASSERT_THAT(vector->num_elements(), Eq(0));
+
+ ICING_ASSERT_OK(vector->Set(9, 'z'));
+ ASSERT_THAT(vector->num_elements(), Eq(10));
+ ICING_EXPECT_OK(vector->Append('a'));
+ EXPECT_THAT(vector->num_elements(), Eq(11));
+ EXPECT_THAT(vector->Get(10), IsOkAndHolds(Pointee(Eq('a'))));
+}
+
+TEST_F(FileBackedVectorTest, AppendAfterTruncate) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<char>> vector,
+ FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ Insert(vector.get(), /*idx=*/0, std::string(1000, 'z'));
+ ASSERT_THAT(vector->num_elements(), Eq(1000));
+
+ ICING_ASSERT_OK(vector->TruncateTo(5));
+ ICING_EXPECT_OK(vector->Append('a'));
+ EXPECT_THAT(vector->num_elements(), Eq(6));
+ EXPECT_THAT(vector->Get(5), IsOkAndHolds(Pointee(Eq('a'))));
+}
+
+TEST_F(FileBackedVectorTest, AppendShouldFailIfExceedingMaxFileSize) {
+ int32_t max_file_size = (1 << 10) - 1;
+ int32_t max_num_elements =
+ (max_file_size - FileBackedVector<char>::Header::kHeaderSize) /
+ sizeof(char);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<char>> vector,
+ FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size));
+ ICING_ASSERT_OK(vector->Set(max_num_elements - 1, 'z'));
+ ASSERT_THAT(vector->num_elements(), Eq(max_num_elements));
+
+ EXPECT_THAT(vector->Append('a'),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+}
+
+TEST_F(FileBackedVectorTest, Allocate) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<char>> vector,
+ FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ ASSERT_THAT(vector->num_elements(), Eq(0));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ typename FileBackedVector<char>::MutableArrayView mutable_arr,
+ vector->Allocate(3));
+ EXPECT_THAT(vector->num_elements(), Eq(3));
+ EXPECT_THAT(mutable_arr, SizeIs(3));
+ std::string change = "abc";
+ mutable_arr.SetArray(/*idx=*/0, /*arr=*/change.data(), /*arr_len=*/3);
+ EXPECT_THAT(Get(vector.get(), /*idx=*/0, /*expected_len=*/3), Eq(change));
+}
+
+TEST_F(FileBackedVectorTest, AllocateAfterSet) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<char>> vector,
+ FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ ASSERT_THAT(vector->num_elements(), Eq(0));
+
+ ICING_ASSERT_OK(vector->Set(9, 'z'));
+ ASSERT_THAT(vector->num_elements(), Eq(10));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ typename FileBackedVector<char>::MutableArrayView mutable_arr,
+ vector->Allocate(3));
+ EXPECT_THAT(vector->num_elements(), Eq(13));
+ EXPECT_THAT(mutable_arr, SizeIs(3));
+ std::string change = "abc";
+ mutable_arr.SetArray(/*idx=*/0, /*arr=*/change.data(), /*arr_len=*/3);
+ EXPECT_THAT(Get(vector.get(), /*idx=*/10, /*expected_len=*/3), Eq(change));
+}
+
+TEST_F(FileBackedVectorTest, AllocateAfterTruncate) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<char>> vector,
+ FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ Insert(vector.get(), /*idx=*/0, std::string(1000, 'z'));
+ ASSERT_THAT(vector->num_elements(), Eq(1000));
+
+ ICING_ASSERT_OK(vector->TruncateTo(5));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ typename FileBackedVector<char>::MutableArrayView mutable_arr,
+ vector->Allocate(3));
+ EXPECT_THAT(vector->num_elements(), Eq(8));
+ std::string change = "abc";
+ mutable_arr.SetArray(/*idx=*/0, /*arr=*/change.data(), /*arr_len=*/3);
+ EXPECT_THAT(Get(vector.get(), /*idx=*/5, /*expected_len=*/3), Eq(change));
+}
+
+TEST_F(FileBackedVectorTest, AllocateInvalidLengthShouldFail) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<char>> vector,
+ FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ ASSERT_THAT(vector->num_elements(), Eq(0));
+
+ EXPECT_THAT(vector->Allocate(-1),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+ EXPECT_THAT(vector->num_elements(), Eq(0));
+
+ EXPECT_THAT(vector->Allocate(0),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+ EXPECT_THAT(vector->num_elements(), Eq(0));
+}
+
+TEST_F(FileBackedVectorTest, AllocateShouldFailIfExceedingMaxFileSize) {
+ int32_t max_file_size = (1 << 10) - 1;
+ int32_t max_num_elements =
+ (max_file_size - FileBackedVector<char>::Header::kHeaderSize) /
+ sizeof(char);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<char>> vector,
+ FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size));
+ ICING_ASSERT_OK(vector->Set(max_num_elements - 3, 'z'));
+ ASSERT_THAT(vector->num_elements(), Eq(max_num_elements - 2));
+
+ EXPECT_THAT(vector->Allocate(3),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+ EXPECT_THAT(vector->Allocate(2), IsOk());
+}
+
TEST_F(FileBackedVectorTest, IncrementalCrc_NonOverlappingChanges) {
int num_elements = 1000;
int incremental_size = 3;
@@ -272,29 +728,58 @@ TEST_F(FileBackedVectorTest, IncrementalCrc_OverlappingChanges) {
}
}
+TEST_F(FileBackedVectorTest, SetIntMaxShouldReturnOutOfRangeError) {
+ // Create a vector and add some data.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<int32_t>> vector,
+ FileBackedVector<int32_t>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(Crc32(0)));
+
+ // It is an edge case. Since Set() calls GrowIfNecessary(idx + 1), we have to
+ // make sure that when idx is INT32_MAX, Set() should handle it correctly.
+ EXPECT_THAT(vector->Set(std::numeric_limits<int32_t>::max(), 1),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+}
+
TEST_F(FileBackedVectorTest, Grow) {
- // This is the same value as FileBackedVector::kMaxNumElts
- constexpr int32_t kMaxNumElts = 1U << 20;
+ int32_t max_file_size = (1 << 20) - 1;
+ int32_t header_size = FileBackedVector<int32_t>::Header::kHeaderSize;
+ int32_t element_type_size = static_cast<int32_t>(sizeof(int32_t));
+
+ // Max file size includes size of the header and elements, so max # of
+ // elements will be (max_file_size - header_size) / element_type_size.
+ //
+ // Also ensure that (max_file_size - header_size) is not a multiple of
+ // element_type_size, in order to test if the desired # of elements is
+ // computed by (math) floor instead of ceil.
+ ASSERT_THAT((max_file_size - header_size) % element_type_size, Not(Eq(0)));
+ int32_t max_num_elements = (max_file_size - header_size) / element_type_size;
ASSERT_TRUE(filesystem_.Truncate(fd_, 0));
- // Create an array and add some data.
+ // Create a vector and add some data.
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<FileBackedVector<char>> vector,
- FileBackedVector<char>::Create(
+ std::unique_ptr<FileBackedVector<int32_t>> vector,
+ FileBackedVector<int32_t>::Create(
filesystem_, file_path_,
- MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size));
EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(Crc32(0)));
- EXPECT_THAT(vector->Set(kMaxNumElts + 11, 'a'),
+ // max_num_elements is the allowed max # of elements, so the valid index
+ // should be 0 to max_num_elements-1.
+ EXPECT_THAT(vector->Set(max_num_elements, 1),
StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
- EXPECT_THAT(vector->Set(-1, 'a'),
+ EXPECT_THAT(vector->Set(-1, 1),
StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+ EXPECT_THAT(vector->Set(max_num_elements - 1, 1), IsOk());
- uint32_t start = kMaxNumElts - 13;
- Insert(vector.get(), start, "abcde");
+ int32_t start = max_num_elements - 5;
+ std::vector<int32_t> data{1, 2, 3, 4, 5};
+ Insert(vector.get(), start, data);
// Crc works?
- const Crc32 good_crc(1134899064U);
+ const Crc32 good_crc(650981917U);
EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(good_crc));
// PersistToDisk does nothing bad, and ensures the content is still there
@@ -306,12 +791,12 @@ TEST_F(FileBackedVectorTest, Grow) {
vector.reset();
ICING_ASSERT_OK_AND_ASSIGN(
- vector, FileBackedVector<char>::Create(
- filesystem_, file_path_,
- MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ vector,
+ FileBackedVector<int32_t>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size));
- std::string expected = "abcde";
- EXPECT_EQ(expected, Get(vector.get(), start, expected.length()));
+ EXPECT_THAT(Get(vector.get(), start, data.size()), Eq(data));
}
TEST_F(FileBackedVectorTest, GrowsInChunks) {
@@ -334,20 +819,20 @@ TEST_F(FileBackedVectorTest, GrowsInChunks) {
// Once we add something though, we'll grow to be kGrowElements big. From this
// point on, file size and disk usage should be the same because Growing will
// explicitly allocate the number of blocks needed to accomodate the file.
- Insert(vector.get(), 0, "a");
- int file_size = kGrowElements * sizeof(int);
+ Insert(vector.get(), 0, {1});
+ int file_size = 1 * kGrowElements * sizeof(int);
EXPECT_THAT(filesystem_.GetFileSize(fd_), Eq(file_size));
EXPECT_THAT(filesystem_.GetDiskUsage(fd_), Eq(file_size));
// Should still be the same size, don't need to grow underlying file
- Insert(vector.get(), 1, "b");
+ Insert(vector.get(), 1, {2});
EXPECT_THAT(filesystem_.GetFileSize(fd_), Eq(file_size));
EXPECT_THAT(filesystem_.GetDiskUsage(fd_), Eq(file_size));
// Now we grow by a kGrowElements chunk, so the underlying file is 2
// kGrowElements big
- file_size *= 2;
- Insert(vector.get(), 2, std::string(kGrowElements, 'c'));
+ file_size = 2 * kGrowElements * sizeof(int);
+ Insert(vector.get(), 2, std::vector<int>(kGrowElements, 3));
EXPECT_THAT(filesystem_.GetFileSize(fd_), Eq(file_size));
EXPECT_THAT(filesystem_.GetDiskUsage(fd_), Eq(file_size));
@@ -476,6 +961,48 @@ TEST_F(FileBackedVectorTest, TruncateAndReReadFile) {
}
}
+TEST_F(FileBackedVectorTest, SetDirty) {
+ // 1. Create a vector and add some data.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<char>> vector,
+ FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ Insert(vector.get(), 0, "abcd");
+
+ std::string_view reconstructed_view =
+ std::string_view(vector->array(), vector->num_elements());
+
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 crc1, vector->ComputeChecksum());
+ Crc32 full_crc_before_overwrite;
+ full_crc_before_overwrite.Append(reconstructed_view);
+ EXPECT_THAT(crc1, Eq(full_crc_before_overwrite));
+
+ // 2. Manually overwrite the values of the first two elements.
+ std::string corrupted_content = "ef";
+ ASSERT_THAT(
+ filesystem_.PWrite(fd_, /*offset=*/sizeof(FileBackedVector<char>::Header),
+ corrupted_content.c_str(), corrupted_content.length()),
+ IsTrue());
+ ASSERT_THAT(Get(vector.get(), 0, 4), Eq("efcd"));
+ Crc32 full_crc_after_overwrite;
+ full_crc_after_overwrite.Append(reconstructed_view);
+ ASSERT_THAT(full_crc_before_overwrite, Not(Eq(full_crc_after_overwrite)));
+
+ // 3. Without calling SetDirty(), the checksum will be recomputed incorrectly.
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 crc2, vector->ComputeChecksum());
+ EXPECT_THAT(crc2, Not(Eq(full_crc_after_overwrite)));
+
+ // 4. Call SetDirty()
+ vector->SetDirty(0);
+ vector->SetDirty(1);
+
+ // 5. The checksum should be computed correctly after calling SetDirty() with
+ // correct index.
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 crc3, vector->ComputeChecksum());
+ EXPECT_THAT(crc3, Eq(full_crc_after_overwrite));
+}
+
TEST_F(FileBackedVectorTest, InitFileTooSmallForHeaderFails) {
{
// 1. Create a vector with a few elements.
diff --git a/icing/file/filesystem.cc b/icing/file/filesystem.cc
index 82b8d98..10b77db 100644
--- a/icing/file/filesystem.cc
+++ b/icing/file/filesystem.cc
@@ -63,18 +63,16 @@ void LogOpenFileDescriptors() {
constexpr int kMaxFileDescriptorsToStat = 4096;
struct rlimit rlim = {0, 0};
if (getrlimit(RLIMIT_NOFILE, &rlim) != 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "getrlimit() failed (errno=%d)", errno);
+ ICING_LOG(ERROR) << "getrlimit() failed (errno=" << errno << ")";
return;
}
int fd_lim = rlim.rlim_cur;
if (fd_lim > kMaxFileDescriptorsToStat) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Maximum number of file descriptors (%d) too large.", fd_lim);
+ ICING_LOG(ERROR) << "Maximum number of file descriptors (" << fd_lim
+ << ") too large.";
fd_lim = kMaxFileDescriptorsToStat;
}
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Listing up to %d file descriptors.", fd_lim);
+ ICING_LOG(ERROR) << "Listing up to " << fd_lim << " file descriptors.";
// Verify that /proc/self/fd is a directory. If not, procfs is not mounted or
// inaccessible for some other reason. In that case, there's no point trying
@@ -96,15 +94,12 @@ void LogOpenFileDescriptors() {
if (len >= 0) {
// Zero-terminate the buffer, because readlink() won't.
target[len < target_size ? len : target_size - 1] = '\0';
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("fd %d -> \"%s\"", fd,
- target);
+ ICING_LOG(ERROR) << "fd " << fd << " -> \"" << target << "\"";
} else if (errno != ENOENT) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("fd %d -> ? (errno=%d)",
- fd, errno);
+ ICING_LOG(ERROR) << "fd " << fd << " -> ? (errno=" << errno << ")";
}
}
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "File descriptor list complete.");
+ ICING_LOG(ERROR) << "File descriptor list complete.";
}
// Logs an error formatted as: desc1 + file_name + desc2 + strerror(errnum).
@@ -113,8 +108,7 @@ void LogOpenFileDescriptors() {
// file descriptors (see LogOpenFileDescriptors() above).
void LogOpenError(const char* desc1, const char* file_name, const char* desc2,
int errnum) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "%s%s%s%s", desc1, file_name, desc2, strerror(errnum));
+ ICING_LOG(ERROR) << desc1 << file_name << desc2 << strerror(errnum);
if (errnum == EMFILE) {
LogOpenFileDescriptors();
}
@@ -155,8 +149,7 @@ bool ListDirectoryInternal(const char* dir_name,
}
}
if (closedir(dir) != 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Error closing %s: %s", dir_name, strerror(errno));
+ ICING_LOG(ERROR) << "Error closing " << dir_name << " " << strerror(errno);
}
return true;
}
@@ -179,11 +172,10 @@ void ScopedFd::reset(int fd) {
const int64_t Filesystem::kBadFileSize;
bool Filesystem::DeleteFile(const char* file_name) const {
- ICING_VLOG(1) << IcingStringUtil::StringPrintf("Deleting file %s", file_name);
+ ICING_VLOG(1) << "Deleting file " << file_name;
int ret = unlink(file_name);
if (ret != 0 && errno != ENOENT) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Deleting file %s failed: %s", file_name, strerror(errno));
+ ICING_LOG(ERROR) << "Deleting file " << file_name << " failed: " << strerror(errno);
return false;
}
return true;
@@ -192,8 +184,7 @@ bool Filesystem::DeleteFile(const char* file_name) const {
bool Filesystem::DeleteDirectory(const char* dir_name) const {
int ret = rmdir(dir_name);
if (ret != 0 && errno != ENOENT) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Deleting directory %s failed: %s", dir_name, strerror(errno));
+ ICING_LOG(ERROR) << "Deleting directory " << dir_name << " failed: " << strerror(errno);
return false;
}
return true;
@@ -206,8 +197,7 @@ bool Filesystem::DeleteDirectoryRecursively(const char* dir_name) const {
if (errno == ENOENT) {
return true; // If directory didn't exist, this was successful.
}
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Stat %s failed: %s", dir_name, strerror(errno));
+ ICING_LOG(ERROR) << "Stat " << dir_name << " failed: " << strerror(errno);
return false;
}
vector<std::string> entries;
@@ -220,8 +210,7 @@ bool Filesystem::DeleteDirectoryRecursively(const char* dir_name) const {
++i) {
std::string filename = std::string(dir_name) + '/' + *i;
if (stat(filename.c_str(), &st) < 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Stat %s failed: %s", filename.c_str(), strerror(errno));
+ ICING_LOG(ERROR) << "Stat " << filename << " failed: " << strerror(errno);
success = false;
} else if (S_ISDIR(st.st_mode)) {
success = DeleteDirectoryRecursively(filename.c_str()) && success;
@@ -244,8 +233,7 @@ bool Filesystem::FileExists(const char* file_name) const {
exists = S_ISREG(st.st_mode) != 0;
} else {
if (errno != ENOENT) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Unable to stat file %s: %s", file_name, strerror(errno));
+ ICING_LOG(ERROR) << "Unable to stat file " << file_name << ": " << strerror(errno);
}
exists = false;
}
@@ -259,8 +247,7 @@ bool Filesystem::DirectoryExists(const char* dir_name) const {
exists = S_ISDIR(st.st_mode) != 0;
} else {
if (errno != ENOENT) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Unable to stat directory %s: %s", dir_name, strerror(errno));
+ ICING_LOG(ERROR) << "Unable to stat directory " << dir_name << ": " << strerror(errno);
}
exists = false;
}
@@ -316,8 +303,7 @@ bool Filesystem::GetMatchingFiles(const char* glob,
int basename_idx = GetBasenameIndex(glob);
if (basename_idx == 0) {
// We need a directory.
- ICING_VLOG(1) << IcingStringUtil::StringPrintf(
- "Expected directory, no matching files for: %s", glob);
+ ICING_VLOG(1) << "Expected directory, no matching files for: " << glob;
return true;
}
const char* basename_glob = glob + basename_idx;
@@ -372,8 +358,7 @@ int Filesystem::OpenForRead(const char* file_name) const {
int64_t Filesystem::GetFileSize(int fd) const {
struct stat st;
if (fstat(fd, &st) < 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Unable to stat file: %s",
- strerror(errno));
+ ICING_LOG(ERROR) << "Unable to stat file: " << strerror(errno);
return kBadFileSize;
}
return st.st_size;
@@ -383,11 +368,9 @@ int64_t Filesystem::GetFileSize(const char* filename) const {
struct stat st;
if (stat(filename, &st) < 0) {
if (errno == ENOENT) {
- ICING_VLOG(1) << IcingStringUtil::StringPrintf(
- "Unable to stat file %s: %s", filename, strerror(errno));
+ ICING_VLOG(1) << "Unable to stat file " << filename << ": " << strerror(errno);
} else {
- ICING_LOG(WARNING) << IcingStringUtil::StringPrintf(
- "Unable to stat file %s: %s", filename, strerror(errno));
+ ICING_LOG(WARNING) << "Unable to stat file " << filename << ": " << strerror(errno);
}
return kBadFileSize;
}
@@ -396,8 +379,7 @@ int64_t Filesystem::GetFileSize(const char* filename) const {
bool Filesystem::Truncate(int fd, int64_t new_size) const {
if (ftruncate(fd, new_size) != 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Unable to truncate file: %s", strerror(errno));
+ ICING_LOG(ERROR) << "Unable to truncate file: " << strerror(errno);
return false;
}
lseek(fd, new_size, SEEK_SET);
@@ -416,8 +398,7 @@ bool Filesystem::Truncate(const char* filename, int64_t new_size) const {
bool Filesystem::Grow(int fd, int64_t new_size) const {
if (ftruncate(fd, new_size) != 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Unable to grow file: %s",
- strerror(errno));
+ ICING_LOG(ERROR) << "Unable to grow file: " << strerror(errno);
return false;
}
@@ -442,8 +423,7 @@ bool Filesystem::Write(int fd, const void* data, size_t data_size) const {
size_t chunk_size = std::min<size_t>(write_len, 64u * 1024);
ssize_t wrote = write(fd, data, chunk_size);
if (wrote < 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Bad write: %s",
- strerror(errno));
+ ICING_LOG(ERROR) << "Bad write: " << strerror(errno);
return false;
}
data = static_cast<const uint8_t*>(data) + wrote;
@@ -521,8 +501,7 @@ bool Filesystem::CopyDirectory(const char* src_dir, const char* dst_dir,
}
}
if (closedir(dir) != 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Error closing %s: %s",
- src_dir, strerror(errno));
+ ICING_LOG(ERROR) << "Error closing " << src_dir << ": " << strerror(errno);
}
return true;
}
@@ -535,8 +514,7 @@ bool Filesystem::PWrite(int fd, off_t offset, const void* data,
size_t chunk_size = std::min<size_t>(write_len, 64u * 1024);
ssize_t wrote = pwrite(fd, data, chunk_size, offset);
if (wrote < 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Bad write: %s",
- strerror(errno));
+ ICING_LOG(ERROR) << "Bad write: " << strerror(errno);
return false;
}
data = static_cast<const uint8_t*>(data) + wrote;
@@ -561,8 +539,7 @@ bool Filesystem::PWrite(const char* filename, off_t offset, const void* data,
bool Filesystem::Read(int fd, void* buf, size_t buf_size) const {
ssize_t read_status = read(fd, buf, buf_size);
if (read_status < 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Bad read: %s",
- strerror(errno));
+ ICING_LOG(ERROR) << "Bad read: " << strerror(errno);
return false;
}
return true;
@@ -582,8 +559,7 @@ bool Filesystem::Read(const char* filename, void* buf, size_t buf_size) const {
bool Filesystem::PRead(int fd, void* buf, size_t buf_size, off_t offset) const {
ssize_t read_status = pread(fd, buf, buf_size, offset);
if (read_status < 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Bad read: %s",
- strerror(errno));
+ ICING_LOG(ERROR) << "Bad read: " << strerror(errno);
return false;
}
return true;
@@ -609,8 +585,7 @@ bool Filesystem::DataSync(int fd) const {
#endif
if (result < 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Unable to sync data: %s",
- strerror(errno));
+ ICING_LOG(ERROR) << "Unable to sync data: " << strerror(errno);
return false;
}
return true;
@@ -618,9 +593,7 @@ bool Filesystem::DataSync(int fd) const {
bool Filesystem::RenameFile(const char* old_name, const char* new_name) const {
if (rename(old_name, new_name) < 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Unable to rename file %s to %s: %s", old_name, new_name,
- strerror(errno));
+ ICING_LOG(ERROR) << "Unable to rename file " << old_name << " to " << new_name << ": " << strerror(errno);
return false;
}
return true;
@@ -658,8 +631,7 @@ bool Filesystem::CreateDirectory(const char* dir_name) const {
if (mkdir(dir_name, S_IRUSR | S_IWUSR | S_IXUSR) == 0) {
success = true;
} else {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Creating directory %s failed: %s", dir_name, strerror(errno));
+ ICING_LOG(ERROR) << "Creating directory " << dir_name << " failed: " << strerror(errno);
}
}
return success;
@@ -679,8 +651,7 @@ bool Filesystem::CreateDirectoryRecursively(const char* dir_name) const {
int64_t Filesystem::GetDiskUsage(int fd) const {
struct stat st;
if (fstat(fd, &st) < 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Unable to stat file: %s",
- strerror(errno));
+ ICING_LOG(ERROR) << "Unable to stat file: " << strerror(errno);
return kBadFileSize;
}
return st.st_blocks * kStatBlockSize;
@@ -689,8 +660,7 @@ int64_t Filesystem::GetDiskUsage(int fd) const {
int64_t Filesystem::GetFileDiskUsage(const char* path) const {
struct stat st;
if (stat(path, &st) != 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Unable to stat %s: %s",
- path, strerror(errno));
+ ICING_LOG(ERROR) << "Unable to stat " << path << ": " << strerror(errno);
return kBadFileSize;
}
return st.st_blocks * kStatBlockSize;
@@ -699,8 +669,7 @@ int64_t Filesystem::GetFileDiskUsage(const char* path) const {
int64_t Filesystem::GetDiskUsage(const char* path) const {
struct stat st;
if (stat(path, &st) != 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Unable to stat %s: %s",
- path, strerror(errno));
+ ICING_LOG(ERROR) << "Unable to stat " << path << ": " << strerror(errno);
return kBadFileSize;
}
int64_t result = st.st_blocks * kStatBlockSize;
diff --git a/icing/file/persistent-hash-map.cc b/icing/file/persistent-hash-map.cc
new file mode 100644
index 0000000..d20285a
--- /dev/null
+++ b/icing/file/persistent-hash-map.cc
@@ -0,0 +1,534 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/file/persistent-hash-map.h"
+
+#include <cstring>
+#include <memory>
+#include <string>
+#include <string_view>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/file/file-backed-vector.h"
+#include "icing/file/memory-mapped-file.h"
+#include "icing/util/crc32.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+// Helper function to check if there is no termination character '\0' in the
+// key.
+libtextclassifier3::Status ValidateKey(std::string_view key) {
+ if (key.find('\0') != std::string_view::npos) { // NOLINT
+ return absl_ports::InvalidArgumentError(
+ "Key cannot contain termination character '\\0'");
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+// Helper function to convert the key to bucket index by hash.
+//
+// Returns:
+// int32_t: A valid bucket index with range [0, num_buckets - 1].
+// INTERNAL_ERROR if num_buckets == 0
+libtextclassifier3::StatusOr<int32_t> HashKeyToBucketIndex(
+ std::string_view key, int32_t num_buckets) {
+ if (num_buckets == 0) {
+ return absl_ports::InternalError("Should not have empty bucket");
+ }
+ return static_cast<int32_t>(std::hash<std::string_view>()(key) % num_buckets);
+}
+
+// Helper function to PWrite crcs and info to metadata_file_path. Note that
+// metadata_file_path will be the normal or temporary (for branching use when
+// rehashing) metadata file path.
+libtextclassifier3::Status WriteMetadata(const Filesystem& filesystem,
+ const char* metadata_file_path,
+ const PersistentHashMap::Crcs* crcs,
+ const PersistentHashMap::Info* info) {
+ ScopedFd sfd(filesystem.OpenForWrite(metadata_file_path));
+ if (!sfd.is_valid()) {
+ return absl_ports::InternalError("Failed to create metadata file");
+ }
+
+ // Write crcs and info. File layout: <Crcs><Info>
+ if (!filesystem.PWrite(sfd.get(), PersistentHashMap::Crcs::kFileOffset, crcs,
+ sizeof(PersistentHashMap::Crcs))) {
+ return absl_ports::InternalError("Failed to write crcs into metadata file");
+ }
+ // Note that PWrite won't change the file offset, so we need to specify
+ // the correct offset when writing Info.
+ if (!filesystem.PWrite(sfd.get(), PersistentHashMap::Info::kFileOffset, info,
+ sizeof(PersistentHashMap::Info))) {
+ return absl_ports::InternalError("Failed to write info into metadata file");
+ }
+
+ return libtextclassifier3::Status::OK;
+}
+
+// Helper function to update checksums from info and storages to a Crcs
+// instance. Note that storages will be the normal instances used by
+// PersistentHashMap, or the temporary instances (for branching use when
+// rehashing).
+libtextclassifier3::Status UpdateChecksums(
+ PersistentHashMap::Crcs* crcs, PersistentHashMap::Info* info,
+ FileBackedVector<PersistentHashMap::Bucket>* bucket_storage,
+ FileBackedVector<PersistentHashMap::Entry>* entry_storage,
+ FileBackedVector<char>* kv_storage) {
+ // Compute crcs
+ ICING_ASSIGN_OR_RETURN(Crc32 bucket_storage_crc,
+ bucket_storage->ComputeChecksum());
+ ICING_ASSIGN_OR_RETURN(Crc32 entry_storage_crc,
+ entry_storage->ComputeChecksum());
+ ICING_ASSIGN_OR_RETURN(Crc32 kv_storage_crc, kv_storage->ComputeChecksum());
+
+ crcs->component_crcs.info_crc = info->ComputeChecksum().Get();
+ crcs->component_crcs.bucket_storage_crc = bucket_storage_crc.Get();
+ crcs->component_crcs.entry_storage_crc = entry_storage_crc.Get();
+ crcs->component_crcs.kv_storage_crc = kv_storage_crc.Get();
+ crcs->all_crc = crcs->component_crcs.ComputeChecksum().Get();
+
+ return libtextclassifier3::Status::OK;
+}
+
+// Helper function to validate checksums.
+libtextclassifier3::Status ValidateChecksums(
+ const PersistentHashMap::Crcs* crcs, const PersistentHashMap::Info* info,
+ FileBackedVector<PersistentHashMap::Bucket>* bucket_storage,
+ FileBackedVector<PersistentHashMap::Entry>* entry_storage,
+ FileBackedVector<char>* kv_storage) {
+ if (crcs->all_crc != crcs->component_crcs.ComputeChecksum().Get()) {
+ return absl_ports::FailedPreconditionError(
+ "Invalid all crc for PersistentHashMap");
+ }
+
+ if (crcs->component_crcs.info_crc != info->ComputeChecksum().Get()) {
+ return absl_ports::FailedPreconditionError(
+ "Invalid info crc for PersistentHashMap");
+ }
+
+ ICING_ASSIGN_OR_RETURN(Crc32 bucket_storage_crc,
+ bucket_storage->ComputeChecksum());
+ if (crcs->component_crcs.bucket_storage_crc != bucket_storage_crc.Get()) {
+ return absl_ports::FailedPreconditionError(
+ "Mismatch crc with PersistentHashMap bucket storage");
+ }
+
+ ICING_ASSIGN_OR_RETURN(Crc32 entry_storage_crc,
+ entry_storage->ComputeChecksum());
+ if (crcs->component_crcs.entry_storage_crc != entry_storage_crc.Get()) {
+ return absl_ports::FailedPreconditionError(
+ "Mismatch crc with PersistentHashMap entry storage");
+ }
+
+ ICING_ASSIGN_OR_RETURN(Crc32 kv_storage_crc, kv_storage->ComputeChecksum());
+ if (crcs->component_crcs.kv_storage_crc != kv_storage_crc.Get()) {
+ return absl_ports::FailedPreconditionError(
+ "Mismatch crc with PersistentHashMap key value storage");
+ }
+
+ return libtextclassifier3::Status::OK;
+}
+
+// Since metadata/bucket/entry storages should be branched when rehashing, we
+// have to store them together under the same sub directory
+// ("<base_dir>/<sub_dir>"). On the other hand, key-value storage won't be
+// branched and it will be stored under <base_dir>.
+//
+// The following 4 methods are helper functions to get the correct path of
+// metadata/bucket/entry/key-value storages, according to the given base
+// directory and sub directory.
+std::string GetMetadataFilePath(std::string_view base_dir,
+ std::string_view sub_dir) {
+ return absl_ports::StrCat(base_dir, "/", sub_dir, "/",
+ PersistentHashMap::kFilePrefix, ".m");
+}
+
+std::string GetBucketStorageFilePath(std::string_view base_dir,
+ std::string_view sub_dir) {
+ return absl_ports::StrCat(base_dir, "/", sub_dir, "/",
+ PersistentHashMap::kFilePrefix, ".b");
+}
+
+std::string GetEntryStorageFilePath(std::string_view base_dir,
+ std::string_view sub_dir) {
+ return absl_ports::StrCat(base_dir, "/", sub_dir, "/",
+ PersistentHashMap::kFilePrefix, ".e");
+}
+
+std::string GetKeyValueStorageFilePath(std::string_view base_dir) {
+ return absl_ports::StrCat(base_dir, "/", PersistentHashMap::kFilePrefix,
+ ".k");
+}
+
+} // namespace
+
+/* static */ libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>>
+PersistentHashMap::Create(const Filesystem& filesystem,
+ std::string_view base_dir, int32_t value_type_size,
+ int32_t max_load_factor_percent) {
+ if (!filesystem.FileExists(
+ GetMetadataFilePath(base_dir, kSubDirectory).c_str()) ||
+ !filesystem.FileExists(
+ GetBucketStorageFilePath(base_dir, kSubDirectory).c_str()) ||
+ !filesystem.FileExists(
+ GetEntryStorageFilePath(base_dir, kSubDirectory).c_str()) ||
+ !filesystem.FileExists(GetKeyValueStorageFilePath(base_dir).c_str())) {
+ // TODO: erase all files if missing any.
+ return InitializeNewFiles(filesystem, base_dir, value_type_size,
+ max_load_factor_percent);
+ }
+ return InitializeExistingFiles(filesystem, base_dir, value_type_size,
+ max_load_factor_percent);
+}
+
+PersistentHashMap::~PersistentHashMap() {
+ if (!PersistToDisk().ok()) {
+ ICING_LOG(WARNING)
+ << "Failed to persist hash map to disk while destructing " << base_dir_;
+ }
+}
+
+libtextclassifier3::Status PersistentHashMap::Put(std::string_view key,
+ const void* value) {
+ ICING_RETURN_IF_ERROR(ValidateKey(key));
+ ICING_ASSIGN_OR_RETURN(
+ int32_t bucket_idx,
+ HashKeyToBucketIndex(key, bucket_storage_->num_elements()));
+
+ ICING_ASSIGN_OR_RETURN(int32_t target_entry_idx,
+ FindEntryIndexByKey(bucket_idx, key));
+ if (target_entry_idx == Entry::kInvalidIndex) {
+ // If not found, then insert new key value pair.
+ return Insert(bucket_idx, key, value);
+ }
+
+ // Otherwise, overwrite the value.
+ ICING_ASSIGN_OR_RETURN(const Entry* entry,
+ entry_storage_->Get(target_entry_idx));
+
+ int32_t kv_len = key.length() + 1 + info()->value_type_size;
+ int32_t value_offset = key.length() + 1;
+ ICING_ASSIGN_OR_RETURN(
+ typename FileBackedVector<char>::MutableArrayView mutable_kv_arr,
+ kv_storage_->GetMutable(entry->key_value_index(), kv_len));
+ // It is the same key and value_size is fixed, so we can directly overwrite
+ // serialized value.
+ mutable_kv_arr.SetArray(value_offset, reinterpret_cast<const char*>(value),
+ info()->value_type_size);
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status PersistentHashMap::GetOrPut(std::string_view key,
+ void* next_value) {
+ ICING_RETURN_IF_ERROR(ValidateKey(key));
+ ICING_ASSIGN_OR_RETURN(
+ int32_t bucket_idx,
+ HashKeyToBucketIndex(key, bucket_storage_->num_elements()));
+
+ ICING_ASSIGN_OR_RETURN(int32_t target_entry_idx,
+ FindEntryIndexByKey(bucket_idx, key));
+ if (target_entry_idx == Entry::kInvalidIndex) {
+ // If not found, then insert new key value pair.
+ return Insert(bucket_idx, key, next_value);
+ }
+
+ // Otherwise, copy the hash map value into next_value.
+ return CopyEntryValue(target_entry_idx, next_value);
+}
+
+libtextclassifier3::Status PersistentHashMap::Get(std::string_view key,
+ void* value) const {
+ ICING_RETURN_IF_ERROR(ValidateKey(key));
+ ICING_ASSIGN_OR_RETURN(
+ int32_t bucket_idx,
+ HashKeyToBucketIndex(key, bucket_storage_->num_elements()));
+
+ ICING_ASSIGN_OR_RETURN(int32_t target_entry_idx,
+ FindEntryIndexByKey(bucket_idx, key));
+ if (target_entry_idx == Entry::kInvalidIndex) {
+ return absl_ports::NotFoundError(
+ absl_ports::StrCat("Key not found in PersistentHashMap ", base_dir_));
+ }
+
+ return CopyEntryValue(target_entry_idx, value);
+}
+
+libtextclassifier3::Status PersistentHashMap::PersistToDisk() {
+ ICING_RETURN_IF_ERROR(bucket_storage_->PersistToDisk());
+ ICING_RETURN_IF_ERROR(entry_storage_->PersistToDisk());
+ ICING_RETURN_IF_ERROR(kv_storage_->PersistToDisk());
+
+ ICING_RETURN_IF_ERROR(UpdateChecksums(crcs(), info(), bucket_storage_.get(),
+ entry_storage_.get(),
+ kv_storage_.get()));
+ // Changes should have been applied to the underlying file when using
+ // MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, but call msync() as an
+ // extra safety step to ensure they are written out.
+ ICING_RETURN_IF_ERROR(metadata_mmapped_file_->PersistToDisk());
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::StatusOr<int64_t> PersistentHashMap::GetDiskUsage() const {
+ ICING_ASSIGN_OR_RETURN(int64_t bucket_storage_disk_usage,
+ bucket_storage_->GetDiskUsage());
+ ICING_ASSIGN_OR_RETURN(int64_t entry_storage_disk_usage,
+ entry_storage_->GetDiskUsage());
+ ICING_ASSIGN_OR_RETURN(int64_t kv_storage_disk_usage,
+ kv_storage_->GetDiskUsage());
+
+ int64_t total = bucket_storage_disk_usage + entry_storage_disk_usage +
+ kv_storage_disk_usage;
+ Filesystem::IncrementByOrSetInvalid(
+ filesystem_->GetDiskUsage(
+ GetMetadataFilePath(base_dir_, kSubDirectory).c_str()),
+ &total);
+
+ if (total < 0 || total == Filesystem::kBadFileSize) {
+ return absl_ports::InternalError(
+ "Failed to get disk usage of PersistentHashMap");
+ }
+ return total;
+}
+
+libtextclassifier3::StatusOr<int64_t> PersistentHashMap::GetElementsSize()
+ const {
+ ICING_ASSIGN_OR_RETURN(int64_t bucket_storage_elements_size,
+ bucket_storage_->GetElementsFileSize());
+ ICING_ASSIGN_OR_RETURN(int64_t entry_storage_elements_size,
+ entry_storage_->GetElementsFileSize());
+ ICING_ASSIGN_OR_RETURN(int64_t kv_storage_elements_size,
+ kv_storage_->GetElementsFileSize());
+ return bucket_storage_elements_size + entry_storage_elements_size +
+ kv_storage_elements_size;
+}
+
+libtextclassifier3::StatusOr<Crc32> PersistentHashMap::ComputeChecksum() {
+ Crcs* crcs_ptr = crcs();
+ ICING_RETURN_IF_ERROR(UpdateChecksums(crcs_ptr, info(), bucket_storage_.get(),
+ entry_storage_.get(),
+ kv_storage_.get()));
+ return Crc32(crcs_ptr->all_crc);
+}
+
+/* static */ libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>>
+PersistentHashMap::InitializeNewFiles(const Filesystem& filesystem,
+ std::string_view base_dir,
+ int32_t value_type_size,
+ int32_t max_load_factor_percent) {
+ // Create directory.
+ const std::string dir_path = absl_ports::StrCat(base_dir, "/", kSubDirectory);
+ if (!filesystem.CreateDirectoryRecursively(dir_path.c_str())) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Failed to create directory: ", dir_path));
+ }
+
+ // Initialize 3 storages
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<FileBackedVector<Bucket>> bucket_storage,
+ FileBackedVector<Bucket>::Create(
+ filesystem, GetBucketStorageFilePath(base_dir, kSubDirectory),
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<FileBackedVector<Entry>> entry_storage,
+ FileBackedVector<Entry>::Create(
+ filesystem, GetEntryStorageFilePath(base_dir, kSubDirectory),
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ ICING_ASSIGN_OR_RETURN(std::unique_ptr<FileBackedVector<char>> kv_storage,
+ FileBackedVector<char>::Create(
+ filesystem, GetKeyValueStorageFilePath(base_dir),
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+
+ // Initialize one bucket.
+ ICING_RETURN_IF_ERROR(bucket_storage->Append(Bucket()));
+ ICING_RETURN_IF_ERROR(bucket_storage->PersistToDisk());
+
+ // Create and initialize new info
+ Info new_info;
+ new_info.version = kVersion;
+ new_info.value_type_size = value_type_size;
+ new_info.max_load_factor_percent = max_load_factor_percent;
+ new_info.num_deleted_entries = 0;
+ new_info.num_deleted_key_value_bytes = 0;
+
+ // Compute checksums
+ Crcs new_crcs;
+ ICING_RETURN_IF_ERROR(UpdateChecksums(&new_crcs, &new_info,
+ bucket_storage.get(),
+ entry_storage.get(), kv_storage.get()));
+
+ const std::string metadata_file_path =
+ GetMetadataFilePath(base_dir, kSubDirectory);
+ // Write new metadata file
+ ICING_RETURN_IF_ERROR(WriteMetadata(filesystem, metadata_file_path.c_str(),
+ &new_crcs, &new_info));
+
+ // Mmap the content of the crcs and info.
+ auto metadata_mmapped_file = std::make_unique<MemoryMappedFile>(
+ filesystem, metadata_file_path,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC);
+ ICING_RETURN_IF_ERROR(metadata_mmapped_file->Remap(
+ /*file_offset=*/0, /*mmap_size=*/sizeof(Crcs) + sizeof(Info)));
+
+ return std::unique_ptr<PersistentHashMap>(new PersistentHashMap(
+ filesystem, base_dir, std::move(metadata_mmapped_file),
+ std::move(bucket_storage), std::move(entry_storage),
+ std::move(kv_storage)));
+}
+
+/* static */ libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>>
+PersistentHashMap::InitializeExistingFiles(const Filesystem& filesystem,
+ std::string_view base_dir,
+ int32_t value_type_size,
+ int32_t max_load_factor_percent) {
+ // Mmap the content of the crcs and info.
+ auto metadata_mmapped_file = std::make_unique<MemoryMappedFile>(
+ filesystem, GetMetadataFilePath(base_dir, kSubDirectory),
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC);
+ ICING_RETURN_IF_ERROR(metadata_mmapped_file->Remap(
+ /*file_offset=*/0, /*mmap_size=*/sizeof(Crcs) + sizeof(Info)));
+
+ // Initialize 3 storages
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<FileBackedVector<Bucket>> bucket_storage,
+ FileBackedVector<Bucket>::Create(
+ filesystem, GetBucketStorageFilePath(base_dir, kSubDirectory),
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<FileBackedVector<Entry>> entry_storage,
+ FileBackedVector<Entry>::Create(
+ filesystem, GetEntryStorageFilePath(base_dir, kSubDirectory),
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ ICING_ASSIGN_OR_RETURN(std::unique_ptr<FileBackedVector<char>> kv_storage,
+ FileBackedVector<char>::Create(
+ filesystem, GetKeyValueStorageFilePath(base_dir),
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+
+ Crcs* crcs_ptr = reinterpret_cast<Crcs*>(
+ metadata_mmapped_file->mutable_region() + Crcs::kFileOffset);
+ Info* info_ptr = reinterpret_cast<Info*>(
+ metadata_mmapped_file->mutable_region() + Info::kFileOffset);
+
+ // Value type size should be consistent.
+ if (value_type_size != info_ptr->value_type_size) {
+ return absl_ports::FailedPreconditionError("Incorrect value type size");
+ }
+
+ // Validate checksums of info and 3 storages.
+ ICING_RETURN_IF_ERROR(
+ ValidateChecksums(crcs_ptr, info_ptr, bucket_storage.get(),
+ entry_storage.get(), kv_storage.get()));
+
+ // Allow max_load_factor_percent_ change.
+ if (max_load_factor_percent != info_ptr->max_load_factor_percent) {
+ ICING_VLOG(2) << "Changing max_load_factor_percent from " << info_ptr->max_load_factor_percent << " to " << max_load_factor_percent;
+
+ info_ptr->max_load_factor_percent = max_load_factor_percent;
+ crcs_ptr->component_crcs.info_crc = info_ptr->ComputeChecksum().Get();
+ crcs_ptr->all_crc = crcs_ptr->component_crcs.ComputeChecksum().Get();
+ ICING_RETURN_IF_ERROR(metadata_mmapped_file->PersistToDisk());
+ // TODO(b/193919210): rehash if needed
+ }
+
+ return std::unique_ptr<PersistentHashMap>(new PersistentHashMap(
+ filesystem, base_dir, std::move(metadata_mmapped_file),
+ std::move(bucket_storage), std::move(entry_storage),
+ std::move(kv_storage)));
+}
+
+libtextclassifier3::StatusOr<int32_t> PersistentHashMap::FindEntryIndexByKey(
+ int32_t bucket_idx, std::string_view key) const {
+ // Iterate all entries in the bucket, compare with key, and return the entry
+ // index if exists.
+ ICING_ASSIGN_OR_RETURN(const Bucket* bucket,
+ bucket_storage_->Get(bucket_idx));
+ int32_t curr_entry_idx = bucket->head_entry_index();
+ while (curr_entry_idx != Entry::kInvalidIndex) {
+ ICING_ASSIGN_OR_RETURN(const Entry* entry,
+ entry_storage_->Get(curr_entry_idx));
+ if (entry->key_value_index() == kInvalidKVIndex) {
+ ICING_LOG(ERROR) << "Got an invalid key value index in the persistent "
+ "hash map bucket. This shouldn't happen";
+ return absl_ports::InternalError("Unexpected invalid key value index");
+ }
+ ICING_ASSIGN_OR_RETURN(const char* kv_arr,
+ kv_storage_->Get(entry->key_value_index()));
+ if (key.compare(kv_arr) == 0) {
+ return curr_entry_idx;
+ }
+
+ curr_entry_idx = entry->next_entry_index();
+ }
+
+ return curr_entry_idx;
+}
+
+libtextclassifier3::Status PersistentHashMap::CopyEntryValue(
+ int32_t entry_idx, void* value) const {
+ ICING_ASSIGN_OR_RETURN(const Entry* entry, entry_storage_->Get(entry_idx));
+
+ ICING_ASSIGN_OR_RETURN(const char* kv_arr,
+ kv_storage_->Get(entry->key_value_index()));
+ int32_t value_offset = strlen(kv_arr) + 1;
+ memcpy(value, kv_arr + value_offset, info()->value_type_size);
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status PersistentHashMap::Insert(int32_t bucket_idx,
+ std::string_view key,
+ const void* value) {
+ // If size() + 1 exceeds Entry::kMaxNumEntries, then return error.
+ if (size() > Entry::kMaxNumEntries - 1) {
+ return absl_ports::ResourceExhaustedError("Cannot insert new entry");
+ }
+
+ ICING_ASSIGN_OR_RETURN(
+ typename FileBackedVector<Bucket>::MutableView mutable_bucket,
+ bucket_storage_->GetMutable(bucket_idx));
+
+ // Append new key value.
+ int32_t new_kv_idx = kv_storage_->num_elements();
+ int32_t kv_len = key.size() + 1 + info()->value_type_size;
+ int32_t value_offset = key.size() + 1;
+ ICING_ASSIGN_OR_RETURN(
+ typename FileBackedVector<char>::MutableArrayView mutable_new_kv_arr,
+ kv_storage_->Allocate(kv_len));
+ mutable_new_kv_arr.SetArray(/*idx=*/0, key.data(), key.size());
+ mutable_new_kv_arr.SetArray(/*idx=*/key.size(), "\0", 1);
+ mutable_new_kv_arr.SetArray(/*idx=*/value_offset,
+ reinterpret_cast<const char*>(value),
+ info()->value_type_size);
+
+ // Append new entry.
+ int32_t new_entry_idx = entry_storage_->num_elements();
+ ICING_RETURN_IF_ERROR(entry_storage_->Append(
+ Entry(new_kv_idx, mutable_bucket.Get().head_entry_index())));
+ mutable_bucket.Get().set_head_entry_index(new_entry_idx);
+
+ // TODO: rehash if needed
+
+ return libtextclassifier3::Status::OK;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/file/persistent-hash-map.h b/icing/file/persistent-hash-map.h
new file mode 100644
index 0000000..24a47ea
--- /dev/null
+++ b/icing/file/persistent-hash-map.h
@@ -0,0 +1,383 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_FILE_PERSISTENT_HASH_MAP_H_
+#define ICING_FILE_PERSISTENT_HASH_MAP_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <string_view>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/file/file-backed-vector.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/memory-mapped-file.h"
+#include "icing/util/crc32.h"
+
+namespace icing {
+namespace lib {
+
+// Low level persistent hash map.
+// It supports variant length serialized key + fixed length serialized value.
+// Key and value can be any type, but callers should serialize key/value by
+// themselves and pass raw bytes into the hash map, and the serialized key
+// should not contain termination character '\0'.
+class PersistentHashMap {
+ public:
+ // Crcs and Info will be written into the metadata file.
+ // File layout: <Crcs><Info>
+ // Crcs
+ struct Crcs {
+ static constexpr int32_t kFileOffset = 0;
+
+ struct ComponentCrcs {
+ uint32_t info_crc;
+ uint32_t bucket_storage_crc;
+ uint32_t entry_storage_crc;
+ uint32_t kv_storage_crc;
+
+ bool operator==(const ComponentCrcs& other) const {
+ return info_crc == other.info_crc &&
+ bucket_storage_crc == other.bucket_storage_crc &&
+ entry_storage_crc == other.entry_storage_crc &&
+ kv_storage_crc == other.kv_storage_crc;
+ }
+
+ Crc32 ComputeChecksum() const {
+ return Crc32(std::string_view(reinterpret_cast<const char*>(this),
+ sizeof(ComponentCrcs)));
+ }
+ } __attribute__((packed));
+
+ bool operator==(const Crcs& other) const {
+ return all_crc == other.all_crc && component_crcs == other.component_crcs;
+ }
+
+ uint32_t all_crc;
+ ComponentCrcs component_crcs;
+ } __attribute__((packed));
+ static_assert(sizeof(Crcs) == 20, "");
+
+ // Info
+ struct Info {
+ static constexpr int32_t kFileOffset = static_cast<int32_t>(sizeof(Crcs));
+
+ int32_t version;
+ int32_t value_type_size;
+ int32_t max_load_factor_percent;
+ int32_t num_deleted_entries;
+ int32_t num_deleted_key_value_bytes;
+
+ Crc32 ComputeChecksum() const {
+ return Crc32(
+ std::string_view(reinterpret_cast<const char*>(this), sizeof(Info)));
+ }
+ } __attribute__((packed));
+ static_assert(sizeof(Info) == 20, "");
+
+ // Bucket
+ class Bucket {
+ public:
+ // Absolute max # of buckets allowed. Since max file size on Android is
+ // 2^31-1, we can at most have ~2^29 buckets. To make it power of 2, round
+ // it down to 2^28. Also since we're using FileBackedVector to store
+ // buckets, add some static_asserts to ensure numbers here are compatible
+ // with FileBackedVector.
+ static constexpr int32_t kMaxNumBuckets = 1 << 28;
+
+ explicit Bucket(int32_t head_entry_index = Entry::kInvalidIndex)
+ : head_entry_index_(head_entry_index) {}
+
+ // For FileBackedVector
+ bool operator==(const Bucket& other) const {
+ return head_entry_index_ == other.head_entry_index_;
+ }
+
+ int32_t head_entry_index() const { return head_entry_index_; }
+ void set_head_entry_index(int32_t head_entry_index) {
+ head_entry_index_ = head_entry_index;
+ }
+
+ private:
+ int32_t head_entry_index_;
+ } __attribute__((packed));
+ static_assert(sizeof(Bucket) == 4, "");
+ static_assert(sizeof(Bucket) == FileBackedVector<Bucket>::kElementTypeSize,
+ "Bucket type size is inconsistent with FileBackedVector "
+ "element type size");
+ static_assert(Bucket::kMaxNumBuckets <=
+ (FileBackedVector<Bucket>::kMaxFileSize -
+ FileBackedVector<Bucket>::Header::kHeaderSize) /
+ FileBackedVector<Bucket>::kElementTypeSize,
+ "Max # of buckets cannot fit into FileBackedVector");
+
+ // Entry
+ class Entry {
+ public:
+ // Absolute max # of entries allowed. Since max file size on Android is
+ // 2^31-1, we can at most have ~2^28 entries. To make it power of 2, round
+ // it down to 2^27. Also since we're using FileBackedVector to store
+ // entries, add some static_asserts to ensure numbers here are compatible
+ // with FileBackedVector.
+ //
+ // Still the actual max # of entries are determined by key-value storage,
+ // since length of the key varies and affects # of actual key-value pairs
+ // that can be stored.
+ static constexpr int32_t kMaxNumEntries = 1 << 27;
+ static constexpr int32_t kMaxIndex = kMaxNumEntries - 1;
+ static constexpr int32_t kInvalidIndex = -1;
+
+ explicit Entry(int32_t key_value_index, int32_t next_entry_index)
+ : key_value_index_(key_value_index),
+ next_entry_index_(next_entry_index) {}
+
+ bool operator==(const Entry& other) const {
+ return key_value_index_ == other.key_value_index_ &&
+ next_entry_index_ == other.next_entry_index_;
+ }
+
+ int32_t key_value_index() const { return key_value_index_; }
+ void set_key_value_index(int32_t key_value_index) {
+ key_value_index_ = key_value_index;
+ }
+
+ int32_t next_entry_index() const { return next_entry_index_; }
+ void set_next_entry_index(int32_t next_entry_index) {
+ next_entry_index_ = next_entry_index;
+ }
+
+ private:
+ int32_t key_value_index_;
+ int32_t next_entry_index_;
+ } __attribute__((packed));
+ static_assert(sizeof(Entry) == 8, "");
+ static_assert(sizeof(Entry) == FileBackedVector<Entry>::kElementTypeSize,
+ "Entry type size is inconsistent with FileBackedVector "
+ "element type size");
+ static_assert(Entry::kMaxNumEntries <=
+ (FileBackedVector<Entry>::kMaxFileSize -
+ FileBackedVector<Entry>::Header::kHeaderSize) /
+ FileBackedVector<Entry>::kElementTypeSize,
+ "Max # of entries cannot fit into FileBackedVector");
+
+ // Key-value serialized type
+ static constexpr int32_t kMaxKVTotalByteSize =
+ (FileBackedVector<char>::kMaxFileSize -
+ FileBackedVector<char>::Header::kHeaderSize) /
+ FileBackedVector<char>::kElementTypeSize;
+ static constexpr int32_t kMaxKVIndex = kMaxKVTotalByteSize - 1;
+ static constexpr int32_t kInvalidKVIndex = -1;
+ static_assert(sizeof(char) == FileBackedVector<char>::kElementTypeSize,
+ "Char type size is inconsistent with FileBackedVector element "
+ "type size");
+
+ static constexpr int32_t kVersion = 1;
+ static constexpr int32_t kDefaultMaxLoadFactorPercent = 75;
+
+ static constexpr std::string_view kFilePrefix = "persistent_hash_map";
+ // Only metadata, bucket, entry files are stored under this sub-directory, for
+ // rehashing branching use.
+ static constexpr std::string_view kSubDirectory = "dynamic";
+
+ // Creates a new PersistentHashMap to read/write/delete key value pairs.
+ //
+ // filesystem: Object to make system level calls
+ // base_dir: Specifies the directory for all persistent hash map related
+ // sub-directory and files to be stored. If base_dir doesn't exist,
+ // then PersistentHashMap will automatically create it. If files
+ // exist, then it will initialize the hash map from existing files.
+ // value_type_size: (fixed) size of the serialized value type for hash map.
+ // max_load_factor_percent: percentage of the max loading for the hash map.
+ // load_factor_percent = 100 * num_keys / num_buckets
+ // If load_factor_percent exceeds
+ // max_load_factor_percent, then rehash will be
+ // invoked (and # of buckets will be doubled).
+ // Note that load_factor_percent exceeding 100 is
+ // considered valid.
+ //
+ // Returns:
+ // FAILED_PRECONDITION_ERROR if the file checksum doesn't match the stored
+ // checksum.
+ // INTERNAL_ERROR on I/O errors.
+ // Any FileBackedVector errors.
+ static libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>>
+ Create(const Filesystem& filesystem, std::string_view base_dir,
+ int32_t value_type_size,
+ int32_t max_load_factor_percent = kDefaultMaxLoadFactorPercent);
+
+ ~PersistentHashMap();
+
+ // Update a key value pair. If key does not exist, then insert (key, value)
+ // into the storage. Otherwise overwrite the value into the storage.
+ //
+ // REQUIRES: the buffer pointed to by value must be of value_size()
+ //
+ // Returns:
+ // OK on success
+ // RESOURCE_EXHAUSTED_ERROR if # of entries reach kMaxNumEntries
+ // INVALID_ARGUMENT_ERROR if the key is invalid (i.e. contains '\0')
+ // INTERNAL_ERROR on I/O error or any data inconsistency
+ // Any FileBackedVector errors
+ libtextclassifier3::Status Put(std::string_view key, const void* value);
+
+ // If key does not exist, then insert (key, next_value) into the storage.
+ // Otherwise, copy the hash map value into next_value.
+ //
+ // REQUIRES: the buffer pointed to by next_value must be of value_size()
+ //
+ // Returns:
+ // OK on success
+ // INVALID_ARGUMENT_ERROR if the key is invalid (i.e. contains '\0')
+ // INTERNAL_ERROR on I/O error or any data inconsistency
+ // Any FileBackedVector errors
+ libtextclassifier3::Status GetOrPut(std::string_view key, void* next_value);
+
+ // Get the value by key from the storage. If key exists, then copy the hash
+ // map value into into value buffer. Otherwise, return NOT_FOUND_ERROR.
+ //
+ // REQUIRES: the buffer pointed to by value must be of value_size()
+ //
+ // Returns:
+ // OK on success
+ // NOT_FOUND_ERROR if the key doesn't exist
+ // INVALID_ARGUMENT_ERROR if the key is invalid (i.e. contains '\0')
+ // INTERNAL_ERROR on I/O error or any data inconsistency
+ // Any FileBackedVector errors
+ libtextclassifier3::Status Get(std::string_view key, void* value) const;
+
+ // Flushes content to underlying files.
+ //
+ // Returns:
+ // OK on success
+ // INTERNAL_ERROR on I/O error
+ libtextclassifier3::Status PersistToDisk();
+
+ // Calculates and returns the disk usage (metadata + 3 storages total file
+ // size) in bytes.
+ //
+ // Returns:
+ // Disk usage on success
+ // INTERNAL_ERROR on I/O error
+ libtextclassifier3::StatusOr<int64_t> GetDiskUsage() const;
+
+ // Returns the total file size of the all the elements held in the persistent
+ // hash map. File size is in bytes. This excludes the size of any internal
+ // metadata, i.e. crcs/info of persistent hash map, file backed vector's
+ // header.
+ //
+ // Returns:
+ // File size on success
+ // INTERNAL_ERROR on I/O error
+ libtextclassifier3::StatusOr<int64_t> GetElementsSize() const;
+
+ // Updates all checksums of the persistent hash map components and returns
+ // all_crc.
+ //
+ // Returns:
+ // Crc of all components (all_crc) on success
+ // INTERNAL_ERROR if any data inconsistency
+ libtextclassifier3::StatusOr<Crc32> ComputeChecksum();
+
+ int32_t size() const {
+ return entry_storage_->num_elements() - info()->num_deleted_entries;
+ }
+
+ bool empty() const { return size() == 0; }
+
+ private:
+ explicit PersistentHashMap(
+ const Filesystem& filesystem, std::string_view base_dir,
+ std::unique_ptr<MemoryMappedFile> metadata_mmapped_file,
+ std::unique_ptr<FileBackedVector<Bucket>> bucket_storage,
+ std::unique_ptr<FileBackedVector<Entry>> entry_storage,
+ std::unique_ptr<FileBackedVector<char>> kv_storage)
+ : filesystem_(&filesystem),
+ base_dir_(base_dir),
+ metadata_mmapped_file_(std::move(metadata_mmapped_file)),
+ bucket_storage_(std::move(bucket_storage)),
+ entry_storage_(std::move(entry_storage)),
+ kv_storage_(std::move(kv_storage)) {}
+
+ static libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>>
+ InitializeNewFiles(const Filesystem& filesystem, std::string_view base_dir,
+ int32_t value_type_size, int32_t max_load_factor_percent);
+
+ static libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>>
+ InitializeExistingFiles(const Filesystem& filesystem,
+ std::string_view base_dir, int32_t value_type_size,
+ int32_t max_load_factor_percent);
+
+ // Find the index of the key entry from a bucket (specified by bucket index).
+ // The caller should specify the desired bucket index.
+ //
+ // Returns:
+ // int32_t: on success, the index of the entry, or Entry::kInvalidIndex if
+ // not found
+ // INTERNAL_ERROR if any content inconsistency
+ // Any FileBackedVector errors
+ libtextclassifier3::StatusOr<int32_t> FindEntryIndexByKey(
+ int32_t bucket_idx, std::string_view key) const;
+
+ // Copy the hash map value of the entry into value buffer.
+ //
+ // REQUIRES: entry_idx should be valid.
+ // REQUIRES: the buffer pointed to by value must be of value_size()
+ //
+ // Returns:
+ // OK on success
+ // Any FileBackedVector errors
+ libtextclassifier3::Status CopyEntryValue(int32_t entry_idx,
+ void* value) const;
+
+ // Insert a new key value pair into a bucket (specified by the bucket index).
+ // The caller should specify the desired bucket index and make sure that the
+ // key is not present in the hash map before calling.
+ //
+ // Returns:
+ // OK on success
+ // Any FileBackedVector errors
+ libtextclassifier3::Status Insert(int32_t bucket_idx, std::string_view key,
+ const void* value);
+
+ Crcs* crcs() {
+ return reinterpret_cast<Crcs*>(metadata_mmapped_file_->mutable_region() +
+ Crcs::kFileOffset);
+ }
+
+ Info* info() {
+ return reinterpret_cast<Info*>(metadata_mmapped_file_->mutable_region() +
+ Info::kFileOffset);
+ }
+
+ const Info* info() const {
+ return reinterpret_cast<const Info*>(metadata_mmapped_file_->region() +
+ Info::kFileOffset);
+ }
+
+ const Filesystem* filesystem_;
+ std::string base_dir_;
+
+ std::unique_ptr<MemoryMappedFile> metadata_mmapped_file_;
+
+ // Storages
+ std::unique_ptr<FileBackedVector<Bucket>> bucket_storage_;
+ std::unique_ptr<FileBackedVector<Entry>> entry_storage_;
+ std::unique_ptr<FileBackedVector<char>> kv_storage_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_FILE_PERSISTENT_HASH_MAP_H_
diff --git a/icing/file/persistent-hash-map_test.cc b/icing/file/persistent-hash-map_test.cc
new file mode 100644
index 0000000..fb15175
--- /dev/null
+++ b/icing/file/persistent-hash-map_test.cc
@@ -0,0 +1,662 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/file/persistent-hash-map.h"
+
+#include <cstring>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/file/file-backed-vector.h"
+#include "icing/file/filesystem.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/tmp-directory.h"
+#include "icing/util/crc32.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+static constexpr int32_t kCorruptedValueOffset = 3;
+
+using ::testing::Eq;
+using ::testing::HasSubstr;
+using ::testing::IsEmpty;
+using ::testing::Not;
+using ::testing::Pointee;
+using ::testing::SizeIs;
+
+using Bucket = PersistentHashMap::Bucket;
+using Crcs = PersistentHashMap::Crcs;
+using Entry = PersistentHashMap::Entry;
+using Info = PersistentHashMap::Info;
+
+class PersistentHashMapTest : public ::testing::Test {
+ protected:
+ void SetUp() override {
+ base_dir_ = GetTestTempDir() + "/persistent_hash_map_test";
+ }
+
+ void TearDown() override {
+ filesystem_.DeleteDirectoryRecursively(base_dir_.c_str());
+ }
+
+ std::vector<char> Serialize(int val) {
+ std::vector<char> ret(sizeof(val));
+ memcpy(ret.data(), &val, sizeof(val));
+ return ret;
+ }
+
+ libtextclassifier3::StatusOr<int> GetValueByKey(
+ PersistentHashMap* persistent_hash_map, std::string_view key) {
+ int val;
+ ICING_RETURN_IF_ERROR(persistent_hash_map->Get(key, &val));
+ return val;
+ }
+
+ Filesystem filesystem_;
+ std::string base_dir_;
+};
+
+TEST_F(PersistentHashMapTest, InvalidBaseDir) {
+ EXPECT_THAT(PersistentHashMap::Create(filesystem_, "/dev/null",
+ /*value_type_size=*/sizeof(int)),
+ StatusIs(libtextclassifier3::StatusCode::INTERNAL));
+}
+
+TEST_F(PersistentHashMapTest, InitializeNewFiles) {
+ {
+ ASSERT_FALSE(filesystem_.DirectoryExists(base_dir_.c_str()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(filesystem_, base_dir_,
+ /*value_type_size=*/sizeof(int)));
+ EXPECT_THAT(persistent_hash_map, Pointee(IsEmpty()));
+
+ ICING_ASSERT_OK(persistent_hash_map->PersistToDisk());
+ }
+
+ // Metadata file should be initialized correctly for both info and crcs
+ // sections.
+ const std::string metadata_file_path =
+ absl_ports::StrCat(base_dir_, "/", PersistentHashMap::kSubDirectory, "/",
+ PersistentHashMap::kFilePrefix, ".m");
+ ScopedFd metadata_sfd(filesystem_.OpenForWrite(metadata_file_path.c_str()));
+ ASSERT_TRUE(metadata_sfd.is_valid());
+
+ // Check info section
+ Info info;
+ ASSERT_TRUE(filesystem_.PRead(metadata_sfd.get(), &info, sizeof(Info),
+ Info::kFileOffset));
+ EXPECT_THAT(info.version, Eq(PersistentHashMap::kVersion));
+ EXPECT_THAT(info.value_type_size, Eq(sizeof(int)));
+ EXPECT_THAT(info.max_load_factor_percent,
+ Eq(PersistentHashMap::kDefaultMaxLoadFactorPercent));
+ EXPECT_THAT(info.num_deleted_entries, Eq(0));
+ EXPECT_THAT(info.num_deleted_key_value_bytes, Eq(0));
+
+ // Check crcs section
+ Crcs crcs;
+ ASSERT_TRUE(filesystem_.PRead(metadata_sfd.get(), &crcs, sizeof(Crcs),
+ Crcs::kFileOffset));
+ // # of elements in bucket_storage should be 1, so it should have non-zero
+ // crc value.
+ EXPECT_THAT(crcs.component_crcs.bucket_storage_crc, Not(Eq(0)));
+ // Other empty file backed vectors should have 0 crc value.
+ EXPECT_THAT(crcs.component_crcs.entry_storage_crc, Eq(0));
+ EXPECT_THAT(crcs.component_crcs.kv_storage_crc, Eq(0));
+ EXPECT_THAT(crcs.component_crcs.info_crc,
+ Eq(Crc32(std::string_view(reinterpret_cast<const char*>(&info),
+ sizeof(Info)))
+ .Get()));
+ EXPECT_THAT(crcs.all_crc,
+ Eq(Crc32(std::string_view(
+ reinterpret_cast<const char*>(&crcs.component_crcs),
+ sizeof(Crcs::ComponentCrcs)))
+ .Get()));
+}
+
+TEST_F(PersistentHashMapTest,
+ TestInitializationFailsWithoutPersistToDiskOrDestruction) {
+ // Create new persistent hash map
+ // Set max_load_factor_percent as 1000. Load factor percent is calculated as
+ // 100 * num_keys / num_buckets. Therefore, with 1 bucket (the initial # of
+ // buckets in an empty PersistentHashMap) and a max_load_factor_percent of
+ // 1000, we would allow the insertion of up to 10 keys before rehashing, to
+ // avoid PersistToDisk being called implicitly by rehashing.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(filesystem_, base_dir_,
+ /*value_type_size=*/sizeof(int),
+ /*max_load_factor_percent=*/1000));
+
+ // Put some key value pairs.
+ ICING_ASSERT_OK(persistent_hash_map->Put("a", Serialize(1).data()));
+ ICING_ASSERT_OK(persistent_hash_map->Put("b", Serialize(2).data()));
+ // TODO(b/193919210): call Delete() to change PersistentHashMap header
+
+ ASSERT_THAT(persistent_hash_map, Pointee(SizeIs(2)));
+ ASSERT_THAT(GetValueByKey(persistent_hash_map.get(), "a"), IsOkAndHolds(1));
+ ASSERT_THAT(GetValueByKey(persistent_hash_map.get(), "b"), IsOkAndHolds(2));
+
+ // Without calling PersistToDisk, checksums will not be recomputed or synced
+ // to disk, so initializing another instance on the same files should fail.
+ EXPECT_THAT(PersistentHashMap::Create(filesystem_, base_dir_,
+ /*value_type_size=*/sizeof(int),
+ /*max_load_factor_percent=*/1000),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+}
+
+TEST_F(PersistentHashMapTest, TestInitializationSucceedsWithPersistToDisk) {
+ // Create new persistent hash map
+ // Set max_load_factor_percent as 1000. Load factor percent is calculated as
+ // 100 * num_keys / num_buckets. Therefore, with 1 bucket (the initial # of
+ // buckets in an empty PersistentHashMap) and a max_load_factor_percent of
+ // 1000, we would allow the insertion of up to 10 keys before rehashing, to
+ // avoid PersistToDisk being called implicitly by rehashing.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map1,
+ PersistentHashMap::Create(filesystem_, base_dir_,
+ /*value_type_size=*/sizeof(int),
+ /*max_load_factor_percent=*/1000));
+
+ // Put some key value pairs.
+ ICING_ASSERT_OK(persistent_hash_map1->Put("a", Serialize(1).data()));
+ ICING_ASSERT_OK(persistent_hash_map1->Put("b", Serialize(2).data()));
+ // TODO(b/193919210): call Delete() to change PersistentHashMap header
+
+ ASSERT_THAT(persistent_hash_map1, Pointee(SizeIs(2)));
+ ASSERT_THAT(GetValueByKey(persistent_hash_map1.get(), "a"), IsOkAndHolds(1));
+ ASSERT_THAT(GetValueByKey(persistent_hash_map1.get(), "b"), IsOkAndHolds(2));
+
+ // After calling PersistToDisk, all checksums should be recomputed and synced
+ // correctly to disk, so initializing another instance on the same files
+ // should succeed, and we should be able to get the same contents.
+ ICING_EXPECT_OK(persistent_hash_map1->PersistToDisk());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map2,
+ PersistentHashMap::Create(filesystem_, base_dir_,
+ /*value_type_size=*/sizeof(int),
+ /*max_load_factor_percent=*/1000));
+ EXPECT_THAT(persistent_hash_map2, Pointee(SizeIs(2)));
+ EXPECT_THAT(GetValueByKey(persistent_hash_map2.get(), "a"), IsOkAndHolds(1));
+ EXPECT_THAT(GetValueByKey(persistent_hash_map2.get(), "b"), IsOkAndHolds(2));
+}
+
+TEST_F(PersistentHashMapTest, TestInitializationSucceedsAfterDestruction) {
+ {
+ // Create new persistent hash map
+ // Set max_load_factor_percent as 1000. Load factor percent is calculated as
+ // 100 * num_keys / num_buckets. Therefore, with 1 bucket (the initial # of
+ // buckets in an empty PersistentHashMap) and a max_load_factor_percent of
+ // 1000, we would allow the insertion of up to 10 keys before rehashing, to
+ // avoid PersistToDisk being called implicitly by rehashing.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(filesystem_, base_dir_,
+ /*value_type_size=*/sizeof(int),
+ /*max_load_factor_percent=*/1000));
+ ICING_ASSERT_OK(persistent_hash_map->Put("a", Serialize(1).data()));
+ ICING_ASSERT_OK(persistent_hash_map->Put("b", Serialize(2).data()));
+ // TODO(b/193919210): call Delete() to change PersistentHashMap header
+
+ ASSERT_THAT(persistent_hash_map, Pointee(SizeIs(2)));
+ ASSERT_THAT(GetValueByKey(persistent_hash_map.get(), "a"), IsOkAndHolds(1));
+ ASSERT_THAT(GetValueByKey(persistent_hash_map.get(), "b"), IsOkAndHolds(2));
+ }
+
+ {
+ // The previous instance went out of scope and was destructed. Although we
+ // didn't call PersistToDisk explicitly, the destructor should invoke it and
+ // thus initializing another instance on the same files should succeed, and
+ // we should be able to get the same contents.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(filesystem_, base_dir_,
+ /*value_type_size=*/sizeof(int),
+ /*max_load_factor_percent=*/1000));
+ EXPECT_THAT(persistent_hash_map, Pointee(SizeIs(2)));
+ EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "a"), IsOkAndHolds(1));
+ EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "b"), IsOkAndHolds(2));
+ }
+}
+
+TEST_F(PersistentHashMapTest,
+ InitializeExistingFilesWithDifferentValueTypeSizeShouldFail) {
+ {
+ // Create new persistent hash map
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(filesystem_, base_dir_,
+ /*value_type_size=*/sizeof(int)));
+ ICING_ASSERT_OK(persistent_hash_map->Put("a", Serialize(1).data()));
+
+ ICING_ASSERT_OK(persistent_hash_map->PersistToDisk());
+ }
+
+ {
+ // Attempt to create the persistent hash map with different value type size.
+ // This should fail.
+ ASSERT_THAT(sizeof(char), Not(Eq(sizeof(int))));
+ libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>>
+ persistent_hash_map_or = PersistentHashMap::Create(
+ filesystem_, base_dir_, /*value_type_size=*/sizeof(char));
+ EXPECT_THAT(persistent_hash_map_or,
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+ EXPECT_THAT(persistent_hash_map_or.status().error_message(),
+ HasSubstr("Incorrect value type size"));
+ }
+}
+
+TEST_F(PersistentHashMapTest, InitializeExistingFilesWithWrongAllCrc) {
+ {
+ // Create new persistent hash map
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(filesystem_, base_dir_,
+ /*value_type_size=*/sizeof(int)));
+ ICING_ASSERT_OK(persistent_hash_map->Put("a", Serialize(1).data()));
+
+ ICING_ASSERT_OK(persistent_hash_map->PersistToDisk());
+ }
+
+ const std::string metadata_file_path =
+ absl_ports::StrCat(base_dir_, "/", PersistentHashMap::kSubDirectory, "/",
+ PersistentHashMap::kFilePrefix, ".m");
+ ScopedFd metadata_sfd(filesystem_.OpenForWrite(metadata_file_path.c_str()));
+ ASSERT_TRUE(metadata_sfd.is_valid());
+
+ Crcs crcs;
+ ASSERT_TRUE(filesystem_.PRead(metadata_sfd.get(), &crcs, sizeof(Crcs),
+ Crcs::kFileOffset));
+
+ // Manually corrupt all_crc
+ crcs.all_crc += kCorruptedValueOffset;
+ ASSERT_TRUE(filesystem_.PWrite(metadata_sfd.get(), Crcs::kFileOffset, &crcs,
+ sizeof(Crcs)));
+ metadata_sfd.reset();
+
+ {
+ // Attempt to create the persistent hash map with metadata containing
+ // corrupted all_crc. This should fail.
+ libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>>
+ persistent_hash_map_or = PersistentHashMap::Create(
+ filesystem_, base_dir_, /*value_type_size=*/sizeof(int));
+ EXPECT_THAT(persistent_hash_map_or,
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+ EXPECT_THAT(persistent_hash_map_or.status().error_message(),
+ HasSubstr("Invalid all crc for PersistentHashMap"));
+ }
+}
+
+TEST_F(PersistentHashMapTest,
+ InitializeExistingFilesWithCorruptedInfoShouldFail) {
+ {
+ // Create new persistent hash map
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(filesystem_, base_dir_,
+ /*value_type_size=*/sizeof(int)));
+ ICING_ASSERT_OK(persistent_hash_map->Put("a", Serialize(1).data()));
+
+ ICING_ASSERT_OK(persistent_hash_map->PersistToDisk());
+ }
+
+ const std::string metadata_file_path =
+ absl_ports::StrCat(base_dir_, "/", PersistentHashMap::kSubDirectory, "/",
+ PersistentHashMap::kFilePrefix, ".m");
+ ScopedFd metadata_sfd(filesystem_.OpenForWrite(metadata_file_path.c_str()));
+ ASSERT_TRUE(metadata_sfd.is_valid());
+
+ Info info;
+ ASSERT_TRUE(filesystem_.PRead(metadata_sfd.get(), &info, sizeof(Info),
+ Info::kFileOffset));
+
+ // Modify info, but don't update the checksum. This would be similar to
+ // corruption of info.
+ info.num_deleted_entries += kCorruptedValueOffset;
+ ASSERT_TRUE(filesystem_.PWrite(metadata_sfd.get(), Info::kFileOffset, &info,
+ sizeof(Info)));
+ {
+ // Attempt to create the persistent hash map with info that doesn't match
+ // its checksum and confirm that it fails.
+ libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>>
+ persistent_hash_map_or = PersistentHashMap::Create(
+ filesystem_, base_dir_, /*value_type_size=*/sizeof(int));
+ EXPECT_THAT(persistent_hash_map_or,
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+ EXPECT_THAT(persistent_hash_map_or.status().error_message(),
+ HasSubstr("Invalid info crc for PersistentHashMap"));
+ }
+}
+
+TEST_F(PersistentHashMapTest,
+ InitializeExistingFilesWithWrongBucketStorageCrc) {
+ {
+ // Create new persistent hash map
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(filesystem_, base_dir_,
+ /*value_type_size=*/sizeof(int)));
+ ICING_ASSERT_OK(persistent_hash_map->Put("a", Serialize(1).data()));
+
+ ICING_ASSERT_OK(persistent_hash_map->PersistToDisk());
+ }
+
+ const std::string metadata_file_path =
+ absl_ports::StrCat(base_dir_, "/", PersistentHashMap::kSubDirectory, "/",
+ PersistentHashMap::kFilePrefix, ".m");
+ ScopedFd metadata_sfd(filesystem_.OpenForWrite(metadata_file_path.c_str()));
+ ASSERT_TRUE(metadata_sfd.is_valid());
+
+ Crcs crcs;
+ ASSERT_TRUE(filesystem_.PRead(metadata_sfd.get(), &crcs, sizeof(Crcs),
+ Crcs::kFileOffset));
+
+ // Manually corrupt bucket_storage_crc
+ crcs.component_crcs.bucket_storage_crc += kCorruptedValueOffset;
+ crcs.all_crc = Crc32(std::string_view(
+ reinterpret_cast<const char*>(&crcs.component_crcs),
+ sizeof(Crcs::ComponentCrcs)))
+ .Get();
+ ASSERT_TRUE(filesystem_.PWrite(metadata_sfd.get(), Crcs::kFileOffset, &crcs,
+ sizeof(Crcs)));
+ {
+ // Attempt to create the persistent hash map with metadata containing
+ // corrupted bucket_storage_crc. This should fail.
+ libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>>
+ persistent_hash_map_or = PersistentHashMap::Create(
+ filesystem_, base_dir_, /*value_type_size=*/sizeof(int));
+ EXPECT_THAT(persistent_hash_map_or,
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+ EXPECT_THAT(
+ persistent_hash_map_or.status().error_message(),
+ HasSubstr("Mismatch crc with PersistentHashMap bucket storage"));
+ }
+}
+
+TEST_F(PersistentHashMapTest, InitializeExistingFilesWithWrongEntryStorageCrc) {
+ {
+ // Create new persistent hash map
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(filesystem_, base_dir_,
+ /*value_type_size=*/sizeof(int)));
+ ICING_ASSERT_OK(persistent_hash_map->Put("a", Serialize(1).data()));
+
+ ICING_ASSERT_OK(persistent_hash_map->PersistToDisk());
+ }
+
+ const std::string metadata_file_path =
+ absl_ports::StrCat(base_dir_, "/", PersistentHashMap::kSubDirectory, "/",
+ PersistentHashMap::kFilePrefix, ".m");
+ ScopedFd metadata_sfd(filesystem_.OpenForWrite(metadata_file_path.c_str()));
+ ASSERT_TRUE(metadata_sfd.is_valid());
+
+ Crcs crcs;
+ ASSERT_TRUE(filesystem_.PRead(metadata_sfd.get(), &crcs, sizeof(Crcs),
+ Crcs::kFileOffset));
+
+ // Manually corrupt entry_storage_crc
+ crcs.component_crcs.entry_storage_crc += kCorruptedValueOffset;
+ crcs.all_crc = Crc32(std::string_view(
+ reinterpret_cast<const char*>(&crcs.component_crcs),
+ sizeof(Crcs::ComponentCrcs)))
+ .Get();
+ ASSERT_TRUE(filesystem_.PWrite(metadata_sfd.get(), Crcs::kFileOffset, &crcs,
+ sizeof(Crcs)));
+ {
+ // Attempt to create the persistent hash map with metadata containing
+ // corrupted entry_storage_crc. This should fail.
+ libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>>
+ persistent_hash_map_or = PersistentHashMap::Create(
+ filesystem_, base_dir_, /*value_type_size=*/sizeof(int));
+ EXPECT_THAT(persistent_hash_map_or,
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+ EXPECT_THAT(persistent_hash_map_or.status().error_message(),
+ HasSubstr("Mismatch crc with PersistentHashMap entry storage"));
+ }
+}
+
+TEST_F(PersistentHashMapTest,
+ InitializeExistingFilesWithWrongKeyValueStorageCrc) {
+ {
+ // Create new persistent hash map
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(filesystem_, base_dir_,
+ /*value_type_size=*/sizeof(int)));
+ ICING_ASSERT_OK(persistent_hash_map->Put("a", Serialize(1).data()));
+
+ ICING_ASSERT_OK(persistent_hash_map->PersistToDisk());
+ }
+
+ const std::string metadata_file_path =
+ absl_ports::StrCat(base_dir_, "/", PersistentHashMap::kSubDirectory, "/",
+ PersistentHashMap::kFilePrefix, ".m");
+ ScopedFd metadata_sfd(filesystem_.OpenForWrite(metadata_file_path.c_str()));
+ ASSERT_TRUE(metadata_sfd.is_valid());
+
+ Crcs crcs;
+ ASSERT_TRUE(filesystem_.PRead(metadata_sfd.get(), &crcs, sizeof(Crcs),
+ Crcs::kFileOffset));
+
+ // Manually corrupt kv_storage_crc
+ crcs.component_crcs.kv_storage_crc += kCorruptedValueOffset;
+ crcs.all_crc = Crc32(std::string_view(
+ reinterpret_cast<const char*>(&crcs.component_crcs),
+ sizeof(Crcs::ComponentCrcs)))
+ .Get();
+ ASSERT_TRUE(filesystem_.PWrite(metadata_sfd.get(), Crcs::kFileOffset, &crcs,
+ sizeof(Crcs)));
+ {
+ // Attempt to create the persistent hash map with metadata containing
+ // corrupted kv_storage_crc. This should fail.
+ libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>>
+ persistent_hash_map_or = PersistentHashMap::Create(
+ filesystem_, base_dir_, /*value_type_size=*/sizeof(int));
+ EXPECT_THAT(persistent_hash_map_or,
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+ EXPECT_THAT(
+ persistent_hash_map_or.status().error_message(),
+ HasSubstr("Mismatch crc with PersistentHashMap key value storage"));
+ }
+}
+
+TEST_F(PersistentHashMapTest,
+ InitializeExistingFilesAllowDifferentMaxLoadFactorPercent) {
+ {
+ // Create new persistent hash map
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(filesystem_, base_dir_,
+ /*value_type_size=*/sizeof(int)));
+ ICING_ASSERT_OK(persistent_hash_map->Put("a", Serialize(1).data()));
+ ICING_ASSERT_OK(persistent_hash_map->Put("b", Serialize(2).data()));
+
+ ASSERT_THAT(persistent_hash_map, Pointee(SizeIs(2)));
+ ASSERT_THAT(GetValueByKey(persistent_hash_map.get(), "a"), IsOkAndHolds(1));
+ ASSERT_THAT(GetValueByKey(persistent_hash_map.get(), "b"), IsOkAndHolds(2));
+
+ ICING_ASSERT_OK(persistent_hash_map->PersistToDisk());
+ }
+
+ int32_t new_max_load_factor_percent = 100;
+ {
+ ASSERT_THAT(new_max_load_factor_percent,
+ Not(Eq(PersistentHashMap::kDefaultMaxLoadFactorPercent)));
+ // Attempt to create the persistent hash map with different max load factor
+ // percent. This should succeed and metadata should be modified correctly.
+ // Also verify all entries should remain unchanged.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(filesystem_, base_dir_,
+ /*value_type_size=*/sizeof(int),
+ new_max_load_factor_percent));
+
+ EXPECT_THAT(persistent_hash_map, Pointee(SizeIs(2)));
+ EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "a"), IsOkAndHolds(1));
+ EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "b"), IsOkAndHolds(2));
+
+ ICING_ASSERT_OK(persistent_hash_map->PersistToDisk());
+ }
+
+ const std::string metadata_file_path =
+ absl_ports::StrCat(base_dir_, "/", PersistentHashMap::kSubDirectory, "/",
+ PersistentHashMap::kFilePrefix, ".m");
+ ScopedFd metadata_sfd(filesystem_.OpenForWrite(metadata_file_path.c_str()));
+ ASSERT_TRUE(metadata_sfd.is_valid());
+
+ Info info;
+ ASSERT_TRUE(filesystem_.PRead(metadata_sfd.get(), &info, sizeof(Info),
+ Info::kFileOffset));
+ EXPECT_THAT(info.max_load_factor_percent, Eq(new_max_load_factor_percent));
+
+ // Also should update crcs correctly. We test it by creating instance again
+ // and make sure it won't get corrupted crcs/info errors.
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(filesystem_, base_dir_,
+ /*value_type_size=*/sizeof(int),
+ new_max_load_factor_percent));
+
+ ICING_ASSERT_OK(persistent_hash_map->PersistToDisk());
+ }
+}
+
+TEST_F(PersistentHashMapTest, PutAndGet) {
+ // Create new persistent hash map
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(filesystem_, base_dir_,
+ /*value_type_size=*/sizeof(int)));
+
+ EXPECT_THAT(persistent_hash_map, Pointee(IsEmpty()));
+ EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "default-google.com"),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "default-youtube.com"),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+
+ ICING_EXPECT_OK(
+ persistent_hash_map->Put("default-google.com", Serialize(100).data()));
+ ICING_EXPECT_OK(
+ persistent_hash_map->Put("default-youtube.com", Serialize(50).data()));
+
+ EXPECT_THAT(persistent_hash_map, Pointee(SizeIs(2)));
+ EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "default-google.com"),
+ IsOkAndHolds(100));
+ EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "default-youtube.com"),
+ IsOkAndHolds(50));
+ EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "key-not-exist"),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+
+ ICING_ASSERT_OK(persistent_hash_map->PersistToDisk());
+}
+
+TEST_F(PersistentHashMapTest, PutShouldOverwriteValueIfKeyExists) {
+ // Create new persistent hash map
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(filesystem_, base_dir_,
+ /*value_type_size=*/sizeof(int)));
+
+ ICING_ASSERT_OK(
+ persistent_hash_map->Put("default-google.com", Serialize(100).data()));
+ ASSERT_THAT(persistent_hash_map, Pointee(SizeIs(1)));
+ ASSERT_THAT(GetValueByKey(persistent_hash_map.get(), "default-google.com"),
+ IsOkAndHolds(100));
+
+ ICING_EXPECT_OK(
+ persistent_hash_map->Put("default-google.com", Serialize(200).data()));
+ EXPECT_THAT(persistent_hash_map, Pointee(SizeIs(1)));
+ EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "default-google.com"),
+ IsOkAndHolds(200));
+
+ ICING_EXPECT_OK(
+ persistent_hash_map->Put("default-google.com", Serialize(300).data()));
+ EXPECT_THAT(persistent_hash_map, Pointee(SizeIs(1)));
+ EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "default-google.com"),
+ IsOkAndHolds(300));
+}
+
+TEST_F(PersistentHashMapTest, GetOrPutShouldPutIfKeyDoesNotExist) {
+ // Create new persistent hash map
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(filesystem_, base_dir_,
+ /*value_type_size=*/sizeof(int)));
+
+ ASSERT_THAT(GetValueByKey(persistent_hash_map.get(), "default-google.com"),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+
+ int val = 1;
+ EXPECT_THAT(persistent_hash_map->GetOrPut("default-google.com", &val),
+ IsOk());
+ EXPECT_THAT(val, Eq(1));
+ EXPECT_THAT(persistent_hash_map, Pointee(SizeIs(1)));
+ EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "default-google.com"),
+ IsOkAndHolds(1));
+}
+
+TEST_F(PersistentHashMapTest, GetOrPutShouldGetIfKeyExists) {
+ // Create new persistent hash map
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(filesystem_, base_dir_,
+ /*value_type_size=*/sizeof(int)));
+
+ ASSERT_THAT(
+ persistent_hash_map->Put("default-google.com", Serialize(1).data()),
+ IsOk());
+ ASSERT_THAT(GetValueByKey(persistent_hash_map.get(), "default-google.com"),
+ IsOkAndHolds(1));
+
+ int val = 2;
+ EXPECT_THAT(persistent_hash_map->GetOrPut("default-google.com", &val),
+ IsOk());
+ EXPECT_THAT(val, Eq(1));
+ EXPECT_THAT(persistent_hash_map, Pointee(SizeIs(1)));
+ EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "default-google.com"),
+ IsOkAndHolds(1));
+}
+
+TEST_F(PersistentHashMapTest, ShouldFailIfKeyContainsTerminationCharacter) {
+ // Create new persistent hash map
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(filesystem_, base_dir_,
+ /*value_type_size=*/sizeof(int)));
+
+ const char invalid_key[] = "a\0bc";
+ std::string_view invalid_key_view(invalid_key, 4);
+
+ int val = 1;
+ EXPECT_THAT(persistent_hash_map->Put(invalid_key_view, &val),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(persistent_hash_map->GetOrPut(invalid_key_view, &val),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(persistent_hash_map->Get(invalid_key_view, &val),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing