aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTim Barron <tjbarron@google.com>2022-08-11 17:05:22 -0700
committerTim Barron <tjbarron@google.com>2022-08-11 17:05:22 -0700
commit87267cbc5531600072a283ba0c9500c3fcac87af (patch)
tree2ec5c19afb1f4d5ee229d0619c25b2f2819ccea0
parent7c93c404e1fb4ed5e35326245ebc820ed774c6b2 (diff)
downloadicing-87267cbc5531600072a283ba0c9500c3fcac87af.tar.gz
Sync from upstream.
Descriptions: ====================================================================== Implement new version of ResultState and ResultStateManager to 1) enforce a page byte size limit and 2) improve handling of pagination when we encounter deleted documents. ====================================================================== Fix bugs in IcingDynamicTrie::Delete. ====================================================================== Implement IcingDynamicTrie::IsBranchingTerm. ====================================================================== Change Icing default logging level to INFO ====================================================================== Refactor KeyMapper class to be an interface. ====================================================================== Improve NamespaceChecker logic to improve Suggest latency. ====================================================================== Change icing native log tag to "AppSearchIcing" ====================================================================== Implement Index Compaction rather than rebuilding index during Compaction. ====================================================================== Implement reverse iterator for IcingDynamicTrie ====================================================================== Avoid adding unnecessary branch points during index compaction ====================================================================== Invalidate expired result states when adding to/retrieving from ResultStateManager. ====================================================================== Add new methods (MutableView, MutableArrayView, Append, Allocate) to FileBackedVector ====================================================================== Create and implement PersistentHashMap class. ====================================================================== Implement RFC822 Tokenizer ====================================================================== Remove uses of StringPrintf in ICING_LOG statements ====================================================================== Properly set query latency when an error is encountered or results are empty. ====================================================================== Bug: 146903474 Bug: 152934343 Bug: 193919210 Bug: 193453081 Bug: 231368517 Bug: 235395538 Bug: 236412165 Change-Id: I8aa278cebb12b25b39deb0ef584c0f198952659d
-rw-r--r--icing/file/file-backed-bitmap.cc3
-rw-r--r--icing/file/file-backed-vector.h496
-rw-r--r--icing/file/file-backed-vector_test.cc579
-rw-r--r--icing/file/filesystem.cc97
-rw-r--r--icing/file/persistent-hash-map.cc534
-rw-r--r--icing/file/persistent-hash-map.h383
-rw-r--r--icing/file/persistent-hash-map_test.cc662
-rw-r--r--icing/icing-search-engine.cc317
-rw-r--r--icing/icing-search-engine.h8
-rw-r--r--icing/icing-search-engine_test.cc132
-rw-r--r--icing/index/hit/hit.cc5
-rw-r--r--icing/index/hit/hit.h3
-rw-r--r--icing/index/index.cc8
-rw-r--r--icing/index/index.h9
-rw-r--r--icing/index/index_test.cc301
-rw-r--r--icing/index/lite/lite-index-header.h (renamed from icing/legacy/index/icing-lite-index-header.h)14
-rw-r--r--icing/index/lite/lite-index-options.cc (renamed from icing/legacy/index/icing-lite-index-options.cc)20
-rw-r--r--icing/index/lite/lite-index-options.h (renamed from icing/legacy/index/icing-lite-index-options.h)11
-rw-r--r--icing/index/lite/lite-index.cc155
-rw-r--r--icing/index/lite/lite-index.h22
-rw-r--r--icing/index/main/flash-index-storage.cc35
-rw-r--r--icing/index/main/main-index.cc173
-rw-r--r--icing/index/main/main-index.h37
-rw-r--r--icing/jni/icing-search-engine-jni.cc5
-rw-r--r--icing/jni/jni-cache.cc3
-rw-r--r--icing/jni/scoped-primitive-array-critical_test.cc140
-rw-r--r--icing/jni/scoped-utf-chars.h1
-rw-r--r--icing/jni/scoped-utf-chars_test.cc126
-rw-r--r--icing/legacy/index/icing-array-storage.cc30
-rw-r--r--icing/legacy/index/icing-common-types.h129
-rw-r--r--icing/legacy/index/icing-dynamic-trie.cc130
-rw-r--r--icing/legacy/index/icing-dynamic-trie.h19
-rw-r--r--icing/legacy/index/icing-dynamic-trie_test.cc350
-rw-r--r--icing/legacy/index/icing-filesystem.cc87
-rw-r--r--icing/legacy/index/icing-flash-bitmap.cc25
-rw-r--r--icing/legacy/index/icing-mmapper.cc6
-rw-r--r--icing/legacy/index/icing-storage-file.cc12
-rw-r--r--icing/query/query-processor_test.cc2
-rw-r--r--icing/result/result-retriever-v2.cc11
-rw-r--r--icing/result/result-retriever-v2_group-result-limiter_test.cc (renamed from icing/result/result-retriever-v2_group-result-limiter-test.cc)136
-rw-r--r--icing/result/result-retriever-v2_projection_test.cc (renamed from icing/result/result-retriever-v2_projection-test.cc)0
-rw-r--r--icing/result/result-retriever-v2_snippet_test.cc (renamed from icing/result/result-retriever-v2_snippet-test.cc)0
-rw-r--r--icing/result/result-retriever-v2_test.cc174
-rw-r--r--icing/result/result-state-manager.cc176
-rw-r--r--icing/result/result-state-manager.h71
-rw-r--r--icing/result/result-state-manager_test.cc1927
-rw-r--r--icing/result/result-state-v2.cc2
-rw-r--r--icing/result/result-state-v2.h17
-rw-r--r--icing/result/result-state-v2_test.cc43
-rw-r--r--icing/result/snippet-retriever.cc28
-rw-r--r--icing/scoring/bm25f-calculator.cc27
-rw-r--r--icing/scoring/priority-queue-scored-document-hits-ranker.cc6
-rw-r--r--icing/scoring/priority-queue-scored-document-hits-ranker.h2
-rw-r--r--icing/store/document-store.cc26
-rw-r--r--icing/store/document-store.h4
-rw-r--r--icing/store/document-store_test.cc42
-rw-r--r--icing/tokenization/icu/icu-language-segmenter_test.cc2
-rw-r--r--icing/tokenization/language-segmenter-factory.h3
-rw-r--r--icing/tokenization/reverse_jni/reverse-jni-break-iterator.cc2
-rw-r--r--icing/tokenization/reverse_jni/reverse-jni-break-iterator.h2
-rw-r--r--icing/tokenization/reverse_jni/reverse-jni-language-segmenter-factory.cc2
-rw-r--r--icing/tokenization/reverse_jni/reverse-jni-language-segmenter.h2
-rw-r--r--icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc2
-rw-r--r--icing/tokenization/rfc822-tokenizer.cc565
-rw-r--r--icing/tokenization/rfc822-tokenizer.h (renamed from icing/absl_ports/status_imports.h)26
-rw-r--r--icing/tokenization/rfc822-tokenizer_test.cc797
-rw-r--r--icing/tokenization/token.h13
-rw-r--r--icing/util/clock.h27
-rw-r--r--icing/util/crc32.h2
-rw-r--r--java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java2
-rw-r--r--proto/icing/proto/optimize.proto13
-rw-r--r--proto/icing/proto/search.proto11
-rw-r--r--synced_AOSP_CL_number.txt2
73 files changed, 7362 insertions, 1872 deletions
diff --git a/icing/file/file-backed-bitmap.cc b/icing/file/file-backed-bitmap.cc
index eec7668..a8231e3 100644
--- a/icing/file/file-backed-bitmap.cc
+++ b/icing/file/file-backed-bitmap.cc
@@ -269,8 +269,7 @@ libtextclassifier3::Status FileBackedBitmap::GrowTo(int new_num_bits) {
return status;
}
- ICING_VLOG(1) << IcingStringUtil::StringPrintf(
- "Grew file %s to new size %zd", file_path_.c_str(), new_file_size);
+ ICING_VLOG(1) << "Grew file " << file_path_ << " to new size " << new_file_size;
mutable_header()->state = Header::ChecksumState::kStale;
return libtextclassifier3::Status::OK;
}
diff --git a/icing/file/file-backed-vector.h b/icing/file/file-backed-vector.h
index 7e42e32..bcfbbdd 100644
--- a/icing/file/file-backed-vector.h
+++ b/icing/file/file-backed-vector.h
@@ -58,8 +58,12 @@
#include <sys/mman.h>
+#include <algorithm>
#include <cinttypes>
#include <cstdint>
+#include <cstring>
+#include <functional>
+#include <limits>
#include <memory>
#include <string>
#include <utility>
@@ -83,6 +87,9 @@ namespace lib {
template <typename T>
class FileBackedVector {
public:
+ class MutableArrayView;
+ class MutableView;
+
// Header stored at the beginning of the file before the rest of the vector
// elements. Stores metadata on the vector.
struct Header {
@@ -133,15 +140,24 @@ class FileBackedVector {
kHeaderChecksumOffset,
"");
- Crc32 crc;
- std::string_view header_str(
- reinterpret_cast<const char*>(this),
- offsetof(FileBackedVector::Header, header_checksum));
- crc.Append(header_str);
- return crc.Get();
+ return Crc32(std::string_view(
+ reinterpret_cast<const char*>(this),
+ offsetof(FileBackedVector::Header, header_checksum)))
+ .Get();
}
};
+ // Absolute max file size for FileBackedVector. Note that Android has a
+ // (2^31-1)-byte single file size limit, so kMaxFileSize is 2^31-1.
+ static constexpr int32_t kMaxFileSize =
+ std::numeric_limits<int32_t>::max(); // 2^31-1 Bytes, ~2.1 GB;
+
+ // Size of element type T. The value is same as sizeof(T), while we should
+ // avoid using sizeof(T) in our codebase to prevent unexpected unsigned
+ // integer casting.
+ static constexpr int32_t kElementTypeSize = static_cast<int32_t>(sizeof(T));
+ static_assert(sizeof(T) <= (1 << 10));
+
// Creates a new FileBackedVector to read/write content to.
//
// filesystem: Object to make system level calls
@@ -149,15 +165,20 @@ class FileBackedVector {
// within a directory that already exists.
// mmap_strategy : Strategy/optimizations to access the content in the vector,
// see MemoryMappedFile::Strategy for more details
+ // max_file_size: Maximum file size for FileBackedVector, default
+ // kMaxFileSize. See max_file_size_ and kMaxFileSize for more
+ // details.
//
// Return:
// FAILED_PRECONDITION_ERROR if the file checksum doesn't match the stored
// checksum.
// INTERNAL_ERROR on I/O errors.
+ // INVALID_ARGUMENT_ERROR if max_file_size is incorrect.
// UNIMPLEMENTED_ERROR if created with strategy READ_WRITE_MANUAL_SYNC.
static libtextclassifier3::StatusOr<std::unique_ptr<FileBackedVector<T>>>
Create(const Filesystem& filesystem, const std::string& file_path,
- MemoryMappedFile::Strategy mmap_strategy);
+ MemoryMappedFile::Strategy mmap_strategy,
+ int32_t max_file_size = kMaxFileSize);
// Deletes the FileBackedVector
//
@@ -184,13 +205,13 @@ class FileBackedVector {
// referencing the now-invalidated region.
//
// Returns:
- // OUT_OF_RANGE_ERROR if idx < 0 or > num_elements()
+ // OUT_OF_RANGE_ERROR if idx < 0 or idx >= num_elements()
libtextclassifier3::StatusOr<T> GetCopy(int32_t idx) const;
- // Gets a pointer to the element at idx.
+ // Gets an immutable pointer to the element at idx.
//
- // WARNING: Subsequent calls to Set may invalidate the pointer returned by
- // Get.
+ // WARNING: Subsequent calls to Set/Append/Allocate may invalidate the pointer
+ // returned by Get.
//
// This is useful if you do not think the FileBackedVector will grow before
// you need to reference this value, and you want to avoid a copy. When the
@@ -198,27 +219,102 @@ class FileBackedVector {
// which will invalidate this pointer to the previously mapped region.
//
// Returns:
- // OUT_OF_RANGE_ERROR if idx < 0 or > num_elements()
+ // OUT_OF_RANGE_ERROR if idx < 0 or idx >= num_elements()
libtextclassifier3::StatusOr<const T*> Get(int32_t idx) const;
+ // Gets a MutableView to the element at idx.
+ //
+ // WARNING: Subsequent calls to Set/Append/Allocate may invalidate the
+ // reference returned by MutableView::Get().
+ //
+ // This is useful if you do not think the FileBackedVector will grow before
+ // you need to reference this value, and you want to mutate the underlying
+ // data directly. When the FileBackedVector grows, the underlying mmap will be
+ // unmapped and remapped, which will invalidate this MutableView to the
+ // previously mapped region.
+ //
+ // Returns:
+ // OUT_OF_RANGE_ERROR if idx < 0 or idx >= num_elements()
+ libtextclassifier3::StatusOr<MutableView> GetMutable(int32_t idx);
+
+ // Gets a MutableArrayView to the elements at range [idx, idx + len).
+ //
+ // WARNING: Subsequent calls to Set/Append/Allocate may invalidate the
+ // reference/pointer returned by MutableArrayView::operator[]/data().
+ //
+ // This is useful if you do not think the FileBackedVector will grow before
+ // you need to reference this value, and you want to mutate the underlying
+ // data directly. When the FileBackedVector grows, the underlying mmap will be
+ // unmapped and remapped, which will invalidate this MutableArrayView to the
+ // previously mapped region.
+ //
+ // Returns:
+ // OUT_OF_RANGE_ERROR if idx < 0 or idx + len > num_elements()
+ libtextclassifier3::StatusOr<MutableArrayView> GetMutable(int32_t idx,
+ int32_t len);
+
// Writes the value at idx.
//
// May grow the underlying file and mmapped region as needed to fit the new
- // value. If it does grow, then any pointers to previous values returned
- // from Get() may be invalidated.
+ // value. If it does grow, then any pointers/references to previous values
+ // returned from Get/GetMutable/Allocate may be invalidated.
//
// Returns:
- // OUT_OF_RANGE_ERROR if idx < 0 or file cannot be grown idx size
+ // OUT_OF_RANGE_ERROR if idx < 0 or idx > kMaxIndex or file cannot be grown
+ // idx size
libtextclassifier3::Status Set(int32_t idx, const T& value);
+ // Appends the value to the end of the vector.
+ //
+ // May grow the underlying file and mmapped region as needed to fit the new
+ // value. If it does grow, then any pointers/references to previous values
+ // returned from Get/GetMutable/Allocate may be invalidated.
+ //
+ // Returns:
+ // OUT_OF_RANGE_ERROR if file cannot be grown (i.e. reach max_file_size_)
+ libtextclassifier3::Status Append(const T& value) {
+ return Set(header_->num_elements, value);
+ }
+
+ // Allocates spaces with given length in the end of the vector and returns a
+ // MutableArrayView to the space.
+ //
+ // May grow the underlying file and mmapped region as needed to fit the new
+ // value. If it does grow, then any pointers/references to previous values
+ // returned from Get/GetMutable/Allocate may be invalidated.
+ //
+ // WARNING: Subsequent calls to Set/Append/Allocate may invalidate the
+ // reference/pointer returned by MutableArrayView::operator[]/data().
+ //
+ // This is useful if you do not think the FileBackedVector will grow before
+ // you need to reference this value, and you want to allocate adjacent spaces
+ // for multiple elements and mutate the underlying data directly. When the
+ // FileBackedVector grows, the underlying mmap will be unmapped and remapped,
+ // which will invalidate this MutableArrayView to the previously mapped
+ // region.
+ //
+ // Returns:
+ // OUT_OF_RANGE_ERROR if len <= 0 or file cannot be grown (i.e. reach
+ // max_file_size_)
+ libtextclassifier3::StatusOr<MutableArrayView> Allocate(int32_t len);
+
// Resizes to first len elements. The crc is cleared on truncation and will be
// updated on destruction, or once the client calls ComputeChecksum() or
// PersistToDisk().
//
// Returns:
- // OUT_OF_RANGE_ERROR if len < 0 or >= num_elements()
+ // OUT_OF_RANGE_ERROR if len < 0 or len >= num_elements()
libtextclassifier3::Status TruncateTo(int32_t new_num_elements);
+ // Mark idx as changed iff idx < changes_end_, so later ComputeChecksum() can
+ // update checksum by the cached changes without going over [0, changes_end_).
+ //
+ // If the buffer size exceeds kPartialCrcLimitDiv, then clear all change
+ // buffers and set changes_end_ as 0, indicating that the checksum should be
+ // recomputed from idx 0 (starting from the beginning). Otherwise cache the
+ // change.
+ void SetDirty(int32_t idx);
+
// Flushes content to underlying file.
//
// Returns:
@@ -248,10 +344,6 @@ class FileBackedVector {
return reinterpret_cast<const T*>(mmapped_file_->region());
}
- T* mutable_array() const {
- return reinterpret_cast<T*>(mmapped_file_->mutable_region());
- }
-
int32_t num_elements() const { return header_->num_elements; }
// Updates checksum of the vector contents and returns it.
@@ -260,6 +352,66 @@ class FileBackedVector {
// INTERNAL_ERROR if the vector's internal state is inconsistent
libtextclassifier3::StatusOr<Crc32> ComputeChecksum();
+ public:
+ class MutableArrayView {
+ public:
+ const T& operator[](int32_t idx) const { return data_[idx]; }
+ T& operator[](int32_t idx) {
+ SetDirty(idx);
+ return data_[idx];
+ }
+
+ const T* data() const { return data_; }
+
+ int32_t size() const { return len_; }
+
+ // Set the mutable array slice (starting at idx) by the given element array.
+ // It handles SetDirty properly for the file-backed-vector when modifying
+ // elements.
+ //
+ // REQUIRES: arr is valid && arr_len >= 0 && idx + arr_len <= size(),
+ // otherwise the behavior is undefined.
+ void SetArray(int32_t idx, const T* arr, int32_t arr_len) {
+ for (int32_t i = 0; i < arr_len; ++i) {
+ SetDirty(idx + i);
+ data_[idx + i] = arr[i];
+ }
+ }
+
+ private:
+ MutableArrayView(FileBackedVector<T>* vector, T* data, int32_t len)
+ : vector_(vector),
+ data_(data),
+ original_idx_(data - vector->array()),
+ len_(len) {}
+
+ void SetDirty(int32_t idx) { vector_->SetDirty(original_idx_ + idx); }
+
+ // Does not own. For SetDirty only.
+ FileBackedVector<T>* vector_;
+
+ // data_ points at vector_->mutable_array()[original_idx_]
+ T* data_;
+ int32_t original_idx_;
+ int32_t len_;
+
+ friend class FileBackedVector;
+ };
+
+ class MutableView {
+ public:
+ const T& Get() const { return mutable_array_view_[0]; }
+ T& Get() { return mutable_array_view_[0]; }
+
+ private:
+ MutableView(FileBackedVector<T>* vector, T* data)
+ : mutable_array_view_(vector, data, 1) {}
+
+ MutableArrayView mutable_array_view_;
+
+ friend class FileBackedVector;
+ };
+
private:
// We track partial updates to the array for crc updating. This
// requires extra memory to keep track of original buffers but
@@ -271,24 +423,33 @@ class FileBackedVector {
// Grow file by at least this many elements if array is growable.
static constexpr int64_t kGrowElements = 1u << 14; // 16K
- // Max number of elements that can be held by the vector.
- static constexpr int64_t kMaxNumElements = 1u << 20; // 1M
+ // Absolute max # of elements allowed. Since we are using int32_t to store
+ // num_elements, max value is 2^31-1. Still the actual max # of elements are
+ // determined by max_file_size, kElementTypeSize, and Header::kHeaderSize.
+ static constexpr int32_t kMaxNumElements =
+ std::numeric_limits<int32_t>::max();
+
+ // Absolute max index allowed.
+ static constexpr int32_t kMaxIndex = kMaxNumElements - 1;
// Can only be created through the factory ::Create function
FileBackedVector(const Filesystem& filesystem, const std::string& file_path,
std::unique_ptr<Header> header,
- std::unique_ptr<MemoryMappedFile> mmapped_file);
+ std::unique_ptr<MemoryMappedFile> mmapped_file,
+ int32_t max_file_size);
// Initialize a new FileBackedVector, and create the file.
static libtextclassifier3::StatusOr<std::unique_ptr<FileBackedVector<T>>>
InitializeNewFile(const Filesystem& filesystem, const std::string& file_path,
- ScopedFd fd, MemoryMappedFile::Strategy mmap_strategy);
+ ScopedFd fd, MemoryMappedFile::Strategy mmap_strategy,
+ int32_t max_file_size);
// Initialize a FileBackedVector from an existing file.
static libtextclassifier3::StatusOr<std::unique_ptr<FileBackedVector<T>>>
InitializeExistingFile(const Filesystem& filesystem,
const std::string& file_path, ScopedFd fd,
- MemoryMappedFile::Strategy mmap_strategy);
+ MemoryMappedFile::Strategy mmap_strategy,
+ int32_t max_file_size);
// Grows the underlying file to hold at least num_elements
//
@@ -296,6 +457,10 @@ class FileBackedVector {
// OUT_OF_RANGE_ERROR if we can't grow to the specified size
libtextclassifier3::Status GrowIfNecessary(int32_t num_elements);
+ T* mutable_array() const {
+ return reinterpret_cast<T*>(mmapped_file_->mutable_region());
+ }
+
// Cached constructor params.
const Filesystem* const filesystem_;
const std::string file_path_;
@@ -314,25 +479,42 @@ class FileBackedVector {
// update. Will be cleared if the size grows too big.
std::string saved_original_buffer_;
- // Keep track of all pages we touched so we can write them back to
- // disk.
- std::vector<bool> dirty_pages_;
+ // Max file size for FileBackedVector, default kMaxFileSize. Note that this
+ // value won't be written into the header, so maximum file size will always be
+ // specified in runtime and the caller should make sure its value is correct
+ // and reasonable. Note that file size includes size of header + elements.
+ //
+ // The range should be in
+ // [Header::kHeaderSize + kElementTypeSize, kMaxFileSize], and
+ // (max_file_size_ - Header::kHeaderSize) / kElementTypeSize is max # of
+ // elements that can be stored.
+ int32_t max_file_size_;
};
template <typename T>
+constexpr int32_t FileBackedVector<T>::kMaxFileSize;
+
+template <typename T>
+constexpr int32_t FileBackedVector<T>::kElementTypeSize;
+
+template <typename T>
constexpr int32_t FileBackedVector<T>::kPartialCrcLimitDiv;
template <typename T>
constexpr int64_t FileBackedVector<T>::kGrowElements;
template <typename T>
-constexpr int64_t FileBackedVector<T>::kMaxNumElements;
+constexpr int32_t FileBackedVector<T>::kMaxNumElements;
+
+template <typename T>
+constexpr int32_t FileBackedVector<T>::kMaxIndex;
template <typename T>
libtextclassifier3::StatusOr<std::unique_ptr<FileBackedVector<T>>>
FileBackedVector<T>::Create(const Filesystem& filesystem,
const std::string& file_path,
- MemoryMappedFile::Strategy mmap_strategy) {
+ MemoryMappedFile::Strategy mmap_strategy,
+ int32_t max_file_size) {
if (mmap_strategy == MemoryMappedFile::Strategy::READ_WRITE_MANUAL_SYNC) {
// FileBackedVector's behavior of growing the file underneath the mmap is
// inherently broken with MAP_PRIVATE. Growing the vector requires extending
@@ -345,6 +527,14 @@ FileBackedVector<T>::Create(const Filesystem& filesystem,
"mmap strategy.");
}
+ if (max_file_size < Header::kHeaderSize + kElementTypeSize ||
+ max_file_size > kMaxFileSize) {
+ // FileBackedVector should be able to store at least 1 element, so
+ // max_file_size should be at least Header::kHeaderSize + kElementTypeSize.
+ return absl_ports::InvalidArgumentError(
+ "Invalid max file size for FileBackedVector");
+ }
+
ScopedFd fd(filesystem.OpenForWrite(file_path.c_str()));
if (!fd.is_valid()) {
return absl_ports::InternalError(
@@ -357,31 +547,38 @@ FileBackedVector<T>::Create(const Filesystem& filesystem,
absl_ports::StrCat("Bad file size for file ", file_path));
}
+ if (max_file_size < file_size) {
+ return absl_ports::InvalidArgumentError(
+ "Max file size should not be smaller than the existing file size");
+ }
+
const bool new_file = file_size == 0;
if (new_file) {
return InitializeNewFile(filesystem, file_path, std::move(fd),
- mmap_strategy);
+ mmap_strategy, max_file_size);
}
return InitializeExistingFile(filesystem, file_path, std::move(fd),
- mmap_strategy);
+ mmap_strategy, max_file_size);
}
template <typename T>
libtextclassifier3::StatusOr<std::unique_ptr<FileBackedVector<T>>>
-FileBackedVector<T>::InitializeNewFile(
- const Filesystem& filesystem, const std::string& file_path, ScopedFd fd,
- MemoryMappedFile::Strategy mmap_strategy) {
+FileBackedVector<T>::InitializeNewFile(const Filesystem& filesystem,
+ const std::string& file_path,
+ ScopedFd fd,
+ MemoryMappedFile::Strategy mmap_strategy,
+ int32_t max_file_size) {
// Create header.
auto header = std::make_unique<Header>();
header->magic = FileBackedVector<T>::Header::kMagic;
- header->element_size = sizeof(T);
+ header->element_size = kElementTypeSize;
header->header_checksum = header->CalculateHeaderChecksum();
// We use Write() here, instead of writing through the mmapped region
// created below, so we can gracefully handle errors that occur when the
// disk is full. See b/77309668 for details.
if (!filesystem.PWrite(fd.get(), /*offset=*/0, header.get(),
- sizeof(Header))) {
+ Header::kHeaderSize)) {
return absl_ports::InternalError("Failed to write header");
}
@@ -393,23 +590,30 @@ FileBackedVector<T>::InitializeNewFile(
auto mmapped_file =
std::make_unique<MemoryMappedFile>(filesystem, file_path, mmap_strategy);
- return std::unique_ptr<FileBackedVector<T>>(new FileBackedVector<T>(
- filesystem, file_path, std::move(header), std::move(mmapped_file)));
+ return std::unique_ptr<FileBackedVector<T>>(
+ new FileBackedVector<T>(filesystem, file_path, std::move(header),
+ std::move(mmapped_file), max_file_size));
}
template <typename T>
libtextclassifier3::StatusOr<std::unique_ptr<FileBackedVector<T>>>
FileBackedVector<T>::InitializeExistingFile(
const Filesystem& filesystem, const std::string& file_path,
- const ScopedFd fd, MemoryMappedFile::Strategy mmap_strategy) {
+ const ScopedFd fd, MemoryMappedFile::Strategy mmap_strategy,
+ int32_t max_file_size) {
int64_t file_size = filesystem.GetFileSize(file_path.c_str());
- if (file_size < sizeof(FileBackedVector<T>::Header)) {
+ if (file_size == Filesystem::kBadFileSize) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Bad file size for file ", file_path));
+ }
+
+ if (file_size < Header::kHeaderSize) {
return absl_ports::InternalError(
absl_ports::StrCat("File header too short for ", file_path));
}
auto header = std::make_unique<Header>();
- if (!filesystem.PRead(fd.get(), header.get(), sizeof(Header),
+ if (!filesystem.PRead(fd.get(), header.get(), Header::kHeaderSize,
/*offset=*/0)) {
return absl_ports::InternalError(
absl_ports::StrCat("Failed to read header of ", file_path));
@@ -429,13 +633,15 @@ FileBackedVector<T>::InitializeExistingFile(
absl_ports::StrCat("Invalid header crc for ", file_path));
}
- if (header->element_size != sizeof(T)) {
+ if (header->element_size != kElementTypeSize) {
return absl_ports::InternalError(IcingStringUtil::StringPrintf(
- "Inconsistent element size, expected %zd, actual %d", sizeof(T),
+ "Inconsistent element size, expected %d, actual %d", kElementTypeSize,
header->element_size));
}
- int64_t min_file_size = header->num_elements * sizeof(T) + sizeof(Header);
+ int64_t min_file_size =
+ static_cast<int64_t>(header->num_elements) * kElementTypeSize +
+ Header::kHeaderSize;
if (min_file_size > file_size) {
return absl_ports::InternalError(IcingStringUtil::StringPrintf(
"Inconsistent file size, expected %" PRId64 ", actual %" PRId64,
@@ -446,23 +652,22 @@ FileBackedVector<T>::InitializeExistingFile(
// access elements from the mmapped region
auto mmapped_file =
std::make_unique<MemoryMappedFile>(filesystem, file_path, mmap_strategy);
- ICING_RETURN_IF_ERROR(
- mmapped_file->Remap(sizeof(Header), file_size - sizeof(Header)));
+ ICING_RETURN_IF_ERROR(mmapped_file->Remap(Header::kHeaderSize,
+ file_size - Header::kHeaderSize));
// Check vector contents
- Crc32 vector_checksum;
- std::string_view vector_contents(
- reinterpret_cast<const char*>(mmapped_file->region()),
- header->num_elements * sizeof(T));
- vector_checksum.Append(vector_contents);
+ Crc32 vector_checksum(
+ std::string_view(reinterpret_cast<const char*>(mmapped_file->region()),
+ header->num_elements * kElementTypeSize));
if (vector_checksum.Get() != header->vector_checksum) {
return absl_ports::FailedPreconditionError(
absl_ports::StrCat("Invalid vector contents for ", file_path));
}
- return std::unique_ptr<FileBackedVector<T>>(new FileBackedVector<T>(
- filesystem, file_path, std::move(header), std::move(mmapped_file)));
+ return std::unique_ptr<FileBackedVector<T>>(
+ new FileBackedVector<T>(filesystem, file_path, std::move(header),
+ std::move(mmapped_file), max_file_size));
}
template <typename T>
@@ -479,12 +684,13 @@ template <typename T>
FileBackedVector<T>::FileBackedVector(
const Filesystem& filesystem, const std::string& file_path,
std::unique_ptr<Header> header,
- std::unique_ptr<MemoryMappedFile> mmapped_file)
+ std::unique_ptr<MemoryMappedFile> mmapped_file, int32_t max_file_size)
: filesystem_(&filesystem),
file_path_(file_path),
header_(std::move(header)),
mmapped_file_(std::move(mmapped_file)),
- changes_end_(header_->num_elements) {}
+ changes_end_(header_->num_elements),
+ max_file_size_(max_file_size) {}
template <typename T>
FileBackedVector<T>::~FileBackedVector() {
@@ -523,6 +729,40 @@ libtextclassifier3::StatusOr<const T*> FileBackedVector<T>::Get(
}
template <typename T>
+libtextclassifier3::StatusOr<typename FileBackedVector<T>::MutableView>
+FileBackedVector<T>::GetMutable(int32_t idx) {
+ if (idx < 0) {
+ return absl_ports::OutOfRangeError(
+ IcingStringUtil::StringPrintf("Index, %d, was less than 0", idx));
+ }
+
+ if (idx >= header_->num_elements) {
+ return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
+ "Index, %d, was greater than vector size, %d", idx,
+ header_->num_elements));
+ }
+
+ return MutableView(this, &mutable_array()[idx]);
+}
+
+template <typename T>
+libtextclassifier3::StatusOr<typename FileBackedVector<T>::MutableArrayView>
+FileBackedVector<T>::GetMutable(int32_t idx, int32_t len) {
+ if (idx < 0) {
+ return absl_ports::OutOfRangeError(
+ IcingStringUtil::StringPrintf("Index, %d, was less than 0", idx));
+ }
+
+ if (idx > header_->num_elements - len) {
+ return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
+ "Index with len, %d %d, was greater than vector size, %d", idx, len,
+ header_->num_elements));
+ }
+
+ return MutableArrayView(this, &mutable_array()[idx], len);
+}
+
+template <typename T>
libtextclassifier3::Status FileBackedVector<T>::Set(int32_t idx,
const T& value) {
if (idx < 0) {
@@ -530,6 +770,11 @@ libtextclassifier3::Status FileBackedVector<T>::Set(int32_t idx,
IcingStringUtil::StringPrintf("Index, %d, was less than 0", idx));
}
+ if (idx > kMaxIndex) {
+ return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
+ "Index, %d, was greater than max index allowed, %d", idx, kMaxIndex));
+ }
+
ICING_RETURN_IF_ERROR(GrowIfNecessary(idx + 1));
if (idx + 1 > header_->num_elements) {
@@ -541,36 +786,39 @@ libtextclassifier3::Status FileBackedVector<T>::Set(int32_t idx,
return libtextclassifier3::Status::OK;
}
- // Cache original value to update crcs.
- if (idx < changes_end_) {
- // If we exceed kPartialCrcLimitDiv, clear changes_end_ to
- // revert to full CRC.
- if ((saved_original_buffer_.size() + sizeof(T)) *
- FileBackedVector<T>::kPartialCrcLimitDiv >
- changes_end_ * sizeof(T)) {
- ICING_VLOG(2) << "FileBackedVector change tracking limit exceeded";
- changes_.clear();
- saved_original_buffer_.clear();
- changes_end_ = 0;
- header_->vector_checksum = 0;
- } else {
- int32_t start_byte = idx * sizeof(T);
-
- changes_.push_back(idx);
- saved_original_buffer_.append(
- reinterpret_cast<char*>(const_cast<T*>(array())) + start_byte,
- sizeof(T));
- }
- }
+ SetDirty(idx);
mutable_array()[idx] = value;
return libtextclassifier3::Status::OK;
}
template <typename T>
+libtextclassifier3::StatusOr<typename FileBackedVector<T>::MutableArrayView>
+FileBackedVector<T>::Allocate(int32_t len) {
+ if (len <= 0) {
+ return absl_ports::OutOfRangeError("Invalid allocate length");
+ }
+
+ if (len > kMaxNumElements - header_->num_elements) {
+ return absl_ports::OutOfRangeError(
+ IcingStringUtil::StringPrintf("Cannot allocate %d elements", len));
+ }
+
+ // Although header_->num_elements + len doesn't exceed kMaxNumElements, the
+ // actual max # of elements are determined by max_file_size, kElementTypeSize,
+ // and kHeaderSize. Thus, it is still possible to fail to grow the file.
+ ICING_RETURN_IF_ERROR(GrowIfNecessary(header_->num_elements + len));
+
+ int32_t start_idx = header_->num_elements;
+ header_->num_elements += len;
+
+ return MutableArrayView(this, &mutable_array()[start_idx], len);
+}
+
+template <typename T>
libtextclassifier3::Status FileBackedVector<T>::GrowIfNecessary(
int32_t num_elements) {
- if (sizeof(T) == 0) {
+ if (kElementTypeSize == 0) {
// Growing is a no-op
return libtextclassifier3::Status::OK;
}
@@ -579,10 +827,12 @@ libtextclassifier3::Status FileBackedVector<T>::GrowIfNecessary(
return libtextclassifier3::Status::OK;
}
- if (num_elements > FileBackedVector<T>::kMaxNumElements) {
+ if (num_elements >
+ (max_file_size_ - Header::kHeaderSize) / kElementTypeSize) {
return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
- "%d exceeds maximum number of elements allowed, %lld", num_elements,
- static_cast<long long>(FileBackedVector<T>::kMaxNumElements)));
+ "%d elements total size exceed maximum bytes of elements allowed, "
+ "%d bytes",
+ num_elements, max_file_size_ - Header::kHeaderSize));
}
int64_t current_file_size = filesystem_->GetFileSize(file_path_.c_str());
@@ -590,7 +840,8 @@ libtextclassifier3::Status FileBackedVector<T>::GrowIfNecessary(
return absl_ports::InternalError("Unable to retrieve file size.");
}
- int64_t least_file_size_needed = sizeof(Header) + num_elements * sizeof(T);
+ int32_t least_file_size_needed =
+ Header::kHeaderSize + num_elements * kElementTypeSize; // Won't overflow
if (least_file_size_needed <= current_file_size) {
// Our underlying file can hold the target num_elements cause we've grown
// before
@@ -598,9 +849,13 @@ libtextclassifier3::Status FileBackedVector<T>::GrowIfNecessary(
}
// Otherwise, we need to grow. Grow to kGrowElements boundary.
- least_file_size_needed = math_util::RoundUpTo(
- least_file_size_needed,
- int64_t{FileBackedVector<T>::kGrowElements * sizeof(T)});
+ // Note that we need to use int64_t here, since int32_t might overflow after
+ // round up.
+ int64_t round_up_file_size_needed = math_util::RoundUpTo(
+ int64_t{least_file_size_needed},
+ int64_t{FileBackedVector<T>::kGrowElements} * kElementTypeSize);
+ least_file_size_needed =
+ std::min(round_up_file_size_needed, int64_t{max_file_size_});
// We use PWrite here rather than Grow because Grow doesn't actually allocate
// an underlying disk block. This can lead to problems with mmap because mmap
@@ -609,20 +864,22 @@ libtextclassifier3::Status FileBackedVector<T>::GrowIfNecessary(
// these blocks, which will ensure that any failure to grow will surface here.
int64_t page_size = getpagesize();
auto buf = std::make_unique<uint8_t[]>(page_size);
- int64_t size_to_write = page_size - (current_file_size % page_size);
+ int64_t size_to_write = std::min(page_size - (current_file_size % page_size),
+ max_file_size_ - current_file_size);
ScopedFd sfd(filesystem_->OpenForWrite(file_path_.c_str()));
- while (current_file_size < least_file_size_needed) {
+ while (size_to_write > 0 && current_file_size < least_file_size_needed) {
if (!filesystem_->PWrite(sfd.get(), current_file_size, buf.get(),
size_to_write)) {
return absl_ports::InternalError(
absl_ports::StrCat("Couldn't grow file ", file_path_));
}
current_file_size += size_to_write;
- size_to_write = page_size - (current_file_size % page_size);
+ size_to_write = std::min(page_size - (current_file_size % page_size),
+ max_file_size_ - current_file_size);
}
ICING_RETURN_IF_ERROR(mmapped_file_->Remap(
- sizeof(Header), least_file_size_needed - sizeof(Header)));
+ Header::kHeaderSize, least_file_size_needed - Header::kHeaderSize));
return libtextclassifier3::Status::OK;
}
@@ -653,6 +910,31 @@ libtextclassifier3::Status FileBackedVector<T>::TruncateTo(
}
template <typename T>
+void FileBackedVector<T>::SetDirty(int32_t idx) {
+ // Cache original value to update crcs.
+ if (idx >= 0 && idx < changes_end_) {
+ // If we exceed kPartialCrcLimitDiv, clear changes_end_ to
+ // revert to full CRC.
+ if ((saved_original_buffer_.size() + kElementTypeSize) *
+ FileBackedVector<T>::kPartialCrcLimitDiv >
+ changes_end_ * kElementTypeSize) {
+ ICING_VLOG(2) << "FileBackedVector change tracking limit exceeded";
+ changes_.clear();
+ saved_original_buffer_.clear();
+ changes_end_ = 0;
+ header_->vector_checksum = 0;
+ } else {
+ int32_t start_byte = idx * kElementTypeSize;
+
+ changes_.push_back(idx);
+ saved_original_buffer_.append(
+ reinterpret_cast<char*>(const_cast<T*>(array())) + start_byte,
+ kElementTypeSize);
+ }
+ }
+}
+
+template <typename T>
libtextclassifier3::StatusOr<Crc32> FileBackedVector<T>::ComputeChecksum() {
// First apply the modified area. Keep a bitmap of already updated
// regions so we don't double-update.
@@ -663,8 +945,7 @@ libtextclassifier3::StatusOr<Crc32> FileBackedVector<T>::ComputeChecksum() {
int num_truncated = 0;
int num_overlapped = 0;
int num_duplicate = 0;
- for (size_t i = 0; i < changes_.size(); i++) {
- const int32_t change_offset = changes_[i];
+ for (const int32_t change_offset : changes_) {
if (change_offset > changes_end_) {
return absl_ports::InternalError(IcingStringUtil::StringPrintf(
"Failed to update crc, change offset %d, changes_end_ %d",
@@ -678,9 +959,10 @@ libtextclassifier3::StatusOr<Crc32> FileBackedVector<T>::ComputeChecksum() {
}
// Turn change buffer into change^original.
- const char* buffer_end = &saved_original_buffer_[cur_offset + sizeof(T)];
- const char* cur_array =
- reinterpret_cast<const char*>(array()) + change_offset * sizeof(T);
+ const char* buffer_end =
+ &saved_original_buffer_[cur_offset + kElementTypeSize];
+ const char* cur_array = reinterpret_cast<const char*>(array()) +
+ change_offset * kElementTypeSize;
// Now xor in. SSE acceleration please?
for (char* cur = &saved_original_buffer_[cur_offset]; cur < buffer_end;
cur++, cur_array++) {
@@ -692,9 +974,9 @@ libtextclassifier3::StatusOr<Crc32> FileBackedVector<T>::ComputeChecksum() {
bool overlap = false;
uint32_t cur_element = change_offset;
for (char* cur = &saved_original_buffer_[cur_offset]; cur < buffer_end;
- cur_element++, cur += sizeof(T)) {
+ cur_element++, cur += kElementTypeSize) {
if (updated[cur_element]) {
- memset(cur, 0, sizeof(T));
+ memset(cur, 0, kElementTypeSize);
overlap = true;
} else {
updated[cur_element] = true;
@@ -705,10 +987,11 @@ libtextclassifier3::StatusOr<Crc32> FileBackedVector<T>::ComputeChecksum() {
// Apply update to crc.
if (new_update) {
// Explicitly create the string_view with length
- std::string_view xored_str(buffer_end - sizeof(T), sizeof(T));
+ std::string_view xored_str(buffer_end - kElementTypeSize,
+ kElementTypeSize);
if (!cur_crc
- .UpdateWithXor(xored_str, changes_end_ * sizeof(T),
- change_offset * sizeof(T))
+ .UpdateWithXor(xored_str, changes_end_ * kElementTypeSize,
+ change_offset * kElementTypeSize)
.ok()) {
return absl_ports::InternalError(IcingStringUtil::StringPrintf(
"Failed to update crc, change offset %d, change "
@@ -722,7 +1005,7 @@ libtextclassifier3::StatusOr<Crc32> FileBackedVector<T>::ComputeChecksum() {
} else {
num_duplicate++;
}
- cur_offset += sizeof(T);
+ cur_offset += kElementTypeSize;
}
if (!changes_.empty()) {
@@ -735,8 +1018,9 @@ libtextclassifier3::StatusOr<Crc32> FileBackedVector<T>::ComputeChecksum() {
if (changes_end_ < header_->num_elements) {
// Explicitly create the string_view with length
std::string_view update_str(
- reinterpret_cast<const char*>(array()) + changes_end_ * sizeof(T),
- (header_->num_elements - changes_end_) * sizeof(T));
+ reinterpret_cast<const char*>(array()) +
+ changes_end_ * kElementTypeSize,
+ (header_->num_elements - changes_end_) * kElementTypeSize);
cur_crc.Append(update_str);
ICING_VLOG(2) << IcingStringUtil::StringPrintf(
"Array update tail crc offset %d -> %d", changes_end_,
@@ -761,7 +1045,7 @@ libtextclassifier3::Status FileBackedVector<T>::PersistToDisk() {
header_->header_checksum = header_->CalculateHeaderChecksum();
if (!filesystem_->PWrite(file_path_.c_str(), /*offset=*/0, header_.get(),
- sizeof(Header))) {
+ Header::kHeaderSize)) {
return absl_ports::InternalError("Failed to sync header");
}
@@ -795,7 +1079,11 @@ libtextclassifier3::StatusOr<int64_t> FileBackedVector<T>::GetElementsFileSize()
return absl_ports::InternalError(
"Failed to get file size of elements in the file-backed vector");
}
- return total_file_size - sizeof(Header);
+ if (total_file_size < Header::kHeaderSize) {
+ return absl_ports::InternalError(
+ "File size should not be smaller than header size");
+ }
+ return total_file_size - Header::kHeaderSize;
}
} // namespace lib
diff --git a/icing/file/file-backed-vector_test.cc b/icing/file/file-backed-vector_test.cc
index 2f60c6b..60ed887 100644
--- a/icing/file/file-backed-vector_test.cc
+++ b/icing/file/file-backed-vector_test.cc
@@ -19,7 +19,9 @@
#include <algorithm>
#include <cerrno>
#include <cstdint>
+#include <limits>
#include <memory>
+#include <string>
#include <string_view>
#include <vector>
@@ -34,10 +36,14 @@
#include "icing/util/crc32.h"
#include "icing/util/logging.h"
+using ::testing::ElementsAre;
using ::testing::Eq;
using ::testing::IsTrue;
+using ::testing::Lt;
+using ::testing::Not;
using ::testing::Pointee;
using ::testing::Return;
+using ::testing::SizeIs;
namespace icing {
namespace lib {
@@ -60,20 +66,30 @@ class FileBackedVectorTest : public testing::Test {
// Helper method to loop over some data and insert into the vector at some idx
template <typename T>
- void Insert(FileBackedVector<T>* vector, int32_t idx, std::string data) {
- for (int i = 0; i < data.length(); ++i) {
+ void Insert(FileBackedVector<T>* vector, int32_t idx,
+ const std::vector<T>& data) {
+ for (int i = 0; i < data.size(); ++i) {
ICING_ASSERT_OK(vector->Set(idx + i, data.at(i)));
}
}
+ void Insert(FileBackedVector<char>* vector, int32_t idx, std::string data) {
+ Insert(vector, idx, std::vector<char>(data.begin(), data.end()));
+ }
+
// Helper method to retrieve data from the beginning of the vector
template <typename T>
- std::string_view Get(FileBackedVector<T>* vector, int32_t expected_len) {
+ std::vector<T> Get(FileBackedVector<T>* vector, int32_t idx,
+ int32_t expected_len) {
+ return std::vector<T>(vector->array() + idx,
+ vector->array() + idx + expected_len);
+ }
+
+ std::string_view Get(FileBackedVector<char>* vector, int32_t expected_len) {
return Get(vector, 0, expected_len);
}
- template <typename T>
- std::string_view Get(FileBackedVector<T>* vector, int32_t idx,
+ std::string_view Get(FileBackedVector<char>* vector, int32_t idx,
int32_t expected_len) {
return std::string_view(vector->array() + idx, expected_len);
}
@@ -103,6 +119,79 @@ TEST_F(FileBackedVectorTest, Create) {
}
}
+TEST_F(FileBackedVectorTest, CreateWithInvalidStrategy) {
+ // Create a vector with unimplemented strategy
+ EXPECT_THAT(FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_MANUAL_SYNC),
+ StatusIs(libtextclassifier3::StatusCode::UNIMPLEMENTED));
+}
+
+TEST_F(FileBackedVectorTest, CreateWithCustomMaxFileSize) {
+ int32_t header_size = FileBackedVector<char>::Header::kHeaderSize;
+
+ // Create a vector with invalid max_file_size
+ EXPECT_THAT(FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
+ /*max_file_size=*/-1),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
+ /*max_file_size=*/header_size - 1),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
+ /*max_file_size=*/header_size + sizeof(char) - 1),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ {
+ // Create a vector with max_file_size that allows only 1 element.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto vector, FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
+ /*max_file_size=*/header_size + sizeof(char) * 1));
+ ICING_ASSERT_OK(vector->Set(0, 'a'));
+ }
+
+ {
+ // We can create it again with larger max_file_size, as long as it is not
+ // greater than kMaxFileSize.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto vector, FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
+ /*max_file_size=*/header_size + sizeof(char) * 2));
+ EXPECT_THAT(vector->Get(0), IsOkAndHolds(Pointee(Eq('a'))));
+ ICING_ASSERT_OK(vector->Set(1, 'b'));
+ }
+
+ // We cannot create it again with max_file_size < current_file_size, even if
+ // it is a valid value.
+ int64_t current_file_size = filesystem_.GetFileSize(file_path_.c_str());
+ ASSERT_THAT(current_file_size, Eq(header_size + sizeof(char) * 2));
+ ASSERT_THAT(current_file_size - 1, Not(Lt(header_size + sizeof(char))));
+ EXPECT_THAT(FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
+ /*max_file_size=*/current_file_size - 1),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ {
+ // We can create it again with max_file_size == current_file_size.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto vector, FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
+ /*max_file_size=*/current_file_size));
+ EXPECT_THAT(vector->Get(0), IsOkAndHolds(Pointee(Eq('a'))));
+ EXPECT_THAT(vector->Get(1), IsOkAndHolds(Pointee(Eq('b'))));
+ }
+}
+
TEST_F(FileBackedVectorTest, SimpleShared) {
// Create a vector and add some data.
ICING_ASSERT_OK_AND_ASSIGN(
@@ -195,6 +284,373 @@ TEST_F(FileBackedVectorTest, Get) {
StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
}
+TEST_F(FileBackedVectorTest, MutableView) {
+ // Create a vector and add some data.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<char>> vector,
+ FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ Insert(vector.get(), /*idx=*/0, std::string(1000, 'a'));
+ EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(Crc32(2620640643U)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(FileBackedVector<char>::MutableView mutable_elt,
+ vector->GetMutable(3));
+
+ mutable_elt.Get() = 'b';
+ EXPECT_THAT(vector->Get(3), IsOkAndHolds(Pointee(Eq('b'))));
+
+ mutable_elt.Get() = 'c';
+ EXPECT_THAT(vector->Get(3), IsOkAndHolds(Pointee(Eq('c'))));
+}
+
+TEST_F(FileBackedVectorTest, MutableViewShouldSetDirty) {
+ // Create a vector and add some data.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<char>> vector,
+ FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ Insert(vector.get(), /*idx=*/0, std::string(1000, 'a'));
+ EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(Crc32(2620640643U)));
+
+ std::string_view reconstructed_view =
+ std::string_view(vector->array(), vector->num_elements());
+
+ ICING_ASSERT_OK_AND_ASSIGN(FileBackedVector<char>::MutableView mutable_elt,
+ vector->GetMutable(3));
+
+ // Mutate the element via MutateView
+ // If non-const Get() is called, MutateView should set the element index dirty
+ // so that ComputeChecksum() can pick up the change and compute the checksum
+ // correctly. Validate by mapping another array on top.
+ mutable_elt.Get() = 'b';
+ ASSERT_THAT(vector->Get(3), IsOkAndHolds(Pointee(Eq('b'))));
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 crc1, vector->ComputeChecksum());
+ Crc32 full_crc1;
+ full_crc1.Append(reconstructed_view);
+ EXPECT_THAT(crc1, Eq(full_crc1));
+
+ // Mutate and test again.
+ mutable_elt.Get() = 'c';
+ ASSERT_THAT(vector->Get(3), IsOkAndHolds(Pointee(Eq('c'))));
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 crc2, vector->ComputeChecksum());
+ Crc32 full_crc2;
+ full_crc2.Append(reconstructed_view);
+ EXPECT_THAT(crc2, Eq(full_crc2));
+}
+
+TEST_F(FileBackedVectorTest, MutableArrayView) {
+ // Create a vector and add some data.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<int>> vector,
+ FileBackedVector<int>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ Insert(vector.get(), /*idx=*/0, std::vector<int>(/*count=*/100, /*value=*/1));
+ EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(Crc32(2494890115U)));
+
+ constexpr int kArrayViewOffset = 5;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FileBackedVector<int>::MutableArrayView mutable_arr,
+ vector->GetMutable(kArrayViewOffset, /*len=*/3));
+ EXPECT_THAT(mutable_arr, SizeIs(3));
+
+ mutable_arr[0] = 2;
+ mutable_arr[1] = 3;
+ mutable_arr[2] = 4;
+
+ EXPECT_THAT(vector->Get(kArrayViewOffset + 0), IsOkAndHolds(Pointee(Eq(2))));
+ EXPECT_THAT(mutable_arr.data()[0], Eq(2));
+
+ EXPECT_THAT(vector->Get(kArrayViewOffset + 1), IsOkAndHolds(Pointee(Eq(3))));
+ EXPECT_THAT(mutable_arr.data()[1], Eq(3));
+
+ EXPECT_THAT(vector->Get(kArrayViewOffset + 2), IsOkAndHolds(Pointee(Eq(4))));
+ EXPECT_THAT(mutable_arr.data()[2], Eq(4));
+}
+
+TEST_F(FileBackedVectorTest, MutableArrayViewSetArray) {
+ // Create a vector and add some data.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<int>> vector,
+ FileBackedVector<int>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ Insert(vector.get(), /*idx=*/0, std::vector<int>(/*count=*/100, /*value=*/1));
+ EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(Crc32(2494890115U)));
+
+ constexpr int kArrayViewOffset = 3;
+ constexpr int kArrayViewLen = 5;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FileBackedVector<int>::MutableArrayView mutable_arr,
+ vector->GetMutable(kArrayViewOffset, kArrayViewLen));
+
+ std::vector<int> change1{2, 3, 4};
+ mutable_arr.SetArray(/*idx=*/0, change1.data(), change1.size());
+ EXPECT_THAT(Get(vector.get(), kArrayViewOffset, kArrayViewLen),
+ ElementsAre(2, 3, 4, 1, 1));
+
+ std::vector<int> change2{5, 6};
+ mutable_arr.SetArray(/*idx=*/2, change2.data(), change2.size());
+ EXPECT_THAT(Get(vector.get(), kArrayViewOffset, kArrayViewLen),
+ ElementsAre(2, 3, 5, 6, 1));
+}
+
+TEST_F(FileBackedVectorTest, MutableArrayViewSetArrayWithZeroLength) {
+ // Create a vector and add some data.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<int>> vector,
+ FileBackedVector<int>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ Insert(vector.get(), /*idx=*/0, std::vector<int>(/*count=*/100, /*value=*/1));
+ EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(Crc32(2494890115U)));
+
+ constexpr int kArrayViewOffset = 3;
+ constexpr int kArrayViewLen = 5;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FileBackedVector<int>::MutableArrayView mutable_arr,
+ vector->GetMutable(kArrayViewOffset, kArrayViewLen));
+
+ // Zero arr_len should work and change nothing
+ std::vector<int> change{2, 3};
+ mutable_arr.SetArray(/*idx=*/0, change.data(), /*arr_len=*/0);
+ EXPECT_THAT(Get(vector.get(), kArrayViewOffset, kArrayViewLen),
+ ElementsAre(1, 1, 1, 1, 1));
+}
+
+TEST_F(FileBackedVectorTest, MutableArrayViewIndexOperatorShouldSetDirty) {
+ // Create an array with some data.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<int>> vector,
+ FileBackedVector<int>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ Insert(vector.get(), /*idx=*/0, std::vector<int>(/*count=*/100, /*value=*/1));
+ EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(Crc32(2494890115U)));
+
+ std::string_view reconstructed_view(
+ reinterpret_cast<const char*>(vector->array()),
+ vector->num_elements() * sizeof(int));
+
+ constexpr int kArrayViewOffset = 5;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FileBackedVector<int>::MutableArrayView mutable_arr,
+ vector->GetMutable(kArrayViewOffset, /*len=*/3));
+
+ // Use operator[] to mutate elements
+ // If non-const operator[] is called, MutateView should set the element index
+ // dirty so that ComputeChecksum() can pick up the change and compute the
+ // checksum correctly. Validate by mapping another array on top.
+ mutable_arr[0] = 2;
+ ASSERT_THAT(vector->Get(kArrayViewOffset + 0), IsOkAndHolds(Pointee(Eq(2))));
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 crc1, vector->ComputeChecksum());
+ EXPECT_THAT(crc1, Eq(Crc32(reconstructed_view)));
+
+ mutable_arr[1] = 3;
+ ASSERT_THAT(vector->Get(kArrayViewOffset + 1), IsOkAndHolds(Pointee(Eq(3))));
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 crc2, vector->ComputeChecksum());
+ EXPECT_THAT(crc2, Eq(Crc32(reconstructed_view)));
+
+ mutable_arr[2] = 4;
+ ASSERT_THAT(vector->Get(kArrayViewOffset + 2), IsOkAndHolds(Pointee(Eq(4))));
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 crc3, vector->ComputeChecksum());
+ EXPECT_THAT(crc3, Eq(Crc32(reconstructed_view)));
+
+ // Change the same position. It should set dirty again.
+ mutable_arr[0] = 5;
+ ASSERT_THAT(vector->Get(kArrayViewOffset + 0), IsOkAndHolds(Pointee(Eq(5))));
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 crc4, vector->ComputeChecksum());
+ EXPECT_THAT(crc4, Eq(Crc32(reconstructed_view)));
+}
+
+TEST_F(FileBackedVectorTest, MutableArrayViewSetArrayShouldSetDirty) {
+ // Create an array with some data.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<int>> vector,
+ FileBackedVector<int>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ Insert(vector.get(), /*idx=*/0, std::vector<int>(/*count=*/100, /*value=*/1));
+ EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(Crc32(2494890115U)));
+
+ std::string_view reconstructed_view(
+ reinterpret_cast<const char*>(vector->array()),
+ vector->num_elements() * sizeof(int));
+
+ constexpr int kArrayViewOffset = 3;
+ constexpr int kArrayViewLen = 5;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FileBackedVector<int>::MutableArrayView mutable_arr,
+ vector->GetMutable(kArrayViewOffset, kArrayViewLen));
+
+ std::vector<int> change{2, 3, 4};
+ mutable_arr.SetArray(/*idx=*/0, change.data(), change.size());
+ ASSERT_THAT(Get(vector.get(), kArrayViewOffset, kArrayViewLen),
+ ElementsAre(2, 3, 4, 1, 1));
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 crc, vector->ComputeChecksum());
+ EXPECT_THAT(crc, Eq(Crc32(reconstructed_view)));
+}
+
+TEST_F(FileBackedVectorTest, Append) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<char>> vector,
+ FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ ASSERT_THAT(vector->num_elements(), Eq(0));
+
+ ICING_EXPECT_OK(vector->Append('a'));
+ EXPECT_THAT(vector->num_elements(), Eq(1));
+ EXPECT_THAT(vector->Get(0), IsOkAndHolds(Pointee(Eq('a'))));
+
+ ICING_EXPECT_OK(vector->Append('b'));
+ EXPECT_THAT(vector->num_elements(), Eq(2));
+ EXPECT_THAT(vector->Get(1), IsOkAndHolds(Pointee(Eq('b'))));
+}
+
+TEST_F(FileBackedVectorTest, AppendAfterSet) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<char>> vector,
+ FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ ASSERT_THAT(vector->num_elements(), Eq(0));
+
+ ICING_ASSERT_OK(vector->Set(9, 'z'));
+ ASSERT_THAT(vector->num_elements(), Eq(10));
+ ICING_EXPECT_OK(vector->Append('a'));
+ EXPECT_THAT(vector->num_elements(), Eq(11));
+ EXPECT_THAT(vector->Get(10), IsOkAndHolds(Pointee(Eq('a'))));
+}
+
+TEST_F(FileBackedVectorTest, AppendAfterTruncate) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<char>> vector,
+ FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ Insert(vector.get(), /*idx=*/0, std::string(1000, 'z'));
+ ASSERT_THAT(vector->num_elements(), Eq(1000));
+
+ ICING_ASSERT_OK(vector->TruncateTo(5));
+ ICING_EXPECT_OK(vector->Append('a'));
+ EXPECT_THAT(vector->num_elements(), Eq(6));
+ EXPECT_THAT(vector->Get(5), IsOkAndHolds(Pointee(Eq('a'))));
+}
+
+TEST_F(FileBackedVectorTest, AppendShouldFailIfExceedingMaxFileSize) {
+ int32_t max_file_size = (1 << 10) - 1;
+ int32_t max_num_elements =
+ (max_file_size - FileBackedVector<char>::Header::kHeaderSize) /
+ sizeof(char);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<char>> vector,
+ FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size));
+ ICING_ASSERT_OK(vector->Set(max_num_elements - 1, 'z'));
+ ASSERT_THAT(vector->num_elements(), Eq(max_num_elements));
+
+ EXPECT_THAT(vector->Append('a'),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+}
+
+TEST_F(FileBackedVectorTest, Allocate) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<char>> vector,
+ FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ ASSERT_THAT(vector->num_elements(), Eq(0));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ typename FileBackedVector<char>::MutableArrayView mutable_arr,
+ vector->Allocate(3));
+ EXPECT_THAT(vector->num_elements(), Eq(3));
+ EXPECT_THAT(mutable_arr, SizeIs(3));
+ std::string change = "abc";
+ mutable_arr.SetArray(/*idx=*/0, /*arr=*/change.data(), /*arr_len=*/3);
+ EXPECT_THAT(Get(vector.get(), /*idx=*/0, /*expected_len=*/3), Eq(change));
+}
+
+TEST_F(FileBackedVectorTest, AllocateAfterSet) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<char>> vector,
+ FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ ASSERT_THAT(vector->num_elements(), Eq(0));
+
+ ICING_ASSERT_OK(vector->Set(9, 'z'));
+ ASSERT_THAT(vector->num_elements(), Eq(10));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ typename FileBackedVector<char>::MutableArrayView mutable_arr,
+ vector->Allocate(3));
+ EXPECT_THAT(vector->num_elements(), Eq(13));
+ EXPECT_THAT(mutable_arr, SizeIs(3));
+ std::string change = "abc";
+ mutable_arr.SetArray(/*idx=*/0, /*arr=*/change.data(), /*arr_len=*/3);
+ EXPECT_THAT(Get(vector.get(), /*idx=*/10, /*expected_len=*/3), Eq(change));
+}
+
+TEST_F(FileBackedVectorTest, AllocateAfterTruncate) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<char>> vector,
+ FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ Insert(vector.get(), /*idx=*/0, std::string(1000, 'z'));
+ ASSERT_THAT(vector->num_elements(), Eq(1000));
+
+ ICING_ASSERT_OK(vector->TruncateTo(5));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ typename FileBackedVector<char>::MutableArrayView mutable_arr,
+ vector->Allocate(3));
+ EXPECT_THAT(vector->num_elements(), Eq(8));
+ std::string change = "abc";
+ mutable_arr.SetArray(/*idx=*/0, /*arr=*/change.data(), /*arr_len=*/3);
+ EXPECT_THAT(Get(vector.get(), /*idx=*/5, /*expected_len=*/3), Eq(change));
+}
+
+TEST_F(FileBackedVectorTest, AllocateInvalidLengthShouldFail) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<char>> vector,
+ FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ ASSERT_THAT(vector->num_elements(), Eq(0));
+
+ EXPECT_THAT(vector->Allocate(-1),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+ EXPECT_THAT(vector->num_elements(), Eq(0));
+
+ EXPECT_THAT(vector->Allocate(0),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+ EXPECT_THAT(vector->num_elements(), Eq(0));
+}
+
+TEST_F(FileBackedVectorTest, AllocateShouldFailIfExceedingMaxFileSize) {
+ int32_t max_file_size = (1 << 10) - 1;
+ int32_t max_num_elements =
+ (max_file_size - FileBackedVector<char>::Header::kHeaderSize) /
+ sizeof(char);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<char>> vector,
+ FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size));
+ ICING_ASSERT_OK(vector->Set(max_num_elements - 3, 'z'));
+ ASSERT_THAT(vector->num_elements(), Eq(max_num_elements - 2));
+
+ EXPECT_THAT(vector->Allocate(3),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+ EXPECT_THAT(vector->Allocate(2), IsOk());
+}
+
TEST_F(FileBackedVectorTest, IncrementalCrc_NonOverlappingChanges) {
int num_elements = 1000;
int incremental_size = 3;
@@ -272,29 +728,58 @@ TEST_F(FileBackedVectorTest, IncrementalCrc_OverlappingChanges) {
}
}
+TEST_F(FileBackedVectorTest, SetIntMaxShouldReturnOutOfRangeError) {
+ // Create a vector and add some data.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<int32_t>> vector,
+ FileBackedVector<int32_t>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(Crc32(0)));
+
+ // It is an edge case. Since Set() calls GrowIfNecessary(idx + 1), we have to
+ // make sure that when idx is INT32_MAX, Set() should handle it correctly.
+ EXPECT_THAT(vector->Set(std::numeric_limits<int32_t>::max(), 1),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+}
+
TEST_F(FileBackedVectorTest, Grow) {
- // This is the same value as FileBackedVector::kMaxNumElts
- constexpr int32_t kMaxNumElts = 1U << 20;
+ int32_t max_file_size = (1 << 20) - 1;
+ int32_t header_size = FileBackedVector<int32_t>::Header::kHeaderSize;
+ int32_t element_type_size = static_cast<int32_t>(sizeof(int32_t));
+
+ // Max file size includes size of the header and elements, so max # of
+ // elements will be (max_file_size - header_size) / element_type_size.
+ //
+ // Also ensure that (max_file_size - header_size) is not a multiple of
+ // element_type_size, in order to test if the desired # of elements is
+ // computed by (math) floor instead of ceil.
+ ASSERT_THAT((max_file_size - header_size) % element_type_size, Not(Eq(0)));
+ int32_t max_num_elements = (max_file_size - header_size) / element_type_size;
ASSERT_TRUE(filesystem_.Truncate(fd_, 0));
- // Create an array and add some data.
+ // Create a vector and add some data.
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<FileBackedVector<char>> vector,
- FileBackedVector<char>::Create(
+ std::unique_ptr<FileBackedVector<int32_t>> vector,
+ FileBackedVector<int32_t>::Create(
filesystem_, file_path_,
- MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size));
EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(Crc32(0)));
- EXPECT_THAT(vector->Set(kMaxNumElts + 11, 'a'),
+ // max_num_elements is the allowed max # of elements, so the valid index
+ // should be 0 to max_num_elements-1.
+ EXPECT_THAT(vector->Set(max_num_elements, 1),
StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
- EXPECT_THAT(vector->Set(-1, 'a'),
+ EXPECT_THAT(vector->Set(-1, 1),
StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+ EXPECT_THAT(vector->Set(max_num_elements - 1, 1), IsOk());
- uint32_t start = kMaxNumElts - 13;
- Insert(vector.get(), start, "abcde");
+ int32_t start = max_num_elements - 5;
+ std::vector<int32_t> data{1, 2, 3, 4, 5};
+ Insert(vector.get(), start, data);
// Crc works?
- const Crc32 good_crc(1134899064U);
+ const Crc32 good_crc(650981917U);
EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(good_crc));
// PersistToDisk does nothing bad, and ensures the content is still there
@@ -306,12 +791,12 @@ TEST_F(FileBackedVectorTest, Grow) {
vector.reset();
ICING_ASSERT_OK_AND_ASSIGN(
- vector, FileBackedVector<char>::Create(
- filesystem_, file_path_,
- MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ vector,
+ FileBackedVector<int32_t>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size));
- std::string expected = "abcde";
- EXPECT_EQ(expected, Get(vector.get(), start, expected.length()));
+ EXPECT_THAT(Get(vector.get(), start, data.size()), Eq(data));
}
TEST_F(FileBackedVectorTest, GrowsInChunks) {
@@ -334,20 +819,20 @@ TEST_F(FileBackedVectorTest, GrowsInChunks) {
// Once we add something though, we'll grow to be kGrowElements big. From this
// point on, file size and disk usage should be the same because Growing will
// explicitly allocate the number of blocks needed to accomodate the file.
- Insert(vector.get(), 0, "a");
- int file_size = kGrowElements * sizeof(int);
+ Insert(vector.get(), 0, {1});
+ int file_size = 1 * kGrowElements * sizeof(int);
EXPECT_THAT(filesystem_.GetFileSize(fd_), Eq(file_size));
EXPECT_THAT(filesystem_.GetDiskUsage(fd_), Eq(file_size));
// Should still be the same size, don't need to grow underlying file
- Insert(vector.get(), 1, "b");
+ Insert(vector.get(), 1, {2});
EXPECT_THAT(filesystem_.GetFileSize(fd_), Eq(file_size));
EXPECT_THAT(filesystem_.GetDiskUsage(fd_), Eq(file_size));
// Now we grow by a kGrowElements chunk, so the underlying file is 2
// kGrowElements big
- file_size *= 2;
- Insert(vector.get(), 2, std::string(kGrowElements, 'c'));
+ file_size = 2 * kGrowElements * sizeof(int);
+ Insert(vector.get(), 2, std::vector<int>(kGrowElements, 3));
EXPECT_THAT(filesystem_.GetFileSize(fd_), Eq(file_size));
EXPECT_THAT(filesystem_.GetDiskUsage(fd_), Eq(file_size));
@@ -476,6 +961,48 @@ TEST_F(FileBackedVectorTest, TruncateAndReReadFile) {
}
}
+TEST_F(FileBackedVectorTest, SetDirty) {
+ // 1. Create a vector and add some data.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<char>> vector,
+ FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ Insert(vector.get(), 0, "abcd");
+
+ std::string_view reconstructed_view =
+ std::string_view(vector->array(), vector->num_elements());
+
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 crc1, vector->ComputeChecksum());
+ Crc32 full_crc_before_overwrite;
+ full_crc_before_overwrite.Append(reconstructed_view);
+ EXPECT_THAT(crc1, Eq(full_crc_before_overwrite));
+
+ // 2. Manually overwrite the values of the first two elements.
+ std::string corrupted_content = "ef";
+ ASSERT_THAT(
+ filesystem_.PWrite(fd_, /*offset=*/sizeof(FileBackedVector<char>::Header),
+ corrupted_content.c_str(), corrupted_content.length()),
+ IsTrue());
+ ASSERT_THAT(Get(vector.get(), 0, 4), Eq("efcd"));
+ Crc32 full_crc_after_overwrite;
+ full_crc_after_overwrite.Append(reconstructed_view);
+ ASSERT_THAT(full_crc_before_overwrite, Not(Eq(full_crc_after_overwrite)));
+
+ // 3. Without calling SetDirty(), the checksum will be recomputed incorrectly.
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 crc2, vector->ComputeChecksum());
+ EXPECT_THAT(crc2, Not(Eq(full_crc_after_overwrite)));
+
+ // 4. Call SetDirty()
+ vector->SetDirty(0);
+ vector->SetDirty(1);
+
+ // 5. The checksum should be computed correctly after calling SetDirty() with
+ // correct index.
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 crc3, vector->ComputeChecksum());
+ EXPECT_THAT(crc3, Eq(full_crc_after_overwrite));
+}
+
TEST_F(FileBackedVectorTest, InitFileTooSmallForHeaderFails) {
{
// 1. Create a vector with a few elements.
diff --git a/icing/file/filesystem.cc b/icing/file/filesystem.cc
index 82b8d98..10b77db 100644
--- a/icing/file/filesystem.cc
+++ b/icing/file/filesystem.cc
@@ -63,18 +63,16 @@ void LogOpenFileDescriptors() {
constexpr int kMaxFileDescriptorsToStat = 4096;
struct rlimit rlim = {0, 0};
if (getrlimit(RLIMIT_NOFILE, &rlim) != 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "getrlimit() failed (errno=%d)", errno);
+ ICING_LOG(ERROR) << "getrlimit() failed (errno=" << errno << ")";
return;
}
int fd_lim = rlim.rlim_cur;
if (fd_lim > kMaxFileDescriptorsToStat) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Maximum number of file descriptors (%d) too large.", fd_lim);
+ ICING_LOG(ERROR) << "Maximum number of file descriptors (" << fd_lim
+ << ") too large.";
fd_lim = kMaxFileDescriptorsToStat;
}
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Listing up to %d file descriptors.", fd_lim);
+ ICING_LOG(ERROR) << "Listing up to " << fd_lim << " file descriptors.";
// Verify that /proc/self/fd is a directory. If not, procfs is not mounted or
// inaccessible for some other reason. In that case, there's no point trying
@@ -96,15 +94,12 @@ void LogOpenFileDescriptors() {
if (len >= 0) {
// Zero-terminate the buffer, because readlink() won't.
target[len < target_size ? len : target_size - 1] = '\0';
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("fd %d -> \"%s\"", fd,
- target);
+ ICING_LOG(ERROR) << "fd " << fd << " -> \"" << target << "\"";
} else if (errno != ENOENT) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("fd %d -> ? (errno=%d)",
- fd, errno);
+ ICING_LOG(ERROR) << "fd " << fd << " -> ? (errno=" << errno << ")";
}
}
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "File descriptor list complete.");
+ ICING_LOG(ERROR) << "File descriptor list complete.";
}
// Logs an error formatted as: desc1 + file_name + desc2 + strerror(errnum).
@@ -113,8 +108,7 @@ void LogOpenFileDescriptors() {
// file descriptors (see LogOpenFileDescriptors() above).
void LogOpenError(const char* desc1, const char* file_name, const char* desc2,
int errnum) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "%s%s%s%s", desc1, file_name, desc2, strerror(errnum));
+ ICING_LOG(ERROR) << desc1 << file_name << desc2 << strerror(errnum);
if (errnum == EMFILE) {
LogOpenFileDescriptors();
}
@@ -155,8 +149,7 @@ bool ListDirectoryInternal(const char* dir_name,
}
}
if (closedir(dir) != 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Error closing %s: %s", dir_name, strerror(errno));
+ ICING_LOG(ERROR) << "Error closing " << dir_name << " " << strerror(errno);
}
return true;
}
@@ -179,11 +172,10 @@ void ScopedFd::reset(int fd) {
const int64_t Filesystem::kBadFileSize;
bool Filesystem::DeleteFile(const char* file_name) const {
- ICING_VLOG(1) << IcingStringUtil::StringPrintf("Deleting file %s", file_name);
+ ICING_VLOG(1) << "Deleting file " << file_name;
int ret = unlink(file_name);
if (ret != 0 && errno != ENOENT) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Deleting file %s failed: %s", file_name, strerror(errno));
+ ICING_LOG(ERROR) << "Deleting file " << file_name << " failed: " << strerror(errno);
return false;
}
return true;
@@ -192,8 +184,7 @@ bool Filesystem::DeleteFile(const char* file_name) const {
bool Filesystem::DeleteDirectory(const char* dir_name) const {
int ret = rmdir(dir_name);
if (ret != 0 && errno != ENOENT) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Deleting directory %s failed: %s", dir_name, strerror(errno));
+ ICING_LOG(ERROR) << "Deleting directory " << dir_name << " failed: " << strerror(errno);
return false;
}
return true;
@@ -206,8 +197,7 @@ bool Filesystem::DeleteDirectoryRecursively(const char* dir_name) const {
if (errno == ENOENT) {
return true; // If directory didn't exist, this was successful.
}
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Stat %s failed: %s", dir_name, strerror(errno));
+ ICING_LOG(ERROR) << "Stat " << dir_name << " failed: " << strerror(errno);
return false;
}
vector<std::string> entries;
@@ -220,8 +210,7 @@ bool Filesystem::DeleteDirectoryRecursively(const char* dir_name) const {
++i) {
std::string filename = std::string(dir_name) + '/' + *i;
if (stat(filename.c_str(), &st) < 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Stat %s failed: %s", filename.c_str(), strerror(errno));
+ ICING_LOG(ERROR) << "Stat " << filename << " failed: " << strerror(errno);
success = false;
} else if (S_ISDIR(st.st_mode)) {
success = DeleteDirectoryRecursively(filename.c_str()) && success;
@@ -244,8 +233,7 @@ bool Filesystem::FileExists(const char* file_name) const {
exists = S_ISREG(st.st_mode) != 0;
} else {
if (errno != ENOENT) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Unable to stat file %s: %s", file_name, strerror(errno));
+ ICING_LOG(ERROR) << "Unable to stat file " << file_name << ": " << strerror(errno);
}
exists = false;
}
@@ -259,8 +247,7 @@ bool Filesystem::DirectoryExists(const char* dir_name) const {
exists = S_ISDIR(st.st_mode) != 0;
} else {
if (errno != ENOENT) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Unable to stat directory %s: %s", dir_name, strerror(errno));
+ ICING_LOG(ERROR) << "Unable to stat directory " << dir_name << ": " << strerror(errno);
}
exists = false;
}
@@ -316,8 +303,7 @@ bool Filesystem::GetMatchingFiles(const char* glob,
int basename_idx = GetBasenameIndex(glob);
if (basename_idx == 0) {
// We need a directory.
- ICING_VLOG(1) << IcingStringUtil::StringPrintf(
- "Expected directory, no matching files for: %s", glob);
+ ICING_VLOG(1) << "Expected directory, no matching files for: " << glob;
return true;
}
const char* basename_glob = glob + basename_idx;
@@ -372,8 +358,7 @@ int Filesystem::OpenForRead(const char* file_name) const {
int64_t Filesystem::GetFileSize(int fd) const {
struct stat st;
if (fstat(fd, &st) < 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Unable to stat file: %s",
- strerror(errno));
+ ICING_LOG(ERROR) << "Unable to stat file: " << strerror(errno);
return kBadFileSize;
}
return st.st_size;
@@ -383,11 +368,9 @@ int64_t Filesystem::GetFileSize(const char* filename) const {
struct stat st;
if (stat(filename, &st) < 0) {
if (errno == ENOENT) {
- ICING_VLOG(1) << IcingStringUtil::StringPrintf(
- "Unable to stat file %s: %s", filename, strerror(errno));
+ ICING_VLOG(1) << "Unable to stat file " << filename << ": " << strerror(errno);
} else {
- ICING_LOG(WARNING) << IcingStringUtil::StringPrintf(
- "Unable to stat file %s: %s", filename, strerror(errno));
+ ICING_LOG(WARNING) << "Unable to stat file " << filename << ": " << strerror(errno);
}
return kBadFileSize;
}
@@ -396,8 +379,7 @@ int64_t Filesystem::GetFileSize(const char* filename) const {
bool Filesystem::Truncate(int fd, int64_t new_size) const {
if (ftruncate(fd, new_size) != 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Unable to truncate file: %s", strerror(errno));
+ ICING_LOG(ERROR) << "Unable to truncate file: " << strerror(errno);
return false;
}
lseek(fd, new_size, SEEK_SET);
@@ -416,8 +398,7 @@ bool Filesystem::Truncate(const char* filename, int64_t new_size) const {
bool Filesystem::Grow(int fd, int64_t new_size) const {
if (ftruncate(fd, new_size) != 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Unable to grow file: %s",
- strerror(errno));
+ ICING_LOG(ERROR) << "Unable to grow file: " << strerror(errno);
return false;
}
@@ -442,8 +423,7 @@ bool Filesystem::Write(int fd, const void* data, size_t data_size) const {
size_t chunk_size = std::min<size_t>(write_len, 64u * 1024);
ssize_t wrote = write(fd, data, chunk_size);
if (wrote < 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Bad write: %s",
- strerror(errno));
+ ICING_LOG(ERROR) << "Bad write: " << strerror(errno);
return false;
}
data = static_cast<const uint8_t*>(data) + wrote;
@@ -521,8 +501,7 @@ bool Filesystem::CopyDirectory(const char* src_dir, const char* dst_dir,
}
}
if (closedir(dir) != 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Error closing %s: %s",
- src_dir, strerror(errno));
+ ICING_LOG(ERROR) << "Error closing " << src_dir << ": " << strerror(errno);
}
return true;
}
@@ -535,8 +514,7 @@ bool Filesystem::PWrite(int fd, off_t offset, const void* data,
size_t chunk_size = std::min<size_t>(write_len, 64u * 1024);
ssize_t wrote = pwrite(fd, data, chunk_size, offset);
if (wrote < 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Bad write: %s",
- strerror(errno));
+ ICING_LOG(ERROR) << "Bad write: " << strerror(errno);
return false;
}
data = static_cast<const uint8_t*>(data) + wrote;
@@ -561,8 +539,7 @@ bool Filesystem::PWrite(const char* filename, off_t offset, const void* data,
bool Filesystem::Read(int fd, void* buf, size_t buf_size) const {
ssize_t read_status = read(fd, buf, buf_size);
if (read_status < 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Bad read: %s",
- strerror(errno));
+ ICING_LOG(ERROR) << "Bad read: " << strerror(errno);
return false;
}
return true;
@@ -582,8 +559,7 @@ bool Filesystem::Read(const char* filename, void* buf, size_t buf_size) const {
bool Filesystem::PRead(int fd, void* buf, size_t buf_size, off_t offset) const {
ssize_t read_status = pread(fd, buf, buf_size, offset);
if (read_status < 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Bad read: %s",
- strerror(errno));
+ ICING_LOG(ERROR) << "Bad read: " << strerror(errno);
return false;
}
return true;
@@ -609,8 +585,7 @@ bool Filesystem::DataSync(int fd) const {
#endif
if (result < 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Unable to sync data: %s",
- strerror(errno));
+ ICING_LOG(ERROR) << "Unable to sync data: " << strerror(errno);
return false;
}
return true;
@@ -618,9 +593,7 @@ bool Filesystem::DataSync(int fd) const {
bool Filesystem::RenameFile(const char* old_name, const char* new_name) const {
if (rename(old_name, new_name) < 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Unable to rename file %s to %s: %s", old_name, new_name,
- strerror(errno));
+ ICING_LOG(ERROR) << "Unable to rename file " << old_name << " to " << new_name << ": " << strerror(errno);
return false;
}
return true;
@@ -658,8 +631,7 @@ bool Filesystem::CreateDirectory(const char* dir_name) const {
if (mkdir(dir_name, S_IRUSR | S_IWUSR | S_IXUSR) == 0) {
success = true;
} else {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Creating directory %s failed: %s", dir_name, strerror(errno));
+ ICING_LOG(ERROR) << "Creating directory " << dir_name << " failed: " << strerror(errno);
}
}
return success;
@@ -679,8 +651,7 @@ bool Filesystem::CreateDirectoryRecursively(const char* dir_name) const {
int64_t Filesystem::GetDiskUsage(int fd) const {
struct stat st;
if (fstat(fd, &st) < 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Unable to stat file: %s",
- strerror(errno));
+ ICING_LOG(ERROR) << "Unable to stat file: " << strerror(errno);
return kBadFileSize;
}
return st.st_blocks * kStatBlockSize;
@@ -689,8 +660,7 @@ int64_t Filesystem::GetDiskUsage(int fd) const {
int64_t Filesystem::GetFileDiskUsage(const char* path) const {
struct stat st;
if (stat(path, &st) != 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Unable to stat %s: %s",
- path, strerror(errno));
+ ICING_LOG(ERROR) << "Unable to stat " << path << ": " << strerror(errno);
return kBadFileSize;
}
return st.st_blocks * kStatBlockSize;
@@ -699,8 +669,7 @@ int64_t Filesystem::GetFileDiskUsage(const char* path) const {
int64_t Filesystem::GetDiskUsage(const char* path) const {
struct stat st;
if (stat(path, &st) != 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Unable to stat %s: %s",
- path, strerror(errno));
+ ICING_LOG(ERROR) << "Unable to stat " << path << ": " << strerror(errno);
return kBadFileSize;
}
int64_t result = st.st_blocks * kStatBlockSize;
diff --git a/icing/file/persistent-hash-map.cc b/icing/file/persistent-hash-map.cc
new file mode 100644
index 0000000..d20285a
--- /dev/null
+++ b/icing/file/persistent-hash-map.cc
@@ -0,0 +1,534 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/file/persistent-hash-map.h"
+
+#include <cstring>
+#include <memory>
+#include <string>
+#include <string_view>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/file/file-backed-vector.h"
+#include "icing/file/memory-mapped-file.h"
+#include "icing/util/crc32.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+// Helper function to check if there is no termination character '\0' in the
+// key.
+libtextclassifier3::Status ValidateKey(std::string_view key) {
+ if (key.find('\0') != std::string_view::npos) { // NOLINT
+ return absl_ports::InvalidArgumentError(
+ "Key cannot contain termination character '\\0'");
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+// Helper function to convert the key to bucket index by hash.
+//
+// Returns:
+// int32_t: A valid bucket index with range [0, num_buckets - 1].
+// INTERNAL_ERROR if num_buckets == 0
+libtextclassifier3::StatusOr<int32_t> HashKeyToBucketIndex(
+ std::string_view key, int32_t num_buckets) {
+ if (num_buckets == 0) {
+ return absl_ports::InternalError("Should not have empty bucket");
+ }
+ return static_cast<int32_t>(std::hash<std::string_view>()(key) % num_buckets);
+}
+
+// Helper function to PWrite crcs and info to metadata_file_path. Note that
+// metadata_file_path will be the normal or temporary (for branching use when
+// rehashing) metadata file path.
+libtextclassifier3::Status WriteMetadata(const Filesystem& filesystem,
+ const char* metadata_file_path,
+ const PersistentHashMap::Crcs* crcs,
+ const PersistentHashMap::Info* info) {
+ ScopedFd sfd(filesystem.OpenForWrite(metadata_file_path));
+ if (!sfd.is_valid()) {
+ return absl_ports::InternalError("Failed to create metadata file");
+ }
+
+ // Write crcs and info. File layout: <Crcs><Info>
+ if (!filesystem.PWrite(sfd.get(), PersistentHashMap::Crcs::kFileOffset, crcs,
+ sizeof(PersistentHashMap::Crcs))) {
+ return absl_ports::InternalError("Failed to write crcs into metadata file");
+ }
+ // Note that PWrite won't change the file offset, so we need to specify
+ // the correct offset when writing Info.
+ if (!filesystem.PWrite(sfd.get(), PersistentHashMap::Info::kFileOffset, info,
+ sizeof(PersistentHashMap::Info))) {
+ return absl_ports::InternalError("Failed to write info into metadata file");
+ }
+
+ return libtextclassifier3::Status::OK;
+}
+
+// Helper function to update checksums from info and storages to a Crcs
+// instance. Note that storages will be the normal instances used by
+// PersistentHashMap, or the temporary instances (for branching use when
+// rehashing).
+libtextclassifier3::Status UpdateChecksums(
+ PersistentHashMap::Crcs* crcs, PersistentHashMap::Info* info,
+ FileBackedVector<PersistentHashMap::Bucket>* bucket_storage,
+ FileBackedVector<PersistentHashMap::Entry>* entry_storage,
+ FileBackedVector<char>* kv_storage) {
+ // Compute crcs
+ ICING_ASSIGN_OR_RETURN(Crc32 bucket_storage_crc,
+ bucket_storage->ComputeChecksum());
+ ICING_ASSIGN_OR_RETURN(Crc32 entry_storage_crc,
+ entry_storage->ComputeChecksum());
+ ICING_ASSIGN_OR_RETURN(Crc32 kv_storage_crc, kv_storage->ComputeChecksum());
+
+ crcs->component_crcs.info_crc = info->ComputeChecksum().Get();
+ crcs->component_crcs.bucket_storage_crc = bucket_storage_crc.Get();
+ crcs->component_crcs.entry_storage_crc = entry_storage_crc.Get();
+ crcs->component_crcs.kv_storage_crc = kv_storage_crc.Get();
+ crcs->all_crc = crcs->component_crcs.ComputeChecksum().Get();
+
+ return libtextclassifier3::Status::OK;
+}
+
+// Helper function to validate checksums.
+libtextclassifier3::Status ValidateChecksums(
+ const PersistentHashMap::Crcs* crcs, const PersistentHashMap::Info* info,
+ FileBackedVector<PersistentHashMap::Bucket>* bucket_storage,
+ FileBackedVector<PersistentHashMap::Entry>* entry_storage,
+ FileBackedVector<char>* kv_storage) {
+ if (crcs->all_crc != crcs->component_crcs.ComputeChecksum().Get()) {
+ return absl_ports::FailedPreconditionError(
+ "Invalid all crc for PersistentHashMap");
+ }
+
+ if (crcs->component_crcs.info_crc != info->ComputeChecksum().Get()) {
+ return absl_ports::FailedPreconditionError(
+ "Invalid info crc for PersistentHashMap");
+ }
+
+ ICING_ASSIGN_OR_RETURN(Crc32 bucket_storage_crc,
+ bucket_storage->ComputeChecksum());
+ if (crcs->component_crcs.bucket_storage_crc != bucket_storage_crc.Get()) {
+ return absl_ports::FailedPreconditionError(
+ "Mismatch crc with PersistentHashMap bucket storage");
+ }
+
+ ICING_ASSIGN_OR_RETURN(Crc32 entry_storage_crc,
+ entry_storage->ComputeChecksum());
+ if (crcs->component_crcs.entry_storage_crc != entry_storage_crc.Get()) {
+ return absl_ports::FailedPreconditionError(
+ "Mismatch crc with PersistentHashMap entry storage");
+ }
+
+ ICING_ASSIGN_OR_RETURN(Crc32 kv_storage_crc, kv_storage->ComputeChecksum());
+ if (crcs->component_crcs.kv_storage_crc != kv_storage_crc.Get()) {
+ return absl_ports::FailedPreconditionError(
+ "Mismatch crc with PersistentHashMap key value storage");
+ }
+
+ return libtextclassifier3::Status::OK;
+}
+
+// Since metadata/bucket/entry storages should be branched when rehashing, we
+// have to store them together under the same sub directory
+// ("<base_dir>/<sub_dir>"). On the other hand, key-value storage won't be
+// branched and it will be stored under <base_dir>.
+//
+// The following 4 methods are helper functions to get the correct path of
+// metadata/bucket/entry/key-value storages, according to the given base
+// directory and sub directory.
+std::string GetMetadataFilePath(std::string_view base_dir,
+ std::string_view sub_dir) {
+ return absl_ports::StrCat(base_dir, "/", sub_dir, "/",
+ PersistentHashMap::kFilePrefix, ".m");
+}
+
+std::string GetBucketStorageFilePath(std::string_view base_dir,
+ std::string_view sub_dir) {
+ return absl_ports::StrCat(base_dir, "/", sub_dir, "/",
+ PersistentHashMap::kFilePrefix, ".b");
+}
+
+std::string GetEntryStorageFilePath(std::string_view base_dir,
+ std::string_view sub_dir) {
+ return absl_ports::StrCat(base_dir, "/", sub_dir, "/",
+ PersistentHashMap::kFilePrefix, ".e");
+}
+
+std::string GetKeyValueStorageFilePath(std::string_view base_dir) {
+ return absl_ports::StrCat(base_dir, "/", PersistentHashMap::kFilePrefix,
+ ".k");
+}
+
+} // namespace
+
+/* static */ libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>>
+PersistentHashMap::Create(const Filesystem& filesystem,
+ std::string_view base_dir, int32_t value_type_size,
+ int32_t max_load_factor_percent) {
+ if (!filesystem.FileExists(
+ GetMetadataFilePath(base_dir, kSubDirectory).c_str()) ||
+ !filesystem.FileExists(
+ GetBucketStorageFilePath(base_dir, kSubDirectory).c_str()) ||
+ !filesystem.FileExists(
+ GetEntryStorageFilePath(base_dir, kSubDirectory).c_str()) ||
+ !filesystem.FileExists(GetKeyValueStorageFilePath(base_dir).c_str())) {
+ // TODO: erase all files if missing any.
+ return InitializeNewFiles(filesystem, base_dir, value_type_size,
+ max_load_factor_percent);
+ }
+ return InitializeExistingFiles(filesystem, base_dir, value_type_size,
+ max_load_factor_percent);
+}
+
+PersistentHashMap::~PersistentHashMap() {
+ if (!PersistToDisk().ok()) {
+ ICING_LOG(WARNING)
+ << "Failed to persist hash map to disk while destructing " << base_dir_;
+ }
+}
+
+libtextclassifier3::Status PersistentHashMap::Put(std::string_view key,
+ const void* value) {
+ ICING_RETURN_IF_ERROR(ValidateKey(key));
+ ICING_ASSIGN_OR_RETURN(
+ int32_t bucket_idx,
+ HashKeyToBucketIndex(key, bucket_storage_->num_elements()));
+
+ ICING_ASSIGN_OR_RETURN(int32_t target_entry_idx,
+ FindEntryIndexByKey(bucket_idx, key));
+ if (target_entry_idx == Entry::kInvalidIndex) {
+ // If not found, then insert new key value pair.
+ return Insert(bucket_idx, key, value);
+ }
+
+ // Otherwise, overwrite the value.
+ ICING_ASSIGN_OR_RETURN(const Entry* entry,
+ entry_storage_->Get(target_entry_idx));
+
+ int32_t kv_len = key.length() + 1 + info()->value_type_size;
+ int32_t value_offset = key.length() + 1;
+ ICING_ASSIGN_OR_RETURN(
+ typename FileBackedVector<char>::MutableArrayView mutable_kv_arr,
+ kv_storage_->GetMutable(entry->key_value_index(), kv_len));
+ // It is the same key and value_size is fixed, so we can directly overwrite
+ // serialized value.
+ mutable_kv_arr.SetArray(value_offset, reinterpret_cast<const char*>(value),
+ info()->value_type_size);
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status PersistentHashMap::GetOrPut(std::string_view key,
+ void* next_value) {
+ ICING_RETURN_IF_ERROR(ValidateKey(key));
+ ICING_ASSIGN_OR_RETURN(
+ int32_t bucket_idx,
+ HashKeyToBucketIndex(key, bucket_storage_->num_elements()));
+
+ ICING_ASSIGN_OR_RETURN(int32_t target_entry_idx,
+ FindEntryIndexByKey(bucket_idx, key));
+ if (target_entry_idx == Entry::kInvalidIndex) {
+ // If not found, then insert new key value pair.
+ return Insert(bucket_idx, key, next_value);
+ }
+
+ // Otherwise, copy the hash map value into next_value.
+ return CopyEntryValue(target_entry_idx, next_value);
+}
+
+libtextclassifier3::Status PersistentHashMap::Get(std::string_view key,
+ void* value) const {
+ ICING_RETURN_IF_ERROR(ValidateKey(key));
+ ICING_ASSIGN_OR_RETURN(
+ int32_t bucket_idx,
+ HashKeyToBucketIndex(key, bucket_storage_->num_elements()));
+
+ ICING_ASSIGN_OR_RETURN(int32_t target_entry_idx,
+ FindEntryIndexByKey(bucket_idx, key));
+ if (target_entry_idx == Entry::kInvalidIndex) {
+ return absl_ports::NotFoundError(
+ absl_ports::StrCat("Key not found in PersistentHashMap ", base_dir_));
+ }
+
+ return CopyEntryValue(target_entry_idx, value);
+}
+
+libtextclassifier3::Status PersistentHashMap::PersistToDisk() {
+ ICING_RETURN_IF_ERROR(bucket_storage_->PersistToDisk());
+ ICING_RETURN_IF_ERROR(entry_storage_->PersistToDisk());
+ ICING_RETURN_IF_ERROR(kv_storage_->PersistToDisk());
+
+ ICING_RETURN_IF_ERROR(UpdateChecksums(crcs(), info(), bucket_storage_.get(),
+ entry_storage_.get(),
+ kv_storage_.get()));
+ // Changes should have been applied to the underlying file when using
+ // MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, but call msync() as an
+ // extra safety step to ensure they are written out.
+ ICING_RETURN_IF_ERROR(metadata_mmapped_file_->PersistToDisk());
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::StatusOr<int64_t> PersistentHashMap::GetDiskUsage() const {
+ ICING_ASSIGN_OR_RETURN(int64_t bucket_storage_disk_usage,
+ bucket_storage_->GetDiskUsage());
+ ICING_ASSIGN_OR_RETURN(int64_t entry_storage_disk_usage,
+ entry_storage_->GetDiskUsage());
+ ICING_ASSIGN_OR_RETURN(int64_t kv_storage_disk_usage,
+ kv_storage_->GetDiskUsage());
+
+ int64_t total = bucket_storage_disk_usage + entry_storage_disk_usage +
+ kv_storage_disk_usage;
+ Filesystem::IncrementByOrSetInvalid(
+ filesystem_->GetDiskUsage(
+ GetMetadataFilePath(base_dir_, kSubDirectory).c_str()),
+ &total);
+
+ if (total < 0 || total == Filesystem::kBadFileSize) {
+ return absl_ports::InternalError(
+ "Failed to get disk usage of PersistentHashMap");
+ }
+ return total;
+}
+
+libtextclassifier3::StatusOr<int64_t> PersistentHashMap::GetElementsSize()
+ const {
+ ICING_ASSIGN_OR_RETURN(int64_t bucket_storage_elements_size,
+ bucket_storage_->GetElementsFileSize());
+ ICING_ASSIGN_OR_RETURN(int64_t entry_storage_elements_size,
+ entry_storage_->GetElementsFileSize());
+ ICING_ASSIGN_OR_RETURN(int64_t kv_storage_elements_size,
+ kv_storage_->GetElementsFileSize());
+ return bucket_storage_elements_size + entry_storage_elements_size +
+ kv_storage_elements_size;
+}
+
+libtextclassifier3::StatusOr<Crc32> PersistentHashMap::ComputeChecksum() {
+ Crcs* crcs_ptr = crcs();
+ ICING_RETURN_IF_ERROR(UpdateChecksums(crcs_ptr, info(), bucket_storage_.get(),
+ entry_storage_.get(),
+ kv_storage_.get()));
+ return Crc32(crcs_ptr->all_crc);
+}
+
+/* static */ libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>>
+PersistentHashMap::InitializeNewFiles(const Filesystem& filesystem,
+ std::string_view base_dir,
+ int32_t value_type_size,
+ int32_t max_load_factor_percent) {
+ // Create directory.
+ const std::string dir_path = absl_ports::StrCat(base_dir, "/", kSubDirectory);
+ if (!filesystem.CreateDirectoryRecursively(dir_path.c_str())) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Failed to create directory: ", dir_path));
+ }
+
+ // Initialize 3 storages
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<FileBackedVector<Bucket>> bucket_storage,
+ FileBackedVector<Bucket>::Create(
+ filesystem, GetBucketStorageFilePath(base_dir, kSubDirectory),
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<FileBackedVector<Entry>> entry_storage,
+ FileBackedVector<Entry>::Create(
+ filesystem, GetEntryStorageFilePath(base_dir, kSubDirectory),
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ ICING_ASSIGN_OR_RETURN(std::unique_ptr<FileBackedVector<char>> kv_storage,
+ FileBackedVector<char>::Create(
+ filesystem, GetKeyValueStorageFilePath(base_dir),
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+
+ // Initialize one bucket.
+ ICING_RETURN_IF_ERROR(bucket_storage->Append(Bucket()));
+ ICING_RETURN_IF_ERROR(bucket_storage->PersistToDisk());
+
+ // Create and initialize new info
+ Info new_info;
+ new_info.version = kVersion;
+ new_info.value_type_size = value_type_size;
+ new_info.max_load_factor_percent = max_load_factor_percent;
+ new_info.num_deleted_entries = 0;
+ new_info.num_deleted_key_value_bytes = 0;
+
+ // Compute checksums
+ Crcs new_crcs;
+ ICING_RETURN_IF_ERROR(UpdateChecksums(&new_crcs, &new_info,
+ bucket_storage.get(),
+ entry_storage.get(), kv_storage.get()));
+
+ const std::string metadata_file_path =
+ GetMetadataFilePath(base_dir, kSubDirectory);
+ // Write new metadata file
+ ICING_RETURN_IF_ERROR(WriteMetadata(filesystem, metadata_file_path.c_str(),
+ &new_crcs, &new_info));
+
+ // Mmap the content of the crcs and info.
+ auto metadata_mmapped_file = std::make_unique<MemoryMappedFile>(
+ filesystem, metadata_file_path,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC);
+ ICING_RETURN_IF_ERROR(metadata_mmapped_file->Remap(
+ /*file_offset=*/0, /*mmap_size=*/sizeof(Crcs) + sizeof(Info)));
+
+ return std::unique_ptr<PersistentHashMap>(new PersistentHashMap(
+ filesystem, base_dir, std::move(metadata_mmapped_file),
+ std::move(bucket_storage), std::move(entry_storage),
+ std::move(kv_storage)));
+}
+
+/* static */ libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>>
+PersistentHashMap::InitializeExistingFiles(const Filesystem& filesystem,
+ std::string_view base_dir,
+ int32_t value_type_size,
+ int32_t max_load_factor_percent) {
+ // Mmap the content of the crcs and info.
+ auto metadata_mmapped_file = std::make_unique<MemoryMappedFile>(
+ filesystem, GetMetadataFilePath(base_dir, kSubDirectory),
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC);
+ ICING_RETURN_IF_ERROR(metadata_mmapped_file->Remap(
+ /*file_offset=*/0, /*mmap_size=*/sizeof(Crcs) + sizeof(Info)));
+
+ // Initialize 3 storages
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<FileBackedVector<Bucket>> bucket_storage,
+ FileBackedVector<Bucket>::Create(
+ filesystem, GetBucketStorageFilePath(base_dir, kSubDirectory),
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<FileBackedVector<Entry>> entry_storage,
+ FileBackedVector<Entry>::Create(
+ filesystem, GetEntryStorageFilePath(base_dir, kSubDirectory),
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ ICING_ASSIGN_OR_RETURN(std::unique_ptr<FileBackedVector<char>> kv_storage,
+ FileBackedVector<char>::Create(
+ filesystem, GetKeyValueStorageFilePath(base_dir),
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+
+ Crcs* crcs_ptr = reinterpret_cast<Crcs*>(
+ metadata_mmapped_file->mutable_region() + Crcs::kFileOffset);
+ Info* info_ptr = reinterpret_cast<Info*>(
+ metadata_mmapped_file->mutable_region() + Info::kFileOffset);
+
+ // Value type size should be consistent.
+ if (value_type_size != info_ptr->value_type_size) {
+ return absl_ports::FailedPreconditionError("Incorrect value type size");
+ }
+
+ // Validate checksums of info and 3 storages.
+ ICING_RETURN_IF_ERROR(
+ ValidateChecksums(crcs_ptr, info_ptr, bucket_storage.get(),
+ entry_storage.get(), kv_storage.get()));
+
+ // Allow max_load_factor_percent_ change.
+ if (max_load_factor_percent != info_ptr->max_load_factor_percent) {
+ ICING_VLOG(2) << "Changing max_load_factor_percent from " << info_ptr->max_load_factor_percent << " to " << max_load_factor_percent;
+
+ info_ptr->max_load_factor_percent = max_load_factor_percent;
+ crcs_ptr->component_crcs.info_crc = info_ptr->ComputeChecksum().Get();
+ crcs_ptr->all_crc = crcs_ptr->component_crcs.ComputeChecksum().Get();
+ ICING_RETURN_IF_ERROR(metadata_mmapped_file->PersistToDisk());
+ // TODO(b/193919210): rehash if needed
+ }
+
+ return std::unique_ptr<PersistentHashMap>(new PersistentHashMap(
+ filesystem, base_dir, std::move(metadata_mmapped_file),
+ std::move(bucket_storage), std::move(entry_storage),
+ std::move(kv_storage)));
+}
+
+libtextclassifier3::StatusOr<int32_t> PersistentHashMap::FindEntryIndexByKey(
+ int32_t bucket_idx, std::string_view key) const {
+ // Iterate all entries in the bucket, compare with key, and return the entry
+ // index if exists.
+ ICING_ASSIGN_OR_RETURN(const Bucket* bucket,
+ bucket_storage_->Get(bucket_idx));
+ int32_t curr_entry_idx = bucket->head_entry_index();
+ while (curr_entry_idx != Entry::kInvalidIndex) {
+ ICING_ASSIGN_OR_RETURN(const Entry* entry,
+ entry_storage_->Get(curr_entry_idx));
+ if (entry->key_value_index() == kInvalidKVIndex) {
+ ICING_LOG(ERROR) << "Got an invalid key value index in the persistent "
+ "hash map bucket. This shouldn't happen";
+ return absl_ports::InternalError("Unexpected invalid key value index");
+ }
+ ICING_ASSIGN_OR_RETURN(const char* kv_arr,
+ kv_storage_->Get(entry->key_value_index()));
+ if (key.compare(kv_arr) == 0) {
+ return curr_entry_idx;
+ }
+
+ curr_entry_idx = entry->next_entry_index();
+ }
+
+ return curr_entry_idx;
+}
+
+libtextclassifier3::Status PersistentHashMap::CopyEntryValue(
+ int32_t entry_idx, void* value) const {
+ ICING_ASSIGN_OR_RETURN(const Entry* entry, entry_storage_->Get(entry_idx));
+
+ ICING_ASSIGN_OR_RETURN(const char* kv_arr,
+ kv_storage_->Get(entry->key_value_index()));
+ int32_t value_offset = strlen(kv_arr) + 1;
+ memcpy(value, kv_arr + value_offset, info()->value_type_size);
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status PersistentHashMap::Insert(int32_t bucket_idx,
+ std::string_view key,
+ const void* value) {
+ // If size() + 1 exceeds Entry::kMaxNumEntries, then return error.
+ if (size() > Entry::kMaxNumEntries - 1) {
+ return absl_ports::ResourceExhaustedError("Cannot insert new entry");
+ }
+
+ ICING_ASSIGN_OR_RETURN(
+ typename FileBackedVector<Bucket>::MutableView mutable_bucket,
+ bucket_storage_->GetMutable(bucket_idx));
+
+ // Append new key value.
+ int32_t new_kv_idx = kv_storage_->num_elements();
+ int32_t kv_len = key.size() + 1 + info()->value_type_size;
+ int32_t value_offset = key.size() + 1;
+ ICING_ASSIGN_OR_RETURN(
+ typename FileBackedVector<char>::MutableArrayView mutable_new_kv_arr,
+ kv_storage_->Allocate(kv_len));
+ mutable_new_kv_arr.SetArray(/*idx=*/0, key.data(), key.size());
+ mutable_new_kv_arr.SetArray(/*idx=*/key.size(), "\0", 1);
+ mutable_new_kv_arr.SetArray(/*idx=*/value_offset,
+ reinterpret_cast<const char*>(value),
+ info()->value_type_size);
+
+ // Append new entry.
+ int32_t new_entry_idx = entry_storage_->num_elements();
+ ICING_RETURN_IF_ERROR(entry_storage_->Append(
+ Entry(new_kv_idx, mutable_bucket.Get().head_entry_index())));
+ mutable_bucket.Get().set_head_entry_index(new_entry_idx);
+
+ // TODO: rehash if needed
+
+ return libtextclassifier3::Status::OK;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/file/persistent-hash-map.h b/icing/file/persistent-hash-map.h
new file mode 100644
index 0000000..24a47ea
--- /dev/null
+++ b/icing/file/persistent-hash-map.h
@@ -0,0 +1,383 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_FILE_PERSISTENT_HASH_MAP_H_
+#define ICING_FILE_PERSISTENT_HASH_MAP_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <string_view>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/file/file-backed-vector.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/memory-mapped-file.h"
+#include "icing/util/crc32.h"
+
+namespace icing {
+namespace lib {
+
+// Low level persistent hash map.
+// It supports variant length serialized key + fixed length serialized value.
+// Key and value can be any type, but callers should serialize key/value by
+// themselves and pass raw bytes into the hash map, and the serialized key
+// should not contain termination character '\0'.
+class PersistentHashMap {
+ public:
+ // Crcs and Info will be written into the metadata file.
+ // File layout: <Crcs><Info>
+ // Crcs
+ struct Crcs {
+ static constexpr int32_t kFileOffset = 0;
+
+ struct ComponentCrcs {
+ uint32_t info_crc;
+ uint32_t bucket_storage_crc;
+ uint32_t entry_storage_crc;
+ uint32_t kv_storage_crc;
+
+ bool operator==(const ComponentCrcs& other) const {
+ return info_crc == other.info_crc &&
+ bucket_storage_crc == other.bucket_storage_crc &&
+ entry_storage_crc == other.entry_storage_crc &&
+ kv_storage_crc == other.kv_storage_crc;
+ }
+
+ Crc32 ComputeChecksum() const {
+ return Crc32(std::string_view(reinterpret_cast<const char*>(this),
+ sizeof(ComponentCrcs)));
+ }
+ } __attribute__((packed));
+
+ bool operator==(const Crcs& other) const {
+ return all_crc == other.all_crc && component_crcs == other.component_crcs;
+ }
+
+ uint32_t all_crc;
+ ComponentCrcs component_crcs;
+ } __attribute__((packed));
+ static_assert(sizeof(Crcs) == 20, "");
+
+ // Info
+ struct Info {
+ static constexpr int32_t kFileOffset = static_cast<int32_t>(sizeof(Crcs));
+
+ int32_t version;
+ int32_t value_type_size;
+ int32_t max_load_factor_percent;
+ int32_t num_deleted_entries;
+ int32_t num_deleted_key_value_bytes;
+
+ Crc32 ComputeChecksum() const {
+ return Crc32(
+ std::string_view(reinterpret_cast<const char*>(this), sizeof(Info)));
+ }
+ } __attribute__((packed));
+ static_assert(sizeof(Info) == 20, "");
+
+ // Bucket
+ class Bucket {
+ public:
+ // Absolute max # of buckets allowed. Since max file size on Android is
+ // 2^31-1, we can at most have ~2^29 buckets. To make it power of 2, round
+ // it down to 2^28. Also since we're using FileBackedVector to store
+ // buckets, add some static_asserts to ensure numbers here are compatible
+ // with FileBackedVector.
+ static constexpr int32_t kMaxNumBuckets = 1 << 28;
+
+ explicit Bucket(int32_t head_entry_index = Entry::kInvalidIndex)
+ : head_entry_index_(head_entry_index) {}
+
+ // For FileBackedVector
+ bool operator==(const Bucket& other) const {
+ return head_entry_index_ == other.head_entry_index_;
+ }
+
+ int32_t head_entry_index() const { return head_entry_index_; }
+ void set_head_entry_index(int32_t head_entry_index) {
+ head_entry_index_ = head_entry_index;
+ }
+
+ private:
+ int32_t head_entry_index_;
+ } __attribute__((packed));
+ static_assert(sizeof(Bucket) == 4, "");
+ static_assert(sizeof(Bucket) == FileBackedVector<Bucket>::kElementTypeSize,
+ "Bucket type size is inconsistent with FileBackedVector "
+ "element type size");
+ static_assert(Bucket::kMaxNumBuckets <=
+ (FileBackedVector<Bucket>::kMaxFileSize -
+ FileBackedVector<Bucket>::Header::kHeaderSize) /
+ FileBackedVector<Bucket>::kElementTypeSize,
+ "Max # of buckets cannot fit into FileBackedVector");
+
+ // Entry
+ class Entry {
+ public:
+ // Absolute max # of entries allowed. Since max file size on Android is
+ // 2^31-1, we can at most have ~2^28 entries. To make it power of 2, round
+ // it down to 2^27. Also since we're using FileBackedVector to store
+ // entries, add some static_asserts to ensure numbers here are compatible
+ // with FileBackedVector.
+ //
+ // Still the actual max # of entries are determined by key-value storage,
+ // since length of the key varies and affects # of actual key-value pairs
+ // that can be stored.
+ static constexpr int32_t kMaxNumEntries = 1 << 27;
+ static constexpr int32_t kMaxIndex = kMaxNumEntries - 1;
+ static constexpr int32_t kInvalidIndex = -1;
+
+ explicit Entry(int32_t key_value_index, int32_t next_entry_index)
+ : key_value_index_(key_value_index),
+ next_entry_index_(next_entry_index) {}
+
+ bool operator==(const Entry& other) const {
+ return key_value_index_ == other.key_value_index_ &&
+ next_entry_index_ == other.next_entry_index_;
+ }
+
+ int32_t key_value_index() const { return key_value_index_; }
+ void set_key_value_index(int32_t key_value_index) {
+ key_value_index_ = key_value_index;
+ }
+
+ int32_t next_entry_index() const { return next_entry_index_; }
+ void set_next_entry_index(int32_t next_entry_index) {
+ next_entry_index_ = next_entry_index;
+ }
+
+ private:
+ int32_t key_value_index_;
+ int32_t next_entry_index_;
+ } __attribute__((packed));
+ static_assert(sizeof(Entry) == 8, "");
+ static_assert(sizeof(Entry) == FileBackedVector<Entry>::kElementTypeSize,
+ "Entry type size is inconsistent with FileBackedVector "
+ "element type size");
+ static_assert(Entry::kMaxNumEntries <=
+ (FileBackedVector<Entry>::kMaxFileSize -
+ FileBackedVector<Entry>::Header::kHeaderSize) /
+ FileBackedVector<Entry>::kElementTypeSize,
+ "Max # of entries cannot fit into FileBackedVector");
+
+ // Key-value serialized type
+ static constexpr int32_t kMaxKVTotalByteSize =
+ (FileBackedVector<char>::kMaxFileSize -
+ FileBackedVector<char>::Header::kHeaderSize) /
+ FileBackedVector<char>::kElementTypeSize;
+ static constexpr int32_t kMaxKVIndex = kMaxKVTotalByteSize - 1;
+ static constexpr int32_t kInvalidKVIndex = -1;
+ static_assert(sizeof(char) == FileBackedVector<char>::kElementTypeSize,
+ "Char type size is inconsistent with FileBackedVector element "
+ "type size");
+
+ static constexpr int32_t kVersion = 1;
+ static constexpr int32_t kDefaultMaxLoadFactorPercent = 75;
+
+ static constexpr std::string_view kFilePrefix = "persistent_hash_map";
+ // Only metadata, bucket, entry files are stored under this sub-directory, for
+ // rehashing branching use.
+ static constexpr std::string_view kSubDirectory = "dynamic";
+
+ // Creates a new PersistentHashMap to read/write/delete key value pairs.
+ //
+ // filesystem: Object to make system level calls
+ // base_dir: Specifies the directory for all persistent hash map related
+ // sub-directory and files to be stored. If base_dir doesn't exist,
+ // then PersistentHashMap will automatically create it. If files
+ // exist, then it will initialize the hash map from existing files.
+ // value_type_size: (fixed) size of the serialized value type for hash map.
+ // max_load_factor_percent: percentage of the max loading for the hash map.
+ // load_factor_percent = 100 * num_keys / num_buckets
+ // If load_factor_percent exceeds
+ // max_load_factor_percent, then rehash will be
+ // invoked (and # of buckets will be doubled).
+ // Note that load_factor_percent exceeding 100 is
+ // considered valid.
+ //
+ // Returns:
+ // FAILED_PRECONDITION_ERROR if the file checksum doesn't match the stored
+ // checksum.
+ // INTERNAL_ERROR on I/O errors.
+ // Any FileBackedVector errors.
+ static libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>>
+ Create(const Filesystem& filesystem, std::string_view base_dir,
+ int32_t value_type_size,
+ int32_t max_load_factor_percent = kDefaultMaxLoadFactorPercent);
+
+ ~PersistentHashMap();
+
+ // Update a key value pair. If key does not exist, then insert (key, value)
+ // into the storage. Otherwise overwrite the value into the storage.
+ //
+ // REQUIRES: the buffer pointed to by value must be of value_size()
+ //
+ // Returns:
+ // OK on success
+ // RESOURCE_EXHAUSTED_ERROR if # of entries reach kMaxNumEntries
+ // INVALID_ARGUMENT_ERROR if the key is invalid (i.e. contains '\0')
+ // INTERNAL_ERROR on I/O error or any data inconsistency
+ // Any FileBackedVector errors
+ libtextclassifier3::Status Put(std::string_view key, const void* value);
+
+ // If key does not exist, then insert (key, next_value) into the storage.
+ // Otherwise, copy the hash map value into next_value.
+ //
+ // REQUIRES: the buffer pointed to by next_value must be of value_size()
+ //
+ // Returns:
+ // OK on success
+ // INVALID_ARGUMENT_ERROR if the key is invalid (i.e. contains '\0')
+ // INTERNAL_ERROR on I/O error or any data inconsistency
+ // Any FileBackedVector errors
+ libtextclassifier3::Status GetOrPut(std::string_view key, void* next_value);
+
+ // Get the value by key from the storage. If key exists, then copy the hash
+ // map value into into value buffer. Otherwise, return NOT_FOUND_ERROR.
+ //
+ // REQUIRES: the buffer pointed to by value must be of value_size()
+ //
+ // Returns:
+ // OK on success
+ // NOT_FOUND_ERROR if the key doesn't exist
+ // INVALID_ARGUMENT_ERROR if the key is invalid (i.e. contains '\0')
+ // INTERNAL_ERROR on I/O error or any data inconsistency
+ // Any FileBackedVector errors
+ libtextclassifier3::Status Get(std::string_view key, void* value) const;
+
+ // Flushes content to underlying files.
+ //
+ // Returns:
+ // OK on success
+ // INTERNAL_ERROR on I/O error
+ libtextclassifier3::Status PersistToDisk();
+
+ // Calculates and returns the disk usage (metadata + 3 storages total file
+ // size) in bytes.
+ //
+ // Returns:
+ // Disk usage on success
+ // INTERNAL_ERROR on I/O error
+ libtextclassifier3::StatusOr<int64_t> GetDiskUsage() const;
+
+ // Returns the total file size of the all the elements held in the persistent
+ // hash map. File size is in bytes. This excludes the size of any internal
+ // metadata, i.e. crcs/info of persistent hash map, file backed vector's
+ // header.
+ //
+ // Returns:
+ // File size on success
+ // INTERNAL_ERROR on I/O error
+ libtextclassifier3::StatusOr<int64_t> GetElementsSize() const;
+
+ // Updates all checksums of the persistent hash map components and returns
+ // all_crc.
+ //
+ // Returns:
+ // Crc of all components (all_crc) on success
+ // INTERNAL_ERROR if any data inconsistency
+ libtextclassifier3::StatusOr<Crc32> ComputeChecksum();
+
+ int32_t size() const {
+ return entry_storage_->num_elements() - info()->num_deleted_entries;
+ }
+
+ bool empty() const { return size() == 0; }
+
+ private:
+ explicit PersistentHashMap(
+ const Filesystem& filesystem, std::string_view base_dir,
+ std::unique_ptr<MemoryMappedFile> metadata_mmapped_file,
+ std::unique_ptr<FileBackedVector<Bucket>> bucket_storage,
+ std::unique_ptr<FileBackedVector<Entry>> entry_storage,
+ std::unique_ptr<FileBackedVector<char>> kv_storage)
+ : filesystem_(&filesystem),
+ base_dir_(base_dir),
+ metadata_mmapped_file_(std::move(metadata_mmapped_file)),
+ bucket_storage_(std::move(bucket_storage)),
+ entry_storage_(std::move(entry_storage)),
+ kv_storage_(std::move(kv_storage)) {}
+
+ static libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>>
+ InitializeNewFiles(const Filesystem& filesystem, std::string_view base_dir,
+ int32_t value_type_size, int32_t max_load_factor_percent);
+
+ static libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>>
+ InitializeExistingFiles(const Filesystem& filesystem,
+ std::string_view base_dir, int32_t value_type_size,
+ int32_t max_load_factor_percent);
+
+ // Find the index of the key entry from a bucket (specified by bucket index).
+ // The caller should specify the desired bucket index.
+ //
+ // Returns:
+ // int32_t: on success, the index of the entry, or Entry::kInvalidIndex if
+ // not found
+ // INTERNAL_ERROR if any content inconsistency
+ // Any FileBackedVector errors
+ libtextclassifier3::StatusOr<int32_t> FindEntryIndexByKey(
+ int32_t bucket_idx, std::string_view key) const;
+
+ // Copy the hash map value of the entry into value buffer.
+ //
+ // REQUIRES: entry_idx should be valid.
+ // REQUIRES: the buffer pointed to by value must be of value_size()
+ //
+ // Returns:
+ // OK on success
+ // Any FileBackedVector errors
+ libtextclassifier3::Status CopyEntryValue(int32_t entry_idx,
+ void* value) const;
+
+ // Insert a new key value pair into a bucket (specified by the bucket index).
+ // The caller should specify the desired bucket index and make sure that the
+ // key is not present in the hash map before calling.
+ //
+ // Returns:
+ // OK on success
+ // Any FileBackedVector errors
+ libtextclassifier3::Status Insert(int32_t bucket_idx, std::string_view key,
+ const void* value);
+
+ Crcs* crcs() {
+ return reinterpret_cast<Crcs*>(metadata_mmapped_file_->mutable_region() +
+ Crcs::kFileOffset);
+ }
+
+ Info* info() {
+ return reinterpret_cast<Info*>(metadata_mmapped_file_->mutable_region() +
+ Info::kFileOffset);
+ }
+
+ const Info* info() const {
+ return reinterpret_cast<const Info*>(metadata_mmapped_file_->region() +
+ Info::kFileOffset);
+ }
+
+ const Filesystem* filesystem_;
+ std::string base_dir_;
+
+ std::unique_ptr<MemoryMappedFile> metadata_mmapped_file_;
+
+ // Storages
+ std::unique_ptr<FileBackedVector<Bucket>> bucket_storage_;
+ std::unique_ptr<FileBackedVector<Entry>> entry_storage_;
+ std::unique_ptr<FileBackedVector<char>> kv_storage_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_FILE_PERSISTENT_HASH_MAP_H_
diff --git a/icing/file/persistent-hash-map_test.cc b/icing/file/persistent-hash-map_test.cc
new file mode 100644
index 0000000..fb15175
--- /dev/null
+++ b/icing/file/persistent-hash-map_test.cc
@@ -0,0 +1,662 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/file/persistent-hash-map.h"
+
+#include <cstring>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/file/file-backed-vector.h"
+#include "icing/file/filesystem.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/tmp-directory.h"
+#include "icing/util/crc32.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+static constexpr int32_t kCorruptedValueOffset = 3;
+
+using ::testing::Eq;
+using ::testing::HasSubstr;
+using ::testing::IsEmpty;
+using ::testing::Not;
+using ::testing::Pointee;
+using ::testing::SizeIs;
+
+using Bucket = PersistentHashMap::Bucket;
+using Crcs = PersistentHashMap::Crcs;
+using Entry = PersistentHashMap::Entry;
+using Info = PersistentHashMap::Info;
+
+class PersistentHashMapTest : public ::testing::Test {
+ protected:
+ void SetUp() override {
+ base_dir_ = GetTestTempDir() + "/persistent_hash_map_test";
+ }
+
+ void TearDown() override {
+ filesystem_.DeleteDirectoryRecursively(base_dir_.c_str());
+ }
+
+ std::vector<char> Serialize(int val) {
+ std::vector<char> ret(sizeof(val));
+ memcpy(ret.data(), &val, sizeof(val));
+ return ret;
+ }
+
+ libtextclassifier3::StatusOr<int> GetValueByKey(
+ PersistentHashMap* persistent_hash_map, std::string_view key) {
+ int val;
+ ICING_RETURN_IF_ERROR(persistent_hash_map->Get(key, &val));
+ return val;
+ }
+
+ Filesystem filesystem_;
+ std::string base_dir_;
+};
+
+TEST_F(PersistentHashMapTest, InvalidBaseDir) {
+ EXPECT_THAT(PersistentHashMap::Create(filesystem_, "/dev/null",
+ /*value_type_size=*/sizeof(int)),
+ StatusIs(libtextclassifier3::StatusCode::INTERNAL));
+}
+
+TEST_F(PersistentHashMapTest, InitializeNewFiles) {
+ {
+ ASSERT_FALSE(filesystem_.DirectoryExists(base_dir_.c_str()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(filesystem_, base_dir_,
+ /*value_type_size=*/sizeof(int)));
+ EXPECT_THAT(persistent_hash_map, Pointee(IsEmpty()));
+
+ ICING_ASSERT_OK(persistent_hash_map->PersistToDisk());
+ }
+
+ // Metadata file should be initialized correctly for both info and crcs
+ // sections.
+ const std::string metadata_file_path =
+ absl_ports::StrCat(base_dir_, "/", PersistentHashMap::kSubDirectory, "/",
+ PersistentHashMap::kFilePrefix, ".m");
+ ScopedFd metadata_sfd(filesystem_.OpenForWrite(metadata_file_path.c_str()));
+ ASSERT_TRUE(metadata_sfd.is_valid());
+
+ // Check info section
+ Info info;
+ ASSERT_TRUE(filesystem_.PRead(metadata_sfd.get(), &info, sizeof(Info),
+ Info::kFileOffset));
+ EXPECT_THAT(info.version, Eq(PersistentHashMap::kVersion));
+ EXPECT_THAT(info.value_type_size, Eq(sizeof(int)));
+ EXPECT_THAT(info.max_load_factor_percent,
+ Eq(PersistentHashMap::kDefaultMaxLoadFactorPercent));
+ EXPECT_THAT(info.num_deleted_entries, Eq(0));
+ EXPECT_THAT(info.num_deleted_key_value_bytes, Eq(0));
+
+ // Check crcs section
+ Crcs crcs;
+ ASSERT_TRUE(filesystem_.PRead(metadata_sfd.get(), &crcs, sizeof(Crcs),
+ Crcs::kFileOffset));
+ // # of elements in bucket_storage should be 1, so it should have non-zero
+ // crc value.
+ EXPECT_THAT(crcs.component_crcs.bucket_storage_crc, Not(Eq(0)));
+ // Other empty file backed vectors should have 0 crc value.
+ EXPECT_THAT(crcs.component_crcs.entry_storage_crc, Eq(0));
+ EXPECT_THAT(crcs.component_crcs.kv_storage_crc, Eq(0));
+ EXPECT_THAT(crcs.component_crcs.info_crc,
+ Eq(Crc32(std::string_view(reinterpret_cast<const char*>(&info),
+ sizeof(Info)))
+ .Get()));
+ EXPECT_THAT(crcs.all_crc,
+ Eq(Crc32(std::string_view(
+ reinterpret_cast<const char*>(&crcs.component_crcs),
+ sizeof(Crcs::ComponentCrcs)))
+ .Get()));
+}
+
+TEST_F(PersistentHashMapTest,
+ TestInitializationFailsWithoutPersistToDiskOrDestruction) {
+ // Create new persistent hash map
+ // Set max_load_factor_percent as 1000. Load factor percent is calculated as
+ // 100 * num_keys / num_buckets. Therefore, with 1 bucket (the initial # of
+ // buckets in an empty PersistentHashMap) and a max_load_factor_percent of
+ // 1000, we would allow the insertion of up to 10 keys before rehashing, to
+ // avoid PersistToDisk being called implicitly by rehashing.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(filesystem_, base_dir_,
+ /*value_type_size=*/sizeof(int),
+ /*max_load_factor_percent=*/1000));
+
+ // Put some key value pairs.
+ ICING_ASSERT_OK(persistent_hash_map->Put("a", Serialize(1).data()));
+ ICING_ASSERT_OK(persistent_hash_map->Put("b", Serialize(2).data()));
+ // TODO(b/193919210): call Delete() to change PersistentHashMap header
+
+ ASSERT_THAT(persistent_hash_map, Pointee(SizeIs(2)));
+ ASSERT_THAT(GetValueByKey(persistent_hash_map.get(), "a"), IsOkAndHolds(1));
+ ASSERT_THAT(GetValueByKey(persistent_hash_map.get(), "b"), IsOkAndHolds(2));
+
+ // Without calling PersistToDisk, checksums will not be recomputed or synced
+ // to disk, so initializing another instance on the same files should fail.
+ EXPECT_THAT(PersistentHashMap::Create(filesystem_, base_dir_,
+ /*value_type_size=*/sizeof(int),
+ /*max_load_factor_percent=*/1000),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+}
+
+TEST_F(PersistentHashMapTest, TestInitializationSucceedsWithPersistToDisk) {
+ // Create new persistent hash map
+ // Set max_load_factor_percent as 1000. Load factor percent is calculated as
+ // 100 * num_keys / num_buckets. Therefore, with 1 bucket (the initial # of
+ // buckets in an empty PersistentHashMap) and a max_load_factor_percent of
+ // 1000, we would allow the insertion of up to 10 keys before rehashing, to
+ // avoid PersistToDisk being called implicitly by rehashing.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map1,
+ PersistentHashMap::Create(filesystem_, base_dir_,
+ /*value_type_size=*/sizeof(int),
+ /*max_load_factor_percent=*/1000));
+
+ // Put some key value pairs.
+ ICING_ASSERT_OK(persistent_hash_map1->Put("a", Serialize(1).data()));
+ ICING_ASSERT_OK(persistent_hash_map1->Put("b", Serialize(2).data()));
+ // TODO(b/193919210): call Delete() to change PersistentHashMap header
+
+ ASSERT_THAT(persistent_hash_map1, Pointee(SizeIs(2)));
+ ASSERT_THAT(GetValueByKey(persistent_hash_map1.get(), "a"), IsOkAndHolds(1));
+ ASSERT_THAT(GetValueByKey(persistent_hash_map1.get(), "b"), IsOkAndHolds(2));
+
+ // After calling PersistToDisk, all checksums should be recomputed and synced
+ // correctly to disk, so initializing another instance on the same files
+ // should succeed, and we should be able to get the same contents.
+ ICING_EXPECT_OK(persistent_hash_map1->PersistToDisk());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map2,
+ PersistentHashMap::Create(filesystem_, base_dir_,
+ /*value_type_size=*/sizeof(int),
+ /*max_load_factor_percent=*/1000));
+ EXPECT_THAT(persistent_hash_map2, Pointee(SizeIs(2)));
+ EXPECT_THAT(GetValueByKey(persistent_hash_map2.get(), "a"), IsOkAndHolds(1));
+ EXPECT_THAT(GetValueByKey(persistent_hash_map2.get(), "b"), IsOkAndHolds(2));
+}
+
+TEST_F(PersistentHashMapTest, TestInitializationSucceedsAfterDestruction) {
+ {
+ // Create new persistent hash map
+ // Set max_load_factor_percent as 1000. Load factor percent is calculated as
+ // 100 * num_keys / num_buckets. Therefore, with 1 bucket (the initial # of
+ // buckets in an empty PersistentHashMap) and a max_load_factor_percent of
+ // 1000, we would allow the insertion of up to 10 keys before rehashing, to
+ // avoid PersistToDisk being called implicitly by rehashing.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(filesystem_, base_dir_,
+ /*value_type_size=*/sizeof(int),
+ /*max_load_factor_percent=*/1000));
+ ICING_ASSERT_OK(persistent_hash_map->Put("a", Serialize(1).data()));
+ ICING_ASSERT_OK(persistent_hash_map->Put("b", Serialize(2).data()));
+ // TODO(b/193919210): call Delete() to change PersistentHashMap header
+
+ ASSERT_THAT(persistent_hash_map, Pointee(SizeIs(2)));
+ ASSERT_THAT(GetValueByKey(persistent_hash_map.get(), "a"), IsOkAndHolds(1));
+ ASSERT_THAT(GetValueByKey(persistent_hash_map.get(), "b"), IsOkAndHolds(2));
+ }
+
+ {
+ // The previous instance went out of scope and was destructed. Although we
+ // didn't call PersistToDisk explicitly, the destructor should invoke it and
+ // thus initializing another instance on the same files should succeed, and
+ // we should be able to get the same contents.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(filesystem_, base_dir_,
+ /*value_type_size=*/sizeof(int),
+ /*max_load_factor_percent=*/1000));
+ EXPECT_THAT(persistent_hash_map, Pointee(SizeIs(2)));
+ EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "a"), IsOkAndHolds(1));
+ EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "b"), IsOkAndHolds(2));
+ }
+}
+
+TEST_F(PersistentHashMapTest,
+ InitializeExistingFilesWithDifferentValueTypeSizeShouldFail) {
+ {
+ // Create new persistent hash map
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(filesystem_, base_dir_,
+ /*value_type_size=*/sizeof(int)));
+ ICING_ASSERT_OK(persistent_hash_map->Put("a", Serialize(1).data()));
+
+ ICING_ASSERT_OK(persistent_hash_map->PersistToDisk());
+ }
+
+ {
+ // Attempt to create the persistent hash map with different value type size.
+ // This should fail.
+ ASSERT_THAT(sizeof(char), Not(Eq(sizeof(int))));
+ libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>>
+ persistent_hash_map_or = PersistentHashMap::Create(
+ filesystem_, base_dir_, /*value_type_size=*/sizeof(char));
+ EXPECT_THAT(persistent_hash_map_or,
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+ EXPECT_THAT(persistent_hash_map_or.status().error_message(),
+ HasSubstr("Incorrect value type size"));
+ }
+}
+
+TEST_F(PersistentHashMapTest, InitializeExistingFilesWithWrongAllCrc) {
+ {
+ // Create new persistent hash map
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(filesystem_, base_dir_,
+ /*value_type_size=*/sizeof(int)));
+ ICING_ASSERT_OK(persistent_hash_map->Put("a", Serialize(1).data()));
+
+ ICING_ASSERT_OK(persistent_hash_map->PersistToDisk());
+ }
+
+ const std::string metadata_file_path =
+ absl_ports::StrCat(base_dir_, "/", PersistentHashMap::kSubDirectory, "/",
+ PersistentHashMap::kFilePrefix, ".m");
+ ScopedFd metadata_sfd(filesystem_.OpenForWrite(metadata_file_path.c_str()));
+ ASSERT_TRUE(metadata_sfd.is_valid());
+
+ Crcs crcs;
+ ASSERT_TRUE(filesystem_.PRead(metadata_sfd.get(), &crcs, sizeof(Crcs),
+ Crcs::kFileOffset));
+
+ // Manually corrupt all_crc
+ crcs.all_crc += kCorruptedValueOffset;
+ ASSERT_TRUE(filesystem_.PWrite(metadata_sfd.get(), Crcs::kFileOffset, &crcs,
+ sizeof(Crcs)));
+ metadata_sfd.reset();
+
+ {
+ // Attempt to create the persistent hash map with metadata containing
+ // corrupted all_crc. This should fail.
+ libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>>
+ persistent_hash_map_or = PersistentHashMap::Create(
+ filesystem_, base_dir_, /*value_type_size=*/sizeof(int));
+ EXPECT_THAT(persistent_hash_map_or,
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+ EXPECT_THAT(persistent_hash_map_or.status().error_message(),
+ HasSubstr("Invalid all crc for PersistentHashMap"));
+ }
+}
+
+TEST_F(PersistentHashMapTest,
+ InitializeExistingFilesWithCorruptedInfoShouldFail) {
+ {
+ // Create new persistent hash map
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(filesystem_, base_dir_,
+ /*value_type_size=*/sizeof(int)));
+ ICING_ASSERT_OK(persistent_hash_map->Put("a", Serialize(1).data()));
+
+ ICING_ASSERT_OK(persistent_hash_map->PersistToDisk());
+ }
+
+ const std::string metadata_file_path =
+ absl_ports::StrCat(base_dir_, "/", PersistentHashMap::kSubDirectory, "/",
+ PersistentHashMap::kFilePrefix, ".m");
+ ScopedFd metadata_sfd(filesystem_.OpenForWrite(metadata_file_path.c_str()));
+ ASSERT_TRUE(metadata_sfd.is_valid());
+
+ Info info;
+ ASSERT_TRUE(filesystem_.PRead(metadata_sfd.get(), &info, sizeof(Info),
+ Info::kFileOffset));
+
+ // Modify info, but don't update the checksum. This would be similar to
+ // corruption of info.
+ info.num_deleted_entries += kCorruptedValueOffset;
+ ASSERT_TRUE(filesystem_.PWrite(metadata_sfd.get(), Info::kFileOffset, &info,
+ sizeof(Info)));
+ {
+ // Attempt to create the persistent hash map with info that doesn't match
+ // its checksum and confirm that it fails.
+ libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>>
+ persistent_hash_map_or = PersistentHashMap::Create(
+ filesystem_, base_dir_, /*value_type_size=*/sizeof(int));
+ EXPECT_THAT(persistent_hash_map_or,
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+ EXPECT_THAT(persistent_hash_map_or.status().error_message(),
+ HasSubstr("Invalid info crc for PersistentHashMap"));
+ }
+}
+
+TEST_F(PersistentHashMapTest,
+ InitializeExistingFilesWithWrongBucketStorageCrc) {
+ {
+ // Create new persistent hash map
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(filesystem_, base_dir_,
+ /*value_type_size=*/sizeof(int)));
+ ICING_ASSERT_OK(persistent_hash_map->Put("a", Serialize(1).data()));
+
+ ICING_ASSERT_OK(persistent_hash_map->PersistToDisk());
+ }
+
+ const std::string metadata_file_path =
+ absl_ports::StrCat(base_dir_, "/", PersistentHashMap::kSubDirectory, "/",
+ PersistentHashMap::kFilePrefix, ".m");
+ ScopedFd metadata_sfd(filesystem_.OpenForWrite(metadata_file_path.c_str()));
+ ASSERT_TRUE(metadata_sfd.is_valid());
+
+ Crcs crcs;
+ ASSERT_TRUE(filesystem_.PRead(metadata_sfd.get(), &crcs, sizeof(Crcs),
+ Crcs::kFileOffset));
+
+ // Manually corrupt bucket_storage_crc
+ crcs.component_crcs.bucket_storage_crc += kCorruptedValueOffset;
+ crcs.all_crc = Crc32(std::string_view(
+ reinterpret_cast<const char*>(&crcs.component_crcs),
+ sizeof(Crcs::ComponentCrcs)))
+ .Get();
+ ASSERT_TRUE(filesystem_.PWrite(metadata_sfd.get(), Crcs::kFileOffset, &crcs,
+ sizeof(Crcs)));
+ {
+ // Attempt to create the persistent hash map with metadata containing
+ // corrupted bucket_storage_crc. This should fail.
+ libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>>
+ persistent_hash_map_or = PersistentHashMap::Create(
+ filesystem_, base_dir_, /*value_type_size=*/sizeof(int));
+ EXPECT_THAT(persistent_hash_map_or,
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+ EXPECT_THAT(
+ persistent_hash_map_or.status().error_message(),
+ HasSubstr("Mismatch crc with PersistentHashMap bucket storage"));
+ }
+}
+
+TEST_F(PersistentHashMapTest, InitializeExistingFilesWithWrongEntryStorageCrc) {
+ {
+ // Create new persistent hash map
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(filesystem_, base_dir_,
+ /*value_type_size=*/sizeof(int)));
+ ICING_ASSERT_OK(persistent_hash_map->Put("a", Serialize(1).data()));
+
+ ICING_ASSERT_OK(persistent_hash_map->PersistToDisk());
+ }
+
+ const std::string metadata_file_path =
+ absl_ports::StrCat(base_dir_, "/", PersistentHashMap::kSubDirectory, "/",
+ PersistentHashMap::kFilePrefix, ".m");
+ ScopedFd metadata_sfd(filesystem_.OpenForWrite(metadata_file_path.c_str()));
+ ASSERT_TRUE(metadata_sfd.is_valid());
+
+ Crcs crcs;
+ ASSERT_TRUE(filesystem_.PRead(metadata_sfd.get(), &crcs, sizeof(Crcs),
+ Crcs::kFileOffset));
+
+ // Manually corrupt entry_storage_crc
+ crcs.component_crcs.entry_storage_crc += kCorruptedValueOffset;
+ crcs.all_crc = Crc32(std::string_view(
+ reinterpret_cast<const char*>(&crcs.component_crcs),
+ sizeof(Crcs::ComponentCrcs)))
+ .Get();
+ ASSERT_TRUE(filesystem_.PWrite(metadata_sfd.get(), Crcs::kFileOffset, &crcs,
+ sizeof(Crcs)));
+ {
+ // Attempt to create the persistent hash map with metadata containing
+ // corrupted entry_storage_crc. This should fail.
+ libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>>
+ persistent_hash_map_or = PersistentHashMap::Create(
+ filesystem_, base_dir_, /*value_type_size=*/sizeof(int));
+ EXPECT_THAT(persistent_hash_map_or,
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+ EXPECT_THAT(persistent_hash_map_or.status().error_message(),
+ HasSubstr("Mismatch crc with PersistentHashMap entry storage"));
+ }
+}
+
+TEST_F(PersistentHashMapTest,
+ InitializeExistingFilesWithWrongKeyValueStorageCrc) {
+ {
+ // Create new persistent hash map
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(filesystem_, base_dir_,
+ /*value_type_size=*/sizeof(int)));
+ ICING_ASSERT_OK(persistent_hash_map->Put("a", Serialize(1).data()));
+
+ ICING_ASSERT_OK(persistent_hash_map->PersistToDisk());
+ }
+
+ const std::string metadata_file_path =
+ absl_ports::StrCat(base_dir_, "/", PersistentHashMap::kSubDirectory, "/",
+ PersistentHashMap::kFilePrefix, ".m");
+ ScopedFd metadata_sfd(filesystem_.OpenForWrite(metadata_file_path.c_str()));
+ ASSERT_TRUE(metadata_sfd.is_valid());
+
+ Crcs crcs;
+ ASSERT_TRUE(filesystem_.PRead(metadata_sfd.get(), &crcs, sizeof(Crcs),
+ Crcs::kFileOffset));
+
+ // Manually corrupt kv_storage_crc
+ crcs.component_crcs.kv_storage_crc += kCorruptedValueOffset;
+ crcs.all_crc = Crc32(std::string_view(
+ reinterpret_cast<const char*>(&crcs.component_crcs),
+ sizeof(Crcs::ComponentCrcs)))
+ .Get();
+ ASSERT_TRUE(filesystem_.PWrite(metadata_sfd.get(), Crcs::kFileOffset, &crcs,
+ sizeof(Crcs)));
+ {
+ // Attempt to create the persistent hash map with metadata containing
+ // corrupted kv_storage_crc. This should fail.
+ libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>>
+ persistent_hash_map_or = PersistentHashMap::Create(
+ filesystem_, base_dir_, /*value_type_size=*/sizeof(int));
+ EXPECT_THAT(persistent_hash_map_or,
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+ EXPECT_THAT(
+ persistent_hash_map_or.status().error_message(),
+ HasSubstr("Mismatch crc with PersistentHashMap key value storage"));
+ }
+}
+
+TEST_F(PersistentHashMapTest,
+ InitializeExistingFilesAllowDifferentMaxLoadFactorPercent) {
+ {
+ // Create new persistent hash map
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(filesystem_, base_dir_,
+ /*value_type_size=*/sizeof(int)));
+ ICING_ASSERT_OK(persistent_hash_map->Put("a", Serialize(1).data()));
+ ICING_ASSERT_OK(persistent_hash_map->Put("b", Serialize(2).data()));
+
+ ASSERT_THAT(persistent_hash_map, Pointee(SizeIs(2)));
+ ASSERT_THAT(GetValueByKey(persistent_hash_map.get(), "a"), IsOkAndHolds(1));
+ ASSERT_THAT(GetValueByKey(persistent_hash_map.get(), "b"), IsOkAndHolds(2));
+
+ ICING_ASSERT_OK(persistent_hash_map->PersistToDisk());
+ }
+
+ int32_t new_max_load_factor_percent = 100;
+ {
+ ASSERT_THAT(new_max_load_factor_percent,
+ Not(Eq(PersistentHashMap::kDefaultMaxLoadFactorPercent)));
+ // Attempt to create the persistent hash map with different max load factor
+ // percent. This should succeed and metadata should be modified correctly.
+ // Also verify all entries should remain unchanged.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(filesystem_, base_dir_,
+ /*value_type_size=*/sizeof(int),
+ new_max_load_factor_percent));
+
+ EXPECT_THAT(persistent_hash_map, Pointee(SizeIs(2)));
+ EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "a"), IsOkAndHolds(1));
+ EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "b"), IsOkAndHolds(2));
+
+ ICING_ASSERT_OK(persistent_hash_map->PersistToDisk());
+ }
+
+ const std::string metadata_file_path =
+ absl_ports::StrCat(base_dir_, "/", PersistentHashMap::kSubDirectory, "/",
+ PersistentHashMap::kFilePrefix, ".m");
+ ScopedFd metadata_sfd(filesystem_.OpenForWrite(metadata_file_path.c_str()));
+ ASSERT_TRUE(metadata_sfd.is_valid());
+
+ Info info;
+ ASSERT_TRUE(filesystem_.PRead(metadata_sfd.get(), &info, sizeof(Info),
+ Info::kFileOffset));
+ EXPECT_THAT(info.max_load_factor_percent, Eq(new_max_load_factor_percent));
+
+ // Also should update crcs correctly. We test it by creating instance again
+ // and make sure it won't get corrupted crcs/info errors.
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(filesystem_, base_dir_,
+ /*value_type_size=*/sizeof(int),
+ new_max_load_factor_percent));
+
+ ICING_ASSERT_OK(persistent_hash_map->PersistToDisk());
+ }
+}
+
+TEST_F(PersistentHashMapTest, PutAndGet) {
+ // Create new persistent hash map
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(filesystem_, base_dir_,
+ /*value_type_size=*/sizeof(int)));
+
+ EXPECT_THAT(persistent_hash_map, Pointee(IsEmpty()));
+ EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "default-google.com"),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "default-youtube.com"),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+
+ ICING_EXPECT_OK(
+ persistent_hash_map->Put("default-google.com", Serialize(100).data()));
+ ICING_EXPECT_OK(
+ persistent_hash_map->Put("default-youtube.com", Serialize(50).data()));
+
+ EXPECT_THAT(persistent_hash_map, Pointee(SizeIs(2)));
+ EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "default-google.com"),
+ IsOkAndHolds(100));
+ EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "default-youtube.com"),
+ IsOkAndHolds(50));
+ EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "key-not-exist"),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+
+ ICING_ASSERT_OK(persistent_hash_map->PersistToDisk());
+}
+
+TEST_F(PersistentHashMapTest, PutShouldOverwriteValueIfKeyExists) {
+ // Create new persistent hash map
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(filesystem_, base_dir_,
+ /*value_type_size=*/sizeof(int)));
+
+ ICING_ASSERT_OK(
+ persistent_hash_map->Put("default-google.com", Serialize(100).data()));
+ ASSERT_THAT(persistent_hash_map, Pointee(SizeIs(1)));
+ ASSERT_THAT(GetValueByKey(persistent_hash_map.get(), "default-google.com"),
+ IsOkAndHolds(100));
+
+ ICING_EXPECT_OK(
+ persistent_hash_map->Put("default-google.com", Serialize(200).data()));
+ EXPECT_THAT(persistent_hash_map, Pointee(SizeIs(1)));
+ EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "default-google.com"),
+ IsOkAndHolds(200));
+
+ ICING_EXPECT_OK(
+ persistent_hash_map->Put("default-google.com", Serialize(300).data()));
+ EXPECT_THAT(persistent_hash_map, Pointee(SizeIs(1)));
+ EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "default-google.com"),
+ IsOkAndHolds(300));
+}
+
+TEST_F(PersistentHashMapTest, GetOrPutShouldPutIfKeyDoesNotExist) {
+ // Create new persistent hash map
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(filesystem_, base_dir_,
+ /*value_type_size=*/sizeof(int)));
+
+ ASSERT_THAT(GetValueByKey(persistent_hash_map.get(), "default-google.com"),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+
+ int val = 1;
+ EXPECT_THAT(persistent_hash_map->GetOrPut("default-google.com", &val),
+ IsOk());
+ EXPECT_THAT(val, Eq(1));
+ EXPECT_THAT(persistent_hash_map, Pointee(SizeIs(1)));
+ EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "default-google.com"),
+ IsOkAndHolds(1));
+}
+
+TEST_F(PersistentHashMapTest, GetOrPutShouldGetIfKeyExists) {
+ // Create new persistent hash map
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(filesystem_, base_dir_,
+ /*value_type_size=*/sizeof(int)));
+
+ ASSERT_THAT(
+ persistent_hash_map->Put("default-google.com", Serialize(1).data()),
+ IsOk());
+ ASSERT_THAT(GetValueByKey(persistent_hash_map.get(), "default-google.com"),
+ IsOkAndHolds(1));
+
+ int val = 2;
+ EXPECT_THAT(persistent_hash_map->GetOrPut("default-google.com", &val),
+ IsOk());
+ EXPECT_THAT(val, Eq(1));
+ EXPECT_THAT(persistent_hash_map, Pointee(SizeIs(1)));
+ EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "default-google.com"),
+ IsOkAndHolds(1));
+}
+
+TEST_F(PersistentHashMapTest, ShouldFailIfKeyContainsTerminationCharacter) {
+ // Create new persistent hash map
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(filesystem_, base_dir_,
+ /*value_type_size=*/sizeof(int)));
+
+ const char invalid_key[] = "a\0bc";
+ std::string_view invalid_key_view(invalid_key, 4);
+
+ int val = 1;
+ EXPECT_THAT(persistent_hash_map->Put(invalid_key_view, &val),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(persistent_hash_map->GetOrPut(invalid_key_view, &val),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(persistent_hash_map->Get(invalid_key_view, &val),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/icing-search-engine.cc b/icing/icing-search-engine.cc
index e390f0f..4089ec9 100644
--- a/icing/icing-search-engine.cc
+++ b/icing/icing-search-engine.cc
@@ -49,14 +49,16 @@
#include "icing/proto/status.pb.h"
#include "icing/query/query-processor.h"
#include "icing/query/suggestion-processor.h"
+#include "icing/result/page-result.h"
#include "icing/result/projection-tree.h"
#include "icing/result/projector.h"
-#include "icing/result/result-retriever.h"
+#include "icing/result/result-retriever-v2.h"
#include "icing/schema/schema-store.h"
#include "icing/schema/schema-util.h"
#include "icing/schema/section.h"
-#include "icing/scoring/ranker.h"
+#include "icing/scoring/priority-queue-scored-document-hits-ranker.h"
#include "icing/scoring/scored-document-hit.h"
+#include "icing/scoring/scored-document-hits-ranker.h"
#include "icing/scoring/scoring-processor.h"
#include "icing/store/document-id.h"
#include "icing/store/document-store.h"
@@ -112,6 +114,11 @@ libtextclassifier3::Status ValidateResultSpec(
return absl_ports::InvalidArgumentError(
"ResultSpecProto.num_per_page cannot be negative.");
}
+ if (result_spec.num_total_bytes_per_page_threshold() <= 0) {
+ return absl_ports::InvalidArgumentError(
+ "ResultSpecProto.num_total_bytes_per_page_threshold cannot be "
+ "non-positive.");
+ }
std::unordered_set<std::string> unique_namespaces;
for (const ResultSpecProto::ResultGrouping& result_grouping :
result_spec.result_groupings()) {
@@ -263,9 +270,9 @@ void TransformStatus(const libtextclassifier3::Status& internal_status,
case libtextclassifier3::StatusCode::UNAUTHENTICATED:
// Other internal status codes aren't supported externally yet. If it
// should be supported, add another switch-case above.
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Internal status code %d not supported in the external API",
- internal_status.error_code());
+ ICING_LOG(ERROR) << "Internal status code "
+ << internal_status.error_code()
+ << " not supported in the external API";
code = StatusProto::UNKNOWN;
break;
}
@@ -295,6 +302,17 @@ libtextclassifier3::Status RetrieveAndAddDocumentInfo(
return libtextclassifier3::Status::OK;
}
+bool ShouldRebuildIndex(const OptimizeStatsProto& optimize_stats) {
+ int num_invalid_documents = optimize_stats.num_deleted_documents() +
+ optimize_stats.num_expired_documents();
+ // Rebuilding the index could be faster than optimizing the index if we have
+ // removed most of the documents.
+ // Based on benchmarks, 85%~95% seems to be a good threshold for most cases.
+ // TODO(b/238236206): Try using the number of remaining hits in this
+ // condition, and allow clients to configure the threshold.
+ return num_invalid_documents >= optimize_stats.num_original_documents() * 0.9;
+}
+
} // namespace
IcingSearchEngine::IcingSearchEngine(const IcingSearchEngineOptions& options,
@@ -634,18 +652,18 @@ SetSchemaResultProto IcingSearchEngine::SetSchema(
StatusProto* result_status = result_proto.mutable_status();
absl_ports::unique_lock l(&mutex_);
- std::unique_ptr<Timer> timer = clock_->GetNewTimer();
+ ScopedTimer timer(clock_->GetNewTimer(), [&result_proto](int64_t t) {
+ result_proto.set_latency_ms(t);
+ });
if (!initialized_) {
result_status->set_code(StatusProto::FAILED_PRECONDITION);
result_status->set_message("IcingSearchEngine has not been initialized!");
- result_proto.set_latency_ms(timer->GetElapsedMilliseconds());
return result_proto;
}
auto lost_previous_schema_or = LostPreviousSchema();
if (!lost_previous_schema_or.ok()) {
TransformStatus(lost_previous_schema_or.status(), result_status);
- result_proto.set_latency_ms(timer->GetElapsedMilliseconds());
return result_proto;
}
bool lost_previous_schema = lost_previous_schema_or.ValueOrDie();
@@ -663,7 +681,6 @@ SetSchemaResultProto IcingSearchEngine::SetSchema(
std::move(new_schema), ignore_errors_and_delete_documents);
if (!set_schema_result_or.ok()) {
TransformStatus(set_schema_result_or.status(), result_status);
- result_proto.set_latency_ms(timer->GetElapsedMilliseconds());
return result_proto;
}
SchemaStore::SetSchemaResult set_schema_result =
@@ -706,7 +723,6 @@ SetSchemaResultProto IcingSearchEngine::SetSchema(
status = document_store_->UpdateSchemaStore(schema_store_.get());
if (!status.ok()) {
TransformStatus(status, result_status);
- result_proto.set_latency_ms(timer->GetElapsedMilliseconds());
return result_proto;
}
} else if (!set_schema_result.old_schema_type_ids_changed.empty() ||
@@ -716,7 +732,6 @@ SetSchemaResultProto IcingSearchEngine::SetSchema(
set_schema_result);
if (!status.ok()) {
TransformStatus(status, result_status);
- result_proto.set_latency_ms(timer->GetElapsedMilliseconds());
return result_proto;
}
}
@@ -726,7 +741,6 @@ SetSchemaResultProto IcingSearchEngine::SetSchema(
status = index_->Reset();
if (!status.ok()) {
TransformStatus(status, result_status);
- result_proto.set_latency_ms(timer->GetElapsedMilliseconds());
return result_proto;
}
@@ -737,7 +751,6 @@ SetSchemaResultProto IcingSearchEngine::SetSchema(
if (!restore_result.status.ok() &&
!absl_ports::IsDataLoss(restore_result.status)) {
TransformStatus(status, result_status);
- result_proto.set_latency_ms(timer->GetElapsedMilliseconds());
return result_proto;
}
}
@@ -748,7 +761,6 @@ SetSchemaResultProto IcingSearchEngine::SetSchema(
result_status->set_message("Schema is incompatible.");
}
- result_proto.set_latency_ms(timer->GetElapsedMilliseconds());
return result_proto;
}
@@ -804,12 +816,13 @@ PutResultProto IcingSearchEngine::Put(const DocumentProto& document) {
PutResultProto IcingSearchEngine::Put(DocumentProto&& document) {
ICING_VLOG(1) << "Writing document to document store";
- std::unique_ptr<Timer> put_timer = clock_->GetNewTimer();
-
PutResultProto result_proto;
StatusProto* result_status = result_proto.mutable_status();
PutDocumentStatsProto* put_document_stats =
result_proto.mutable_put_document_stats();
+ ScopedTimer put_timer(clock_->GetNewTimer(), [put_document_stats](int64_t t) {
+ put_document_stats->set_latency_ms(t);
+ });
// Lock must be acquired before validation because the DocumentStore uses
// the schema file to validate, and the schema could be changed in
@@ -818,7 +831,6 @@ PutResultProto IcingSearchEngine::Put(DocumentProto&& document) {
if (!initialized_) {
result_status->set_code(StatusProto::FAILED_PRECONDITION);
result_status->set_message("IcingSearchEngine has not been initialized!");
- put_document_stats->set_latency_ms(put_timer->GetElapsedMilliseconds());
return result_proto;
}
@@ -826,7 +838,6 @@ PutResultProto IcingSearchEngine::Put(DocumentProto&& document) {
schema_store_.get(), language_segmenter_.get(), std::move(document));
if (!tokenized_document_or.ok()) {
TransformStatus(tokenized_document_or.status(), result_status);
- put_document_stats->set_latency_ms(put_timer->GetElapsedMilliseconds());
return result_proto;
}
TokenizedDocument tokenized_document(
@@ -837,7 +848,6 @@ PutResultProto IcingSearchEngine::Put(DocumentProto&& document) {
tokenized_document.num_tokens(), put_document_stats);
if (!document_id_or.ok()) {
TransformStatus(document_id_or.status(), result_status);
- put_document_stats->set_latency_ms(put_timer->GetElapsedMilliseconds());
return result_proto;
}
DocumentId document_id = document_id_or.ValueOrDie();
@@ -846,7 +856,6 @@ PutResultProto IcingSearchEngine::Put(DocumentProto&& document) {
IndexProcessor::Create(normalizer_.get(), index_.get(), clock_.get());
if (!index_processor_or.ok()) {
TransformStatus(index_processor_or.status(), result_status);
- put_document_stats->set_latency_ms(put_timer->GetElapsedMilliseconds());
return result_proto;
}
std::unique_ptr<IndexProcessor> index_processor =
@@ -867,7 +876,6 @@ PutResultProto IcingSearchEngine::Put(DocumentProto&& document) {
}
TransformStatus(status, result_status);
- put_document_stats->set_latency_ms(put_timer->GetElapsedMilliseconds());
return result_proto;
}
@@ -1081,7 +1089,9 @@ DeleteByQueryResultProto IcingSearchEngine::DeleteByQuery(
delete_stats->set_num_schema_types_filtered(
search_spec.schema_type_filters_size());
- std::unique_ptr<Timer> delete_timer = clock_->GetNewTimer();
+ ScopedTimer delete_timer(clock_->GetNewTimer(), [delete_stats](int64_t t) {
+ delete_stats->set_latency_ms(t);
+ });
libtextclassifier3::Status status =
ValidateSearchSpec(search_spec, performance_configuration_);
if (!status.ok()) {
@@ -1096,6 +1106,8 @@ DeleteByQueryResultProto IcingSearchEngine::DeleteByQuery(
document_store_.get(), schema_store_.get());
if (!query_processor_or.ok()) {
TransformStatus(query_processor_or.status(), result_status);
+ delete_stats->set_parse_query_latency_ms(
+ component_timer->GetElapsedMilliseconds());
return result_proto;
}
std::unique_ptr<QueryProcessor> query_processor =
@@ -1104,6 +1116,8 @@ DeleteByQueryResultProto IcingSearchEngine::DeleteByQuery(
auto query_results_or = query_processor->ParseSearch(search_spec);
if (!query_results_or.ok()) {
TransformStatus(query_results_or.status(), result_status);
+ delete_stats->set_parse_query_latency_ms(
+ component_timer->GetElapsedMilliseconds());
return result_proto;
}
QueryProcessor::QueryResults query_results =
@@ -1131,6 +1145,8 @@ DeleteByQueryResultProto IcingSearchEngine::DeleteByQuery(
query_results.root_iterator->doc_hit_info().document_id());
if (!status.ok()) {
TransformStatus(status, result_status);
+ delete_stats->set_document_removal_latency_ms(
+ component_timer->GetElapsedMilliseconds());
return result_proto;
}
}
@@ -1138,6 +1154,8 @@ DeleteByQueryResultProto IcingSearchEngine::DeleteByQuery(
query_results.root_iterator->doc_hit_info().document_id());
if (!status.ok()) {
TransformStatus(status, result_status);
+ delete_stats->set_document_removal_latency_ms(
+ component_timer->GetElapsedMilliseconds());
return result_proto;
}
}
@@ -1156,7 +1174,6 @@ DeleteByQueryResultProto IcingSearchEngine::DeleteByQuery(
result_proto.mutable_status()->set_message(
"No documents matched the query to delete by!");
}
- delete_stats->set_latency_ms(delete_timer->GetElapsedMilliseconds());
delete_stats->set_num_documents_deleted(num_deleted);
return result_proto;
}
@@ -1199,11 +1216,10 @@ OptimizeResultProto IcingSearchEngine::Optimize() {
return result_proto;
}
- std::unique_ptr<Timer> optimize_timer = clock_->GetNewTimer();
OptimizeStatsProto* optimize_stats = result_proto.mutable_optimize_stats();
- int64_t before_size = filesystem_->GetDiskUsage(options_.base_dir().c_str());
- optimize_stats->set_storage_size_before(
- Filesystem::SanitizeFileSize(before_size));
+ ScopedTimer optimize_timer(
+ clock_->GetNewTimer(),
+ [optimize_stats](int64_t t) { optimize_stats->set_latency_ms(t); });
// Flushes data to disk before doing optimization
auto status = InternalPersistToDisk(PersistType::FULL);
@@ -1212,52 +1228,85 @@ OptimizeResultProto IcingSearchEngine::Optimize() {
return result_proto;
}
+ int64_t before_size = filesystem_->GetDiskUsage(options_.base_dir().c_str());
+ optimize_stats->set_storage_size_before(
+ Filesystem::SanitizeFileSize(before_size));
+
// TODO(b/143646633): figure out if we need to optimize index and doc store
// at the same time.
std::unique_ptr<Timer> optimize_doc_store_timer = clock_->GetNewTimer();
- libtextclassifier3::Status optimization_status =
- OptimizeDocumentStore(optimize_stats);
+ libtextclassifier3::StatusOr<std::vector<DocumentId>>
+ document_id_old_to_new_or = OptimizeDocumentStore(optimize_stats);
optimize_stats->set_document_store_optimize_latency_ms(
optimize_doc_store_timer->GetElapsedMilliseconds());
- if (!optimization_status.ok() &&
- !absl_ports::IsDataLoss(optimization_status)) {
+ if (!document_id_old_to_new_or.ok() &&
+ !absl_ports::IsDataLoss(document_id_old_to_new_or.status())) {
// The status now is either ABORTED_ERROR or INTERNAL_ERROR.
// If ABORTED_ERROR, Icing should still be working.
// If INTERNAL_ERROR, we're having IO errors or other errors that we can't
// recover from.
- TransformStatus(optimization_status, result_status);
+ TransformStatus(document_id_old_to_new_or.status(), result_status);
return result_proto;
}
// The status is either OK or DATA_LOSS. The optimized document store is
// guaranteed to work, so we update index according to the new document store.
std::unique_ptr<Timer> optimize_index_timer = clock_->GetNewTimer();
- libtextclassifier3::Status index_reset_status = index_->Reset();
- if (!index_reset_status.ok()) {
- status = absl_ports::Annotate(
- absl_ports::InternalError("Failed to reset index after optimization."),
- index_reset_status.error_message());
- TransformStatus(status, result_status);
- return result_proto;
+ bool should_rebuild_index =
+ !document_id_old_to_new_or.ok() || ShouldRebuildIndex(*optimize_stats);
+ if (!should_rebuild_index) {
+ optimize_stats->set_index_restoration_mode(
+ OptimizeStatsProto::INDEX_TRANSLATION);
+ libtextclassifier3::Status index_optimize_status =
+ index_->Optimize(document_id_old_to_new_or.ValueOrDie());
+ if (!index_optimize_status.ok()) {
+ ICING_LOG(WARNING) << "Failed to optimize index. Error: "
+ << index_optimize_status.error_message();
+ should_rebuild_index = true;
+ }
}
+ // If we received a DATA_LOSS error from OptimizeDocumentStore, we have a
+ // valid document store, but it might be the old one or the new one. So throw
+ // out the index and rebuild from scratch.
+ // Likewise, if Index::Optimize failed, then attempt to recover the index by
+ // rebuilding from scratch.
+ // If ShouldRebuildIndex() returns true, we will also rebuild the index for
+ // better performance.
+ if (should_rebuild_index) {
+ optimize_stats->set_index_restoration_mode(
+ OptimizeStatsProto::FULL_INDEX_REBUILD);
+ ICING_LOG(WARNING) << "Resetting the entire index!";
+ libtextclassifier3::Status index_reset_status = index_->Reset();
+ if (!index_reset_status.ok()) {
+ status = absl_ports::Annotate(
+ absl_ports::InternalError("Failed to reset index."),
+ index_reset_status.error_message());
+ TransformStatus(status, result_status);
+ optimize_stats->set_index_restoration_latency_ms(
+ optimize_index_timer->GetElapsedMilliseconds());
+ return result_proto;
+ }
- IndexRestorationResult index_restoration_status = RestoreIndexIfNeeded();
- optimize_stats->set_index_restoration_latency_ms(
- optimize_index_timer->GetElapsedMilliseconds());
- // DATA_LOSS means that we have successfully re-added content to the index.
- // Some indexed content was lost, but otherwise the index is in a valid state
- // and can be queried.
- if (!index_restoration_status.status.ok() &&
- !absl_ports::IsDataLoss(index_restoration_status.status)) {
- status = absl_ports::Annotate(
- absl_ports::InternalError(
- "Failed to reindex documents after optimization."),
- index_restoration_status.status.error_message());
+ IndexRestorationResult index_restoration_status = RestoreIndexIfNeeded();
+ // DATA_LOSS means that we have successfully re-added content to the index.
+ // Some indexed content was lost, but otherwise the index is in a valid
+ // state and can be queried.
+ if (!index_restoration_status.status.ok() &&
+ !absl_ports::IsDataLoss(index_restoration_status.status)) {
+ status = absl_ports::Annotate(
+ absl_ports::InternalError(
+ "Failed to reindex documents after optimization."),
+ index_restoration_status.status.error_message());
- TransformStatus(status, result_status);
- return result_proto;
+ TransformStatus(status, result_status);
+ optimize_stats->set_index_restoration_latency_ms(
+ optimize_index_timer->GetElapsedMilliseconds());
+ return result_proto;
+ }
}
+ optimize_stats->set_index_restoration_latency_ms(
+ optimize_index_timer->GetElapsedMilliseconds());
// Read the optimize status to get the time that we last ran.
std::string optimize_status_filename =
@@ -1279,12 +1328,18 @@ OptimizeResultProto IcingSearchEngine::Optimize() {
optimize_status->set_last_successful_optimize_run_time_ms(current_time);
optimize_status_file.Write(std::move(optimize_status));
+ // Flushes data to disk after doing optimization
+ status = InternalPersistToDisk(PersistType::FULL);
+ if (!status.ok()) {
+ TransformStatus(status, result_status);
+ return result_proto;
+ }
+
int64_t after_size = filesystem_->GetDiskUsage(options_.base_dir().c_str());
optimize_stats->set_storage_size_after(
Filesystem::SanitizeFileSize(after_size));
- optimize_stats->set_latency_ms(optimize_timer->GetElapsedMilliseconds());
- TransformStatus(optimization_status, result_status);
+ TransformStatus(document_id_old_to_new_or.status(), result_status);
return result_proto;
}
@@ -1442,7 +1497,9 @@ SearchResultProto IcingSearchEngine::Search(
QueryStatsProto* query_stats = result_proto.mutable_query_stats();
query_stats->set_query_length(search_spec.query().length());
- std::unique_ptr<Timer> overall_timer = clock_->GetNewTimer();
+ ScopedTimer overall_timer(clock_->GetNewTimer(), [query_stats](int64_t t) {
+ query_stats->set_latency_ms(t);
+ });
libtextclassifier3::Status status = ValidateResultSpec(result_spec);
if (!status.ok()) {
@@ -1470,6 +1527,8 @@ SearchResultProto IcingSearchEngine::Search(
document_store_.get(), schema_store_.get());
if (!query_processor_or.ok()) {
TransformStatus(query_processor_or.status(), result_status);
+ query_stats->set_parse_query_latency_ms(
+ component_timer->GetElapsedMilliseconds());
return result_proto;
}
std::unique_ptr<QueryProcessor> query_processor =
@@ -1478,6 +1537,8 @@ SearchResultProto IcingSearchEngine::Search(
auto query_results_or = query_processor->ParseSearch(search_spec);
if (!query_results_or.ok()) {
TransformStatus(query_results_or.status(), result_status);
+ query_stats->set_parse_query_latency_ms(
+ component_timer->GetElapsedMilliseconds());
return result_proto;
}
QueryProcessor::QueryResults query_results =
@@ -1498,6 +1559,8 @@ SearchResultProto IcingSearchEngine::Search(
scoring_spec, document_store_.get(), schema_store_.get());
if (!scoring_processor_or.ok()) {
TransformStatus(scoring_processor_or.status(), result_status);
+ query_stats->set_scoring_latency_ms(
+ component_timer->GetElapsedMilliseconds());
return result_proto;
}
std::unique_ptr<ScoringProcessor> scoring_processor =
@@ -1517,62 +1580,62 @@ SearchResultProto IcingSearchEngine::Search(
}
component_timer = clock_->GetNewTimer();
- // Ranks and paginates results
- libtextclassifier3::StatusOr<PageResultState> page_result_state_or =
- result_state_manager_->RankAndPaginate(ResultState(
- std::move(result_document_hits), std::move(query_results.query_terms),
- search_spec, scoring_spec, result_spec, *document_store_));
- if (!page_result_state_or.ok()) {
- TransformStatus(page_result_state_or.status(), result_status);
- return result_proto;
- }
- PageResultState page_result_state =
- std::move(page_result_state_or).ValueOrDie();
+ // Ranks results
+ std::unique_ptr<ScoredDocumentHitsRanker> ranker =
+ std::make_unique<PriorityQueueScoredDocumentHitsRanker>(
+ std::move(result_document_hits),
+ /*is_descending=*/scoring_spec.order_by() ==
+ ScoringSpecProto::Order::DESC);
query_stats->set_ranking_latency_ms(
component_timer->GetElapsedMilliseconds());
component_timer = clock_->GetNewTimer();
- // Retrieves the document protos and snippets if requested
+ // RanksAndPaginates and retrieves the document protos and snippets if
+ // requested
auto result_retriever_or =
- ResultRetriever::Create(document_store_.get(), schema_store_.get(),
- language_segmenter_.get(), normalizer_.get());
+ ResultRetrieverV2::Create(document_store_.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get());
if (!result_retriever_or.ok()) {
- result_state_manager_->InvalidateResultState(
- page_result_state.next_page_token);
TransformStatus(result_retriever_or.status(), result_status);
+ query_stats->set_document_retrieval_latency_ms(
+ component_timer->GetElapsedMilliseconds());
return result_proto;
}
- std::unique_ptr<ResultRetriever> result_retriever =
+ std::unique_ptr<ResultRetrieverV2> result_retriever =
std::move(result_retriever_or).ValueOrDie();
- libtextclassifier3::StatusOr<std::vector<SearchResultProto::ResultProto>>
- results_or = result_retriever->RetrieveResults(page_result_state);
- if (!results_or.ok()) {
- result_state_manager_->InvalidateResultState(
- page_result_state.next_page_token);
- TransformStatus(results_or.status(), result_status);
+ libtextclassifier3::StatusOr<std::pair<uint64_t, PageResult>>
+ page_result_info_or = result_state_manager_->CacheAndRetrieveFirstPage(
+ std::move(ranker), std::move(query_results.query_terms), search_spec,
+ scoring_spec, result_spec, *document_store_, *result_retriever);
+ if (!page_result_info_or.ok()) {
+ TransformStatus(page_result_info_or.status(), result_status);
+ query_stats->set_document_retrieval_latency_ms(
+ component_timer->GetElapsedMilliseconds());
return result_proto;
}
- std::vector<SearchResultProto::ResultProto> results =
- std::move(results_or).ValueOrDie();
+ std::pair<uint64_t, PageResult> page_result_info =
+ std::move(page_result_info_or).ValueOrDie();
// Assembles the final search result proto
- result_proto.mutable_results()->Reserve(results.size());
- for (SearchResultProto::ResultProto& result : results) {
+ result_proto.mutable_results()->Reserve(
+ page_result_info.second.results.size());
+ for (SearchResultProto::ResultProto& result :
+ page_result_info.second.results) {
result_proto.mutable_results()->Add(std::move(result));
}
+
result_status->set_code(StatusProto::OK);
- if (page_result_state.next_page_token != kInvalidNextPageToken) {
- result_proto.set_next_page_token(page_result_state.next_page_token);
+ if (page_result_info.first != kInvalidNextPageToken) {
+ result_proto.set_next_page_token(page_result_info.first);
}
+
query_stats->set_document_retrieval_latency_ms(
component_timer->GetElapsedMilliseconds());
- query_stats->set_latency_ms(overall_timer->GetElapsedMilliseconds());
query_stats->set_num_results_returned_current_page(
result_proto.results_size());
query_stats->set_num_results_with_snippets(
- std::min(result_proto.results_size(),
- result_spec.snippet_spec().num_to_snippet()));
+ page_result_info.second.num_results_with_snippets);
return result_proto;
}
@@ -1593,53 +1656,46 @@ SearchResultProto IcingSearchEngine::GetNextPage(uint64_t next_page_token) {
query_stats->set_is_first_page(false);
std::unique_ptr<Timer> overall_timer = clock_->GetNewTimer();
- libtextclassifier3::StatusOr<PageResultState> page_result_state_or =
- result_state_manager_->GetNextPage(next_page_token);
-
- if (!page_result_state_or.ok()) {
- if (absl_ports::IsNotFound(page_result_state_or.status())) {
- // NOT_FOUND means an empty result.
- result_status->set_code(StatusProto::OK);
- } else {
- // Real error, pass up.
- TransformStatus(page_result_state_or.status(), result_status);
- }
- return result_proto;
- }
-
- PageResultState page_result_state =
- std::move(page_result_state_or).ValueOrDie();
- query_stats->set_requested_page_size(page_result_state.requested_page_size);
-
- // Retrieves the document protos.
auto result_retriever_or =
- ResultRetriever::Create(document_store_.get(), schema_store_.get(),
- language_segmenter_.get(), normalizer_.get());
+ ResultRetrieverV2::Create(document_store_.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get());
if (!result_retriever_or.ok()) {
TransformStatus(result_retriever_or.status(), result_status);
return result_proto;
}
- std::unique_ptr<ResultRetriever> result_retriever =
+ std::unique_ptr<ResultRetrieverV2> result_retriever =
std::move(result_retriever_or).ValueOrDie();
- libtextclassifier3::StatusOr<std::vector<SearchResultProto::ResultProto>>
- results_or = result_retriever->RetrieveResults(page_result_state);
- if (!results_or.ok()) {
- TransformStatus(results_or.status(), result_status);
+ libtextclassifier3::StatusOr<std::pair<uint64_t, PageResult>>
+ page_result_info_or = result_state_manager_->GetNextPage(
+ next_page_token, *result_retriever);
+ if (!page_result_info_or.ok()) {
+ if (absl_ports::IsNotFound(page_result_info_or.status())) {
+ // NOT_FOUND means an empty result.
+ result_status->set_code(StatusProto::OK);
+ } else {
+ // Real error, pass up.
+ TransformStatus(page_result_info_or.status(), result_status);
+ }
return result_proto;
}
- std::vector<SearchResultProto::ResultProto> results =
- std::move(results_or).ValueOrDie();
+
+ std::pair<uint64_t, PageResult> page_result_info =
+ std::move(page_result_info_or).ValueOrDie();
+ query_stats->set_requested_page_size(
+ page_result_info.second.requested_page_size);
// Assembles the final search result proto
- result_proto.mutable_results()->Reserve(results.size());
- for (SearchResultProto::ResultProto& result : results) {
+ result_proto.mutable_results()->Reserve(
+ page_result_info.second.results.size());
+ for (SearchResultProto::ResultProto& result :
+ page_result_info.second.results) {
result_proto.mutable_results()->Add(std::move(result));
}
result_status->set_code(StatusProto::OK);
- if (page_result_state.next_page_token != kInvalidNextPageToken) {
- result_proto.set_next_page_token(page_result_state.next_page_token);
+ if (page_result_info.first != kInvalidNextPageToken) {
+ result_proto.set_next_page_token(page_result_info.first);
}
// The only thing that we're doing is document retrieval. So document
@@ -1650,12 +1706,8 @@ SearchResultProto IcingSearchEngine::GetNextPage(uint64_t next_page_token) {
query_stats->set_latency_ms(overall_timer->GetElapsedMilliseconds());
query_stats->set_num_results_returned_current_page(
result_proto.results_size());
- int num_left_to_snippet =
- std::max(page_result_state.snippet_context.snippet_spec.num_to_snippet() -
- page_result_state.num_previously_returned,
- 0);
query_stats->set_num_results_with_snippets(
- std::min(result_proto.results_size(), num_left_to_snippet));
+ page_result_info.second.num_results_with_snippets);
return result_proto;
}
@@ -1668,8 +1720,8 @@ void IcingSearchEngine::InvalidateNextPageToken(uint64_t next_page_token) {
result_state_manager_->InvalidateResultState(next_page_token);
}
-libtextclassifier3::Status IcingSearchEngine::OptimizeDocumentStore(
- OptimizeStatsProto* optimize_stats) {
+libtextclassifier3::StatusOr<std::vector<DocumentId>>
+IcingSearchEngine::OptimizeDocumentStore(OptimizeStatsProto* optimize_stats) {
// Gets the current directory path and an empty tmp directory path for
// document store optimization.
const std::string current_document_dir =
@@ -1685,15 +1737,16 @@ libtextclassifier3::Status IcingSearchEngine::OptimizeDocumentStore(
}
// Copies valid document data to tmp directory
- auto optimize_status = document_store_->OptimizeInto(
- temporary_document_dir, language_segmenter_.get(), optimize_stats);
+ libtextclassifier3::StatusOr<std::vector<DocumentId>>
+ document_id_old_to_new_or = document_store_->OptimizeInto(
+ temporary_document_dir, language_segmenter_.get(), optimize_stats);
// Handles error if any
- if (!optimize_status.ok()) {
+ if (!document_id_old_to_new_or.ok()) {
filesystem_->DeleteDirectoryRecursively(temporary_document_dir.c_str());
return absl_ports::Annotate(
absl_ports::AbortedError("Failed to optimize document store"),
- optimize_status.error_message());
+ document_id_old_to_new_or.status().error_message());
}
// result_state_manager_ depends on document_store_. So we need to reset it at
@@ -1768,7 +1821,7 @@ libtextclassifier3::Status IcingSearchEngine::OptimizeDocumentStore(
ICING_LOG(ERROR) << "Document store has been optimized, but it failed to "
"delete temporary file directory";
}
- return libtextclassifier3::Status::OK;
+ return document_id_old_to_new_or;
}
IcingSearchEngine::IndexRestorationResult
diff --git a/icing/icing-search-engine.h b/icing/icing-search-engine.h
index 6a06fb9..2eda803 100644
--- a/icing/icing-search-engine.h
+++ b/icing/icing-search-engine.h
@@ -20,13 +20,13 @@
#include <string>
#include <string_view>
-#include "icing/jni/jni-cache.h"
#include "icing/text_classifier/lib3/utils/base/status.h"
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/absl_ports/mutex.h"
#include "icing/absl_ports/thread_annotations.h"
#include "icing/file/filesystem.h"
#include "icing/index/index.h"
+#include "icing/jni/jni-cache.h"
#include "icing/legacy/index/icing-filesystem.h"
#include "icing/performance-configuration.h"
#include "icing/proto/document.pb.h"
@@ -582,14 +582,16 @@ class IcingSearchEngine {
// would need call Initialize() to reinitialize everything into a valid state.
//
// Returns:
- // OK on success
+ // On success, a vector that maps from old document id to new document id. A
+ // value of kInvalidDocumentId indicates that the old document id has been
+ // deleted.
// ABORTED_ERROR if any error happens before the actual optimization, the
// original document store should be still available
// DATA_LOSS_ERROR on errors that could potentially cause data loss,
// document store is still available
// INTERNAL_ERROR on any IO errors or other errors that we can't recover
// from
- libtextclassifier3::Status OptimizeDocumentStore(
+ libtextclassifier3::StatusOr<std::vector<DocumentId>> OptimizeDocumentStore(
OptimizeStatsProto* optimize_stats)
ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
diff --git a/icing/icing-search-engine_test.cc b/icing/icing-search-engine_test.cc
index f922b98..2ac456e 100644
--- a/icing/icing-search-engine_test.cc
+++ b/icing/icing-search-engine_test.cc
@@ -20,13 +20,13 @@
#include <string>
#include <utility>
-#include "icing/jni/jni-cache.h"
#include "icing/text_classifier/lib3/utils/base/status.h"
#include "gmock/gmock.h"
#include "gtest/gtest.h"
#include "icing/document-builder.h"
#include "icing/file/filesystem.h"
#include "icing/file/mock-filesystem.h"
+#include "icing/jni/jni-cache.h"
#include "icing/legacy/index/icing-mock-filesystem.h"
#include "icing/portable/endian.h"
#include "icing/portable/equals-proto.h"
@@ -2274,7 +2274,12 @@ TEST_F(IcingSearchEngineTest, SearchReturnsScoresCreationTimestamp) {
}
TEST_F(IcingSearchEngineTest, SearchReturnsOneResult) {
- IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ auto fake_clock = std::make_unique<FakeClock>();
+ fake_clock->SetTimerElapsedMilliseconds(1000);
+ TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+ std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(),
+ std::move(fake_clock), GetTestJniCache());
ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
@@ -2299,6 +2304,15 @@ TEST_F(IcingSearchEngineTest, SearchReturnsOneResult) {
SearchResultProto search_result_proto =
icing.Search(search_spec, GetDefaultScoringSpec(), result_spec);
EXPECT_THAT(search_result_proto.status(), ProtoIsOk());
+
+ EXPECT_THAT(search_result_proto.query_stats().latency_ms(), Eq(1000));
+ EXPECT_THAT(search_result_proto.query_stats().parse_query_latency_ms(),
+ Eq(1000));
+ EXPECT_THAT(search_result_proto.query_stats().scoring_latency_ms(), Eq(1000));
+ EXPECT_THAT(search_result_proto.query_stats().ranking_latency_ms(), Eq(1000));
+ EXPECT_THAT(search_result_proto.query_stats().document_retrieval_latency_ms(),
+ Eq(1000));
+
// The token is a random number so we don't verify it.
expected_search_result_proto.set_next_page_token(
search_result_proto.next_page_token());
@@ -2347,6 +2361,30 @@ TEST_F(IcingSearchEngineTest, SearchNegativeResultLimitReturnsInvalidArgument) {
expected_search_result_proto));
}
+TEST_F(IcingSearchEngineTest,
+ SearchNonPositivePageTotalBytesLimitReturnsInvalidArgument) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("");
+
+ ResultSpecProto result_spec;
+ result_spec.set_num_total_bytes_per_page_threshold(-1);
+
+ SearchResultProto actual_results1 =
+ icing.Search(search_spec, GetDefaultScoringSpec(), result_spec);
+ EXPECT_THAT(actual_results1.status(),
+ ProtoStatusIs(StatusProto::INVALID_ARGUMENT));
+
+ result_spec.set_num_total_bytes_per_page_threshold(0);
+ SearchResultProto actual_results2 =
+ icing.Search(search_spec, GetDefaultScoringSpec(), result_spec);
+ EXPECT_THAT(actual_results2.status(),
+ ProtoStatusIs(StatusProto::INVALID_ARGUMENT));
+}
+
TEST_F(IcingSearchEngineTest, SearchWithPersistenceReturnsValidResults) {
IcingSearchEngineOptions icing_options = GetDefaultIcingOptions();
@@ -2403,7 +2441,12 @@ TEST_F(IcingSearchEngineTest, SearchWithPersistenceReturnsValidResults) {
}
TEST_F(IcingSearchEngineTest, SearchShouldReturnEmpty) {
- IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ auto fake_clock = std::make_unique<FakeClock>();
+ fake_clock->SetTimerElapsedMilliseconds(1000);
+ TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+ std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(),
+ std::move(fake_clock), GetTestJniCache());
ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
@@ -2418,6 +2461,15 @@ TEST_F(IcingSearchEngineTest, SearchShouldReturnEmpty) {
SearchResultProto search_result_proto =
icing.Search(search_spec, GetDefaultScoringSpec(),
ResultSpecProto::default_instance());
+ EXPECT_THAT(search_result_proto.status(), ProtoIsOk());
+
+ EXPECT_THAT(search_result_proto.query_stats().latency_ms(), Eq(1000));
+ EXPECT_THAT(search_result_proto.query_stats().parse_query_latency_ms(),
+ Eq(1000));
+ EXPECT_THAT(search_result_proto.query_stats().scoring_latency_ms(), Eq(1000));
+ EXPECT_THAT(search_result_proto.query_stats().ranking_latency_ms(), Eq(0));
+ EXPECT_THAT(search_result_proto.query_stats().document_retrieval_latency_ms(),
+ Eq(0));
EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
expected_search_result_proto));
@@ -2894,10 +2946,11 @@ TEST_F(IcingSearchEngineTest, GetAndPutShouldWorkAfterOptimization) {
DocumentProto document1 = CreateMessageDocument("namespace", "uri1");
DocumentProto document2 = CreateMessageDocument("namespace", "uri2");
DocumentProto document3 = CreateMessageDocument("namespace", "uri3");
+ DocumentProto document4 = CreateMessageDocument("namespace", "uri4");
+ DocumentProto document5 = CreateMessageDocument("namespace", "uri5");
GetResultProto expected_get_result_proto;
expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
- *expected_get_result_proto.mutable_document() = document1;
{
IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
@@ -2905,27 +2958,49 @@ TEST_F(IcingSearchEngineTest, GetAndPutShouldWorkAfterOptimization) {
ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document3).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Delete("namespace", "uri2").status(), ProtoIsOk());
ASSERT_THAT(icing.Optimize().status(), ProtoIsOk());
// Validates that Get() and Put() are good right after Optimize()
+ *expected_get_result_proto.mutable_document() = document1;
EXPECT_THAT(
icing.Get("namespace", "uri1", GetResultSpecProto::default_instance()),
EqualsProto(expected_get_result_proto));
- EXPECT_THAT(icing.Put(document2).status(), ProtoIsOk());
+ EXPECT_THAT(
+ icing.Get("namespace", "uri2", GetResultSpecProto::default_instance())
+ .status()
+ .code(),
+ Eq(StatusProto::NOT_FOUND));
+ *expected_get_result_proto.mutable_document() = document3;
+ EXPECT_THAT(
+ icing.Get("namespace", "uri3", GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+ EXPECT_THAT(icing.Put(document4).status(), ProtoIsOk());
} // Destroys IcingSearchEngine to make sure nothing is cached.
IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+ *expected_get_result_proto.mutable_document() = document1;
EXPECT_THAT(
icing.Get("namespace", "uri1", GetResultSpecProto::default_instance()),
EqualsProto(expected_get_result_proto));
-
- *expected_get_result_proto.mutable_document() = document2;
EXPECT_THAT(
- icing.Get("namespace", "uri2", GetResultSpecProto::default_instance()),
+ icing.Get("namespace", "uri2", GetResultSpecProto::default_instance())
+ .status()
+ .code(),
+ Eq(StatusProto::NOT_FOUND));
+ *expected_get_result_proto.mutable_document() = document3;
+ EXPECT_THAT(
+ icing.Get("namespace", "uri3", GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+ *expected_get_result_proto.mutable_document() = document4;
+ EXPECT_THAT(
+ icing.Get("namespace", "uri4", GetResultSpecProto::default_instance()),
EqualsProto(expected_get_result_proto));
- EXPECT_THAT(icing.Put(document3).status(), ProtoIsOk());
+ EXPECT_THAT(icing.Put(document5).status(), ProtoIsOk());
}
TEST_F(IcingSearchEngineTest, DeleteShouldWorkAfterOptimization) {
@@ -3821,8 +3896,11 @@ TEST_F(IcingSearchEngineTest,
ProtoIsOk());
// Optimize() fails due to filesystem error
- EXPECT_THAT(icing.Optimize().status(),
- ProtoStatusIs(StatusProto::WARNING_DATA_LOSS));
+ OptimizeResultProto result = icing.Optimize();
+ EXPECT_THAT(result.status(), ProtoStatusIs(StatusProto::WARNING_DATA_LOSS));
+ // Should rebuild the index for data loss.
+ EXPECT_THAT(result.optimize_stats().index_restoration_mode(),
+ Eq(OptimizeStatsProto::FULL_INDEX_REBUILD));
// Document is not found because original file directory is missing
GetResultProto expected_get_result_proto;
@@ -3895,8 +3973,11 @@ TEST_F(IcingSearchEngineTest, OptimizationShouldRecoverIfDataFilesAreMissing) {
ProtoIsOk());
// Optimize() fails due to filesystem error
- EXPECT_THAT(icing.Optimize().status(),
- ProtoStatusIs(StatusProto::WARNING_DATA_LOSS));
+ OptimizeResultProto result = icing.Optimize();
+ EXPECT_THAT(result.status(), ProtoStatusIs(StatusProto::WARNING_DATA_LOSS));
+ // Should rebuild the index for data loss.
+ EXPECT_THAT(result.optimize_stats().index_restoration_mode(),
+ Eq(OptimizeStatsProto::FULL_INDEX_REBUILD));
// Document is not found because original files are missing
GetResultProto expected_get_result_proto;
@@ -7867,6 +7948,7 @@ TEST_F(IcingSearchEngineTest, OptimizeStatsProtoTest) {
expected.set_num_original_documents(3);
expected.set_num_deleted_documents(1);
expected.set_num_expired_documents(1);
+ expected.set_index_restoration_mode(OptimizeStatsProto::INDEX_TRANSLATION);
// Run Optimize
OptimizeResultProto result = icing->Optimize();
@@ -7899,6 +7981,7 @@ TEST_F(IcingSearchEngineTest, OptimizeStatsProtoTest) {
expected.set_num_deleted_documents(0);
expected.set_num_expired_documents(0);
expected.set_time_since_last_optimize_ms(10000);
+ expected.set_index_restoration_mode(OptimizeStatsProto::INDEX_TRANSLATION);
// Run Optimize
result = icing->Optimize();
@@ -7907,6 +7990,29 @@ TEST_F(IcingSearchEngineTest, OptimizeStatsProtoTest) {
result.mutable_optimize_stats()->clear_storage_size_before();
result.mutable_optimize_stats()->clear_storage_size_after();
EXPECT_THAT(result.optimize_stats(), EqualsProto(expected));
+
+ // Delete the last document.
+ ASSERT_THAT(icing->Delete(document3.namespace_(), document3.uri()).status(),
+ ProtoIsOk());
+
+ expected = OptimizeStatsProto();
+ expected.set_latency_ms(5);
+ expected.set_document_store_optimize_latency_ms(5);
+ expected.set_index_restoration_latency_ms(5);
+ expected.set_num_original_documents(1);
+ expected.set_num_deleted_documents(1);
+ expected.set_num_expired_documents(0);
+ expected.set_time_since_last_optimize_ms(0);
+ // Should rebuild the index since all documents are removed.
+ expected.set_index_restoration_mode(OptimizeStatsProto::FULL_INDEX_REBUILD);
+
+ // Run Optimize
+ result = icing->Optimize();
+ EXPECT_THAT(result.optimize_stats().storage_size_before(),
+ Ge(result.optimize_stats().storage_size_after()));
+ result.mutable_optimize_stats()->clear_storage_size_before();
+ result.mutable_optimize_stats()->clear_storage_size_after();
+ EXPECT_THAT(result.optimize_stats(), EqualsProto(expected));
}
TEST_F(IcingSearchEngineTest, StorageInfoTest) {
diff --git a/icing/index/hit/hit.cc b/icing/index/hit/hit.cc
index 887e6e4..ce1c366 100644
--- a/icing/index/hit/hit.cc
+++ b/icing/index/hit/hit.cc
@@ -97,6 +97,11 @@ bool Hit::is_in_prefix_section() const {
return bit_util::BitfieldGet(value(), kInPrefixSection, 1);
}
+Hit Hit::TranslateHit(Hit old_hit, DocumentId new_document_id) {
+ return Hit(old_hit.section_id(), new_document_id, old_hit.term_frequency(),
+ old_hit.is_in_prefix_section(), old_hit.is_prefix_hit());
+}
+
bool Hit::EqualsDocumentIdAndSectionId::operator()(const Hit& hit1,
const Hit& hit2) const {
return (hit1.value() >> kNumFlags) == (hit2.value() >> kNumFlags);
diff --git a/icing/index/hit/hit.h b/icing/index/hit/hit.h
index ee1f64b..f8cbd78 100644
--- a/icing/index/hit/hit.h
+++ b/icing/index/hit/hit.h
@@ -77,6 +77,9 @@ class Hit {
bool is_prefix_hit() const;
bool is_in_prefix_section() const;
+ // Creates a new hit based on old_hit but with new_document_id set.
+ static Hit TranslateHit(Hit old_hit, DocumentId new_document_id);
+
bool operator<(const Hit& h2) const { return value() < h2.value(); }
bool operator==(const Hit& h2) const { return value() == h2.value(); }
diff --git a/icing/index/index.cc b/icing/index/index.cc
index 02ba699..1d863cc 100644
--- a/icing/index/index.cc
+++ b/icing/index/index.cc
@@ -264,6 +264,14 @@ IndexStorageInfoProto Index::GetStorageInfo() const {
return main_index_->GetStorageInfo(std::move(storage_info));
}
+libtextclassifier3::Status Index::Optimize(
+ const std::vector<DocumentId>& document_id_old_to_new) {
+ if (main_index_->last_added_document_id() != kInvalidDocumentId) {
+ ICING_RETURN_IF_ERROR(main_index_->Optimize(document_id_old_to_new));
+ }
+ return lite_index_->Optimize(document_id_old_to_new, term_id_codec_.get());
+}
+
libtextclassifier3::Status Index::Editor::BufferTerm(const char* term) {
// Step 1: See if this term is already in the lexicon
uint32_t tvi;
diff --git a/icing/index/index.h b/icing/index/index.h
index f101a91..748acb0 100644
--- a/icing/index/index.h
+++ b/icing/index/index.h
@@ -263,6 +263,15 @@ class Index {
return lite_index_->Reset();
}
+ // Reduces internal file sizes by reclaiming space of deleted documents.
+ //
+ // Returns:
+ // OK on success
+ // INTERNAL_ERROR on IO error, this indicates that the index may be in an
+ // invalid state and should be cleared.
+ libtextclassifier3::Status Optimize(
+ const std::vector<DocumentId>& document_id_old_to_new);
+
private:
Index(const Options& options, std::unique_ptr<TermIdCodec> term_id_codec,
std::unique_ptr<LiteIndex> lite_index,
diff --git a/icing/index/index_test.cc b/icing/index/index_test.cc
index 2eb3b59..7323603 100644
--- a/icing/index/index_test.cc
+++ b/icing/index/index_test.cc
@@ -14,6 +14,7 @@
#include "icing/index/index.h"
+#include <algorithm>
#include <cstdint>
#include <limits>
#include <memory>
@@ -48,6 +49,7 @@ namespace lib {
namespace {
+using ::testing::ContainerEq;
using ::testing::ElementsAre;
using ::testing::Eq;
using ::testing::Ge;
@@ -79,6 +81,23 @@ class IndexTest : public Test {
icing_filesystem_.DeleteDirectoryRecursively(index_dir_.c_str());
}
+ std::vector<DocHitInfo> GetHits(
+ std::unique_ptr<DocHitInfoIterator> iterator) {
+ std::vector<DocHitInfo> infos;
+ while (iterator->Advance().ok()) {
+ infos.push_back(iterator->doc_hit_info());
+ }
+ return infos;
+ }
+
+ libtextclassifier3::StatusOr<std::vector<DocHitInfo>> GetHits(
+ std::string term, TermMatchType::Code match_type) {
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator(term, kSectionIdMaskAll, match_type));
+ return GetHits(std::move(itr));
+ }
+
Filesystem filesystem_;
IcingFilesystem icing_filesystem_;
std::string index_dir_;
@@ -97,14 +116,6 @@ constexpr DocumentId kDocumentId8 = 8;
constexpr SectionId kSectionId2 = 2;
constexpr SectionId kSectionId3 = 3;
-std::vector<DocHitInfo> GetHits(std::unique_ptr<DocHitInfoIterator> iterator) {
- std::vector<DocHitInfo> infos;
- while (iterator->Advance().ok()) {
- infos.push_back(iterator->doc_hit_info());
- }
- return infos;
-}
-
MATCHER_P2(EqualsDocHitInfo, document_id, sections, "") {
const DocHitInfo& actual = arg;
SectionIdMask section_mask = kSectionIdMaskNone;
@@ -249,6 +260,66 @@ TEST_F(IndexTest, SingleHitSingleTermIndexAfterMerge) {
kDocumentId0, std::vector<SectionId>{kSectionId2})));
}
+TEST_F(IndexTest, SingleHitSingleTermIndexAfterOptimize) {
+ Index::Editor edit = index_->Edit(
+ kDocumentId2, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
+ EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+ index_->set_last_added_document_id(kDocumentId2);
+
+ ICING_ASSERT_OK(index_->Optimize(/*document_id_old_to_new=*/{0, 1, 2}));
+ EXPECT_THAT(GetHits("foo", TermMatchType::EXACT_ONLY),
+ IsOkAndHolds(ElementsAre(EqualsDocHitInfo(
+ kDocumentId2, std::vector<SectionId>{kSectionId2}))));
+ EXPECT_EQ(index_->last_added_document_id(), kDocumentId2);
+
+ // Mapping to a different docid will translate the hit
+ ICING_ASSERT_OK(index_->Optimize(
+ /*document_id_old_to_new=*/{0, kInvalidDocumentId, kDocumentId1}));
+ EXPECT_THAT(GetHits("foo", TermMatchType::EXACT_ONLY),
+ IsOkAndHolds(ElementsAre(EqualsDocHitInfo(
+ kDocumentId1, std::vector<SectionId>{kSectionId2}))));
+ EXPECT_EQ(index_->last_added_document_id(), kDocumentId1);
+
+ // Mapping to kInvalidDocumentId will remove the hit.
+ ICING_ASSERT_OK(
+ index_->Optimize(/*document_id_old_to_new=*/{0, kInvalidDocumentId}));
+ EXPECT_THAT(GetHits("foo", TermMatchType::EXACT_ONLY),
+ IsOkAndHolds(IsEmpty()));
+ EXPECT_EQ(index_->last_added_document_id(), kInvalidDocumentId);
+}
+
+TEST_F(IndexTest, SingleHitSingleTermIndexAfterMergeAndOptimize) {
+ Index::Editor edit = index_->Edit(
+ kDocumentId2, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
+ EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+ index_->set_last_added_document_id(kDocumentId2);
+
+ ICING_ASSERT_OK(index_->Merge());
+
+ ICING_ASSERT_OK(index_->Optimize(/*document_id_old_to_new=*/{0, 1, 2}));
+ EXPECT_THAT(GetHits("foo", TermMatchType::EXACT_ONLY),
+ IsOkAndHolds(ElementsAre(EqualsDocHitInfo(
+ kDocumentId2, std::vector<SectionId>{kSectionId2}))));
+ EXPECT_EQ(index_->last_added_document_id(), kDocumentId2);
+
+ // Mapping to a different docid will translate the hit
+ ICING_ASSERT_OK(index_->Optimize(
+ /*document_id_old_to_new=*/{0, kInvalidDocumentId, kDocumentId1}));
+ EXPECT_THAT(GetHits("foo", TermMatchType::EXACT_ONLY),
+ IsOkAndHolds(ElementsAre(EqualsDocHitInfo(
+ kDocumentId1, std::vector<SectionId>{kSectionId2}))));
+ EXPECT_EQ(index_->last_added_document_id(), kDocumentId1);
+
+ // Mapping to kInvalidDocumentId will remove the hit.
+ ICING_ASSERT_OK(
+ index_->Optimize(/*document_id_old_to_new=*/{0, kInvalidDocumentId}));
+ EXPECT_THAT(GetHits("foo", TermMatchType::EXACT_ONLY),
+ IsOkAndHolds(IsEmpty()));
+ EXPECT_EQ(index_->last_added_document_id(), kInvalidDocumentId);
+}
+
TEST_F(IndexTest, SingleHitMultiTermIndex) {
Index::Editor edit = index_->Edit(
kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
@@ -281,6 +352,112 @@ TEST_F(IndexTest, SingleHitMultiTermIndexAfterMerge) {
kDocumentId0, std::vector<SectionId>{kSectionId2})));
}
+TEST_F(IndexTest, MultiHitMultiTermIndexAfterOptimize) {
+ Index::Editor edit = index_->Edit(
+ kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
+ EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+
+ edit = index_->Edit(kDocumentId1, kSectionId2, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/0);
+ EXPECT_THAT(edit.BufferTerm("bar"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+
+ edit = index_->Edit(kDocumentId2, kSectionId3, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/0);
+ EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+ index_->set_last_added_document_id(kDocumentId2);
+
+ ICING_ASSERT_OK(index_->Optimize(/*document_id_old_to_new=*/{0, 1, 2}));
+ EXPECT_THAT(
+ GetHits("foo", TermMatchType::EXACT_ONLY),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(kDocumentId2, std::vector<SectionId>{kSectionId3}),
+ EqualsDocHitInfo(kDocumentId0,
+ std::vector<SectionId>{kSectionId2}))));
+ EXPECT_THAT(GetHits("bar", TermMatchType::EXACT_ONLY),
+ IsOkAndHolds(ElementsAre(EqualsDocHitInfo(
+ kDocumentId1, std::vector<SectionId>{kSectionId2}))));
+ EXPECT_EQ(index_->last_added_document_id(), kDocumentId2);
+
+ // Delete document id 1, and document id 2 is translated to 1.
+ ICING_ASSERT_OK(
+ index_->Optimize(/*document_id_old_to_new=*/{0, kInvalidDocumentId, 1}));
+ EXPECT_THAT(
+ GetHits("foo", TermMatchType::EXACT_ONLY),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(kDocumentId1, std::vector<SectionId>{kSectionId3}),
+ EqualsDocHitInfo(kDocumentId0,
+ std::vector<SectionId>{kSectionId2}))));
+ EXPECT_THAT(GetHits("bar", TermMatchType::EXACT_ONLY),
+ IsOkAndHolds(IsEmpty()));
+ EXPECT_EQ(index_->last_added_document_id(), kDocumentId1);
+
+ // Delete all the rest documents.
+ ICING_ASSERT_OK(index_->Optimize(
+ /*document_id_old_to_new=*/{kInvalidDocumentId, kInvalidDocumentId}));
+ EXPECT_THAT(GetHits("foo", TermMatchType::EXACT_ONLY),
+ IsOkAndHolds(IsEmpty()));
+ EXPECT_THAT(GetHits("bar", TermMatchType::EXACT_ONLY),
+ IsOkAndHolds(IsEmpty()));
+ EXPECT_EQ(index_->last_added_document_id(), kInvalidDocumentId);
+}
+
+TEST_F(IndexTest, MultiHitMultiTermIndexAfterMergeAndOptimize) {
+ Index::Editor edit = index_->Edit(
+ kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
+ EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+
+ edit = index_->Edit(kDocumentId1, kSectionId2, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/0);
+ EXPECT_THAT(edit.BufferTerm("bar"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+
+ edit = index_->Edit(kDocumentId2, kSectionId3, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/0);
+ EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+ index_->set_last_added_document_id(kDocumentId2);
+
+ ICING_ASSERT_OK(index_->Merge());
+
+ ICING_ASSERT_OK(index_->Optimize(/*document_id_old_to_new=*/{0, 1, 2}));
+ EXPECT_THAT(
+ GetHits("foo", TermMatchType::EXACT_ONLY),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(kDocumentId2, std::vector<SectionId>{kSectionId3}),
+ EqualsDocHitInfo(kDocumentId0,
+ std::vector<SectionId>{kSectionId2}))));
+ EXPECT_THAT(GetHits("bar", TermMatchType::EXACT_ONLY),
+ IsOkAndHolds(ElementsAre(EqualsDocHitInfo(
+ kDocumentId1, std::vector<SectionId>{kSectionId2}))));
+ EXPECT_EQ(index_->last_added_document_id(), kDocumentId2);
+
+ // Delete document id 1, and document id 2 is translated to 1.
+ ICING_ASSERT_OK(
+ index_->Optimize(/*document_id_old_to_new=*/{0, kInvalidDocumentId, 1}));
+ EXPECT_THAT(
+ GetHits("foo", TermMatchType::EXACT_ONLY),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(kDocumentId1, std::vector<SectionId>{kSectionId3}),
+ EqualsDocHitInfo(kDocumentId0,
+ std::vector<SectionId>{kSectionId2}))));
+ EXPECT_THAT(GetHits("bar", TermMatchType::EXACT_ONLY),
+ IsOkAndHolds(IsEmpty()));
+ EXPECT_EQ(index_->last_added_document_id(), kDocumentId1);
+
+ // Delete all the rest documents.
+ ICING_ASSERT_OK(index_->Optimize(
+ /*document_id_old_to_new=*/{kInvalidDocumentId, kInvalidDocumentId}));
+ EXPECT_THAT(GetHits("foo", TermMatchType::EXACT_ONLY),
+ IsOkAndHolds(IsEmpty()));
+ EXPECT_THAT(GetHits("bar", TermMatchType::EXACT_ONLY),
+ IsOkAndHolds(IsEmpty()));
+ EXPECT_EQ(index_->last_added_document_id(), kInvalidDocumentId);
+}
+
TEST_F(IndexTest, NoHitMultiTermIndex) {
Index::Editor edit = index_->Edit(
kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
@@ -807,6 +984,114 @@ TEST_F(IndexTest, FullIndexMerge) {
EXPECT_THAT(last_itr->doc_hit_info().document_id(), Eq(document_id + 1));
}
+TEST_F(IndexTest, OptimizeShouldWorkForEmptyIndex) {
+ // Optimize an empty index should succeed, but have no effects.
+ ICING_ASSERT_OK(index_->Optimize(std::vector<DocumentId>()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("", kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
+ EXPECT_THAT(GetHits(std::move(itr)), IsEmpty());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ itr, index_->GetIterator("", kSectionIdMaskAll, TermMatchType::PREFIX));
+ EXPECT_THAT(GetHits(std::move(itr)), IsEmpty());
+}
+
+TEST_F(IndexTest, IndexOptimize) {
+ std::string prefix = "prefix";
+ std::default_random_engine random;
+ std::vector<std::string> query_terms;
+ // Add 1024 hits to main index, and 1024 hits to lite index.
+ for (int i = 0; i < 2048; ++i) {
+ if (i == 1024) {
+ ICING_ASSERT_OK(index_->Merge());
+ }
+ // Generate a unique term for document i.
+ query_terms.push_back(prefix + RandomString("abcdefg", 5, &random) +
+ std::to_string(i));
+ TermMatchType::Code term_match_type = TermMatchType::PREFIX;
+ SectionId section_id = i % 5;
+ if (section_id == 2) {
+ // Make section 2 an exact section.
+ term_match_type = TermMatchType::EXACT_ONLY;
+ }
+ Index::Editor edit = index_->Edit(/*document_id=*/i, section_id,
+ term_match_type, /*namespace_id=*/0);
+ ICING_ASSERT_OK(edit.BufferTerm(query_terms.at(i).c_str()));
+ ICING_ASSERT_OK(edit.IndexAllBufferedTerms());
+ index_->set_last_added_document_id(i);
+ }
+
+ // Delete one document for every three documents.
+ DocumentId document_id = 0;
+ DocumentId new_last_added_document_id = kInvalidDocumentId;
+ std::vector<DocumentId> document_id_old_to_new;
+ for (int i = 0; i < 2048; ++i) {
+ if (i % 3 == 0) {
+ document_id_old_to_new.push_back(kInvalidDocumentId);
+ } else {
+ new_last_added_document_id = document_id++;
+ document_id_old_to_new.push_back(new_last_added_document_id);
+ }
+ }
+
+ std::vector<DocHitInfo> exp_prefix_hits;
+ for (int i = 0; i < 2048; ++i) {
+ if (document_id_old_to_new[i] == kInvalidDocumentId) {
+ continue;
+ }
+ if (i % 5 == 2) {
+ // Section 2 is an exact section, so we should not see any hits in
+ // prefix search.
+ continue;
+ }
+ exp_prefix_hits.push_back(DocHitInfo(document_id_old_to_new[i]));
+ exp_prefix_hits.back().UpdateSection(/*section_id=*/i % 5,
+ /*hit_term_frequency=*/1);
+ }
+ std::reverse(exp_prefix_hits.begin(), exp_prefix_hits.end());
+
+ // Check that optimize is correct
+ ICING_ASSERT_OK(index_->Optimize(document_id_old_to_new));
+ EXPECT_EQ(index_->last_added_document_id(), new_last_added_document_id);
+ // Check prefix search.
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<DocHitInfo> hits,
+ GetHits(prefix, TermMatchType::PREFIX));
+ EXPECT_THAT(hits, ContainerEq(exp_prefix_hits));
+ // Check exact search.
+ for (int i = 0; i < 2048; ++i) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ hits, GetHits(query_terms[i], TermMatchType::EXACT_ONLY));
+ if (document_id_old_to_new[i] == kInvalidDocumentId) {
+ EXPECT_THAT(hits, IsEmpty());
+ } else {
+ EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfo(
+ document_id_old_to_new[i],
+ std::vector<SectionId>{(SectionId)(i % 5)})));
+ }
+ }
+
+ // Check that optimize does not block merge.
+ ICING_ASSERT_OK(index_->Merge());
+ EXPECT_EQ(index_->last_added_document_id(), new_last_added_document_id);
+ // Check prefix search.
+ ICING_ASSERT_OK_AND_ASSIGN(hits, GetHits(prefix, TermMatchType::PREFIX));
+ EXPECT_THAT(hits, ContainerEq(exp_prefix_hits));
+ // Check exact search.
+ for (int i = 0; i < 2048; ++i) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ hits, GetHits(query_terms[i], TermMatchType::EXACT_ONLY));
+ if (document_id_old_to_new[i] == kInvalidDocumentId) {
+ EXPECT_THAT(hits, IsEmpty());
+ } else {
+ EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfo(
+ document_id_old_to_new[i],
+ std::vector<SectionId>{(SectionId)(i % 5)})));
+ }
+ }
+}
+
TEST_F(IndexTest, IndexCreateIOFailure) {
// Create the index with mock filesystem. By default, Mock will return false,
// so the first attempted file operation will fail.
diff --git a/icing/legacy/index/icing-lite-index-header.h b/icing/index/lite/lite-index-header.h
index ac2d3c0..dd6a0a8 100644
--- a/icing/legacy/index/icing-lite-index-header.h
+++ b/icing/index/lite/lite-index-header.h
@@ -16,15 +16,15 @@
#define ICING_LEGACY_INDEX_ICING_LITE_INDEX_HEADER_H_
#include "icing/legacy/core/icing-string-util.h"
-#include "icing/legacy/index/icing-common-types.h"
+#include "icing/store/document-id.h"
namespace icing {
namespace lib {
// A wrapper around the actual mmapped header data.
-class IcingLiteIndex_Header {
+class LiteIndex_Header {
public:
- virtual ~IcingLiteIndex_Header() = default;
+ virtual ~LiteIndex_Header() = default;
// Returns true if the magic of the header matches the hard-coded magic
// value associated with this header format.
@@ -47,7 +47,7 @@ class IcingLiteIndex_Header {
virtual void Reset() = 0;
};
-class IcingLiteIndex_HeaderImpl : public IcingLiteIndex_Header {
+class LiteIndex_HeaderImpl : public LiteIndex_Header {
public:
struct HeaderData {
static const uint32_t kMagic = 0x6dfba6a0;
@@ -66,7 +66,7 @@ class IcingLiteIndex_HeaderImpl : public IcingLiteIndex_Header {
uint32_t searchable_end;
};
- explicit IcingLiteIndex_HeaderImpl(HeaderData *hdr) : hdr_(hdr) {}
+ explicit LiteIndex_HeaderImpl(HeaderData *hdr) : hdr_(hdr) {}
bool check_magic() const override {
return hdr_->magic == HeaderData::kMagic;
@@ -97,7 +97,7 @@ class IcingLiteIndex_HeaderImpl : public IcingLiteIndex_Header {
void Reset() override {
hdr_->lite_index_crc = 0;
hdr_->magic = HeaderData::kMagic;
- hdr_->last_added_docid = kIcingInvalidDocId;
+ hdr_->last_added_docid = kInvalidDocumentId;
hdr_->cur_size = 0;
hdr_->searchable_end = 0;
}
@@ -105,7 +105,7 @@ class IcingLiteIndex_HeaderImpl : public IcingLiteIndex_Header {
private:
HeaderData *hdr_;
};
-static_assert(24 == sizeof(IcingLiteIndex_HeaderImpl::HeaderData),
+static_assert(24 == sizeof(LiteIndex_HeaderImpl::HeaderData),
"sizeof(HeaderData) != 24");
} // namespace lib
diff --git a/icing/legacy/index/icing-lite-index-options.cc b/icing/index/lite/lite-index-options.cc
index 4bf0d38..29075f8 100644
--- a/icing/legacy/index/icing-lite-index-options.cc
+++ b/icing/index/lite/lite-index-options.cc
@@ -12,13 +12,25 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "icing/legacy/index/icing-lite-index-options.h"
+#include "icing/index/lite/lite-index-options.h"
+
+#include "icing/index/lite/term-id-hit-pair.h"
namespace icing {
namespace lib {
namespace {
+constexpr int kIcingMaxVariantsPerToken = 10; // Maximum number of variants
+
+constexpr size_t kIcingMaxSearchableDocumentSize = (1u << 16) - 1; // 64K
+// Max num tokens per document. 64KB is our original maximum (searchable)
+// document size. We clip if document exceeds this.
+constexpr uint32_t kIcingMaxNumTokensPerDoc =
+ kIcingMaxSearchableDocumentSize / 5;
+constexpr uint32_t kIcingMaxNumHitsPerDocument =
+ kIcingMaxNumTokensPerDoc * kIcingMaxVariantsPerToken;
+
uint32_t CalculateHitBufferSize(uint32_t hit_buffer_want_merge_bytes) {
constexpr uint32_t kHitBufferSlopMult = 2;
@@ -27,7 +39,7 @@ uint32_t CalculateHitBufferSize(uint32_t hit_buffer_want_merge_bytes) {
// TODO(b/111690435) Move LiteIndex::Element to a separate file so that this
// can use sizeof(LiteIndex::Element)
uint32_t hit_capacity_elts_with_slop =
- hit_buffer_want_merge_bytes / sizeof(uint64_t);
+ hit_buffer_want_merge_bytes / sizeof(TermIdHitPair);
// Add some slop for index variants on top of max num tokens.
hit_capacity_elts_with_slop += kIcingMaxNumHitsPerDocument;
hit_capacity_elts_with_slop *= kHitBufferSlopMult;
@@ -51,8 +63,8 @@ IcingDynamicTrie::Options CalculateTrieOptions(uint32_t hit_buffer_size) {
} // namespace
-IcingLiteIndexOptions::IcingLiteIndexOptions(
- const std::string& filename_base, uint32_t hit_buffer_want_merge_bytes)
+LiteIndexOptions::LiteIndexOptions(const std::string& filename_base,
+ uint32_t hit_buffer_want_merge_bytes)
: filename_base(filename_base),
hit_buffer_want_merge_bytes(hit_buffer_want_merge_bytes) {
hit_buffer_size = CalculateHitBufferSize(hit_buffer_want_merge_bytes);
diff --git a/icing/legacy/index/icing-lite-index-options.h b/icing/index/lite/lite-index-options.h
index 2922621..ae58802 100644
--- a/icing/legacy/index/icing-lite-index-options.h
+++ b/icing/index/lite/lite-index-options.h
@@ -15,20 +15,19 @@
#ifndef ICING_LEGACY_INDEX_ICING_LITE_INDEX_OPTIONS_H_
#define ICING_LEGACY_INDEX_ICING_LITE_INDEX_OPTIONS_H_
-#include "icing/legacy/index/icing-common-types.h"
#include "icing/legacy/index/icing-dynamic-trie.h"
namespace icing {
namespace lib {
-struct IcingLiteIndexOptions {
- IcingLiteIndexOptions() = default;
- // Creates IcingLiteIndexOptions based off of the specified parameters. All
+struct LiteIndexOptions {
+ LiteIndexOptions() = default;
+ // Creates LiteIndexOptions based off of the specified parameters. All
// other fields are calculated based on the value of
// hit_buffer_want_merge_bytes and the logic in CalculateHitBufferSize and
// CalculateTrieOptions.
- IcingLiteIndexOptions(const std::string& filename_base,
- uint32_t hit_buffer_want_merge_bytes);
+ LiteIndexOptions(const std::string& filename_base,
+ uint32_t hit_buffer_want_merge_bytes);
IcingDynamicTrie::Options lexicon_options;
IcingDynamicTrie::Options display_mappings_options;
diff --git a/icing/index/lite/lite-index.cc b/icing/index/lite/lite-index.cc
index fc40225..9622ff4 100644
--- a/icing/index/lite/lite-index.cc
+++ b/icing/index/lite/lite-index.cc
@@ -23,6 +23,7 @@
#include <memory>
#include <string>
#include <string_view>
+#include <unordered_set>
#include <utility>
#include <vector>
@@ -33,13 +34,13 @@
#include "icing/file/filesystem.h"
#include "icing/index/hit/doc-hit-info.h"
#include "icing/index/hit/hit.h"
+#include "icing/index/lite/lite-index-header.h"
#include "icing/index/term-property-id.h"
#include "icing/legacy/core/icing-string-util.h"
#include "icing/legacy/core/icing-timer.h"
#include "icing/legacy/index/icing-array-storage.h"
#include "icing/legacy/index/icing-dynamic-trie.h"
#include "icing/legacy/index/icing-filesystem.h"
-#include "icing/legacy/index/icing-lite-index-header.h"
#include "icing/legacy/index/icing-mmapper.h"
#include "icing/proto/term.pb.h"
#include "icing/schema/section.h"
@@ -60,7 +61,7 @@ std::string MakeHitBufferFilename(const std::string& filename_base) {
return filename_base + "hb";
}
-size_t header_size() { return sizeof(IcingLiteIndex_HeaderImpl::HeaderData); }
+size_t header_size() { return sizeof(LiteIndex_HeaderImpl::HeaderData); }
} // namespace
@@ -156,8 +157,8 @@ libtextclassifier3::Status LiteIndex::Initialize() {
// Set up header.
header_mmap_.Remap(hit_buffer_fd_.get(), 0, header_size());
- header_ = std::make_unique<IcingLiteIndex_HeaderImpl>(
- reinterpret_cast<IcingLiteIndex_HeaderImpl::HeaderData*>(
+ header_ = std::make_unique<LiteIndex_HeaderImpl>(
+ reinterpret_cast<LiteIndex_HeaderImpl::HeaderData*>(
header_mmap_.address()));
header_->Reset();
@@ -171,8 +172,8 @@ libtextclassifier3::Status LiteIndex::Initialize() {
UpdateChecksum();
} else {
header_mmap_.Remap(hit_buffer_fd_.get(), 0, header_size());
- header_ = std::make_unique<IcingLiteIndex_HeaderImpl>(
- reinterpret_cast<IcingLiteIndex_HeaderImpl::HeaderData*>(
+ header_ = std::make_unique<LiteIndex_HeaderImpl>(
+ reinterpret_cast<LiteIndex_HeaderImpl::HeaderData*>(
header_mmap_.address()));
if (!hit_buffer_.Init(hit_buffer_fd_.get(), header_padded_size, true,
@@ -197,8 +198,7 @@ libtextclassifier3::Status LiteIndex::Initialize() {
}
}
- ICING_VLOG(2) << IcingStringUtil::StringPrintf("Lite index init ok in %.3fms",
- timer.Elapsed() * 1000);
+ ICING_VLOG(2) << "Lite index init ok in " << timer.Elapsed() * 1000 << "ms";
return status;
error:
@@ -230,8 +230,7 @@ Crc32 LiteIndex::ComputeChecksum() {
Crc32 all_crc(header_->CalculateHeaderCrc());
all_crc.Append(std::string_view(reinterpret_cast<const char*>(dependent_crcs),
sizeof(dependent_crcs)));
- ICING_VLOG(2) << IcingStringUtil::StringPrintf(
- "Lite index crc computed in %.3fms", timer.Elapsed() * 1000);
+ ICING_VLOG(2) << "Lite index crc computed in " << timer.Elapsed() * 1000 << "ms";
return all_crc;
}
@@ -246,8 +245,7 @@ libtextclassifier3::Status LiteIndex::Reset() {
header_->Reset();
UpdateChecksum();
- ICING_VLOG(2) << IcingStringUtil::StringPrintf("Lite index clear in %.3fms",
- timer.Elapsed() * 1000);
+ ICING_VLOG(2) << "Lite index clear in " << timer.Elapsed() * 1000 << "ms";
return libtextclassifier3::Status::OK;
}
@@ -439,34 +437,38 @@ IndexStorageInfoProto LiteIndex::GetStorageInfo(
return storage_info;
}
-uint32_t LiteIndex::Seek(uint32_t term_id) {
+void LiteIndex::SortHits() {
// Make searchable by sorting by hit buffer.
uint32_t sort_len = header_->cur_size() - header_->searchable_end();
- if (sort_len > 0) {
- IcingTimer timer;
-
- auto* array_start =
- hit_buffer_.GetMutableMem<TermIdHitPair::Value>(0, header_->cur_size());
- TermIdHitPair::Value* sort_start = array_start + header_->searchable_end();
- std::sort(sort_start, array_start + header_->cur_size());
-
- // Now merge with previous region. Since the previous region is already
- // sorted and deduplicated, optimize the merge by skipping everything less
- // than the new region's smallest value.
- if (header_->searchable_end() > 0) {
- std::inplace_merge(array_start, array_start + header_->searchable_end(),
- array_start + header_->cur_size());
- }
- ICING_VLOG(2) << IcingStringUtil::StringPrintf(
- "Lite index sort and merge %u into %u in %.3fms", sort_len,
- header_->searchable_end(), timer.Elapsed() * 1000);
-
- // Now the entire array is sorted.
- header_->set_searchable_end(header_->cur_size());
+ if (sort_len <= 0) {
+ return;
+ }
+ IcingTimer timer;
- // Update crc in-line.
- UpdateChecksum();
+ auto* array_start =
+ hit_buffer_.GetMutableMem<TermIdHitPair::Value>(0, header_->cur_size());
+ TermIdHitPair::Value* sort_start = array_start + header_->searchable_end();
+ std::sort(sort_start, array_start + header_->cur_size());
+
+ // Now merge with previous region. Since the previous region is already
+ // sorted and deduplicated, optimize the merge by skipping everything less
+ // than the new region's smallest value.
+ if (header_->searchable_end() > 0) {
+ std::inplace_merge(array_start, array_start + header_->searchable_end(),
+ array_start + header_->cur_size());
}
+ ICING_VLOG(2) << "Lite index sort and merge " << sort_len << " into "
+ << header_->searchable_end() << " in " << timer.Elapsed() * 1000 << "ms";
+
+ // Now the entire array is sorted.
+ header_->set_searchable_end(header_->cur_size());
+
+ // Update crc in-line.
+ UpdateChecksum();
+}
+
+uint32_t LiteIndex::Seek(uint32_t term_id) {
+ SortHits();
// Binary search for our term_id. Make sure we get the first
// element. Using kBeginSortValue ensures this for the hit value.
@@ -480,5 +482,86 @@ uint32_t LiteIndex::Seek(uint32_t term_id) {
return ptr - array;
}
+libtextclassifier3::Status LiteIndex::Optimize(
+ const std::vector<DocumentId>& document_id_old_to_new,
+ const TermIdCodec* term_id_codec) {
+ if (header_->cur_size() == 0) {
+ return libtextclassifier3::Status::OK;
+ }
+ // Sort the hits so that hits with the same term id will be grouped together,
+ // which helps later to determine which terms will be unused after compaction.
+ SortHits();
+ uint32_t new_size = 0;
+ // The largest document id after translating hits.
+ DocumentId largest_document_id = kInvalidDocumentId;
+ uint32_t curr_term_id = 0;
+ uint32_t curr_tvi = 0;
+ std::unordered_set<uint32_t> tvi_to_delete;
+ for (uint32_t idx = 0; idx < header_->cur_size(); ++idx) {
+ TermIdHitPair term_id_hit_pair(
+ hit_buffer_.array_cast<TermIdHitPair>()[idx]);
+ if (idx == 0 || term_id_hit_pair.term_id() != curr_term_id) {
+ curr_term_id = term_id_hit_pair.term_id();
+ ICING_ASSIGN_OR_RETURN(TermIdCodec::DecodedTermInfo term_info,
+ term_id_codec->DecodeTermInfo(curr_term_id));
+ curr_tvi = term_info.tvi;
+ // Mark the property of the current term as not having hits in prefix
+ // section. The property will be set below if there are any valid hits
+ // from a prefix section.
+ lexicon_.ClearProperty(curr_tvi, GetHasHitsInPrefixSectionPropertyId());
+ // Add curr_tvi to tvi_to_delete. It will be removed from tvi_to_delete
+ // below if there are any valid hits pointing to that termid.
+ tvi_to_delete.insert(curr_tvi);
+ }
+ DocumentId new_document_id =
+ document_id_old_to_new[term_id_hit_pair.hit().document_id()];
+ if (new_document_id == kInvalidDocumentId) {
+ continue;
+ }
+ if (largest_document_id == kInvalidDocumentId ||
+ new_document_id > largest_document_id) {
+ largest_document_id = new_document_id;
+ }
+ if (term_id_hit_pair.hit().is_in_prefix_section()) {
+ lexicon_.SetProperty(curr_tvi, GetHasHitsInPrefixSectionPropertyId());
+ }
+ tvi_to_delete.erase(curr_tvi);
+ TermIdHitPair new_term_id_hit_pair(
+ term_id_hit_pair.term_id(),
+ Hit::TranslateHit(term_id_hit_pair.hit(), new_document_id));
+ // Rewriting the hit_buffer in place.
+ // new_size is weakly less than idx so we are okay to overwrite the entry at
+ // new_size, and valp should never be nullptr since it is within the already
+ // allocated region of hit_buffer_.
+ TermIdHitPair::Value* valp =
+ hit_buffer_.GetMutableMem<TermIdHitPair::Value>(new_size++, 1);
+ *valp = new_term_id_hit_pair.value();
+ }
+ header_->set_cur_size(new_size);
+ header_->set_searchable_end(new_size);
+ header_->set_last_added_docid(largest_document_id);
+
+ // Delete unused terms.
+ std::unordered_set<std::string> terms_to_delete;
+ for (IcingDynamicTrie::Iterator term_iter(lexicon_, /*prefix=*/"");
+ term_iter.IsValid(); term_iter.Advance()) {
+ if (tvi_to_delete.find(term_iter.GetValueIndex()) != tvi_to_delete.end()) {
+ terms_to_delete.insert(term_iter.GetKey());
+ }
+ }
+ for (const std::string& term : terms_to_delete) {
+ // Mark "term" as deleted. This won't actually free space in the lexicon. It
+ // will simply make it impossible to Find "term" in subsequent calls (which
+ // saves an unnecessary search through the hit buffer). This is acceptable
+ // because the free space will eventually be reclaimed the next time that
+ // the lite index is merged with the main index.
+ if (!lexicon_.Delete(term)) {
+ return absl_ports::InternalError(
+ "Could not delete invalid terms in lite lexicon during compaction.");
+ }
+ }
+ return libtextclassifier3::Status::OK;
+}
+
} // namespace lib
} // namespace icing
diff --git a/icing/index/lite/lite-index.h b/icing/index/lite/lite-index.h
index 42d69f8..64b5881 100644
--- a/icing/index/lite/lite-index.h
+++ b/icing/index/lite/lite-index.h
@@ -30,12 +30,13 @@
#include "icing/file/filesystem.h"
#include "icing/index/hit/doc-hit-info.h"
#include "icing/index/hit/hit.h"
+#include "icing/index/lite/lite-index-header.h"
+#include "icing/index/lite/lite-index-options.h"
#include "icing/index/lite/term-id-hit-pair.h"
+#include "icing/index/term-id-codec.h"
#include "icing/legacy/index/icing-array-storage.h"
#include "icing/legacy/index/icing-dynamic-trie.h"
#include "icing/legacy/index/icing-filesystem.h"
-#include "icing/legacy/index/icing-lite-index-header.h"
-#include "icing/legacy/index/icing-lite-index-options.h"
#include "icing/legacy/index/icing-mmapper.h"
#include "icing/proto/debug.pb.h"
#include "icing/proto/storage.pb.h"
@@ -53,7 +54,7 @@ namespace lib {
class LiteIndex {
public:
// An entry in the hit buffer.
- using Options = IcingLiteIndexOptions;
+ using Options = LiteIndexOptions;
// Updates checksum of subcomponents.
~LiteIndex();
@@ -260,6 +261,16 @@ class LiteIndex {
IndexStorageInfoProto GetStorageInfo(
IndexStorageInfoProto storage_info) const;
+ // Reduces internal file sizes by reclaiming space of deleted documents.
+ //
+ // Returns:
+ // OK on success
+ // INTERNAL_ERROR on IO error, this indicates that the index may be in an
+ // invalid state and should be cleared.
+ libtextclassifier3::Status Optimize(
+ const std::vector<DocumentId>& document_id_old_to_new,
+ const TermIdCodec* term_id_codec);
+
private:
static IcingDynamicTrie::RuntimeOptions MakeTrieRuntimeOptions();
@@ -279,6 +290,9 @@ class LiteIndex {
// Sets the computed checksum in the header
void UpdateChecksum();
+ // Sort hits stored in the index.
+ void SortHits();
+
// Returns the position of the first element with term_id, or the size of the
// hit buffer if term_id is not present.
uint32_t Seek(uint32_t term_id);
@@ -301,7 +315,7 @@ class LiteIndex {
IcingMMapper header_mmap_;
// Wrapper around the mmapped header that contains stats on the lite index.
- std::unique_ptr<IcingLiteIndex_Header> header_;
+ std::unique_ptr<LiteIndex_Header> header_;
// Options used to initialize the LiteIndex.
const Options options_;
diff --git a/icing/index/main/flash-index-storage.cc b/icing/index/main/flash-index-storage.cc
index dabff28..33dacf9 100644
--- a/icing/index/main/flash-index-storage.cc
+++ b/icing/index/main/flash-index-storage.cc
@@ -133,9 +133,7 @@ bool FlashIndexStorage::CreateHeader() {
posting_list_bytes /= 2) {
uint32_t aligned_posting_list_bytes =
(posting_list_bytes / sizeof(Hit) * sizeof(Hit));
- ICING_VLOG(1) << IcingStringUtil::StringPrintf(
- "Block size %u: %u", header_block_->header()->num_index_block_infos,
- aligned_posting_list_bytes);
+ ICING_VLOG(1) << "Block size " << header_block_->header()->num_index_block_infos << ": " << aligned_posting_list_bytes;
// Initialize free list to empty.
HeaderBlock::Header::IndexBlockInfo* block_info =
@@ -169,23 +167,18 @@ bool FlashIndexStorage::OpenHeader(int64_t file_size) {
return false;
}
if (file_size % read_header.header()->block_size != 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Index size %" PRIu64 " not a multiple of block size %u", file_size,
- read_header.header()->block_size);
+ ICING_LOG(ERROR) << "Index size " << file_size << " not a multiple of block size " << read_header.header()->block_size;
return false;
}
if (file_size < static_cast<int64_t>(read_header.header()->block_size)) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Index size %" PRIu64 " shorter than block size %u", file_size,
- read_header.header()->block_size);
+ ICING_LOG(ERROR) << "Index size " << file_size << " shorter than block size " << read_header.header()->block_size;
return false;
}
if (read_header.header()->block_size % getpagesize() != 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Block size %u is not a multiple of page size %d",
- read_header.header()->block_size, getpagesize());
+ ICING_LOG(ERROR) << "Block size " << read_header.header()->block_size
+ << " is not a multiple of page size " << getpagesize();
return false;
}
num_blocks_ = file_size / read_header.header()->block_size;
@@ -215,11 +208,10 @@ bool FlashIndexStorage::OpenHeader(int64_t file_size) {
int posting_list_bytes =
header_block_->header()->index_block_infos[i].posting_list_bytes;
if (posting_list_bytes % sizeof(Hit) != 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Posting list size misaligned, index %u, size %u, hit %zu, "
- "file_size %" PRIu64,
- i, header_block_->header()->index_block_infos[i].posting_list_bytes,
- sizeof(Hit), file_size);
+ ICING_LOG(ERROR) << "Posting list size misaligned, index " << i
+ << ", size "
+ << header_block_->header()->index_block_infos[i].posting_list_bytes
+ << ", hit " << sizeof(Hit) << ", file_size " << file_size;
return false;
}
}
@@ -229,8 +221,7 @@ bool FlashIndexStorage::OpenHeader(int64_t file_size) {
bool FlashIndexStorage::PersistToDisk() {
// First, write header.
if (!header_block_->Write(block_fd_.get())) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Write index header failed: %s", strerror(errno));
+ ICING_LOG(ERROR) << "Write index header failed: " << strerror(errno);
return false;
}
@@ -456,8 +447,7 @@ void FlashIndexStorage::FreePostingList(PostingListHolder holder) {
int FlashIndexStorage::GrowIndex() {
if (num_blocks_ >= kMaxBlockIndex) {
- ICING_VLOG(1) << IcingStringUtil::StringPrintf("Reached max block index %u",
- kMaxBlockIndex);
+ ICING_VLOG(1) << "Reached max block index " << kMaxBlockIndex;
return kInvalidBlockIndex;
}
@@ -465,8 +455,7 @@ int FlashIndexStorage::GrowIndex() {
if (!filesystem_->Grow(
block_fd_.get(),
static_cast<uint64_t>(num_blocks_ + 1) * block_size())) {
- ICING_VLOG(1) << IcingStringUtil::StringPrintf(
- "Error growing index file: %s", strerror(errno));
+ ICING_VLOG(1) << "Error growing index file: " << strerror(errno);
return kInvalidBlockIndex;
}
diff --git a/icing/index/main/main-index.cc b/icing/index/main/main-index.cc
index 158c287..9f591c0 100644
--- a/icing/index/main/main-index.cc
+++ b/icing/index/main/main-index.cc
@@ -17,9 +17,11 @@
#include <cstring>
#include <memory>
#include <string>
+#include <unordered_set>
#include "icing/absl_ports/canonical_errors.h"
#include "icing/absl_ports/str_cat.h"
+#include "icing/file/destructible-directory.h"
#include "icing/index/main/index-block.h"
#include "icing/index/term-id-codec.h"
#include "icing/index/term-property-id.h"
@@ -84,35 +86,40 @@ FindTermResult FindShortestValidTermWithPrefixHits(
} // namespace
+MainIndex::MainIndex(const std::string& index_directory,
+ const Filesystem* filesystem,
+ const IcingFilesystem* icing_filesystem)
+ : base_dir_(index_directory),
+ filesystem_(filesystem),
+ icing_filesystem_(icing_filesystem) {}
+
libtextclassifier3::StatusOr<std::unique_ptr<MainIndex>> MainIndex::Create(
const std::string& index_directory, const Filesystem* filesystem,
const IcingFilesystem* icing_filesystem) {
ICING_RETURN_ERROR_IF_NULL(filesystem);
ICING_RETURN_ERROR_IF_NULL(icing_filesystem);
- auto main_index = std::make_unique<MainIndex>();
- ICING_RETURN_IF_ERROR(
- main_index->Init(index_directory, filesystem, icing_filesystem));
+ std::unique_ptr<MainIndex> main_index(
+ new MainIndex(index_directory, filesystem, icing_filesystem));
+ ICING_RETURN_IF_ERROR(main_index->Init());
return main_index;
}
// TODO(b/139087650) : Migrate off of IcingFilesystem.
-libtextclassifier3::Status MainIndex::Init(
- const std::string& index_directory, const Filesystem* filesystem,
- const IcingFilesystem* icing_filesystem) {
- if (!filesystem->CreateDirectoryRecursively(index_directory.c_str())) {
+libtextclassifier3::Status MainIndex::Init() {
+ if (!filesystem_->CreateDirectoryRecursively(base_dir_.c_str())) {
return absl_ports::InternalError("Unable to create main index directory.");
}
- std::string flash_index_file = index_directory + "/main_index";
+ std::string flash_index_file = base_dir_ + "/main_index";
ICING_ASSIGN_OR_RETURN(
FlashIndexStorage flash_index,
- FlashIndexStorage::Create(flash_index_file, filesystem));
+ FlashIndexStorage::Create(flash_index_file, filesystem_));
flash_index_storage_ =
std::make_unique<FlashIndexStorage>(std::move(flash_index));
- std::string lexicon_file = index_directory + "/main-lexicon";
+ std::string lexicon_file = base_dir_ + "/main-lexicon";
IcingDynamicTrie::RuntimeOptions runtime_options;
main_lexicon_ = std::make_unique<IcingDynamicTrie>(
- lexicon_file, runtime_options, icing_filesystem);
+ lexicon_file, runtime_options, icing_filesystem_);
IcingDynamicTrie::Options lexicon_options;
if (!main_lexicon_->CreateIfNotExist(lexicon_options) ||
!main_lexicon_->Init()) {
@@ -490,8 +497,7 @@ libtextclassifier3::Status MainIndex::AddHits(
}
// Now copy remaining backfills.
- ICING_VLOG(1) << IcingStringUtil::StringPrintf("Remaining backfills %zu",
- backfill_map.size());
+ ICING_VLOG(1) << "Remaining backfills " << backfill_map.size();
for (auto other_tvi_main_tvi_pair : backfill_map) {
PostingListIdentifier backfill_posting_list_id =
PostingListIdentifier::kInvalid;
@@ -524,9 +530,7 @@ libtextclassifier3::Status MainIndex::AddHitsForTerm(
std::unique_ptr<PostingListAccessor> pl_accessor;
if (posting_list_id.is_valid()) {
if (posting_list_id.block_index() >= flash_index_storage_->num_blocks()) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Index dropped hits. Invalid block index %u >= %u",
- posting_list_id.block_index(), flash_index_storage_->num_blocks());
+ ICING_LOG(ERROR) << "Index dropped hits. Invalid block index " << posting_list_id.block_index() << " >= " << flash_index_storage_->num_blocks();
// TODO(b/159918304) : Consider revising the checksumming strategy in the
// main index. Providing some mechanism to check for corruption - either
// during initialization or some later time would allow us to avoid
@@ -633,5 +637,142 @@ std::string MainIndex::GetDebugInfo(DebugInfoVerbosity::Code verbosity) const {
return res;
}
+libtextclassifier3::Status MainIndex::Optimize(
+ const std::vector<DocumentId>& document_id_old_to_new) {
+ std::string temporary_index_dir_path = base_dir_ + "_temp";
+ if (!filesystem_->DeleteDirectoryRecursively(
+ temporary_index_dir_path.c_str())) {
+ ICING_LOG(ERROR) << "Recursively deleting " << temporary_index_dir_path;
+ return absl_ports::InternalError(
+ "Unable to delete temp directory to prepare to build new index.");
+ }
+
+ DestructibleDirectory temporary_index_dir(
+ filesystem_, std::move(temporary_index_dir_path));
+ if (!temporary_index_dir.is_valid()) {
+ return absl_ports::InternalError(
+ "Unable to create temp directory to build new index.");
+ }
+
+ ICING_ASSIGN_OR_RETURN(std::unique_ptr<MainIndex> new_index,
+ MainIndex::Create(temporary_index_dir.dir(),
+ filesystem_, icing_filesystem_));
+ ICING_RETURN_IF_ERROR(TransferIndex(document_id_old_to_new, new_index.get()));
+ ICING_RETURN_IF_ERROR(new_index->PersistToDisk());
+ new_index = nullptr;
+ flash_index_storage_ = nullptr;
+ main_lexicon_ = nullptr;
+
+ if (!filesystem_->SwapFiles(temporary_index_dir.dir().c_str(),
+ base_dir_.c_str())) {
+ return absl_ports::InternalError(
+ "Unable to apply new index due to failed swap!");
+ }
+
+ // Reinitialize the index so that flash_index_storage_ and main_lexicon_ are
+ // properly updated.
+ return Init();
+}
+
+libtextclassifier3::StatusOr<DocumentId> MainIndex::TransferAndAddHits(
+ const std::vector<DocumentId>& document_id_old_to_new, const char* term,
+ PostingListAccessor& old_pl_accessor, MainIndex* new_index) {
+ std::vector<Hit> new_hits;
+ bool has_no_exact_hits = true;
+ bool has_hits_in_prefix_section = false;
+ // The largest document id after translating hits.
+ DocumentId largest_document_id = kInvalidDocumentId;
+ ICING_ASSIGN_OR_RETURN(std::vector<Hit> tmp,
+ old_pl_accessor.GetNextHitsBatch());
+ while (!tmp.empty()) {
+ for (const Hit& hit : tmp) {
+ DocumentId new_document_id = document_id_old_to_new[hit.document_id()];
+ // Transfer the document id of the hit, if the document is not deleted
+ // or outdated.
+ if (new_document_id != kInvalidDocumentId) {
+ if (hit.is_in_prefix_section()) {
+ has_hits_in_prefix_section = true;
+ }
+ if (!hit.is_prefix_hit()) {
+ has_no_exact_hits = false;
+ }
+ if (largest_document_id == kInvalidDocumentId ||
+ new_document_id > largest_document_id) {
+ largest_document_id = new_document_id;
+ }
+ new_hits.push_back(Hit::TranslateHit(hit, new_document_id));
+ }
+ }
+ ICING_ASSIGN_OR_RETURN(tmp, old_pl_accessor.GetNextHitsBatch());
+ }
+ // A term without exact hits indicates that it is a purely backfill term. If
+ // the term is not branching in the new trie, it means backfilling is no
+ // longer necessary, so that we can skip.
+ if (new_hits.empty() ||
+ (has_no_exact_hits && !new_index->main_lexicon_->IsBranchingTerm(term))) {
+ return largest_document_id;
+ }
+
+ ICING_ASSIGN_OR_RETURN(
+ PostingListAccessor hit_accum,
+ PostingListAccessor::Create(new_index->flash_index_storage_.get()));
+ for (auto itr = new_hits.rbegin(); itr != new_hits.rend(); ++itr) {
+ ICING_RETURN_IF_ERROR(hit_accum.PrependHit(*itr));
+ }
+ PostingListAccessor::FinalizeResult result =
+ PostingListAccessor::Finalize(std::move(hit_accum));
+ uint32_t tvi;
+ if (!result.id.is_valid() ||
+ !new_index->main_lexicon_->Insert(term, &result.id, &tvi,
+ /*replace=*/false)) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Could not transfer main index for term: ", term));
+ }
+ if (has_no_exact_hits && !new_index->main_lexicon_->SetProperty(
+ tvi, GetHasNoExactHitsPropertyId())) {
+ return absl_ports::InternalError("Setting prefix prop failed");
+ }
+ if (has_hits_in_prefix_section &&
+ !new_index->main_lexicon_->SetProperty(
+ tvi, GetHasHitsInPrefixSectionPropertyId())) {
+ return absl_ports::InternalError("Setting prefix prop failed");
+ }
+ return largest_document_id;
+}
+
+libtextclassifier3::Status MainIndex::TransferIndex(
+ const std::vector<DocumentId>& document_id_old_to_new,
+ MainIndex* new_index) {
+ DocumentId largest_document_id = kInvalidDocumentId;
+ for (IcingDynamicTrie::Iterator term_itr(*main_lexicon_, /*prefix=*/"",
+ /*reverse=*/true);
+ term_itr.IsValid(); term_itr.Advance()) {
+ PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
+ memcpy(&posting_list_id, term_itr.GetValue(), sizeof(posting_list_id));
+ if (posting_list_id == PostingListIdentifier::kInvalid) {
+ // Why?
+ ICING_LOG(ERROR)
+ << "Got invalid posting_list_id from previous main index";
+ continue;
+ }
+ ICING_ASSIGN_OR_RETURN(PostingListAccessor pl_accessor,
+ PostingListAccessor::CreateFromExisting(
+ flash_index_storage_.get(), posting_list_id));
+ ICING_ASSIGN_OR_RETURN(
+ DocumentId curr_largest_document_id,
+ TransferAndAddHits(document_id_old_to_new, term_itr.GetKey(),
+ pl_accessor, new_index));
+ if (curr_largest_document_id == kInvalidDocumentId) {
+ continue;
+ }
+ if (largest_document_id == kInvalidDocumentId ||
+ curr_largest_document_id > largest_document_id) {
+ largest_document_id = curr_largest_document_id;
+ }
+ }
+ new_index->flash_index_storage_->set_last_indexed_docid(largest_document_id);
+ return libtextclassifier3::Status::OK;
+}
+
} // namespace lib
} // namespace icing
diff --git a/icing/index/main/main-index.h b/icing/index/main/main-index.h
index d6f7d5f..15030b0 100644
--- a/icing/index/main/main-index.h
+++ b/icing/index/main/main-index.h
@@ -188,10 +188,20 @@ class MainIndex {
// postings lists.
std::string GetDebugInfo(DebugInfoVerbosity::Code verbosity) const;
+ // Reduces internal file sizes by reclaiming space of deleted documents.
+ //
+ // Returns:
+ // OK on success
+ // INTERNAL_ERROR on IO error, this indicates that the index may be in an
+ // invalid state and should be cleared.
+ libtextclassifier3::Status Optimize(
+ const std::vector<DocumentId>& document_id_old_to_new);
+
private:
- libtextclassifier3::Status Init(const std::string& index_directory,
- const Filesystem* filesystem,
- const IcingFilesystem* icing_filesystem);
+ MainIndex(const std::string& index_directory, const Filesystem* filesystem,
+ const IcingFilesystem* icing_filesystem);
+
+ libtextclassifier3::Status Init();
// Helpers for merging the lexicon
// Add all 'backfill' branch points. Backfill branch points are prefix
@@ -287,6 +297,27 @@ class MainIndex {
PostingListIdentifier backfill_posting_list_id,
PostingListAccessor* hit_accum);
+ // Transfer hits from old_pl_accessor to new_index for term.
+ //
+ // Returns:
+ // largest document id added to the translated posting list, on success
+ // INTERNAL_ERROR on IO error
+ static libtextclassifier3::StatusOr<DocumentId> TransferAndAddHits(
+ const std::vector<DocumentId>& document_id_old_to_new, const char* term,
+ PostingListAccessor& old_pl_accessor, MainIndex* new_index);
+
+ // Transfer hits from the current main index to new_index.
+ //
+ // Returns:
+ // OK on success
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::Status TransferIndex(
+ const std::vector<DocumentId>& document_id_old_to_new,
+ MainIndex* new_index);
+
+ std::string base_dir_;
+ const Filesystem* filesystem_;
+ const IcingFilesystem* icing_filesystem_;
std::unique_ptr<FlashIndexStorage> flash_index_storage_;
std::unique_ptr<IcingDynamicTrie> main_lexicon_;
};
diff --git a/icing/jni/icing-search-engine-jni.cc b/icing/jni/icing-search-engine-jni.cc
index 17bb059..c9e7127 100644
--- a/icing/jni/icing-search-engine-jni.cc
+++ b/icing/jni/icing-search-engine-jni.cc
@@ -17,12 +17,11 @@
#include <string>
#include <utility>
+#include <google/protobuf/message_lite.h>
+#include "icing/icing-search-engine.h"
#include "icing/jni/jni-cache.h"
#include "icing/jni/scoped-primitive-array-critical.h"
#include "icing/jni/scoped-utf-chars.h"
-#include <google/protobuf/message_lite.h>
-#include "icing/absl_ports/status_imports.h"
-#include "icing/icing-search-engine.h"
#include "icing/proto/document.pb.h"
#include "icing/proto/initialize.pb.h"
#include "icing/proto/optimize.pb.h"
diff --git a/icing/jni/jni-cache.cc b/icing/jni/jni-cache.cc
index 9b75db6..1804b9a 100644
--- a/icing/jni/jni-cache.cc
+++ b/icing/jni/jni-cache.cc
@@ -159,8 +159,7 @@ libtextclassifier3::StatusOr<std::unique_ptr<JniCache>> JniCache::Create(
// BreakIteratorBatcher
ICING_GET_CLASS_OR_RETURN_NULL(
- breakiterator,
- "com/google/android/icing/BreakIteratorBatcher");
+ breakiterator, "com/google/android/icing/BreakIteratorBatcher");
ICING_GET_METHOD(breakiterator, constructor, "<init>",
"(Ljava/util/Locale;)V");
ICING_GET_METHOD(breakiterator, settext, "setText", "(Ljava/lang/String;)V");
diff --git a/icing/jni/scoped-primitive-array-critical_test.cc b/icing/jni/scoped-primitive-array-critical_test.cc
new file mode 100644
index 0000000..3655378
--- /dev/null
+++ b/icing/jni/scoped-primitive-array-critical_test.cc
@@ -0,0 +1,140 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/jni/scoped-primitive-array-critical.h"
+
+#include <jni.h>
+
+#include <utility>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "util/java/mock_jni_env.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::Eq;
+using ::testing::IsNull;
+using ::testing::Return;
+using util::java::test::MockJNIEnv;
+
+TEST(ScopedJniClassesTest, ScopedPrimitiveArrayNull) {
+ auto env_mock = std::make_unique<MockJNIEnv>();
+ // Construct a scoped utf chars normally.
+ ScopedPrimitiveArrayCritical<uint8_t> scoped_primitive_array(
+ env_mock.get(), /*array=*/nullptr);
+ EXPECT_THAT(scoped_primitive_array.data(), IsNull());
+ EXPECT_THAT(scoped_primitive_array.size(), Eq(0));
+
+ // Move construct a scoped utf chars
+ ScopedPrimitiveArrayCritical<uint8_t> moved_scoped_primitive_array(
+ std::move(scoped_primitive_array));
+ EXPECT_THAT(moved_scoped_primitive_array.data(), IsNull());
+ EXPECT_THAT(moved_scoped_primitive_array.size(), Eq(0));
+
+ // Move assign a scoped utf chars
+ ScopedPrimitiveArrayCritical<uint8_t> move_assigned_scoped_primitive_array =
+ std::move(moved_scoped_primitive_array);
+ EXPECT_THAT(move_assigned_scoped_primitive_array.data(), IsNull());
+ EXPECT_THAT(move_assigned_scoped_primitive_array.size(), Eq(0));
+}
+
+TEST(ScopedJniClassesTest, ScopedPrimitiveArrayConstruction) {
+ auto env_mock = std::make_unique<MockJNIEnv>();
+ // Construct a scoped utf chars normally.
+ jarray fake_jarray = reinterpret_cast<jarray>(-303);
+ uint8_t fake_array[] = {1, 8, 63, 90};
+ ON_CALL(*env_mock, GetPrimitiveArrayCritical(Eq(fake_jarray), IsNull()))
+ .WillByDefault(Return(fake_array));
+ ON_CALL(*env_mock, GetArrayLength(Eq(fake_jarray))).WillByDefault(Return(4));
+
+ ScopedPrimitiveArrayCritical<uint8_t> scoped_primitive_array(
+ env_mock.get(),
+ /*array=*/fake_jarray);
+ EXPECT_THAT(scoped_primitive_array.data(), Eq(fake_array));
+ EXPECT_THAT(scoped_primitive_array.size(), Eq(4));
+
+ EXPECT_CALL(*env_mock, ReleasePrimitiveArrayCritical(Eq(fake_jarray),
+ Eq(fake_array), Eq(0)))
+ .Times(1);
+}
+
+TEST(ScopedJniClassesTest, ScopedPrimitiveArrayMoveConstruction) {
+ auto env_mock = std::make_unique<MockJNIEnv>();
+ // Construct a scoped utf chars normally.
+ jarray fake_jarray = reinterpret_cast<jarray>(-303);
+ uint8_t fake_array[] = {1, 8, 63, 90};
+ ON_CALL(*env_mock, GetPrimitiveArrayCritical(Eq(fake_jarray), IsNull()))
+ .WillByDefault(Return(fake_array));
+ ON_CALL(*env_mock, GetArrayLength(Eq(fake_jarray))).WillByDefault(Return(4));
+
+ ScopedPrimitiveArrayCritical<uint8_t> scoped_primitive_array(
+ env_mock.get(),
+ /*array=*/fake_jarray);
+
+ // Move construct a scoped utf chars
+ ScopedPrimitiveArrayCritical<uint8_t> moved_scoped_primitive_array(
+ std::move(scoped_primitive_array));
+ EXPECT_THAT(moved_scoped_primitive_array.data(), Eq(fake_array));
+ EXPECT_THAT(moved_scoped_primitive_array.size(), Eq(4));
+
+ EXPECT_CALL(*env_mock, ReleasePrimitiveArrayCritical(Eq(fake_jarray),
+ Eq(fake_array), Eq(0)))
+ .Times(1);
+}
+
+TEST(ScopedJniClassesTest, ScopedPrimitiveArrayMoveAssignment) {
+ // Setup the mock to return:
+ // {1, 8, 63, 90} for jstring (-303)
+ // {5, 9, 82} for jstring (-505)
+ auto env_mock = std::make_unique<MockJNIEnv>();
+ jarray fake_jarray1 = reinterpret_cast<jarray>(-303);
+ uint8_t fake_array1[] = {1, 8, 63, 90};
+ ON_CALL(*env_mock, GetPrimitiveArrayCritical(Eq(fake_jarray1), IsNull()))
+ .WillByDefault(Return(fake_array1));
+ ON_CALL(*env_mock, GetArrayLength(Eq(fake_jarray1))).WillByDefault(Return(4));
+
+ jarray fake_jarray2 = reinterpret_cast<jarray>(-505);
+ uint8_t fake_array2[] = {5, 9, 82};
+ ON_CALL(*env_mock, GetPrimitiveArrayCritical(Eq(fake_jarray2), IsNull()))
+ .WillByDefault(Return(fake_array2));
+ ON_CALL(*env_mock, GetArrayLength(Eq(fake_jarray2))).WillByDefault(Return(3));
+
+ ScopedPrimitiveArrayCritical<uint8_t> scoped_primitive_array1(
+ env_mock.get(),
+ /*array=*/fake_jarray1);
+ ScopedPrimitiveArrayCritical<uint8_t> scoped_primitive_array2(
+ env_mock.get(),
+ /*array=*/fake_jarray2);
+
+ // Move assign a scoped utf chars
+ scoped_primitive_array2 = std::move(scoped_primitive_array1);
+ EXPECT_THAT(scoped_primitive_array2.data(), Eq(fake_array1));
+ EXPECT_THAT(scoped_primitive_array2.size(), Eq(4));
+
+ EXPECT_CALL(*env_mock, ReleasePrimitiveArrayCritical(Eq(fake_jarray1),
+ Eq(fake_array1), Eq(0)))
+ .Times(1);
+ EXPECT_CALL(*env_mock, ReleasePrimitiveArrayCritical(Eq(fake_jarray2),
+ Eq(fake_array2), Eq(0)))
+ .Times(1);
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/jni/scoped-utf-chars.h b/icing/jni/scoped-utf-chars.h
index 2dafcc1..5a3ac6a 100644
--- a/icing/jni/scoped-utf-chars.h
+++ b/icing/jni/scoped-utf-chars.h
@@ -12,7 +12,6 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-
#ifndef ICING_JNI_SCOPED_UTF_CHARS_H_
#define ICING_JNI_SCOPED_UTF_CHARS_H_
diff --git a/icing/jni/scoped-utf-chars_test.cc b/icing/jni/scoped-utf-chars_test.cc
new file mode 100644
index 0000000..d249f69
--- /dev/null
+++ b/icing/jni/scoped-utf-chars_test.cc
@@ -0,0 +1,126 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/jni/scoped-utf-chars.h"
+
+#include <jni.h>
+
+#include <string>
+#include <utility>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "util/java/mock_jni_env.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::Eq;
+using ::testing::IsNull;
+using ::testing::Return;
+using util::java::test::MockJNIEnv;
+
+TEST(ScopedJniClassesTest, ScopedUtfCharsNull) {
+ auto env_mock = std::make_unique<MockJNIEnv>();
+ // Construct a scoped utf chars normally.
+ ScopedUtfChars scoped_utf_chars(env_mock.get(), /*s=*/nullptr);
+ EXPECT_THAT(scoped_utf_chars.c_str(), IsNull());
+ EXPECT_THAT(scoped_utf_chars.size(), Eq(0));
+
+ // Move construct a scoped utf chars
+ ScopedUtfChars moved_scoped_utf_chars(std::move(scoped_utf_chars));
+ EXPECT_THAT(moved_scoped_utf_chars.c_str(), IsNull());
+ EXPECT_THAT(moved_scoped_utf_chars.size(), Eq(0));
+
+ // Move assign a scoped utf chars
+ ScopedUtfChars move_assigned_scoped_utf_chars =
+ std::move(moved_scoped_utf_chars);
+ EXPECT_THAT(move_assigned_scoped_utf_chars.c_str(), IsNull());
+ EXPECT_THAT(move_assigned_scoped_utf_chars.size(), Eq(0));
+}
+
+TEST(ScopedJniClassesTest, ScopedUtfCharsConstruction) {
+ auto env_mock = std::make_unique<MockJNIEnv>();
+ // Construct a scoped utf chars normally.
+ jstring fake_jstring = reinterpret_cast<jstring>(-303);
+ std::string fake_string = "foo";
+ ON_CALL(*env_mock, GetStringUTFChars(Eq(fake_jstring), IsNull()))
+ .WillByDefault(Return(fake_string.c_str()));
+
+ ScopedUtfChars scoped_utf_chars(env_mock.get(), /*s=*/fake_jstring);
+ EXPECT_THAT(scoped_utf_chars.c_str(), Eq(fake_string.c_str()));
+ EXPECT_THAT(scoped_utf_chars.size(), Eq(3));
+
+ EXPECT_CALL(*env_mock,
+ ReleaseStringUTFChars(Eq(fake_jstring), Eq(fake_string.c_str())))
+ .Times(1);
+}
+
+TEST(ScopedJniClassesTest, ScopedUtfCharsMoveConstruction) {
+ auto env_mock = std::make_unique<MockJNIEnv>();
+ // Construct a scoped utf chars normally.
+ jstring fake_jstring = reinterpret_cast<jstring>(-303);
+ std::string fake_string = "foo";
+ ON_CALL(*env_mock, GetStringUTFChars(Eq(fake_jstring), IsNull()))
+ .WillByDefault(Return(fake_string.c_str()));
+
+ ScopedUtfChars scoped_utf_chars(env_mock.get(), /*s=*/fake_jstring);
+
+ // Move construct a scoped utf chars
+ ScopedUtfChars moved_scoped_utf_chars(std::move(scoped_utf_chars));
+ EXPECT_THAT(moved_scoped_utf_chars.c_str(), Eq(fake_string.c_str()));
+ EXPECT_THAT(moved_scoped_utf_chars.size(), Eq(3));
+
+ EXPECT_CALL(*env_mock,
+ ReleaseStringUTFChars(Eq(fake_jstring), Eq(fake_string.c_str())))
+ .Times(1);
+}
+
+TEST(ScopedJniClassesTest, ScopedUtfCharsMoveAssignment) {
+ // Setup the mock to return:
+ // "foo" for jstring (-303)
+ // "bar baz" for jstring (-505)
+ auto env_mock = std::make_unique<MockJNIEnv>();
+ jstring fake_jstring1 = reinterpret_cast<jstring>(-303);
+ std::string fake_string1 = "foo";
+ ON_CALL(*env_mock, GetStringUTFChars(Eq(fake_jstring1), IsNull()))
+ .WillByDefault(Return(fake_string1.c_str()));
+
+ jstring fake_jstring2 = reinterpret_cast<jstring>(-505);
+ std::string fake_string2 = "bar baz";
+ ON_CALL(*env_mock, GetStringUTFChars(Eq(fake_jstring2), IsNull()))
+ .WillByDefault(Return(fake_string2.c_str()));
+
+ ScopedUtfChars scoped_utf_chars1(env_mock.get(), /*s=*/fake_jstring1);
+ ScopedUtfChars scoped_utf_chars2(env_mock.get(), /*s=*/fake_jstring2);
+
+ // Move assign a scoped utf chars
+ scoped_utf_chars2 = std::move(scoped_utf_chars1);
+ EXPECT_THAT(scoped_utf_chars2.c_str(), Eq(fake_string1.c_str()));
+ EXPECT_THAT(scoped_utf_chars2.size(), Eq(3));
+
+ EXPECT_CALL(*env_mock, ReleaseStringUTFChars(Eq(fake_jstring1),
+ Eq(fake_string1.c_str())))
+ .Times(1);
+ EXPECT_CALL(*env_mock, ReleaseStringUTFChars(Eq(fake_jstring2),
+ Eq(fake_string2.c_str())))
+ .Times(1);
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/legacy/index/icing-array-storage.cc b/icing/legacy/index/icing-array-storage.cc
index 4d2ef67..de5178a 100644
--- a/icing/legacy/index/icing-array-storage.cc
+++ b/icing/legacy/index/icing-array-storage.cc
@@ -65,17 +65,13 @@ bool IcingArrayStorage::Init(int fd, size_t fd_offset, bool map_shared,
return false;
}
if (file_size < fd_offset) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Array storage file size %" PRIu64 " less than offset %zu", file_size,
- fd_offset);
+ ICING_LOG(ERROR) << "Array storage file size " << file_size << " less than offset " << fd_offset;
return false;
}
uint32_t capacity_num_elts = (file_size - fd_offset) / elt_size;
if (capacity_num_elts < num_elts) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Array storage num elts %u > capacity num elts %u", num_elts,
- capacity_num_elts);
+ ICING_LOG(ERROR) << "Array storage num elts " << num_elts << " > capacity num elts " << capacity_num_elts;
return false;
}
@@ -108,8 +104,7 @@ bool IcingArrayStorage::Init(int fd, size_t fd_offset, bool map_shared,
if (init_crc) {
*crc_ptr_ = crc;
} else if (crc != *crc_ptr_) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Array storage bad crc %u vs %u", crc, *crc_ptr_);
+ ICING_LOG(ERROR) << "Array storage bad crc " << crc << " vs " << *crc_ptr_;
goto failed;
}
}
@@ -276,9 +271,9 @@ void IcingArrayStorage::UpdateCrc() {
cur_offset += change.elt_len * elt_size_;
}
if (!changes_.empty()) {
- ICING_VLOG(2) << IcingStringUtil::StringPrintf(
- "Array update partial crcs %d truncated %d overlapped %d duplicate %d",
- num_partial_crcs, num_truncated, num_overlapped, num_duplicate);
+ ICING_VLOG(2) << "Array update partial crcs " << num_partial_crcs
+ << " truncated " << num_truncated << " overlapped " << num_overlapped
+ << " duplicate " << num_duplicate;
}
// Now update with grown area.
@@ -286,8 +281,7 @@ void IcingArrayStorage::UpdateCrc() {
cur_crc = IcingStringUtil::UpdateCrc32(
cur_crc, array_cast<char>() + changes_end_ * elt_size_,
(cur_num_ - changes_end_) * elt_size_);
- ICING_VLOG(2) << IcingStringUtil::StringPrintf(
- "Array update tail crc offset %u -> %u", changes_end_, cur_num_);
+ ICING_VLOG(2) << "Array update tail crc offset " << changes_end_ << " -> " << cur_num_;
}
// Clear, now that we've applied changes.
@@ -341,8 +335,7 @@ uint32_t IcingArrayStorage::Sync() {
if (pwrite(fd_, array() + dirty_start, dirty_end - dirty_start,
fd_offset_ + dirty_start) !=
static_cast<ssize_t>(dirty_end - dirty_start)) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Flushing pages failed (%u, %u)", dirty_start, dirty_end);
+ ICING_LOG(ERROR) << "Flushing pages failed (" << dirty_start << ", " << dirty_end << ")";
}
in_dirty = false;
} else if (!in_dirty && is_dirty) {
@@ -361,8 +354,7 @@ uint32_t IcingArrayStorage::Sync() {
if (pwrite(fd_, array() + dirty_start, dirty_end - dirty_start,
fd_offset_ + dirty_start) !=
static_cast<ssize_t>(dirty_end - dirty_start)) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Flushing pages failed (%u, %u)", dirty_start, dirty_end);
+ ICING_LOG(ERROR) << "Flushing pages failed (" << dirty_start << ", " << dirty_end << ")";
}
}
@@ -377,9 +369,7 @@ uint32_t IcingArrayStorage::Sync() {
}
if (num_flushed > 0) {
- ICING_VLOG(1) << IcingStringUtil::StringPrintf(
- "Flushing %u/%u %u contiguous pages in %.3fms", num_flushed,
- dirty_pages_size, num_contiguous, timer.Elapsed() * 1000.);
+ ICING_VLOG(1) << "Flushing " << num_flushed << "/" << dirty_pages_size << " " << num_contiguous << " contiguous pages in " << timer.Elapsed() * 1000 << "ms.";
}
return num_flushed;
diff --git a/icing/legacy/index/icing-common-types.h b/icing/legacy/index/icing-common-types.h
deleted file mode 100644
index 592b549..0000000
--- a/icing/legacy/index/icing-common-types.h
+++ /dev/null
@@ -1,129 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Copyright 2014 Google Inc. All Rights Reserved.
-// Author: sbanacho@google.com (Scott Banachowski)
-// Author: csyoung@google.com (C. Sean Young)
-
-#ifndef ICING_LEGACY_INDEX_ICING_COMMON_TYPES_H_
-#define ICING_LEGACY_INDEX_ICING_COMMON_TYPES_H_
-
-#include "icing/legacy/core/icing-core-types.h"
-
-// Protocol buffers are shared across several components.
-namespace com {
-namespace google {
-namespace android {
-namespace gms {
-namespace icing {
-namespace lib {
-
-class ClientFileGroup;
-class Document;
-class Document_Section;
-class DocumentStoreStatusProto;
-class IMEUpdate;
-class IMEUpdateResponse;
-class IndexCorpusScoringConfig;
-class IndexCorpusScoringConfig_Section;
-class IndexScoringConfig;
-class InitStatus;
-class InitStatus_CorpusInitInfo;
-class PendingDeleteUsageReport;
-class PhraseAffinityRequest;
-class QueryResponse;
-class QueryResponse_Corpus;
-class QueryResponse_Corpus_Section;
-class QueryResponse_Corpus_Tag;
-class QueryRequestSpec;
-class QueryRequestSpec_CorpusSpec;
-class QueryRequestSpec_SectionSpec;
-class ResponseDebugInfo;
-class ResultDebugInfo;
-class SectionConfig;
-class SuggestionResponse;
-class SuggestionResponse_Suggestion;
-class UsageReportsResponse;
-class UsageStats;
-class UsageStats_Corpus;
-
-} // namespace lib
-} // namespace icing
-} // namespace gms
-} // namespace android
-} // namespace google
-} // namespace com
-
-namespace icing {
-namespace lib {
-
-// Typedefs.
-using IcingDocId = uint32_t;
-
-using IcingSectionId = uint32_t;
-
-using IcingCorpusId = uint16_t;
-using IcingSectionIdMask = uint16_t;
-
-using IcingTagsCount = uint16_t;
-
-using IcingSequenceNumber = int64_t;
-
-using IcingScore = uint64_t;
-
-constexpr size_t kIcingMaxTokenLen = 30; // default shared between query
- // processor and indexer
-constexpr int kIcingQueryTermLimit = 50; // Maximum number of terms in a query
-constexpr int kIcingMaxVariantsPerToken = 10; // Maximum number of variants
-
-// LINT.IfChange
-constexpr int kIcingDocIdBits = 20; // 1M docs
-constexpr IcingDocId kIcingInvalidDocId = (1u << kIcingDocIdBits) - 1;
-constexpr IcingDocId kIcingMaxDocId = kIcingInvalidDocId - 1;
-// LINT.ThenChange(//depot/google3/wireless/android/icing/plx/google_sql_common_macros.sql)
-
-constexpr int kIcingDocScoreBits = 32;
-
-constexpr int kIcingSectionIdBits = 4; // 4 bits for 16 values
-constexpr IcingSectionId kIcingMaxSectionId = (1u << kIcingSectionIdBits) - 1;
-constexpr IcingSectionId kIcingInvalidSectionId = kIcingMaxSectionId + 1;
-constexpr IcingSectionIdMask kIcingSectionIdMaskAll = ~IcingSectionIdMask{0};
-constexpr IcingSectionIdMask kIcingSectionIdMaskNone = IcingSectionIdMask{0};
-
-constexpr int kIcingCorpusIdBits = 15; // 32K
-constexpr IcingCorpusId kIcingInvalidCorpusId = (1u << kIcingCorpusIdBits) - 1;
-constexpr IcingCorpusId kIcingMaxCorpusId = kIcingInvalidCorpusId - 1;
-
-constexpr size_t kIcingMaxSearchableDocumentSize = (1u << 16) - 1; // 64K
-// Max num tokens per document. 64KB is our original maximum (searchable)
-// document size. We clip if document exceeds this.
-constexpr uint32_t kIcingMaxNumTokensPerDoc =
- kIcingMaxSearchableDocumentSize / 5;
-constexpr uint32_t kIcingMaxNumHitsPerDocument =
- kIcingMaxNumTokensPerDoc * kIcingMaxVariantsPerToken;
-
-constexpr IcingTagsCount kIcingInvalidTagCount = ~IcingTagsCount{0};
-constexpr IcingTagsCount kIcingMaxTagCount = kIcingInvalidTagCount - 1;
-
-// Location refers to document storage.
-constexpr uint64_t kIcingInvalidLocation = ~uint64_t{0};
-constexpr uint64_t kIcingMaxDocStoreWriteLocation = uint64_t{1}
- << 32; // 4bytes.
-
-// Dump symbols in the proto namespace.
-using namespace ::com::google::android::gms::icing; // NOLINT(build/namespaces)
-} // namespace lib
-} // namespace icing
-
-#endif // ICING_LEGACY_INDEX_ICING_COMMON_TYPES_H_
diff --git a/icing/legacy/index/icing-dynamic-trie.cc b/icing/legacy/index/icing-dynamic-trie.cc
index 4428599..c6816ad 100644
--- a/icing/legacy/index/icing-dynamic-trie.cc
+++ b/icing/legacy/index/icing-dynamic-trie.cc
@@ -460,8 +460,7 @@ bool IcingDynamicTrie::IcingDynamicTrieStorage::Init() {
if (i == 0) {
// Header.
if (file_size != IcingMMapper::system_page_size()) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Trie hdr wrong size: %" PRIu64, file_size);
+ ICING_LOG(ERROR) << "Trie hdr wrong size: " << file_size;
goto failed;
}
@@ -522,8 +521,7 @@ bool IcingDynamicTrie::IcingDynamicTrieStorage::Init() {
sizeof(char), hdr_.hdr.suffixes_size(),
hdr_.hdr.max_suffixes_size(),
&crcs_->array_crcs[SUFFIX], init_crcs)) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Trie mmap suffix failed");
+ ICING_LOG(ERROR) << "Trie mmap suffix failed";
goto failed;
}
@@ -671,8 +669,7 @@ bool IcingDynamicTrie::IcingDynamicTrieStorage::Sync() {
}
if (!WriteHeader()) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Flushing trie header failed: %s", strerror(errno));
+ ICING_LOG(ERROR) << "Flushing trie header failed: " << strerror(errno);
success = false;
}
@@ -686,8 +683,7 @@ bool IcingDynamicTrie::IcingDynamicTrieStorage::Sync() {
}
if (total_flushed > 0) {
- ICING_VLOG(1) << IcingStringUtil::StringPrintf("Flushing %u pages of trie",
- total_flushed);
+ ICING_VLOG(1) << "Flushing " << total_flushed << " pages of trie";
}
return success;
@@ -817,8 +813,7 @@ uint32_t IcingDynamicTrie::IcingDynamicTrieStorage::UpdateCrc() {
uint32_t IcingDynamicTrie::IcingDynamicTrieStorage::UpdateCrcInternal(
bool write_hdr) {
if (write_hdr && !WriteHeader()) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Flushing trie header failed: %s", strerror(errno));
+ ICING_LOG(ERROR) << "Flushing trie header failed: " << strerror(errno);
}
crcs_->header_crc = GetHeaderCrc();
@@ -912,8 +907,7 @@ bool IcingDynamicTrie::IcingDynamicTrieStorage::Header::SerializeToArray(
bool IcingDynamicTrie::IcingDynamicTrieStorage::Header::Verify() {
// Check version.
if (hdr.version() != kCurVersion) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Trie version %u mismatch", hdr.version());
+ ICING_LOG(ERROR) << "Trie version " << hdr.version() << " mismatch";
return false;
}
@@ -1155,9 +1149,8 @@ bool IcingDynamicTrie::Sync() {
Warm();
- ICING_VLOG(1) << IcingStringUtil::StringPrintf(
- "Syncing dynamic trie %s took %.3fms", filename_base_.c_str(),
- timer.Elapsed() * 1000.);
+ ICING_VLOG(1) << "Syncing dynamic trie " << filename_base_.c_str()
+ << " took " << timer.Elapsed() * 1000 << "ms";
return success;
}
@@ -1207,8 +1200,7 @@ std::unique_ptr<IcingFlashBitmap> IcingDynamicTrie::OpenAndInitBitmap(
const IcingFilesystem *filesystem) {
auto bitmap = std::make_unique<IcingFlashBitmap>(filename, filesystem);
if (!bitmap->Init() || (verify && !bitmap->Verify())) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Init of %s failed",
- filename.c_str());
+ ICING_LOG(ERROR) << "Init of " << filename.c_str() << " failed";
return nullptr;
}
return bitmap;
@@ -1238,16 +1230,14 @@ bool IcingDynamicTrie::InitPropertyBitmaps() {
vector<std::string> files;
if (!filesystem_->GetMatchingFiles((property_bitmaps_prefix_ + "*").c_str(),
&files)) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Could not get files at prefix %s", property_bitmaps_prefix_.c_str());
+ ICING_LOG(ERROR) << "Could not get files at prefix " << property_bitmaps_prefix_;
goto failed;
}
for (size_t i = 0; i < files.size(); i++) {
// Decode property id from filename.
size_t property_id_start_idx = files[i].rfind('.');
if (property_id_start_idx == std::string::npos) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Malformed filename %s",
- files[i].c_str());
+ ICING_LOG(ERROR) << "Malformed filename " << files[i];
continue;
}
property_id_start_idx++; // skip dot
@@ -1255,8 +1245,7 @@ bool IcingDynamicTrie::InitPropertyBitmaps() {
uint32_t property_id =
strtol(files[i].c_str() + property_id_start_idx, &end, 10); // NOLINT
if (!end || end != (files[i].c_str() + files[i].size())) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Malformed filename %s",
- files[i].c_str());
+ ICING_LOG(ERROR) << "Malformed filename " << files[i];
continue;
}
std::unique_ptr<IcingFlashBitmap> bitmap = OpenAndInitBitmap(
@@ -1264,8 +1253,7 @@ bool IcingDynamicTrie::InitPropertyBitmaps() {
runtime_options_.storage_policy == RuntimeOptions::kMapSharedWithCrc,
filesystem_);
if (!bitmap) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Open prop bitmap failed: %s", files[i].c_str());
+ ICING_LOG(ERROR) << "Open prop bitmap failed: " << files[i];
goto failed;
}
bitmap->Truncate(truncate_idx);
@@ -1795,11 +1783,12 @@ bool IcingDynamicTrie::Find(const char *key, void *value,
}
IcingDynamicTrie::Iterator::Iterator(const IcingDynamicTrie &trie,
- const char *prefix)
+ const char *prefix, bool reverse)
: cur_key_(prefix),
cur_suffix_(nullptr),
cur_suffix_len_(0),
single_leaf_match_(false),
+ reverse_(reverse),
trie_(trie) {
if (!trie.is_initialized()) {
ICING_LOG(FATAL) << "DynamicTrie not initialized";
@@ -1808,19 +1797,29 @@ IcingDynamicTrie::Iterator::Iterator(const IcingDynamicTrie &trie,
Reset();
}
-void IcingDynamicTrie::Iterator::LeftBranchToLeaf(uint32_t node_index) {
+void IcingDynamicTrie::Iterator::BranchToLeaf(uint32_t node_index,
+ BranchType branch_type) {
// Go down the trie, following the left-most child until we hit a
// leaf. Push to stack and cur_key nodes and chars as we go.
- for (; !trie_.storage_->GetNode(node_index)->is_leaf();
- node_index =
- trie_.storage_
- ->GetNext(trie_.storage_->GetNode(node_index)->next_index(), 0)
- ->node_index()) {
- branch_stack_.push_back(Branch(node_index));
- cur_key_.push_back(
- trie_.storage_
- ->GetNext(trie_.storage_->GetNode(node_index)->next_index(), 0)
- ->val());
+ // When reverse_ is true, the method will follow the right-most child.
+ const Node *node = trie_.storage_->GetNode(node_index);
+ while (!node->is_leaf()) {
+ const Next *next_start = trie_.storage_->GetNext(node->next_index(), 0);
+ int child_idx;
+ if (branch_type == BranchType::kRightMost) {
+ uint32_t next_array_size = 1u << node->log2_num_children();
+ child_idx = trie_.GetValidNextsSize(next_start, next_array_size) - 1;
+ } else {
+ // node isn't a leaf. So it must have >0 children.
+ // 0 is the left-most child.
+ child_idx = 0;
+ }
+ const Next &child_next = next_start[child_idx];
+ branch_stack_.push_back(Branch(node_index, child_idx));
+ cur_key_.push_back(child_next.val());
+
+ node_index = child_next.node_index();
+ node = trie_.storage_->GetNode(node_index);
}
// We're at a leaf.
@@ -1856,7 +1855,7 @@ void IcingDynamicTrie::Iterator::Reset() {
// Two cases/states:
//
// - Found an intermediate node. If we matched all of prefix
- // (cur_key_), LeftBranchToLeaf.
+ // (cur_key_), BranchToLeaf.
//
// - Found a leaf node, which is the ONLY matching key for this
// prefix. Check that suffix matches the prefix. Then we set
@@ -1879,7 +1878,9 @@ void IcingDynamicTrie::Iterator::Reset() {
cur_suffix_len_ = strlen(cur_suffix_);
single_leaf_match_ = true;
} else if (static_cast<size_t>(key_offset) == cur_key_.size()) {
- LeftBranchToLeaf(node_index);
+ BranchType branch_type =
+ (reverse_) ? BranchType::kRightMost : BranchType::kLeftMost;
+ BranchToLeaf(node_index, branch_type);
}
}
@@ -1906,19 +1907,25 @@ bool IcingDynamicTrie::Iterator::Advance() {
while (!branch_stack_.empty()) {
Branch *branch = &branch_stack_.back();
const Node *node = trie_.storage_->GetNode(branch->node_idx);
- branch->child_idx++;
- if (branch->child_idx < (1 << node->log2_num_children()) &&
- trie_.storage_->GetNext(node->next_index(), branch->child_idx)
- ->node_index() != kInvalidNodeIndex) {
- // Successfully incremented to the next child. Update the char
- // value at this depth.
- cur_key_[cur_key_.size() - 1] =
- trie_.storage_->GetNext(node->next_index(), branch->child_idx)->val();
- // We successfully found a sub-trie to explore.
- LeftBranchToLeaf(
- trie_.storage_->GetNext(node->next_index(), branch->child_idx)
- ->node_index());
- return true;
+ if (reverse_) {
+ branch->child_idx--;
+ } else {
+ branch->child_idx++;
+ }
+ if (branch->child_idx >= 0 &&
+ branch->child_idx < (1 << node->log2_num_children())) {
+ const Next *child_next =
+ trie_.storage_->GetNext(node->next_index(), branch->child_idx);
+ if (child_next->node_index() != kInvalidNodeIndex) {
+ // Successfully incremented to the next child. Update the char
+ // value at this depth.
+ cur_key_[cur_key_.size() - 1] = child_next->val();
+ // We successfully found a sub-trie to explore.
+ BranchType branch_type =
+ (reverse_) ? BranchType::kRightMost : BranchType::kLeftMost;
+ BranchToLeaf(child_next->node_index(), branch_type);
+ return true;
+ }
}
branch_stack_.pop_back();
cur_key_.resize(cur_key_.size() - 1);
@@ -2108,7 +2115,8 @@ const IcingDynamicTrie::Next *IcingDynamicTrie::GetNextByChar(
}
int IcingDynamicTrie::GetValidNextsSize(
- IcingDynamicTrie::Next *next_array_start, int next_array_length) const {
+ const IcingDynamicTrie::Next *next_array_start,
+ int next_array_length) const {
// Only searching for key char 0xff is not sufficient, as 0xff can be a valid
// character. We must also specify kInvalidNodeIndex as the target node index
// when searching the next array.
@@ -2295,15 +2303,16 @@ bool IcingDynamicTrie::IsBranchingTerm(const char *key) const {
return false;
}
- // key is not present in the trie.
+ // There is no intermediate node for key in the trie.
if (key[key_offset] != '\0') {
return false;
}
// Found key as an intermediate node, but key is not a valid term stored in
- // the trie.
+ // the trie. In this case, we need at least two children for key to be a
+ // branching term.
if (GetNextByChar(cur_node, '\0') == nullptr) {
- return false;
+ return cur_node->log2_num_children() >= 1;
}
// The intermediate node for key must have more than two children for key to
@@ -2320,8 +2329,7 @@ void IcingDynamicTrie::GetDebugInfo(int verbosity, std::string *out) const {
vector<std::string> files;
if (!filesystem_->GetMatchingFiles((property_bitmaps_prefix_ + "*").c_str(),
&files)) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Could not get files at prefix %s", property_bitmaps_prefix_.c_str());
+ ICING_LOG(ERROR) << "Could not get files at prefix " << property_bitmaps_prefix_;
return;
}
for (size_t i = 0; i < files.size(); i++) {
@@ -2393,8 +2401,7 @@ IcingFlashBitmap *IcingDynamicTrie::OpenOrCreatePropertyBitmap(
}
if (property_id > kMaxPropertyId) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Property id %u out of range", property_id);
+ ICING_LOG(ERROR) << "Property id " << property_id << " out of range";
return nullptr;
}
@@ -2567,8 +2574,7 @@ bool IcingDynamicTrie::ClearPropertyForAllValues(uint32_t property_id) {
PropertyReadersAll readers(*this);
if (!readers.Exists(property_id)) {
- ICING_VLOG(1) << IcingStringUtil::StringPrintf(
- "Properties for id %u don't exist", property_id);
+ ICING_VLOG(1) << "Properties for id " << property_id << " don't exist";
return true;
}
diff --git a/icing/legacy/index/icing-dynamic-trie.h b/icing/legacy/index/icing-dynamic-trie.h
index ec8b31a..b172632 100644
--- a/icing/legacy/index/icing-dynamic-trie.h
+++ b/icing/legacy/index/icing-dynamic-trie.h
@@ -405,9 +405,6 @@ class IcingDynamicTrie : public IIcingStorage {
// key is a branching term, if and only if there exists terms s1 and s2 in the
// trie such that key is the maximum common prefix of s1 and s2, but s1 and s2
// are not prefixes of each other.
- //
- // The function assumes that key is already present in the trie. Otherwise,
- // false will be returned.
bool IsBranchingTerm(const char *key) const;
void GetDebugInfo(int verbosity, std::string *out) const override;
@@ -520,7 +517,8 @@ class IcingDynamicTrie : public IIcingStorage {
// Change in underlying trie invalidates iterator.
class Iterator {
public:
- Iterator(const IcingDynamicTrie &trie, const char *prefix);
+ Iterator(const IcingDynamicTrie &trie, const char *prefix,
+ bool reverse = false);
void Reset();
bool Advance();
@@ -537,9 +535,10 @@ class IcingDynamicTrie : public IIcingStorage {
Iterator();
// Copy is ok.
- // Helper function that takes the left-most branch down
- // intermediate nodes to a leaf.
- void LeftBranchToLeaf(uint32_t node_index);
+ enum class BranchType { kLeftMost = 0, kRightMost = 1 };
+ // Helper function that takes the left-most or the right-most branch down
+ // intermediate nodes to a leaf, based on branch_type.
+ void BranchToLeaf(uint32_t node_index, BranchType branch_type);
std::string cur_key_;
const char *cur_suffix_;
@@ -548,10 +547,12 @@ class IcingDynamicTrie : public IIcingStorage {
uint32_t node_idx;
int child_idx;
- explicit Branch(uint32_t ni) : node_idx(ni), child_idx(0) {}
+ explicit Branch(uint32_t node_index, int child_index)
+ : node_idx(node_index), child_idx(child_index) {}
};
std::vector<Branch> branch_stack_;
bool single_leaf_match_;
+ bool reverse_;
const IcingDynamicTrie &trie_;
};
@@ -625,7 +626,7 @@ class IcingDynamicTrie : public IIcingStorage {
const Next *LowerBound(const Next *start, const Next *end, uint8_t key_char,
uint32_t node_index = 0) const;
// Returns the number of valid nexts in the array.
- int GetValidNextsSize(IcingDynamicTrie::Next *next_array_start,
+ int GetValidNextsSize(const IcingDynamicTrie::Next *next_array_start,
int next_array_length) const;
void FindBestNode(const char *key, uint32_t *best_node_index, int *key_offset,
bool prefix, bool utf8 = false) const;
diff --git a/icing/legacy/index/icing-dynamic-trie_test.cc b/icing/legacy/index/icing-dynamic-trie_test.cc
index b69ee64..850fcdc 100644
--- a/icing/legacy/index/icing-dynamic-trie_test.cc
+++ b/icing/legacy/index/icing-dynamic-trie_test.cc
@@ -39,6 +39,7 @@ namespace {
using testing::ContainerEq;
using testing::ElementsAre;
+using testing::StrEq;
constexpr std::string_view kKeys[] = {
"", "ab", "ac", "abd", "bac", "bb", "bacd", "abbb", "abcdefg",
@@ -109,6 +110,17 @@ class IcingDynamicTrieTest : public ::testing::Test {
std::string trie_files_prefix_;
};
+std::vector<std::pair<std::string, int>> RetrieveKeyValuePairs(
+ IcingDynamicTrie::Iterator& term_iter) {
+ std::vector<std::pair<std::string, int>> key_value;
+ for (; term_iter.IsValid(); term_iter.Advance()) {
+ uint32_t val;
+ memcpy(&val, term_iter.GetValue(), sizeof(val));
+ key_value.push_back(std::make_pair(term_iter.GetKey(), val));
+ }
+ return key_value;
+}
+
constexpr std::string_view kCommonEnglishWords[] = {
"that", "was", "for", "on", "are", "with", "they", "be", "at",
"one", "have", "this", "from", "word", "but", "what", "some", "you",
@@ -161,7 +173,6 @@ TEST_F(IcingDynamicTrieTest, Init) {
TEST_F(IcingDynamicTrieTest, Iterator) {
// Test iterator.
IcingFilesystem filesystem;
- uint32_t val;
IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
&filesystem);
ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
@@ -171,104 +182,161 @@ TEST_F(IcingDynamicTrieTest, Iterator) {
ASSERT_TRUE(trie.Insert(kKeys[i].data(), &i));
}
- // We try everything twice to test that Reset also works.
-
// Should get the entire trie.
+ std::vector<std::pair<std::string, int>> exp_key_values = {
+ {"", 0}, {"ab", 1}, {"abbb", 7}, {"abcdefg", 8}, {"abd", 3},
+ {"ac", 2}, {"bac", 4}, {"bacd", 6}, {"bb", 5}};
IcingDynamicTrie::Iterator it_all(trie, "");
- for (int i = 0; i < 2; i++) {
- uint32_t count = 0;
- for (; it_all.IsValid(); it_all.Advance()) {
- uint32_t val_idx = it_all.GetValueIndex();
- EXPECT_EQ(it_all.GetValue(), trie.GetValueAtIndex(val_idx));
- count++;
- }
- EXPECT_EQ(count, kNumKeys);
- it_all.Reset();
- }
+ std::vector<std::pair<std::string, int>> key_values =
+ RetrieveKeyValuePairs(it_all);
+ EXPECT_THAT(key_values, ContainerEq(exp_key_values));
+
+ // Should get same results after calling Reset
+ it_all.Reset();
+ key_values = RetrieveKeyValuePairs(it_all);
+ EXPECT_THAT(key_values, ContainerEq(exp_key_values));
// Get everything under "a".
+ exp_key_values = {
+ {"ab", 1}, {"abbb", 7}, {"abcdefg", 8}, {"abd", 3}, {"ac", 2}};
IcingDynamicTrie::Iterator it1(trie, "a");
- for (int i = 0; i < 2; i++) {
- ASSERT_TRUE(it1.IsValid());
- EXPECT_STREQ(it1.GetKey(), "ab");
- static const uint32_t kOne = 1;
- ASSERT_TRUE(it1.GetValue() != nullptr);
- EXPECT_TRUE(!memcmp(it1.GetValue(), &kOne, sizeof(kOne)));
+ key_values = RetrieveKeyValuePairs(it1);
+ EXPECT_THAT(key_values, ContainerEq(exp_key_values));
- ASSERT_TRUE(it1.Advance());
- ASSERT_TRUE(it1.IsValid());
- EXPECT_STREQ(it1.GetKey(), "abbb");
+ // Should get same results after calling Reset
+ it1.Reset();
+ key_values = RetrieveKeyValuePairs(it1);
+ EXPECT_THAT(key_values, ContainerEq(exp_key_values));
- ASSERT_TRUE(it1.Advance());
- ASSERT_TRUE(it1.IsValid());
- EXPECT_STREQ(it1.GetKey(), "abcdefg");
+ // Now "b".
+ exp_key_values = {{"bac", 4}, {"bacd", 6}, {"bb", 5}};
+ IcingDynamicTrie::Iterator it2(trie, "b");
+ key_values = RetrieveKeyValuePairs(it2);
+ EXPECT_THAT(key_values, ContainerEq(exp_key_values));
- ASSERT_TRUE(it1.Advance());
- ASSERT_TRUE(it1.IsValid());
- EXPECT_STREQ(it1.GetKey(), "abd");
+ // Should get same results after calling Reset
+ it2.Reset();
+ key_values = RetrieveKeyValuePairs(it2);
+ EXPECT_THAT(key_values, ContainerEq(exp_key_values));
- ASSERT_TRUE(it1.Advance());
- ASSERT_TRUE(it1.IsValid());
- EXPECT_STREQ(it1.GetKey(), "ac");
+ // Get everything under "ab".
+ exp_key_values = {{"ab", 1}, {"abbb", 7}, {"abcdefg", 8}, {"abd", 3}};
+ IcingDynamicTrie::Iterator it3(trie, "ab");
+ key_values = RetrieveKeyValuePairs(it3);
+ EXPECT_THAT(key_values, ContainerEq(exp_key_values));
- EXPECT_FALSE(it1.Advance());
- EXPECT_FALSE(it1.IsValid());
+ // Should get same results after calling Reset
+ it3.Reset();
+ key_values = RetrieveKeyValuePairs(it3);
+ EXPECT_THAT(key_values, ContainerEq(exp_key_values));
- it1.Reset();
+ // Should match only one key exactly.
+ constexpr std::string_view kOneMatch[] = {
+ "abd",
+ "abcd",
+ "abcdef",
+ "abcdefg",
+ };
+ // With the following match:
+ constexpr std::string_view kOneMatchMatched[] = {
+ "abd",
+ "abcdefg",
+ "abcdefg",
+ "abcdefg",
+ };
+
+ for (size_t k = 0; k < ABSL_ARRAYSIZE(kOneMatch); k++) {
+ IcingDynamicTrie::Iterator it_single(trie, kOneMatch[k].data());
+ ASSERT_TRUE(it_single.IsValid()) << kOneMatch[k];
+ EXPECT_THAT(it_single.GetKey(), StrEq(kOneMatchMatched[k].data()));
+ EXPECT_FALSE(it_single.Advance()) << kOneMatch[k];
+ EXPECT_FALSE(it_single.IsValid()) << kOneMatch[k];
+
+ // Should get same results after calling Reset
+ it_single.Reset();
+ ASSERT_TRUE(it_single.IsValid()) << kOneMatch[k];
+ EXPECT_THAT(it_single.GetKey(), StrEq(kOneMatchMatched[k].data()));
+ EXPECT_FALSE(it_single.Advance()) << kOneMatch[k];
+ EXPECT_FALSE(it_single.IsValid()) << kOneMatch[k];
}
- // Now "b".
- IcingDynamicTrie::Iterator it2(trie, "b");
- for (int i = 0; i < 2; i++) {
- ASSERT_TRUE(it2.IsValid());
- EXPECT_STREQ(it2.GetKey(), "bac");
- val = 1;
- ASSERT_TRUE(it1.GetValue() != nullptr);
- EXPECT_TRUE(!memcmp(it1.GetValue(), &val, sizeof(val)));
- val = 4;
- ASSERT_TRUE(it2.GetValue() != nullptr);
- EXPECT_TRUE(!memcmp(it2.GetValue(), &val, sizeof(val)));
-
- ASSERT_TRUE(it2.Advance());
- ASSERT_TRUE(it2.IsValid());
- EXPECT_STREQ(it2.GetKey(), "bacd");
-
- ASSERT_TRUE(it2.Advance());
- ASSERT_TRUE(it2.IsValid());
- EXPECT_STREQ(it2.GetKey(), "bb");
-
- EXPECT_FALSE(it2.Advance());
- EXPECT_FALSE(it2.IsValid());
-
- it2.Reset();
+ // Matches nothing.
+ constexpr std::string_view kNoMatch[] = {
+ "abbd",
+ "abcdeg",
+ "abcdefh",
+ };
+ for (size_t k = 0; k < ABSL_ARRAYSIZE(kNoMatch); k++) {
+ IcingDynamicTrie::Iterator it_empty(trie, kNoMatch[k].data());
+ EXPECT_FALSE(it_empty.IsValid());
+ it_empty.Reset();
+ EXPECT_FALSE(it_empty.IsValid());
}
- // Get everything under "ab".
- IcingDynamicTrie::Iterator it3(trie, "ab");
- for (int i = 0; i < 2; i++) {
- ASSERT_TRUE(it3.IsValid());
- EXPECT_STREQ(it3.GetKey(), "ab");
- val = 1;
- ASSERT_TRUE(it3.GetValue() != nullptr);
- EXPECT_TRUE(!memcmp(it3.GetValue(), &val, sizeof(val)));
+ // Clear.
+ trie.Clear();
+ EXPECT_FALSE(IcingDynamicTrie::Iterator(trie, "").IsValid());
+ EXPECT_EQ(0u, trie.size());
+ EXPECT_EQ(1.0, trie.min_free_fraction());
+}
- ASSERT_TRUE(it3.Advance());
- ASSERT_TRUE(it3.IsValid());
- EXPECT_STREQ(it3.GetKey(), "abbb");
+TEST_F(IcingDynamicTrieTest, IteratorReverse) {
+ // Test iterator.
+ IcingFilesystem filesystem;
+ IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
+ &filesystem);
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+ ASSERT_TRUE(trie.Init());
- ASSERT_TRUE(it3.Advance());
- ASSERT_TRUE(it3.IsValid());
- EXPECT_STREQ(it3.GetKey(), "abcdefg");
+ for (uint32_t i = 0; i < kNumKeys; i++) {
+ ASSERT_TRUE(trie.Insert(kKeys[i].data(), &i));
+ }
- ASSERT_TRUE(it3.Advance());
- ASSERT_TRUE(it3.IsValid());
- EXPECT_STREQ(it3.GetKey(), "abd");
+ // Should get the entire trie.
+ std::vector<std::pair<std::string, int>> exp_key_values = {
+ {"bb", 5}, {"bacd", 6}, {"bac", 4}, {"ac", 2}, {"abd", 3},
+ {"abcdefg", 8}, {"abbb", 7}, {"ab", 1}, {"", 0}};
+ IcingDynamicTrie::Iterator it_all(trie, "", /*reverse=*/true);
+ std::vector<std::pair<std::string, int>> key_values =
+ RetrieveKeyValuePairs(it_all);
+ EXPECT_THAT(key_values, ContainerEq(exp_key_values));
+ it_all.Reset();
+ key_values = RetrieveKeyValuePairs(it_all);
+ EXPECT_THAT(key_values, ContainerEq(exp_key_values));
- EXPECT_FALSE(it3.Advance());
- EXPECT_FALSE(it3.IsValid());
+ // Get everything under "a".
+ exp_key_values = {
+ {"ac", 2}, {"abd", 3}, {"abcdefg", 8}, {"abbb", 7}, {"ab", 1}};
+ IcingDynamicTrie::Iterator it1(trie, "a", /*reverse=*/true);
+ key_values = RetrieveKeyValuePairs(it1);
+ EXPECT_THAT(key_values, ContainerEq(exp_key_values));
- it3.Reset();
- }
+ // Should get same results after calling Reset
+ it1.Reset();
+ key_values = RetrieveKeyValuePairs(it1);
+ EXPECT_THAT(key_values, ContainerEq(exp_key_values));
+
+ // Now "b".
+ exp_key_values = {{"bb", 5}, {"bacd", 6}, {"bac", 4}};
+ IcingDynamicTrie::Iterator it2(trie, "b", /*reverse=*/true);
+ key_values = RetrieveKeyValuePairs(it2);
+ EXPECT_THAT(key_values, ContainerEq(exp_key_values));
+
+ // Should get same results after calling Reset
+ it2.Reset();
+ key_values = RetrieveKeyValuePairs(it2);
+ EXPECT_THAT(key_values, ContainerEq(exp_key_values));
+
+ // Get everything under "ab".
+ exp_key_values = {{"abd", 3}, {"abcdefg", 8}, {"abbb", 7}, {"ab", 1}};
+ IcingDynamicTrie::Iterator it3(trie, "ab", /*reverse=*/true);
+ key_values = RetrieveKeyValuePairs(it3);
+ EXPECT_THAT(key_values, ContainerEq(exp_key_values));
+
+ // Should get same results after calling Reset
+ it3.Reset();
+ key_values = RetrieveKeyValuePairs(it3);
+ EXPECT_THAT(key_values, ContainerEq(exp_key_values));
// Should match only one key exactly.
constexpr std::string_view kOneMatch[] = {
@@ -286,15 +354,19 @@ TEST_F(IcingDynamicTrieTest, Iterator) {
};
for (size_t k = 0; k < ABSL_ARRAYSIZE(kOneMatch); k++) {
- IcingDynamicTrie::Iterator it_single(trie, kOneMatch[k].data());
- for (int i = 0; i < 2; i++) {
- ASSERT_TRUE(it_single.IsValid()) << kOneMatch[k];
- EXPECT_STREQ(it_single.GetKey(), kOneMatchMatched[k].data());
- EXPECT_FALSE(it_single.Advance()) << kOneMatch[k];
- EXPECT_FALSE(it_single.IsValid()) << kOneMatch[k];
-
- it_single.Reset();
- }
+ IcingDynamicTrie::Iterator it_single(trie, kOneMatch[k].data(),
+ /*reverse=*/true);
+ ASSERT_TRUE(it_single.IsValid()) << kOneMatch[k];
+ EXPECT_THAT(it_single.GetKey(), StrEq(kOneMatchMatched[k].data()));
+ EXPECT_FALSE(it_single.Advance()) << kOneMatch[k];
+ EXPECT_FALSE(it_single.IsValid()) << kOneMatch[k];
+
+ // Should get same results after calling Reset
+ it_single.Reset();
+ ASSERT_TRUE(it_single.IsValid()) << kOneMatch[k];
+ EXPECT_THAT(it_single.GetKey(), StrEq(kOneMatchMatched[k].data()));
+ EXPECT_FALSE(it_single.Advance()) << kOneMatch[k];
+ EXPECT_FALSE(it_single.IsValid()) << kOneMatch[k];
}
// Matches nothing.
@@ -304,21 +376,65 @@ TEST_F(IcingDynamicTrieTest, Iterator) {
"abcdefh",
};
for (size_t k = 0; k < ABSL_ARRAYSIZE(kNoMatch); k++) {
- IcingDynamicTrie::Iterator it_empty(trie, kNoMatch[k].data());
- for (int i = 0; i < 2; i++) {
- EXPECT_FALSE(it_empty.IsValid());
-
- it_empty.Reset();
- }
+ IcingDynamicTrie::Iterator it_empty(trie, kNoMatch[k].data(),
+ /*reverse=*/true);
+ EXPECT_FALSE(it_empty.IsValid());
+ it_empty.Reset();
+ EXPECT_FALSE(it_empty.IsValid());
}
// Clear.
trie.Clear();
- EXPECT_FALSE(IcingDynamicTrie::Iterator(trie, "").IsValid());
+ EXPECT_FALSE(
+ IcingDynamicTrie::Iterator(trie, "", /*reverse=*/true).IsValid());
EXPECT_EQ(0u, trie.size());
EXPECT_EQ(1.0, trie.min_free_fraction());
}
+TEST_F(IcingDynamicTrieTest, IteratorLoadTest) {
+ IcingFilesystem filesystem;
+ IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
+ &filesystem);
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+ ASSERT_TRUE(trie.Init());
+
+ std::default_random_engine random;
+ ICING_LOG(ERROR) << "Seed: " << std::default_random_engine::default_seed;
+
+ std::vector<std::pair<std::string, int>> exp_key_values;
+ // Randomly generate 1024 terms.
+ for (int i = 0; i < 1024; ++i) {
+ std::string term = RandomString("abcdefg", 5, &random) + std::to_string(i);
+ ASSERT_TRUE(trie.Insert(term.c_str(), &i));
+ exp_key_values.push_back(std::make_pair(term, i));
+ }
+ // Lexicographically sort the expected keys.
+ std::sort(exp_key_values.begin(), exp_key_values.end());
+
+ // Check that the iterator works.
+ IcingDynamicTrie::Iterator term_iter(trie, /*prefix=*/"");
+ std::vector<std::pair<std::string, int>> key_values =
+ RetrieveKeyValuePairs(term_iter);
+ EXPECT_THAT(key_values, ContainerEq(exp_key_values));
+
+ // Check that Reset works.
+ term_iter.Reset();
+ key_values = RetrieveKeyValuePairs(term_iter);
+ EXPECT_THAT(key_values, ContainerEq(exp_key_values));
+
+ std::reverse(exp_key_values.begin(), exp_key_values.end());
+ // Check that the reverse iterator works.
+ IcingDynamicTrie::Iterator term_iter_reverse(trie, /*prefix=*/"",
+ /*reverse=*/true);
+ key_values = RetrieveKeyValuePairs(term_iter_reverse);
+ EXPECT_THAT(key_values, ContainerEq(exp_key_values));
+
+ // Check that Reset works.
+ term_iter_reverse.Reset();
+ key_values = RetrieveKeyValuePairs(term_iter_reverse);
+ EXPECT_THAT(key_values, ContainerEq(exp_key_values));
+}
+
TEST_F(IcingDynamicTrieTest, Persistence) {
// Test persistence on the English dictionary.
IcingFilesystem filesystem;
@@ -1233,7 +1349,7 @@ TEST_F(IcingDynamicTrieTest, BitmapsClosedWhenInitFails) {
ASSERT_EQ(0, trie.property_bitmaps_.size());
}
-TEST_F(IcingDynamicTrieTest, IsBranchingTerm) {
+TEST_F(IcingDynamicTrieTest, IsBranchingTermShouldWorkForExistingTerms) {
IcingFilesystem filesystem;
IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
&filesystem);
@@ -1319,34 +1435,52 @@ TEST_F(IcingDynamicTrieTest, IsBranchingTermShouldWorkForNonExistingTerms) {
EXPECT_FALSE(trie.IsBranchingTerm(""));
EXPECT_FALSE(trie.IsBranchingTerm("a"));
EXPECT_FALSE(trie.IsBranchingTerm("ab"));
+ EXPECT_FALSE(trie.IsBranchingTerm("abc"));
ASSERT_TRUE(trie.Insert("aa", &value));
EXPECT_FALSE(trie.IsBranchingTerm(""));
EXPECT_FALSE(trie.IsBranchingTerm("a"));
-
- ASSERT_TRUE(trie.Insert("", &value));
- EXPECT_FALSE(trie.IsBranchingTerm("a"));
-
- ASSERT_TRUE(trie.Insert("ab", &value));
- EXPECT_FALSE(trie.IsBranchingTerm("a"));
+ EXPECT_FALSE(trie.IsBranchingTerm("ab"));
+ EXPECT_FALSE(trie.IsBranchingTerm("abc"));
ASSERT_TRUE(trie.Insert("ac", &value));
- EXPECT_FALSE(trie.IsBranchingTerm("a"));
+ EXPECT_FALSE(trie.IsBranchingTerm(""));
+ // "a" does not exist in the trie, but now it branches to "aa" and "ac".
+ EXPECT_TRUE(trie.IsBranchingTerm("a"));
+ EXPECT_FALSE(trie.IsBranchingTerm("ab"));
+ EXPECT_FALSE(trie.IsBranchingTerm("abc"));
ASSERT_TRUE(trie.Insert("ad", &value));
- EXPECT_FALSE(trie.IsBranchingTerm("a"));
+ EXPECT_FALSE(trie.IsBranchingTerm(""));
+ EXPECT_TRUE(trie.IsBranchingTerm("a"));
+ EXPECT_FALSE(trie.IsBranchingTerm("ab"));
+ EXPECT_FALSE(trie.IsBranchingTerm("abc"));
ASSERT_TRUE(trie.Insert("abcd", &value));
+ EXPECT_FALSE(trie.IsBranchingTerm(""));
+ EXPECT_TRUE(trie.IsBranchingTerm("a"));
+ EXPECT_FALSE(trie.IsBranchingTerm("ab"));
EXPECT_FALSE(trie.IsBranchingTerm("abc"));
- ASSERT_TRUE(trie.Insert("abce", &value));
+ ASSERT_TRUE(trie.Insert("abd", &value));
+ EXPECT_FALSE(trie.IsBranchingTerm(""));
+ EXPECT_TRUE(trie.IsBranchingTerm("a"));
+ // "ab" does not exist in the trie, but now it branches to "abcd" and "abd".
+ EXPECT_TRUE(trie.IsBranchingTerm("ab"));
EXPECT_FALSE(trie.IsBranchingTerm("abc"));
- ASSERT_TRUE(trie.Insert("abcf", &value));
- EXPECT_FALSE(trie.IsBranchingTerm("abc"));
+ ASSERT_TRUE(trie.Insert("abce", &value));
+ EXPECT_FALSE(trie.IsBranchingTerm(""));
+ EXPECT_TRUE(trie.IsBranchingTerm("a"));
+ EXPECT_TRUE(trie.IsBranchingTerm("ab"));
+ // "abc" does not exist in the trie, but now it branches to "abcd" and "abce".
+ EXPECT_TRUE(trie.IsBranchingTerm("abc"));
ASSERT_TRUE(trie.Insert("abc_suffix", &value));
- EXPECT_FALSE(trie.IsBranchingTerm("abc"));
+ EXPECT_FALSE(trie.IsBranchingTerm(""));
+ EXPECT_TRUE(trie.IsBranchingTerm("a"));
+ EXPECT_TRUE(trie.IsBranchingTerm("ab"));
+ EXPECT_TRUE(trie.IsBranchingTerm("abc"));
EXPECT_FALSE(trie.IsBranchingTerm("abc_s"));
EXPECT_FALSE(trie.IsBranchingTerm("abc_su"));
EXPECT_FALSE(trie.IsBranchingTerm("abc_suffi"));
diff --git a/icing/legacy/index/icing-filesystem.cc b/icing/legacy/index/icing-filesystem.cc
index 4f5e571..fbf5a27 100644
--- a/icing/legacy/index/icing-filesystem.cc
+++ b/icing/legacy/index/icing-filesystem.cc
@@ -65,18 +65,15 @@ void LogOpenFileDescriptors() {
constexpr int kMaxFileDescriptorsToStat = 4096;
struct rlimit rlim = {0, 0};
if (getrlimit(RLIMIT_NOFILE, &rlim) != 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "getrlimit() failed (errno=%d)", errno);
+ ICING_LOG(ERROR) << "getrlimit() failed (errno=" << errno << ")";
return;
}
int fd_lim = rlim.rlim_cur;
if (fd_lim > kMaxFileDescriptorsToStat) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Maximum number of file descriptors (%d) too large.", fd_lim);
+ ICING_LOG(ERROR) << "Maximum number of file descriptors (" << fd_lim << ") too large.";
fd_lim = kMaxFileDescriptorsToStat;
}
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Listing up to %d file descriptors.", fd_lim);
+ ICING_LOG(ERROR) << "Listing up to " << fd_lim << " file descriptors.";
// Verify that /proc/self/fd is a directory. If not, procfs is not mounted or
// inaccessible for some other reason. In that case, there's no point trying
@@ -98,15 +95,12 @@ void LogOpenFileDescriptors() {
if (len >= 0) {
// Zero-terminate the buffer, because readlink() won't.
target[len < target_size ? len : target_size - 1] = '\0';
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("fd %d -> \"%s\"", fd,
- target);
+ ICING_LOG(ERROR) << "fd " << fd << " -> \"" << target << "\"";
} else if (errno != ENOENT) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("fd %d -> ? (errno=%d)",
- fd, errno);
+ ICING_LOG(ERROR) << "fd " << fd << " -> ? (errno=" << errno << ")";
}
}
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "File descriptor list complete.");
+ ICING_LOG(ERROR) << "File descriptor list complete.";
}
// Logs an error formatted as: desc1 + file_name + desc2 + strerror(errnum).
@@ -115,8 +109,7 @@ void LogOpenFileDescriptors() {
// file descriptors (see LogOpenFileDescriptors() above).
void LogOpenError(const char *desc1, const char *file_name, const char *desc2,
int errnum) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "%s%s%s%s", desc1, file_name, desc2, strerror(errnum));
+ ICING_LOG(ERROR) << desc1 << file_name << desc2 << strerror(errnum);
if (errnum == EMFILE) {
LogOpenFileDescriptors();
}
@@ -157,8 +150,7 @@ bool ListDirectoryInternal(const char *dir_name,
}
}
if (closedir(dir) != 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Error closing %s: %s", dir_name, strerror(errno));
+ ICING_LOG(ERROR) << "Error closing " << dir_name << ": " << strerror(errno);
}
return true;
}
@@ -181,12 +173,11 @@ void IcingScopedFd::reset(int fd) {
const uint64_t IcingFilesystem::kBadFileSize;
bool IcingFilesystem::DeleteFile(const char *file_name) const {
- ICING_VLOG(1) << IcingStringUtil::StringPrintf("Deleting file %s", file_name);
+ ICING_VLOG(1) << "Deleting file " << file_name;
int ret = unlink(file_name);
bool success = (ret == 0) || (errno == ENOENT);
if (!success) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Deleting file %s failed: %s", file_name, strerror(errno));
+ ICING_LOG(ERROR) << "Deleting file " << file_name << " failed: " << strerror(errno);
}
return success;
}
@@ -195,8 +186,7 @@ bool IcingFilesystem::DeleteDirectory(const char *dir_name) const {
int ret = rmdir(dir_name);
bool success = (ret == 0) || (errno == ENOENT);
if (!success) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Deleting directory %s failed: %s", dir_name, strerror(errno));
+ ICING_LOG(ERROR) << "Deleting directory " << dir_name << " failed: " << strerror(errno);
}
return success;
}
@@ -208,8 +198,7 @@ bool IcingFilesystem::DeleteDirectoryRecursively(const char *dir_name) const {
if (errno == ENOENT) {
return true; // If directory didn't exist, this was successful.
}
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Stat %s failed: %s", dir_name, strerror(errno));
+ ICING_LOG(ERROR) << "Stat " << dir_name << " failed: " << strerror(errno);
return false;
}
vector<std::string> entries;
@@ -222,8 +211,7 @@ bool IcingFilesystem::DeleteDirectoryRecursively(const char *dir_name) const {
++i) {
std::string filename = std::string(dir_name) + '/' + *i;
if (stat(filename.c_str(), &st) < 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Stat %s failed: %s", filename.c_str(), strerror(errno));
+ ICING_LOG(ERROR) << "Stat " << filename << " failed: " << strerror(errno);
success = false;
} else if (S_ISDIR(st.st_mode)) {
success = DeleteDirectoryRecursively(filename.c_str()) && success;
@@ -246,8 +234,7 @@ bool IcingFilesystem::FileExists(const char *file_name) const {
exists = S_ISREG(st.st_mode) != 0;
} else {
if (errno != ENOENT) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Unable to stat file %s: %s", file_name, strerror(errno));
+ ICING_LOG(ERROR) << "Unable to stat file " << file_name << ": " << strerror(errno);
}
exists = false;
}
@@ -261,8 +248,7 @@ bool IcingFilesystem::DirectoryExists(const char *dir_name) const {
exists = S_ISDIR(st.st_mode) != 0;
} else {
if (errno != ENOENT) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Unable to stat directory %s: %s", dir_name, strerror(errno));
+ ICING_LOG(ERROR) << "Unable to stat directory " << dir_name << ": " << strerror(errno);
}
exists = false;
}
@@ -317,8 +303,7 @@ bool IcingFilesystem::GetMatchingFiles(const char *glob,
int basename_idx = GetBasenameIndex(glob);
if (basename_idx == 0) {
// We need a directory.
- ICING_VLOG(1) << IcingStringUtil::StringPrintf(
- "Expected directory, no matching files for: %s", glob);
+ ICING_VLOG(1) << "Expected directory, no matching files for: " << glob;
return true;
}
const char *basename_glob = glob + basename_idx;
@@ -374,8 +359,7 @@ uint64_t IcingFilesystem::GetFileSize(int fd) const {
struct stat st;
uint64_t size = kBadFileSize;
if (fstat(fd, &st) < 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Unable to stat file: %s",
- strerror(errno));
+ ICING_LOG(ERROR) << "Unable to stat file: " << strerror(errno);
} else {
size = st.st_size;
}
@@ -386,8 +370,7 @@ uint64_t IcingFilesystem::GetFileSize(const char *filename) const {
struct stat st;
uint64_t size = kBadFileSize;
if (stat(filename, &st) < 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Unable to stat file %s: %s", filename, strerror(errno));
+ ICING_LOG(ERROR) << "Unable to stat file " << filename << ": " << strerror(errno);
} else {
size = st.st_size;
}
@@ -399,8 +382,7 @@ bool IcingFilesystem::Truncate(int fd, uint64_t new_size) const {
if (ret == 0) {
lseek(fd, new_size, SEEK_SET);
} else {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Unable to truncate file: %s", strerror(errno));
+ ICING_LOG(ERROR) << "Unable to truncate file: " << strerror(errno);
}
return (ret == 0);
}
@@ -418,8 +400,7 @@ bool IcingFilesystem::Truncate(const char *filename, uint64_t new_size) const {
bool IcingFilesystem::Grow(int fd, uint64_t new_size) const {
int ret = ftruncate(fd, new_size);
if (ret != 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Unable to grow file: %s",
- strerror(errno));
+ ICING_LOG(ERROR) << "Unable to grow file: " << strerror(errno);
}
return (ret == 0);
}
@@ -431,8 +412,7 @@ bool IcingFilesystem::Write(int fd, const void *data, size_t data_size) const {
size_t chunk_size = std::min<size_t>(write_len, 64u * 1024);
ssize_t wrote = write(fd, data, chunk_size);
if (wrote < 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Bad write: %s",
- strerror(errno));
+ ICING_LOG(ERROR) << "Bad write: " << strerror(errno);
return false;
}
data = static_cast<const uint8_t *>(data) + wrote;
@@ -449,8 +429,7 @@ bool IcingFilesystem::PWrite(int fd, off_t offset, const void *data,
size_t chunk_size = std::min<size_t>(write_len, 64u * 1024);
ssize_t wrote = pwrite(fd, data, chunk_size, offset);
if (wrote < 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Bad write: %s",
- strerror(errno));
+ ICING_LOG(ERROR) << "Bad write: " << strerror(errno);
return false;
}
data = static_cast<const uint8_t *>(data) + wrote;
@@ -468,8 +447,7 @@ bool IcingFilesystem::DataSync(int fd) const {
#endif
if (result < 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Unable to sync data: %s",
- strerror(errno));
+ ICING_LOG(ERROR) << "Unable to sync data: " << strerror(errno);
return false;
}
return true;
@@ -478,9 +456,7 @@ bool IcingFilesystem::DataSync(int fd) const {
bool IcingFilesystem::RenameFile(const char *old_name,
const char *new_name) const {
if (rename(old_name, new_name) < 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Unable to rename file %s to %s: %s", old_name, new_name,
- strerror(errno));
+ ICING_LOG(ERROR) << "Unable to rename file " << old_name << " to " << new_name << ": " << strerror(errno);
return false;
}
return true;
@@ -518,8 +494,7 @@ bool IcingFilesystem::CreateDirectory(const char *dir_name) const {
if (mkdir(dir_name, S_IRUSR | S_IWUSR | S_IXUSR) == 0) {
success = true;
} else {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Creating directory %s failed: %s", dir_name, strerror(errno));
+ ICING_LOG(ERROR) << "Creating directory " << dir_name << " failed: " << strerror(errno);
}
}
return success;
@@ -561,8 +536,7 @@ end:
if (src_fd > 0) close(src_fd);
if (dst_fd > 0) close(dst_fd);
if (!success) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Couldn't copy file %s to %s", src, dst);
+ ICING_LOG(ERROR) << "Couldn't copy file " << src << " to " << dst;
}
return success;
}
@@ -583,8 +557,7 @@ bool IcingFilesystem::ComputeChecksum(int fd, uint32_t *checksum,
uint64_t IcingFilesystem::GetDiskUsage(int fd) const {
struct stat st;
if (fstat(fd, &st) < 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Unable to stat file: %s",
- strerror(errno));
+ ICING_LOG(ERROR) << "Unable to stat file: " << strerror(errno);
return kBadFileSize;
}
return st.st_blocks * kStatBlockSize;
@@ -593,8 +566,7 @@ uint64_t IcingFilesystem::GetDiskUsage(int fd) const {
uint64_t IcingFilesystem::GetFileDiskUsage(const char *path) const {
struct stat st;
if (stat(path, &st) != 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Unable to stat %s: %s",
- path, strerror(errno));
+ ICING_LOG(ERROR) << "Unable to stat " << path << ": " << strerror(errno);
return kBadFileSize;
}
return st.st_blocks * kStatBlockSize;
@@ -603,8 +575,7 @@ uint64_t IcingFilesystem::GetFileDiskUsage(const char *path) const {
uint64_t IcingFilesystem::GetDiskUsage(const char *path) const {
struct stat st;
if (stat(path, &st) != 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Unable to stat %s: %s",
- path, strerror(errno));
+ ICING_LOG(ERROR) << "Unable to stat " << path << ": " << strerror(errno);
return kBadFileSize;
}
uint64_t result = st.st_blocks * kStatBlockSize;
diff --git a/icing/legacy/index/icing-flash-bitmap.cc b/icing/legacy/index/icing-flash-bitmap.cc
index 56dec00..774308f 100644
--- a/icing/legacy/index/icing-flash-bitmap.cc
+++ b/icing/legacy/index/icing-flash-bitmap.cc
@@ -73,8 +73,7 @@ class IcingFlashBitmap::Accessor {
bool IcingFlashBitmap::Verify() const {
if (!is_initialized()) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Can't verify unopened flash bitmap %s", filename_.c_str());
+ ICING_LOG(ERROR) << "Can't verify unopened flash bitmap " << filename_;
return false;
}
if (mmapper_ == nullptr) {
@@ -83,26 +82,21 @@ bool IcingFlashBitmap::Verify() const {
}
Accessor accessor(mmapper_.get());
if (accessor.header()->magic != kMagic) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Flash bitmap %s has incorrect magic header", filename_.c_str());
+ ICING_LOG(ERROR) << "Flash bitmap " << filename_ << " has incorrect magic header";
return false;
}
if (accessor.header()->version != kCurVersion) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Flash bitmap %s has incorrect version", filename_.c_str());
+ ICING_LOG(ERROR) << "Flash bitmap " << filename_ << " has incorrect version";
return false;
}
if (accessor.header()->dirty) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Flash bitmap %s is dirty", filename_.c_str());
+ ICING_LOG(ERROR) << "Flash bitmap " << filename_ << " is dirty";
return false;
}
uint32_t crc =
IcingStringUtil::UpdateCrc32(0, accessor.data(), accessor.data_size());
if (accessor.header()->crc != crc) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Flash bitmap %s has incorrect CRC32 %u %u", filename_.c_str(),
- accessor.header()->crc, crc);
+ ICING_LOG(ERROR) << "Flash bitmap " << filename_ << " has incorrect CRC32 " << accessor.header()->crc << " " << crc;
return false;
}
return true;
@@ -265,17 +259,14 @@ uint32_t IcingFlashBitmap::UpdateCrc() const {
bool IcingFlashBitmap::Grow(size_t new_file_size) {
IcingScopedFd fd(filesystem_->OpenForWrite(filename_.c_str()));
if (!filesystem_->Grow(fd.get(), new_file_size)) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Grow %s to new size %zu failed", filename_.c_str(), new_file_size);
+ ICING_LOG(ERROR) << "Grow " << filename_ << " to new size " << new_file_size << " failed";
return false;
}
if (!mmapper_->Remap(fd.get(), 0, new_file_size)) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Remap of %s after grow failed", filename_.c_str());
+ ICING_LOG(ERROR) << "Remap of " << filename_ << " after grow failed";
return false;
}
- ICING_VLOG(1) << IcingStringUtil::StringPrintf(
- "Grew %s new size %zu", filename_.c_str(), new_file_size);
+ ICING_VLOG(1) << "Grew " << filename_ << " new size " << new_file_size;
Accessor accessor(mmapper_.get());
accessor.header()->dirty = true;
return true;
diff --git a/icing/legacy/index/icing-mmapper.cc b/icing/legacy/index/icing-mmapper.cc
index 7946c82..d086da2 100644
--- a/icing/legacy/index/icing-mmapper.cc
+++ b/icing/legacy/index/icing-mmapper.cc
@@ -67,8 +67,7 @@ void IcingMMapper::DoMapping(int fd, uint64_t location, size_t size) {
address_ = reinterpret_cast<uint8_t *>(mmap_result_) + alignment_adjustment;
} else {
const char *errstr = strerror(errno);
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Could not mmap file for reading: %s", errstr);
+ ICING_LOG(ERROR) << "Could not mmap file for reading: " << errstr;
mmap_result_ = nullptr;
}
}
@@ -95,8 +94,7 @@ IcingMMapper::~IcingMMapper() { Unmap(); }
bool IcingMMapper::Sync() {
if (is_valid() && !read_only_) {
if (msync(mmap_result_, mmap_len_, MS_SYNC) != 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("msync failed: %s",
- strerror(errno));
+ ICING_LOG(ERROR) << "msync failed: " << strerror(errno);
return false;
}
}
diff --git a/icing/legacy/index/icing-storage-file.cc b/icing/legacy/index/icing-storage-file.cc
index 35a4418..bbc6b81 100644
--- a/icing/legacy/index/icing-storage-file.cc
+++ b/icing/legacy/index/icing-storage-file.cc
@@ -69,22 +69,18 @@ bool IcingStorageFile::Sync() {
IcingTimer timer;
if (!PreSync()) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Pre-sync %s failed",
- filename_.c_str());
+ ICING_LOG(ERROR) << "Pre-sync " << filename_ << " failed";
return false;
}
if (!filesystem_->DataSync(fd_.get())) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Sync %s failed",
- filename_.c_str());
+ ICING_LOG(ERROR) << "Sync " << filename_ << " failed";
return false;
}
if (!PostSync()) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Post-sync %s failed",
- filename_.c_str());
+ ICING_LOG(ERROR) << "Post-sync " << filename_ << " failed";
return false;
}
- ICING_VLOG(1) << IcingStringUtil::StringPrintf(
- "Syncing %s took %.3fms", filename_.c_str(), timer.Elapsed() * 1000.);
+ ICING_VLOG(1) << "Syncing " << filename_ << " took " << timer.Elapsed() * 1000 << "ms";
return true;
}
diff --git a/icing/query/query-processor_test.cc b/icing/query/query-processor_test.cc
index a725213..d1cce87 100644
--- a/icing/query/query-processor_test.cc
+++ b/icing/query/query-processor_test.cc
@@ -17,7 +17,6 @@
#include <memory>
#include <string>
-#include "icing/jni/jni-cache.h"
#include "icing/text_classifier/lib3/utils/base/status.h"
#include "gmock/gmock.h"
#include "gtest/gtest.h"
@@ -27,6 +26,7 @@
#include "icing/index/index.h"
#include "icing/index/iterator/doc-hit-info-iterator-test-util.h"
#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/jni/jni-cache.h"
#include "icing/legacy/index/icing-filesystem.h"
#include "icing/portable/platform.h"
#include "icing/proto/schema.pb.h"
diff --git a/icing/result/result-retriever-v2.cc b/icing/result/result-retriever-v2.cc
index 195f641..92ab048 100644
--- a/icing/result/result-retriever-v2.cc
+++ b/icing/result/result-retriever-v2.cc
@@ -110,6 +110,7 @@ std::pair<PageResult, bool> ResultRetrieverV2::RetrieveNextPage(
// Retrieve info
std::vector<SearchResultProto::ResultProto> results;
+ int32_t num_total_bytes = 0;
while (results.size() < result_state.num_per_page() &&
!result_state.scored_document_hits_ranker->empty()) {
ScoredDocumentHit next_best_document_hit =
@@ -154,7 +155,17 @@ std::pair<PageResult, bool> ResultRetrieverV2::RetrieveNextPage(
// Add the document, itself.
*result.mutable_document() = std::move(document);
result.set_score(next_best_document_hit.score());
+ size_t result_bytes = result.ByteSizeLong();
results.push_back(std::move(result));
+
+ // Check if num_total_bytes + result_bytes reaches or exceeds
+ // num_total_bytes_per_page_threshold. Use subtraction to avoid integer
+ // overflow.
+ if (result_bytes >=
+ result_state.num_total_bytes_per_page_threshold() - num_total_bytes) {
+ break;
+ }
+ num_total_bytes += result_bytes;
}
// Update numbers in ResultState
diff --git a/icing/result/result-retriever-v2_group-result-limiter-test.cc b/icing/result/result-retriever-v2_group-result-limiter_test.cc
index e4bfe09..e0a6c79 100644
--- a/icing/result/result-retriever-v2_group-result-limiter-test.cc
+++ b/icing/result/result-retriever-v2_group-result-limiter_test.cc
@@ -185,6 +185,142 @@ TEST_F(ResultRetrieverV2GroupResultLimiterTest,
}
TEST_F(ResultRetrieverV2GroupResultLimiterTest,
+ ResultGroupingHasEmptyFirstPage) {
+ // Creates 2 documents and ensures the relationship in terms of document
+ // score is: document1 < document2
+ DocumentProto document1 = DocumentBuilder()
+ .SetKey("namespace", "uri/1")
+ .SetSchema("Document")
+ .SetScore(1)
+ .SetCreationTimestampMs(1000)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(document1));
+
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("namespace", "uri/2")
+ .SetSchema("Document")
+ .SetScore(2)
+ .SetCreationTimestampMs(1000)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store_->Put(document2));
+
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ ScoredDocumentHit(document_id1, kSectionIdMaskNone, document1.score()),
+ ScoredDocumentHit(document_id2, kSectionIdMaskNone, document2.score())};
+
+ // Create a ResultSpec that limits "namespace" to 0 results.
+ ResultSpecProto result_spec = CreateResultSpec(/*num_per_page=*/1);
+ ResultSpecProto::ResultGrouping* result_grouping =
+ result_spec.add_result_groupings();
+ result_grouping->set_max_results(0);
+ result_grouping->add_namespaces("namespace");
+
+ // Creates a ResultState with 2 ScoredDocumentHits.
+ ResultStateV2 result_state(
+ std::make_unique<PriorityQueueScoredDocumentHitsRanker>(
+ std::move(scored_document_hits), /*is_descending=*/true),
+ /*query_terms=*/{}, CreateSearchSpec(TermMatchType::EXACT_ONLY),
+ CreateScoringSpec(/*is_descending_order=*/true), result_spec,
+ *document_store_);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ResultRetrieverV2> result_retriever,
+ ResultRetrieverV2::Create(document_store_.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get()));
+
+ // First page: empty page
+ auto [page_result, has_more_results] =
+ result_retriever->RetrieveNextPage(result_state);
+ ASSERT_THAT(page_result.results, IsEmpty());
+ EXPECT_FALSE(has_more_results);
+}
+
+TEST_F(ResultRetrieverV2GroupResultLimiterTest,
+ ResultGroupingHasEmptyLastPage) {
+ // Creates 4 documents and ensures the relationship in terms of document
+ // score is: document1 < document2 < document3 < document4
+ DocumentProto document1 = DocumentBuilder()
+ .SetKey("namespace", "uri/1")
+ .SetSchema("Document")
+ .SetScore(1)
+ .SetCreationTimestampMs(1000)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(document1));
+
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("namespace", "uri/2")
+ .SetSchema("Document")
+ .SetScore(2)
+ .SetCreationTimestampMs(1000)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store_->Put(document2));
+
+ DocumentProto document3 = DocumentBuilder()
+ .SetKey("namespace", "uri/3")
+ .SetSchema("Document")
+ .SetScore(3)
+ .SetCreationTimestampMs(1000)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3,
+ document_store_->Put(document3));
+
+ DocumentProto document4 = DocumentBuilder()
+ .SetKey("namespace", "uri/4")
+ .SetSchema("Document")
+ .SetScore(4)
+ .SetCreationTimestampMs(1000)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id4,
+ document_store_->Put(document4));
+
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ ScoredDocumentHit(document_id1, kSectionIdMaskNone, document1.score()),
+ ScoredDocumentHit(document_id2, kSectionIdMaskNone, document2.score()),
+ ScoredDocumentHit(document_id3, kSectionIdMaskNone, document3.score()),
+ ScoredDocumentHit(document_id4, kSectionIdMaskNone, document4.score())};
+
+ // Create a ResultSpec that limits "namespace" to 2 results.
+ ResultSpecProto result_spec = CreateResultSpec(/*num_per_page=*/2);
+ ResultSpecProto::ResultGrouping* result_grouping =
+ result_spec.add_result_groupings();
+ result_grouping->set_max_results(2);
+ result_grouping->add_namespaces("namespace");
+
+ // Creates a ResultState with 4 ScoredDocumentHits.
+ ResultStateV2 result_state(
+ std::make_unique<PriorityQueueScoredDocumentHitsRanker>(
+ std::move(scored_document_hits), /*is_descending=*/true),
+ /*query_terms=*/{}, CreateSearchSpec(TermMatchType::EXACT_ONLY),
+ CreateScoringSpec(/*is_descending_order=*/true), result_spec,
+ *document_store_);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ResultRetrieverV2> result_retriever,
+ ResultRetrieverV2::Create(document_store_.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get()));
+
+ // First page: document4 and document3 should be returned.
+ auto [page_result1, has_more_results1] =
+ result_retriever->RetrieveNextPage(result_state);
+ ASSERT_THAT(page_result1.results, SizeIs(2));
+ EXPECT_THAT(page_result1.results.at(0).document(), EqualsProto(document4));
+ EXPECT_THAT(page_result1.results.at(1).document(), EqualsProto(document3));
+ EXPECT_TRUE(has_more_results1);
+
+ // Second page: although there are valid document hits in result state, all of
+ // them will be filtered out by group result limiter, so we should get an
+ // empty page.
+ auto [page_result2, has_more_results2] =
+ result_retriever->RetrieveNextPage(result_state);
+ EXPECT_THAT(page_result2.results, SizeIs(0));
+ EXPECT_FALSE(has_more_results2);
+}
+
+TEST_F(ResultRetrieverV2GroupResultLimiterTest,
ResultGroupingDoesNotLimitOtherNamespaceResults) {
// Creates 4 documents and ensures the relationship in terms of document
// score is: document1 < document2 < document3 < document4
diff --git a/icing/result/result-retriever-v2_projection-test.cc b/icing/result/result-retriever-v2_projection_test.cc
index bdd1715..bdd1715 100644
--- a/icing/result/result-retriever-v2_projection-test.cc
+++ b/icing/result/result-retriever-v2_projection_test.cc
diff --git a/icing/result/result-retriever-v2_snippet-test.cc b/icing/result/result-retriever-v2_snippet_test.cc
index afb31cf..afb31cf 100644
--- a/icing/result/result-retriever-v2_snippet-test.cc
+++ b/icing/result/result-retriever-v2_snippet_test.cc
diff --git a/icing/result/result-retriever-v2_test.cc b/icing/result/result-retriever-v2_test.cc
index f23a88a..0998754 100644
--- a/icing/result/result-retriever-v2_test.cc
+++ b/icing/result/result-retriever-v2_test.cc
@@ -56,6 +56,7 @@ using ::icing::lib::portable_equals_proto::EqualsProto;
using ::testing::DoDefault;
using ::testing::ElementsAre;
using ::testing::Eq;
+using ::testing::Gt;
using ::testing::IsEmpty;
using ::testing::Pointee;
using ::testing::Return;
@@ -635,6 +636,179 @@ TEST_F(ResultRetrieverV2Test, ShouldUpdateNumTotalHits) {
EXPECT_THAT(num_total_hits_, Eq(0));
}
+TEST_F(ResultRetrieverV2Test, ShouldLimitNumTotalBytesPerPage) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ doc_store->Put(CreateDocument(/*id=*/1)));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ doc_store->Put(CreateDocument(/*id=*/2)));
+
+ std::vector<SectionId> hit_section_ids = {GetSectionId("Email", "name"),
+ GetSectionId("Email", "body")};
+ SectionIdMask hit_section_id_mask = CreateSectionIdMask(hit_section_ids);
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ {document_id1, hit_section_id_mask, /*score=*/5},
+ {document_id2, hit_section_id_mask, /*score=*/0}};
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ResultRetrieverV2> result_retriever,
+ ResultRetrieverV2::Create(doc_store.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get()));
+
+ SearchResultProto::ResultProto result1;
+ *result1.mutable_document() = CreateDocument(/*id=*/1);
+ result1.set_score(5);
+ SearchResultProto::ResultProto result2;
+ *result2.mutable_document() = CreateDocument(/*id=*/2);
+ result2.set_score(0);
+
+ ResultSpecProto result_spec = CreateResultSpec(/*num_per_page=*/2);
+ result_spec.set_num_total_bytes_per_page_threshold(result1.ByteSizeLong());
+ ResultStateV2 result_state(
+ std::make_unique<PriorityQueueScoredDocumentHitsRanker>(
+ std::move(scored_document_hits),
+ /*is_descending=*/true),
+ /*query_terms=*/{}, CreateSearchSpec(TermMatchType::EXACT_ONLY),
+ CreateScoringSpec(/*is_descending_order=*/true), result_spec, *doc_store);
+
+ // First page. Only result1 should be returned, since its byte size meets
+ // num_total_bytes_per_page_threshold and ResultRetriever should terminate
+ // early even though # of results is still below num_per_page.
+ auto [page_result1, has_more_results1] =
+ result_retriever->RetrieveNextPage(result_state);
+ EXPECT_THAT(page_result1.results, ElementsAre(EqualsProto(result1)));
+ // Has more results.
+ EXPECT_TRUE(has_more_results1);
+
+ // Second page, result2.
+ auto [page_result2, has_more_results2] =
+ result_retriever->RetrieveNextPage(result_state);
+ EXPECT_THAT(page_result2.results, ElementsAre(EqualsProto(result2)));
+ // No more results.
+ EXPECT_FALSE(has_more_results2);
+}
+
+TEST_F(ResultRetrieverV2Test,
+ ShouldReturnSingleLargeResultAboveNumTotalBytesPerPageThreshold) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ doc_store->Put(CreateDocument(/*id=*/1)));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ doc_store->Put(CreateDocument(/*id=*/2)));
+
+ std::vector<SectionId> hit_section_ids = {GetSectionId("Email", "name"),
+ GetSectionId("Email", "body")};
+ SectionIdMask hit_section_id_mask = CreateSectionIdMask(hit_section_ids);
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ {document_id1, hit_section_id_mask, /*score=*/5},
+ {document_id2, hit_section_id_mask, /*score=*/0}};
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ResultRetrieverV2> result_retriever,
+ ResultRetrieverV2::Create(doc_store.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get()));
+
+ SearchResultProto::ResultProto result1;
+ *result1.mutable_document() = CreateDocument(/*id=*/1);
+ result1.set_score(5);
+ SearchResultProto::ResultProto result2;
+ *result2.mutable_document() = CreateDocument(/*id=*/2);
+ result2.set_score(0);
+
+ int threshold = 1;
+ ASSERT_THAT(result1.ByteSizeLong(), Gt(threshold));
+
+ ResultSpecProto result_spec = CreateResultSpec(/*num_per_page=*/2);
+ result_spec.set_num_total_bytes_per_page_threshold(threshold);
+ ResultStateV2 result_state(
+ std::make_unique<PriorityQueueScoredDocumentHitsRanker>(
+ std::move(scored_document_hits),
+ /*is_descending=*/true),
+ /*query_terms=*/{}, CreateSearchSpec(TermMatchType::EXACT_ONLY),
+ CreateScoringSpec(/*is_descending_order=*/true), result_spec, *doc_store);
+
+ // First page. Should return single result1 even though its byte size exceeds
+ // num_total_bytes_per_page_threshold.
+ auto [page_result1, has_more_results1] =
+ result_retriever->RetrieveNextPage(result_state);
+ EXPECT_THAT(page_result1.results, ElementsAre(EqualsProto(result1)));
+ // Has more results.
+ EXPECT_TRUE(has_more_results1);
+
+ // Second page, result2.
+ auto [page_result2, has_more_results2] =
+ result_retriever->RetrieveNextPage(result_state);
+ EXPECT_THAT(page_result2.results, ElementsAre(EqualsProto(result2)));
+ // No more results.
+ EXPECT_FALSE(has_more_results2);
+}
+
+TEST_F(ResultRetrieverV2Test,
+ ShouldRetrieveNextResultWhenBelowNumTotalBytesPerPageThreshold) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ doc_store->Put(CreateDocument(/*id=*/1)));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ doc_store->Put(CreateDocument(/*id=*/2)));
+
+ std::vector<SectionId> hit_section_ids = {GetSectionId("Email", "name"),
+ GetSectionId("Email", "body")};
+ SectionIdMask hit_section_id_mask = CreateSectionIdMask(hit_section_ids);
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ {document_id1, hit_section_id_mask, /*score=*/5},
+ {document_id2, hit_section_id_mask, /*score=*/0}};
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ResultRetrieverV2> result_retriever,
+ ResultRetrieverV2::Create(doc_store.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get()));
+
+ SearchResultProto::ResultProto result1;
+ *result1.mutable_document() = CreateDocument(/*id=*/1);
+ result1.set_score(5);
+ SearchResultProto::ResultProto result2;
+ *result2.mutable_document() = CreateDocument(/*id=*/2);
+ result2.set_score(0);
+
+ int threshold = result1.ByteSizeLong() + 1;
+ ASSERT_THAT(result1.ByteSizeLong() + result2.ByteSizeLong(), Gt(threshold));
+
+ ResultSpecProto result_spec = CreateResultSpec(/*num_per_page=*/2);
+ result_spec.set_num_total_bytes_per_page_threshold(threshold);
+ ResultStateV2 result_state(
+ std::make_unique<PriorityQueueScoredDocumentHitsRanker>(
+ std::move(scored_document_hits),
+ /*is_descending=*/true),
+ /*query_terms=*/{}, CreateSearchSpec(TermMatchType::EXACT_ONLY),
+ CreateScoringSpec(/*is_descending_order=*/true), result_spec, *doc_store);
+
+ // After retrieving result1, total bytes are still below the threshold and #
+ // of results is still below num_per_page, so ResultRetriever should continue
+ // the retrieval process and thus include result2 into this page, even though
+ // finally total bytes of result1 + result2 exceed the threshold.
+ auto [page_result, has_more_results] =
+ result_retriever->RetrieveNextPage(result_state);
+ EXPECT_THAT(page_result.results,
+ ElementsAre(EqualsProto(result1), EqualsProto(result2)));
+ // No more results.
+ EXPECT_FALSE(has_more_results);
+}
+
} // namespace
} // namespace lib
diff --git a/icing/result/result-state-manager.cc b/icing/result/result-state-manager.cc
index 1057f9b..2783fe2 100644
--- a/icing/result/result-state-manager.cc
+++ b/icing/result/result-state-manager.cc
@@ -14,7 +14,16 @@
#include "icing/result/result-state-manager.h"
+#include <memory>
+#include <queue>
+#include <utility>
+
#include "icing/proto/search.pb.h"
+#include "icing/query/query-terms.h"
+#include "icing/result/page-result.h"
+#include "icing/result/result-retriever-v2.h"
+#include "icing/result/result-state-v2.h"
+#include "icing/scoring/scored-document-hits-ranker.h"
#include "icing/util/clock.h"
#include "icing/util/logging.h"
#include "icing/util/status-macros.h"
@@ -31,50 +40,66 @@ ResultStateManager::ResultStateManager(int max_total_hits,
random_generator_(GetSteadyTimeNanoseconds()),
clock_(*clock) {}
-libtextclassifier3::StatusOr<PageResultState>
-ResultStateManager::RankAndPaginate(ResultState result_state) {
- if (!result_state.HasMoreResults()) {
- return absl_ports::InvalidArgumentError("ResultState has no results");
+libtextclassifier3::StatusOr<std::pair<uint64_t, PageResult>>
+ResultStateManager::CacheAndRetrieveFirstPage(
+ std::unique_ptr<ScoredDocumentHitsRanker> ranker,
+ SectionRestrictQueryTermsMap query_terms,
+ const SearchSpecProto& search_spec, const ScoringSpecProto& scoring_spec,
+ const ResultSpecProto& result_spec, const DocumentStore& document_store,
+ const ResultRetrieverV2& result_retriever) {
+ if (ranker == nullptr) {
+ return absl_ports::InvalidArgumentError("Should not provide null ranker");
}
- // Gets the number before calling GetNextPage() because num_returned() may
- // change after returning more results.
- int num_previously_returned = result_state.num_returned();
- int num_per_page = result_state.num_per_page();
-
- std::vector<ScoredDocumentHit> page_result_document_hits =
- result_state.GetNextPage(document_store_);
-
- SnippetContext snippet_context_copy = result_state.snippet_context();
-
- std::unordered_map<std::string, ProjectionTree> projection_tree_map_copy =
- result_state.projection_tree_map();
- if (!result_state.HasMoreResults()) {
+ // Create shared pointer of ResultState.
+ // ResultState should be created by ResultStateManager only.
+ std::shared_ptr<ResultStateV2> result_state = std::make_shared<ResultStateV2>(
+ std::move(ranker), std::move(query_terms), search_spec, scoring_spec,
+ result_spec, document_store);
+
+ // Retrieve docs outside of ResultStateManager critical section.
+ // Will enter ResultState critical section inside ResultRetriever.
+ auto [page_result, has_more_results] =
+ result_retriever.RetrieveNextPage(*result_state);
+ if (!has_more_results) {
// No more pages, won't store ResultState, returns directly
- return PageResultState(
- std::move(page_result_document_hits), kInvalidNextPageToken,
- std::move(snippet_context_copy), std::move(projection_tree_map_copy),
- num_previously_returned, num_per_page);
+ return std::make_pair(kInvalidNextPageToken, std::move(page_result));
}
- absl_ports::unique_lock l(&mutex_);
-
// ResultState has multiple pages, storing it
- uint64_t next_page_token = Add(std::move(result_state));
+ int num_hits_to_add = 0;
+ {
+ // ResultState critical section
+ absl_ports::unique_lock l(&result_state->mutex);
+
+ result_state->scored_document_hits_ranker->TruncateHitsTo(max_total_hits_);
+ result_state->RegisterNumTotalHits(&num_total_hits_);
+ num_hits_to_add = result_state->scored_document_hits_ranker->size();
+ }
- return PageResultState(std::move(page_result_document_hits), next_page_token,
- std::move(snippet_context_copy),
- std::move(projection_tree_map_copy),
- num_previously_returned, num_per_page);
-}
+ // It is fine to exit ResultState critical section, since it is just created
+ // above and only this thread (this call stack) has access to it. Thus, it
+ // won't be changed during the gap before we enter ResultStateManager critical
+ // section.
+ uint64_t next_page_token = kInvalidNextPageToken;
+ {
+ // ResultStateManager critical section
+ absl_ports::unique_lock l(&mutex_);
+
+ // Remove expired result states first.
+ InternalInvalidateExpiredResultStates(kDefaultResultStateTtlInMs);
+ // Remove states to make room for this new state.
+ RemoveStatesIfNeeded(num_hits_to_add);
+ // Generate a new unique token and add it into result_state_map_.
+ next_page_token = Add(std::move(result_state));
+ }
-uint64_t ResultStateManager::Add(ResultState result_state) {
- RemoveStatesIfNeeded(result_state);
- result_state.TruncateHitsTo(max_total_hits_);
+ return std::make_pair(next_page_token, std::move(page_result));
+}
+uint64_t ResultStateManager::Add(std::shared_ptr<ResultStateV2> result_state) {
uint64_t new_token = GetUniqueToken();
- num_total_hits_ += result_state.num_remaining();
result_state_map_.emplace(new_token, std::move(result_state));
// Tracks the insertion order
token_queue_.push(
@@ -83,43 +108,40 @@ uint64_t ResultStateManager::Add(ResultState result_state) {
return new_token;
}
-libtextclassifier3::StatusOr<PageResultState> ResultStateManager::GetNextPage(
- uint64_t next_page_token) {
- absl_ports::unique_lock l(&mutex_);
+libtextclassifier3::StatusOr<std::pair<uint64_t, PageResult>>
+ResultStateManager::GetNextPage(uint64_t next_page_token,
+ const ResultRetrieverV2& result_retriever) {
+ std::shared_ptr<ResultStateV2> result_state = nullptr;
+ {
+ // ResultStateManager critical section
+ absl_ports::unique_lock l(&mutex_);
- const auto& state_iterator = result_state_map_.find(next_page_token);
- if (state_iterator == result_state_map_.end()) {
- return absl_ports::NotFoundError("next_page_token not found");
- }
+ // Remove expired result states before fetching
+ InternalInvalidateExpiredResultStates(kDefaultResultStateTtlInMs);
- int num_returned = state_iterator->second.num_returned();
- int num_per_page = state_iterator->second.num_per_page();
- std::vector<ScoredDocumentHit> result_of_page =
- state_iterator->second.GetNextPage(document_store_);
- if (result_of_page.empty()) {
- // This shouldn't happen, all our active states should contain results, but
- // a sanity check here in case of any data inconsistency.
- InternalInvalidateResultState(next_page_token);
- return absl_ports::NotFoundError(
- "No more results, token has been invalidated.");
+ const auto& state_iterator = result_state_map_.find(next_page_token);
+ if (state_iterator == result_state_map_.end()) {
+ return absl_ports::NotFoundError("next_page_token not found");
+ }
+ result_state = state_iterator->second;
}
- // Copies the SnippetContext in case the ResultState is invalidated.
- SnippetContext snippet_context_copy =
- state_iterator->second.snippet_context();
+ // Retrieve docs outside of ResultStateManager critical section.
+ // Will enter ResultState critical section inside ResultRetriever.
+ auto [page_result, has_more_results] =
+ result_retriever.RetrieveNextPage(*result_state);
- std::unordered_map<std::string, ProjectionTree> projection_tree_map_copy =
- state_iterator->second.projection_tree_map();
+ if (!has_more_results) {
+ {
+ // ResultStateManager critical section
+ absl_ports::unique_lock l(&mutex_);
+
+ InternalInvalidateResultState(next_page_token);
+ }
- if (!state_iterator->second.HasMoreResults()) {
- InternalInvalidateResultState(next_page_token);
next_page_token = kInvalidNextPageToken;
}
-
- num_total_hits_ -= result_of_page.size();
- return PageResultState(
- result_of_page, next_page_token, std::move(snippet_context_copy),
- std::move(projection_tree_map_copy), num_returned, num_per_page);
+ return std::make_pair(next_page_token, std::move(page_result));
}
void ResultStateManager::InvalidateResultState(uint64_t next_page_token) {
@@ -137,17 +159,13 @@ void ResultStateManager::InvalidateAllResultStates() {
InternalInvalidateAllResultStates();
}
-void ResultStateManager::InvalidateExpiredResultStates(
- int64_t result_state_ttl) {
- absl_ports::unique_lock l(&mutex_);
- InternalInvalidateExpiredResultStates(result_state_ttl);
-}
-
void ResultStateManager::InternalInvalidateAllResultStates() {
+ // We don't have to reset num_total_hits_ (to 0) here, since clearing
+ // result_state_map_ will "eventually" invoke the destructor of ResultState
+ // (which decrements num_total_hits_) and num_total_hits_ will become 0.
result_state_map_.clear();
invalidated_token_set_.clear();
token_queue_ = std::queue<std::pair<uint64_t, int64_t>>();
- num_total_hits_ = 0;
}
uint64_t ResultStateManager::GetUniqueToken() {
@@ -163,14 +181,14 @@ uint64_t ResultStateManager::GetUniqueToken() {
return new_token;
}
-void ResultStateManager::RemoveStatesIfNeeded(const ResultState& result_state) {
+void ResultStateManager::RemoveStatesIfNeeded(int num_hits_to_add) {
if (result_state_map_.empty() || token_queue_.empty()) {
return;
}
// 1. Check if this new result_state would take up the entire result state
// manager budget.
- if (result_state.num_remaining() > max_total_hits_) {
+ if (num_hits_to_add > max_total_hits_) {
// This single result state will exceed our budget. Drop everything else to
// accomodate it.
InternalInvalidateAllResultStates();
@@ -187,7 +205,13 @@ void ResultStateManager::RemoveStatesIfNeeded(const ResultState& result_state) {
// 3. If we're over budget, remove states from oldest to newest until we fit
// into our budget.
- while (result_state.num_remaining() + num_total_hits_ > max_total_hits_) {
+ // Note: num_total_hits_ may not be decremented immediately after invalidating
+ // a result state, since other threads may still hold the shared pointer.
+ // Thus, we have to check if token_queue_ is empty or not, since it is
+ // possible that num_total_hits_ is non-zero and still greater than
+ // max_total_hits_ when token_queue_ is empty. Still "eventually" it will be
+ // decremented after the last thread releases the shared pointer.
+ while (!token_queue_.empty() && num_total_hits_ > max_total_hits_) {
InternalInvalidateResultState(token_queue_.front().first);
token_queue_.pop();
}
@@ -201,7 +225,9 @@ void ResultStateManager::InternalInvalidateResultState(uint64_t token) {
// remove the token in RemoveStatesIfNeeded().
auto itr = result_state_map_.find(token);
if (itr != result_state_map_.end()) {
- num_total_hits_ -= itr->second.num_remaining();
+ // We don't have to decrement num_total_hits_ here, since erasing the shared
+ // ptr instance will "eventually" invoke the destructor of ResultState and
+ // it will handle this.
result_state_map_.erase(itr);
invalidated_token_set_.insert(token);
}
@@ -214,7 +240,9 @@ void ResultStateManager::InternalInvalidateExpiredResultStates(
current_time - token_queue_.front().second >= result_state_ttl) {
auto itr = result_state_map_.find(token_queue_.front().first);
if (itr != result_state_map_.end()) {
- num_total_hits_ -= itr->second.num_remaining();
+ // We don't have to decrement num_total_hits_ here, since erasing the
+ // shared ptr instance will "eventually" invoke the destructor of
+ // ResultState and it will handle this.
result_state_map_.erase(itr);
} else {
// Since result_state_map_ and invalidated_token_set_ are mutually
diff --git a/icing/result/result-state-manager.h b/icing/result/result-state-manager.h
index 745b0ec..0684864 100644
--- a/icing/result/result-state-manager.h
+++ b/icing/result/result-state-manager.h
@@ -15,6 +15,8 @@
#ifndef ICING_RESULT_RESULT_STATE_MANAGER_H_
#define ICING_RESULT_RESULT_STATE_MANAGER_H_
+#include <atomic>
+#include <memory>
#include <queue>
#include <random>
#include <unordered_map>
@@ -24,8 +26,11 @@
#include "icing/absl_ports/mutex.h"
#include "icing/proto/scoring.pb.h"
#include "icing/proto/search.pb.h"
-#include "icing/result/page-result-state.h"
-#include "icing/result/result-state.h"
+#include "icing/query/query-terms.h"
+#include "icing/result/page-result.h"
+#include "icing/result/result-retriever-v2.h"
+#include "icing/result/result-state-v2.h"
+#include "icing/scoring/scored-document-hits-ranker.h"
#include "icing/util/clock.h"
namespace icing {
@@ -49,30 +54,46 @@ class ResultStateManager {
ResultStateManager(const ResultStateManager&) = delete;
ResultStateManager& operator=(const ResultStateManager&) = delete;
- // Ranks the results and returns the first page of them. The result object
- // PageResultState contains a next_page_token which can be used to fetch more
- // pages later. It will be set to a default value 0 if there're no more pages.
+ // Creates a new result state, retrieves and returns PageResult for the first
+ // page. Also caches the new result state and returns a next_page_token which
+ // can be used to fetch more pages from the same result state later. Before
+ // caching the result state, adjusts (truncate) the size and evicts some old
+ // result states if exceeding the cache size limit. next_page_token will be
+ // set to a default value kInvalidNextPageToken if there're no more pages.
//
- // NOTE: it's caller's responsibility not to call this method with the same
- // ResultState more than once, otherwise duplicate states will be stored
- // internally.
+ // NOTE: it is possible to have empty result for the first page even if the
+ // ranker was not empty before the retrieval, since GroupResultLimiter
+ // may filter out all docs. In this case, the first page is also the
+ // last page and next_page_token will be set to kInvalidNextPageToken.
//
// Returns:
- // A PageResultState on success
- // INVALID_ARGUMENT if the input state contains no results
- libtextclassifier3::StatusOr<PageResultState> RankAndPaginate(
- ResultState result_state) ICING_LOCKS_EXCLUDED(mutex_);
+ // A token and PageResult wrapped by std::pair on success
+ // INVALID_ARGUMENT if the input ranker is null or contains no results
+ libtextclassifier3::StatusOr<std::pair<uint64_t, PageResult>>
+ CacheAndRetrieveFirstPage(std::unique_ptr<ScoredDocumentHitsRanker> ranker,
+ SectionRestrictQueryTermsMap query_terms,
+ const SearchSpecProto& search_spec,
+ const ScoringSpecProto& scoring_spec,
+ const ResultSpecProto& result_spec,
+ const DocumentStore& document_store,
+ const ResultRetrieverV2& result_retriever)
+ ICING_LOCKS_EXCLUDED(mutex_);
- // Retrieves and returns the next page of results wrapped in PageResultState.
+ // Retrieves and returns PageResult for the next page.
// The returned results won't exist in ResultStateManager anymore. If the
// query has no more pages after this retrieval, the input token will be
// invalidated.
//
+ // NOTE: it is possible to have empty result for the last page even if the
+ // ranker was not empty before the retrieval, since GroupResultLimiter
+ // may filtered out all remaining docs.
+ //
// Returns:
- // PageResultState on success, guaranteed to have non-empty results
+ // A token and PageResult wrapped by std::pair on success
// NOT_FOUND if failed to find any more results
- libtextclassifier3::StatusOr<PageResultState> GetNextPage(
- uint64_t next_page_token) ICING_LOCKS_EXCLUDED(mutex_);
+ libtextclassifier3::StatusOr<std::pair<uint64_t, PageResult>> GetNextPage(
+ uint64_t next_page_token, const ResultRetrieverV2& result_retriever)
+ ICING_LOCKS_EXCLUDED(mutex_);
// Invalidates the result state associated with the given next-page token.
void InvalidateResultState(uint64_t next_page_token)
@@ -81,12 +102,6 @@ class ResultStateManager {
// Invalidates all result states / tokens currently in ResultStateManager.
void InvalidateAllResultStates() ICING_LOCKS_EXCLUDED(mutex_);
- // Invalidates expired result states / tokens currently in ResultStateManager
- // that were created before current_time - result_state_ttl.
- void InvalidateExpiredResultStates(
- int64_t result_state_ttl = kDefaultResultStateTtlInMs)
- ICING_LOCKS_EXCLUDED(mutex_);
-
private:
absl_ports::shared_mutex mutex_;
@@ -100,10 +115,10 @@ class ResultStateManager {
// The number of scored document hits that all result states currently held by
// the result state manager have.
- int num_total_hits_;
+ std::atomic<int> num_total_hits_;
// A hash map of (next-page token -> result state)
- std::unordered_map<uint64_t, ResultState> result_state_map_
+ std::unordered_map<uint64_t, std::shared_ptr<ResultStateV2>> result_state_map_
ICING_GUARDED_BY(mutex_);
// A queue used to track the insertion order of tokens with pushed timestamps.
@@ -125,14 +140,16 @@ class ResultStateManager {
// currently valid tokens. When the maximum number of result states is
// reached, the oldest / firstly added result state will be removed to make
// room for the new state.
- uint64_t Add(ResultState result_state) ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+ uint64_t Add(std::shared_ptr<ResultStateV2> result_state)
+ ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
// Helper method to generate a next-page token that is unique among all
// existing tokens in token_queue_.
uint64_t GetUniqueToken() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
- // Helper method to remove old states to make room for incoming states.
- void RemoveStatesIfNeeded(const ResultState& result_state)
+ // Helper method to remove old states to make room for incoming states with
+ // size num_hits_to_add.
+ void RemoveStatesIfNeeded(int num_hits_to_add)
ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
// Helper method to remove a result state from result_state_map_, the token
diff --git a/icing/result/result-state-manager_test.cc b/icing/result/result-state-manager_test.cc
index 251a736..7025c63 100644
--- a/icing/result/result-state-manager_test.cc
+++ b/icing/result/result-state-manager_test.cc
@@ -16,23 +16,39 @@
#include "gmock/gmock.h"
#include "gtest/gtest.h"
+#include "icing/document-builder.h"
#include "icing/file/filesystem.h"
#include "icing/portable/equals-proto.h"
+#include "icing/result/page-result.h"
+#include "icing/result/result-retriever-v2.h"
#include "icing/schema/schema-store.h"
+#include "icing/scoring/priority-queue-scored-document-hits-ranker.h"
+#include "icing/scoring/scored-document-hits-ranker.h"
#include "icing/store/document-store.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/fake-clock.h"
+#include "icing/testing/icu-data-file-helper.h"
+#include "icing/testing/test-data.h"
#include "icing/testing/tmp-directory.h"
+#include "icing/tokenization/language-segmenter-factory.h"
+#include "icing/transform/normalizer-factory.h"
+#include "icing/transform/normalizer.h"
#include "icing/util/clock.h"
+#include "unicode/uloc.h"
namespace icing {
namespace lib {
namespace {
+
using ::icing::lib::portable_equals_proto::EqualsProto;
-using ::testing::ElementsAre;
using ::testing::Eq;
-using ::testing::Gt;
using ::testing::IsEmpty;
+using ::testing::Not;
+using ::testing::SizeIs;
+using PageResultInfo = std::pair<uint64_t, PageResult>;
+
+// TODO(sungyc): Refactor helper functions below (builder classes or common test
+// utility).
ScoringSpecProto CreateScoringSpec() {
ScoringSpecProto scoring_spec;
@@ -46,102 +62,175 @@ ResultSpecProto CreateResultSpec(int num_per_page) {
return result_spec;
}
-ScoredDocumentHit CreateScoredHit(DocumentId document_id) {
- return ScoredDocumentHit(document_id, kSectionIdMaskNone, /*score=*/1);
+DocumentProto CreateDocument(int id) {
+ return DocumentBuilder()
+ .SetNamespace("namespace")
+ .SetUri(std::to_string(id))
+ .SetSchema("Document")
+ .SetCreationTimestampMs(1574365086666 + id)
+ .SetScore(1)
+ .Build();
}
class ResultStateManagerTest : public testing::Test {
protected:
+ ResultStateManagerTest() : test_dir_(GetTestTempDir() + "/icing") {
+ filesystem_.CreateDirectoryRecursively(test_dir_.c_str());
+ }
+
void SetUp() override {
+ if (!IsCfStringTokenization() && !IsReverseJniTokenization()) {
+ ICING_ASSERT_OK(
+ // File generated via icu_data_file rule in //icing/BUILD.
+ icu_data_file_helper::SetUpICUDataFile(
+ GetTestFilePath("icing/icu.dat")));
+ }
+
clock_ = std::make_unique<FakeClock>();
- schema_store_base_dir_ = GetTestTempDir() + "/schema_store";
- filesystem_.CreateDirectoryRecursively(schema_store_base_dir_.c_str());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
ICING_ASSERT_OK_AND_ASSIGN(
- schema_store_, SchemaStore::Create(&filesystem_, schema_store_base_dir_,
- clock_.get()));
+ language_segmenter_,
+ language_segmenter_factory::Create(std::move(options)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ schema_store_,
+ SchemaStore::Create(&filesystem_, test_dir_, clock_.get()));
SchemaProto schema;
schema.add_types()->set_schema_type("Document");
ICING_ASSERT_OK(schema_store_->SetSchema(std::move(schema)));
- doc_store_base_dir_ = GetTestTempDir() + "/document_store";
- filesystem_.CreateDirectoryRecursively(doc_store_base_dir_.c_str());
+ ICING_ASSERT_OK_AND_ASSIGN(normalizer_, normalizer_factory::Create(
+ /*max_term_byte_size=*/10000));
+
ICING_ASSERT_OK_AND_ASSIGN(
DocumentStore::CreateResult result,
- DocumentStore::Create(&filesystem_, doc_store_base_dir_, clock_.get(),
+ DocumentStore::Create(&filesystem_, test_dir_, clock_.get(),
schema_store_.get()));
document_store_ = std::move(result.document_store);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ result_retriever_, ResultRetrieverV2::Create(
+ document_store_.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get()));
}
void TearDown() override {
- filesystem_.DeleteDirectoryRecursively(doc_store_base_dir_.c_str());
- filesystem_.DeleteDirectoryRecursively(schema_store_base_dir_.c_str());
+ filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
clock_.reset();
}
- ResultState CreateResultState(
- const std::vector<ScoredDocumentHit>& scored_document_hits,
- int num_per_page) {
- return ResultState(scored_document_hits, /*query_terms=*/{},
- SearchSpecProto::default_instance(), CreateScoringSpec(),
- CreateResultSpec(num_per_page), *document_store_);
- }
-
- ScoredDocumentHit AddScoredDocument(DocumentId document_id) {
+ std::pair<ScoredDocumentHit, DocumentProto> AddScoredDocument(
+ DocumentId document_id) {
DocumentProto document;
document.set_namespace_("namespace");
document.set_uri(std::to_string(document_id));
document.set_schema("Document");
- document_store_->Put(std::move(document));
- return ScoredDocumentHit(document_id, kSectionIdMaskNone, /*score=*/1);
+ document.set_creation_timestamp_ms(1574365086666 + document_id);
+ document_store_->Put(document);
+ return std::make_pair(
+ ScoredDocumentHit(document_id, kSectionIdMaskNone, /*score=*/1),
+ std::move(document));
+ }
+
+ std::pair<std::vector<ScoredDocumentHit>, std::vector<DocumentProto>>
+ AddScoredDocuments(const std::vector<DocumentId>& document_ids) {
+ std::vector<ScoredDocumentHit> scored_document_hits;
+ std::vector<DocumentProto> document_protos;
+
+ for (DocumentId document_id : document_ids) {
+ std::pair<ScoredDocumentHit, DocumentProto> pair =
+ AddScoredDocument(document_id);
+ scored_document_hits.emplace_back(std::move(pair.first));
+ document_protos.emplace_back(std::move(pair.second));
+ }
+
+ std::reverse(document_protos.begin(), document_protos.end());
+
+ return std::make_pair(std::move(scored_document_hits),
+ std::move(document_protos));
}
FakeClock* clock() { return clock_.get(); }
const FakeClock* clock() const { return clock_.get(); }
+ DocumentStore& document_store() { return *document_store_; }
const DocumentStore& document_store() const { return *document_store_; }
+ const ResultRetrieverV2& result_retriever() const {
+ return *result_retriever_;
+ }
+
private:
Filesystem filesystem_;
+ const std::string test_dir_;
std::unique_ptr<FakeClock> clock_;
- std::string doc_store_base_dir_;
- std::string schema_store_base_dir_;
- std::unique_ptr<DocumentStore> document_store_;
+ std::unique_ptr<LanguageSegmenter> language_segmenter_;
std::unique_ptr<SchemaStore> schema_store_;
+ std::unique_ptr<Normalizer> normalizer_;
+ std::unique_ptr<DocumentStore> document_store_;
+ std::unique_ptr<ResultRetrieverV2> result_retriever_;
};
-TEST_F(ResultStateManagerTest, ShouldRankAndPaginateOnePage) {
- ResultState original_result_state =
- CreateResultState({AddScoredDocument(/*document_id=*/0),
- AddScoredDocument(/*document_id=*/1),
- AddScoredDocument(/*document_id=*/2)},
- /*num_per_page=*/10);
+TEST_F(ResultStateManagerTest, ShouldCacheAndRetrieveFirstPageOnePage) {
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store().Put(CreateDocument(/*id=*/1)));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store().Put(CreateDocument(/*id=*/2)));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3,
+ document_store().Put(CreateDocument(/*id=*/3)));
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ {document_id1, kSectionIdMaskNone, /*score=*/1},
+ {document_id2, kSectionIdMaskNone, /*score=*/1},
+ {document_id3, kSectionIdMaskNone, /*score=*/1}};
+ std::unique_ptr<ScoredDocumentHitsRanker> ranker =
+ std::make_unique<PriorityQueueScoredDocumentHitsRanker>(
+ std::move(scored_document_hits), /*is_descending=*/true);
ResultStateManager result_state_manager(
/*max_total_hits=*/std::numeric_limits<int>::max(), document_store(),
clock());
+
ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state,
- result_state_manager.RankAndPaginate(std::move(original_result_state)));
+ PageResultInfo page_result_info,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::move(ranker),
+ /*query_terms=*/{}, SearchSpecProto::default_instance(),
+ CreateScoringSpec(), CreateResultSpec(/*num_per_page=*/10),
+ document_store(), result_retriever()));
- EXPECT_THAT(page_result_state.next_page_token, Eq(kInvalidNextPageToken));
+ EXPECT_THAT(page_result_info.first, Eq(kInvalidNextPageToken));
- // Should get the original scored document hits
- EXPECT_THAT(
- page_result_state.scored_document_hits,
- ElementsAre(EqualsScoredDocumentHit(CreateScoredHit(/*document_id=*/2)),
- EqualsScoredDocumentHit(CreateScoredHit(/*document_id=*/1)),
- EqualsScoredDocumentHit(CreateScoredHit(/*document_id=*/0))));
+ // Should get docs.
+ ASSERT_THAT(page_result_info.second.results, SizeIs(3));
+ EXPECT_THAT(page_result_info.second.results.at(0).document(),
+ EqualsProto(CreateDocument(/*id=*/3)));
+ EXPECT_THAT(page_result_info.second.results.at(1).document(),
+ EqualsProto(CreateDocument(/*id=*/2)));
+ EXPECT_THAT(page_result_info.second.results.at(2).document(),
+ EqualsProto(CreateDocument(/*id=*/1)));
}
-TEST_F(ResultStateManagerTest, ShouldRankAndPaginateMultiplePages) {
- ResultState original_result_state =
- CreateResultState({AddScoredDocument(/*document_id=*/0),
- AddScoredDocument(/*document_id=*/1),
- AddScoredDocument(/*document_id=*/2),
- AddScoredDocument(/*document_id=*/3),
- AddScoredDocument(/*document_id=*/4)},
- /*num_per_page=*/2);
+TEST_F(ResultStateManagerTest, ShouldCacheAndRetrieveFirstPageMultiplePages) {
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store().Put(CreateDocument(/*id=*/1)));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store().Put(CreateDocument(/*id=*/2)));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3,
+ document_store().Put(CreateDocument(/*id=*/3)));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id4,
+ document_store().Put(CreateDocument(/*id=*/4)));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id5,
+ document_store().Put(CreateDocument(/*id=*/5)));
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ {document_id1, kSectionIdMaskNone, /*score=*/1},
+ {document_id2, kSectionIdMaskNone, /*score=*/1},
+ {document_id3, kSectionIdMaskNone, /*score=*/1},
+ {document_id4, kSectionIdMaskNone, /*score=*/1},
+ {document_id5, kSectionIdMaskNone, /*score=*/1}};
+ std::unique_ptr<ScoredDocumentHitsRanker> ranker =
+ std::make_unique<PriorityQueueScoredDocumentHitsRanker>(
+ std::move(scored_document_hits), /*is_descending=*/true);
ResultStateManager result_state_manager(
/*max_total_hits=*/std::numeric_limits<int>::max(), document_store(),
@@ -149,977 +238,1132 @@ TEST_F(ResultStateManagerTest, ShouldRankAndPaginateMultiplePages) {
// First page, 2 results
ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state1,
- result_state_manager.RankAndPaginate(std::move(original_result_state)));
- EXPECT_THAT(
- page_result_state1.scored_document_hits,
- ElementsAre(EqualsScoredDocumentHit(CreateScoredHit(/*document_id=*/4)),
- EqualsScoredDocumentHit(CreateScoredHit(/*document_id=*/3))));
-
- uint64_t next_page_token = page_result_state1.next_page_token;
+ PageResultInfo page_result_info1,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::move(ranker),
+ /*query_terms=*/{}, SearchSpecProto::default_instance(),
+ CreateScoringSpec(), CreateResultSpec(/*num_per_page=*/2),
+ document_store(), result_retriever()));
+ EXPECT_THAT(page_result_info1.first, Not(Eq(kInvalidNextPageToken)));
+ ASSERT_THAT(page_result_info1.second.results, SizeIs(2));
+ EXPECT_THAT(page_result_info1.second.results.at(0).document(),
+ EqualsProto(CreateDocument(/*id=*/5)));
+ EXPECT_THAT(page_result_info1.second.results.at(1).document(),
+ EqualsProto(CreateDocument(/*id=*/4)));
+
+ uint64_t next_page_token = page_result_info1.first;
// Second page, 2 results
- ICING_ASSERT_OK_AND_ASSIGN(PageResultState page_result_state2,
- result_state_manager.GetNextPage(next_page_token));
- EXPECT_THAT(
- page_result_state2.scored_document_hits,
- ElementsAre(EqualsScoredDocumentHit(CreateScoredHit(/*document_id=*/2)),
- EqualsScoredDocumentHit(CreateScoredHit(/*document_id=*/1))));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultInfo page_result_info2,
+ result_state_manager.GetNextPage(next_page_token, result_retriever()));
+ EXPECT_THAT(page_result_info2.first, Eq(next_page_token));
+ ASSERT_THAT(page_result_info2.second.results, SizeIs(2));
+ EXPECT_THAT(page_result_info2.second.results.at(0).document(),
+ EqualsProto(CreateDocument(/*id=*/3)));
+ EXPECT_THAT(page_result_info2.second.results.at(1).document(),
+ EqualsProto(CreateDocument(/*id=*/2)));
// Third page, 1 result
- ICING_ASSERT_OK_AND_ASSIGN(PageResultState page_result_state3,
- result_state_manager.GetNextPage(next_page_token));
- EXPECT_THAT(
- page_result_state3.scored_document_hits,
- ElementsAre(EqualsScoredDocumentHit(CreateScoredHit(/*document_id=*/0))));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultInfo page_result_info3,
+ result_state_manager.GetNextPage(next_page_token, result_retriever()));
+ EXPECT_THAT(page_result_info3.first, Eq(kInvalidNextPageToken));
+ ASSERT_THAT(page_result_info3.second.results, SizeIs(1));
+ EXPECT_THAT(page_result_info3.second.results.at(0).document(),
+ EqualsProto(CreateDocument(/*id=*/1)));
// No results
- EXPECT_THAT(result_state_manager.GetNextPage(next_page_token),
- StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(
+ result_state_manager.GetNextPage(next_page_token, result_retriever()),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
}
-TEST_F(ResultStateManagerTest, EmptyStateShouldReturnError) {
- ResultState empty_result_state = CreateResultState({}, /*num_per_page=*/1);
+TEST_F(ResultStateManagerTest, NullRankerShouldReturnError) {
+ ResultStateManager result_state_manager(
+ /*max_total_hits=*/std::numeric_limits<int>::max(), document_store(),
+ clock());
+
+ EXPECT_THAT(result_state_manager.CacheAndRetrieveFirstPage(
+ /*ranker=*/nullptr,
+ /*query_terms=*/{}, SearchSpecProto::default_instance(),
+ CreateScoringSpec(), CreateResultSpec(/*num_per_page=*/1),
+ document_store(), result_retriever()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+TEST_F(ResultStateManagerTest, EmptyRankerShouldReturnEmptyFirstPage) {
ResultStateManager result_state_manager(
/*max_total_hits=*/std::numeric_limits<int>::max(), document_store(),
clock());
- EXPECT_THAT(
- result_state_manager.RankAndPaginate(std::move(empty_result_state)),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultInfo page_result_info,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<PriorityQueueScoredDocumentHitsRanker>(
+ std::vector<ScoredDocumentHit>(), /*is_descending=*/true),
+ /*query_terms=*/{}, SearchSpecProto::default_instance(),
+ CreateScoringSpec(), CreateResultSpec(/*num_per_page=*/1),
+ document_store(), result_retriever()));
+
+ EXPECT_THAT(page_result_info.first, Eq(kInvalidNextPageToken));
+ EXPECT_THAT(page_result_info.second.results, IsEmpty());
}
-TEST_F(ResultStateManagerTest, ShouldInvalidateOneToken) {
- ResultState result_state1 =
- CreateResultState({AddScoredDocument(/*document_id=*/0),
- AddScoredDocument(/*document_id=*/1),
- AddScoredDocument(/*document_id=*/2)},
- /*num_per_page=*/1);
- ResultState result_state2 =
- CreateResultState({AddScoredDocument(/*document_id=*/3),
- AddScoredDocument(/*document_id=*/4),
- AddScoredDocument(/*document_id=*/5)},
- /*num_per_page=*/1);
+TEST_F(ResultStateManagerTest, ShouldAllowEmptyFirstPage) {
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store().Put(CreateDocument(/*id=*/1)));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store().Put(CreateDocument(/*id=*/2)));
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ {document_id1, kSectionIdMaskNone, /*score=*/1},
+ {document_id2, kSectionIdMaskNone, /*score=*/1}};
+
+ ResultStateManager result_state_manager(
+ /*max_total_hits=*/std::numeric_limits<int>::max(), document_store(),
+ clock());
+
+ // Create a ResultSpec that limits "namespace" to 0 results.
+ ResultSpecProto result_spec = CreateResultSpec(/*num_per_page=*/1);
+ ResultSpecProto::ResultGrouping* result_grouping =
+ result_spec.add_result_groupings();
+ result_grouping->set_max_results(0);
+ result_grouping->add_namespaces("namespace");
+
+ // First page, no result.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultInfo page_result_info,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<PriorityQueueScoredDocumentHitsRanker>(
+ std::move(scored_document_hits), /*is_descending=*/true),
+ /*query_terms=*/{}, SearchSpecProto::default_instance(),
+ CreateScoringSpec(), result_spec, document_store(),
+ result_retriever()));
+ // If the first page has no result, then it should be the last page.
+ EXPECT_THAT(page_result_info.first, Eq(kInvalidNextPageToken));
+ EXPECT_THAT(page_result_info.second.results, IsEmpty());
+}
+
+TEST_F(ResultStateManagerTest, ShouldAllowEmptyLastPage) {
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store().Put(CreateDocument(/*id=*/1)));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store().Put(CreateDocument(/*id=*/2)));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3,
+ document_store().Put(CreateDocument(/*id=*/3)));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id4,
+ document_store().Put(CreateDocument(/*id=*/4)));
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ {document_id1, kSectionIdMaskNone, /*score=*/1},
+ {document_id2, kSectionIdMaskNone, /*score=*/1},
+ {document_id3, kSectionIdMaskNone, /*score=*/1},
+ {document_id4, kSectionIdMaskNone, /*score=*/1}};
+
+ ResultStateManager result_state_manager(
+ /*max_total_hits=*/std::numeric_limits<int>::max(), document_store(),
+ clock());
+
+ // Create a ResultSpec that limits "namespace" to 2 results.
+ ResultSpecProto result_spec = CreateResultSpec(/*num_per_page=*/2);
+ ResultSpecProto::ResultGrouping* result_grouping =
+ result_spec.add_result_groupings();
+ result_grouping->set_max_results(2);
+ result_grouping->add_namespaces("namespace");
+
+ // First page, 2 results.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultInfo page_result_info1,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<PriorityQueueScoredDocumentHitsRanker>(
+ std::move(scored_document_hits), /*is_descending=*/true),
+ /*query_terms=*/{}, SearchSpecProto::default_instance(),
+ CreateScoringSpec(), result_spec, document_store(),
+ result_retriever()));
+ EXPECT_THAT(page_result_info1.first, Not(Eq(kInvalidNextPageToken)));
+ ASSERT_THAT(page_result_info1.second.results, SizeIs(2));
+ EXPECT_THAT(page_result_info1.second.results.at(0).document(),
+ EqualsProto(CreateDocument(/*id=*/4)));
+ EXPECT_THAT(page_result_info1.second.results.at(1).document(),
+ EqualsProto(CreateDocument(/*id=*/3)));
+
+ uint64_t next_page_token = page_result_info1.first;
+
+ // Second page, all remaining documents will be filtered out by group result
+ // limiter, so we should get an empty page.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultInfo page_result_info2,
+ result_state_manager.GetNextPage(next_page_token, result_retriever()));
+ EXPECT_THAT(page_result_info2.first, Eq(kInvalidNextPageToken));
+ EXPECT_THAT(page_result_info2.second.results, IsEmpty());
+}
+
+TEST_F(ResultStateManagerTest,
+ ShouldInvalidateExpiredTokensWhenCacheAndRetrieveFirstPage) {
+ auto [scored_document_hits1, document_protos1] = AddScoredDocuments(
+ {/*document_id=*/0, /*document_id=*/1, /*document_id=*/2});
+ auto [scored_document_hits2, document_protos2] = AddScoredDocuments(
+ {/*document_id=*/3, /*document_id=*/4, /*document_id=*/5});
ResultStateManager result_state_manager(
/*max_total_hits=*/std::numeric_limits<int>::max(), document_store(),
clock());
+
+ SectionRestrictQueryTermsMap query_terms;
+ SearchSpecProto search_spec;
+ ScoringSpecProto scoring_spec = CreateScoringSpec();
+ ResultSpecProto result_spec = CreateResultSpec(/*num_per_page=*/1);
+
+ // Set time as 1s and add state 1.
+ clock()->SetSystemTimeMilliseconds(1000);
ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state1,
- result_state_manager.RankAndPaginate(std::move(result_state1)));
+ PageResultInfo page_result_info1,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<PriorityQueueScoredDocumentHitsRanker>(
+ std::move(scored_document_hits1), /*is_descending=*/true),
+ query_terms, search_spec, scoring_spec, result_spec, document_store(),
+ result_retriever()));
+ ASSERT_THAT(page_result_info1.first, Not(Eq(kInvalidNextPageToken)));
+
+ // Set time as 1hr1s and add state 2.
+ clock()->SetSystemTimeMilliseconds(kDefaultResultStateTtlInMs + 1000);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultInfo page_result_info2,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<PriorityQueueScoredDocumentHitsRanker>(
+ std::move(scored_document_hits2), /*is_descending=*/true),
+ query_terms, search_spec, scoring_spec, result_spec, document_store(),
+ result_retriever()));
+
+ // Calling CacheAndRetrieveFirstPage() on state 2 should invalidate the
+ // expired state 1 internally.
+ //
+ // We test the behavior by setting time back to 1s, to make sure the
+ // invalidation of state 1 was done by the previous
+ // CacheAndRetrieveFirstPage() instead of the following GetNextPage().
+ clock()->SetSystemTimeMilliseconds(1000);
+ // page_result_info1's token (page_result_info1.first) shouldn't be found.
+ EXPECT_THAT(result_state_manager.GetNextPage(page_result_info1.first,
+ result_retriever()),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
+TEST_F(ResultStateManagerTest,
+ ShouldInvalidateExpiredTokensWhenGetNextPageOnOthers) {
+ auto [scored_document_hits1, document_protos1] = AddScoredDocuments(
+ {/*document_id=*/0, /*document_id=*/1, /*document_id=*/2});
+ auto [scored_document_hits2, document_protos2] = AddScoredDocuments(
+ {/*document_id=*/3, /*document_id=*/4, /*document_id=*/5});
+
+ ResultStateManager result_state_manager(
+ /*max_total_hits=*/std::numeric_limits<int>::max(), document_store(),
+ clock());
+
+ // Set time as 1s and add state 1.
+ clock()->SetSystemTimeMilliseconds(1000);
ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state2,
- result_state_manager.RankAndPaginate(std::move(result_state2)));
+ PageResultInfo page_result_info1,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<PriorityQueueScoredDocumentHitsRanker>(
+ std::move(scored_document_hits1), /*is_descending=*/true),
+ /*query_terms=*/{}, SearchSpecProto::default_instance(),
+ CreateScoringSpec(), CreateResultSpec(/*num_per_page=*/1),
+ document_store(), result_retriever()));
+ ASSERT_THAT(page_result_info1.first, Not(Eq(kInvalidNextPageToken)));
+
+ // Set time as 2s and add state 2.
+ clock()->SetSystemTimeMilliseconds(2000);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultInfo page_result_info2,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<PriorityQueueScoredDocumentHitsRanker>(
+ std::move(scored_document_hits2), /*is_descending=*/true),
+ /*query_terms=*/{}, SearchSpecProto::default_instance(),
+ CreateScoringSpec(), CreateResultSpec(/*num_per_page=*/1),
+ document_store(), result_retriever()));
+ ASSERT_THAT(page_result_info2.first, Not(Eq(kInvalidNextPageToken)));
+
+ // 1. Set time as 1hr1s.
+ // 2. Call GetNextPage() on state 2. It should correctly invalidate the
+ // expired state 1.
+ // 3. Then calling GetNextPage() on state 1 shouldn't get anything.
+ clock()->SetSystemTimeMilliseconds(kDefaultResultStateTtlInMs + 1000);
+ // page_result_info2's token (page_result_info2.first) should be found
+ ICING_ASSERT_OK_AND_ASSIGN(page_result_info2,
+ result_state_manager.GetNextPage(
+ page_result_info2.first, result_retriever()));
+ // We test the behavior by setting time back to 2s, to make sure the
+ // invalidation of state 1 was done by the previous GetNextPage() instead of
+ // the following GetNextPage().
+ clock()->SetSystemTimeMilliseconds(2000);
+ // page_result_info1's token (page_result_info1.first) shouldn't be found.
+ EXPECT_THAT(result_state_manager.GetNextPage(page_result_info1.first,
+ result_retriever()),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
- result_state_manager.InvalidateResultState(
- page_result_state1.next_page_token);
+TEST_F(ResultStateManagerTest,
+ ShouldInvalidateExpiredTokensWhenGetNextPageOnItself) {
+ auto [scored_document_hits1, document_protos1] = AddScoredDocuments(
+ {/*document_id=*/0, /*document_id=*/1, /*document_id=*/2});
+ auto [scored_document_hits2, document_protos2] = AddScoredDocuments(
+ {/*document_id=*/3, /*document_id=*/4, /*document_id=*/5});
- // page_result_state1.next_page_token() shouldn't be found
- EXPECT_THAT(
- result_state_manager.GetNextPage(page_result_state1.next_page_token),
- StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ ResultStateManager result_state_manager(
+ /*max_total_hits=*/std::numeric_limits<int>::max(), document_store(),
+ clock());
- // page_result_state2.next_page_token() should still exist
+ // Set time as 1s and add state.
+ clock()->SetSystemTimeMilliseconds(1000);
ICING_ASSERT_OK_AND_ASSIGN(
- page_result_state2,
- result_state_manager.GetNextPage(page_result_state2.next_page_token));
- EXPECT_THAT(
- page_result_state2.scored_document_hits,
- ElementsAre(EqualsScoredDocumentHit(CreateScoredHit(/*document_id=*/4))));
+ PageResultInfo page_result_info,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<PriorityQueueScoredDocumentHitsRanker>(
+ std::move(scored_document_hits1), /*is_descending=*/true),
+ /*query_terms=*/{}, SearchSpecProto::default_instance(),
+ CreateScoringSpec(), CreateResultSpec(/*num_per_page=*/1),
+ document_store(), result_retriever()));
+ ASSERT_THAT(page_result_info.first, Not(Eq(kInvalidNextPageToken)));
+
+ // 1. Set time as 1hr1s.
+ // 2. Then calling GetNextPage() on the state shouldn't get anything.
+ clock()->SetSystemTimeMilliseconds(kDefaultResultStateTtlInMs + 1000);
+ // page_result_info's token (page_result_info.first) shouldn't be found.
+ EXPECT_THAT(result_state_manager.GetNextPage(page_result_info.first,
+ result_retriever()),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
}
-TEST_F(ResultStateManagerTest, ShouldInvalidateAllTokens) {
- ResultState result_state1 =
- CreateResultState({AddScoredDocument(/*document_id=*/0),
- AddScoredDocument(/*document_id=*/1),
- AddScoredDocument(/*document_id=*/2)},
- /*num_per_page=*/1);
- ResultState result_state2 =
- CreateResultState({AddScoredDocument(/*document_id=*/3),
- AddScoredDocument(/*document_id=*/4),
- AddScoredDocument(/*document_id=*/5)},
- /*num_per_page=*/1);
+TEST_F(ResultStateManagerTest, ShouldInvalidateOneToken) {
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store().Put(CreateDocument(/*id=*/1)));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store().Put(CreateDocument(/*id=*/2)));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3,
+ document_store().Put(CreateDocument(/*id=*/3)));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id4,
+ document_store().Put(CreateDocument(/*id=*/4)));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id5,
+ document_store().Put(CreateDocument(/*id=*/5)));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id6,
+ document_store().Put(CreateDocument(/*id=*/6)));
+ std::vector<ScoredDocumentHit> scored_document_hits1 = {
+ {document_id1, kSectionIdMaskNone, /*score=*/1},
+ {document_id2, kSectionIdMaskNone, /*score=*/1},
+ {document_id3, kSectionIdMaskNone, /*score=*/1}};
+ std::vector<ScoredDocumentHit> scored_document_hits2 = {
+ {document_id4, kSectionIdMaskNone, /*score=*/1},
+ {document_id5, kSectionIdMaskNone, /*score=*/1},
+ {document_id6, kSectionIdMaskNone, /*score=*/1}};
ResultStateManager result_state_manager(
/*max_total_hits=*/std::numeric_limits<int>::max(), document_store(),
clock());
+
ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state1,
- result_state_manager.RankAndPaginate(std::move(result_state1)));
+ PageResultInfo page_result_info1,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<PriorityQueueScoredDocumentHitsRanker>(
+ std::move(scored_document_hits1), /*is_descending=*/true),
+ /*query_terms=*/{}, SearchSpecProto::default_instance(),
+ CreateScoringSpec(), CreateResultSpec(/*num_per_page=*/1),
+ document_store(), result_retriever()));
+
ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state2,
- result_state_manager.RankAndPaginate(std::move(result_state2)));
+ PageResultInfo page_result_info2,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<PriorityQueueScoredDocumentHitsRanker>(
+ std::move(scored_document_hits2), /*is_descending=*/true),
+ /*query_terms=*/{}, SearchSpecProto::default_instance(),
+ CreateScoringSpec(), CreateResultSpec(/*num_per_page=*/1),
+ document_store(), result_retriever()));
- result_state_manager.InvalidateAllResultStates();
+ // Invalidate first result state by the token.
+ result_state_manager.InvalidateResultState(page_result_info1.first);
- // page_result_state1.next_page_token() shouldn't be found
- EXPECT_THAT(
- result_state_manager.GetNextPage(page_result_state1.next_page_token),
- StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ // page_result_info1's token (page_result_info1.first) shouldn't be found
+ EXPECT_THAT(result_state_manager.GetNextPage(page_result_info1.first,
+ result_retriever()),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
- // page_result_state2.next_page_token() shouldn't be found
- EXPECT_THAT(
- result_state_manager.GetNextPage(page_result_state2.next_page_token),
- StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ // page_result_info2's token (page_result_info2.first) should still exist
+ ICING_ASSERT_OK_AND_ASSIGN(page_result_info2,
+ result_state_manager.GetNextPage(
+ page_result_info2.first, result_retriever()));
+ // Should get docs.
+ ASSERT_THAT(page_result_info2.second.results, SizeIs(1));
+ EXPECT_THAT(page_result_info2.second.results.at(0).document(),
+ EqualsProto(CreateDocument(/*id=*/5)));
}
-TEST_F(ResultStateManagerTest, ShouldInvalidateOldTokens) {
- ResultState result_state1 =
- CreateResultState({AddScoredDocument(/*document_id=*/0),
- AddScoredDocument(/*document_id=*/1),
- AddScoredDocument(/*document_id=*/2)},
- /*num_per_page=*/1);
- ResultState result_state2 =
- CreateResultState({AddScoredDocument(/*document_id=*/3),
- AddScoredDocument(/*document_id=*/4),
- AddScoredDocument(/*document_id=*/5)},
- /*num_per_page=*/1);
+TEST_F(ResultStateManagerTest, ShouldInvalidateAllTokens) {
+ auto [scored_document_hits1, document_protos1] = AddScoredDocuments(
+ {/*document_id=*/0, /*document_id=*/1, /*document_id=*/2});
+ auto [scored_document_hits2, document_protos2] = AddScoredDocuments(
+ {/*document_id=*/3, /*document_id=*/4, /*document_id=*/5});
ResultStateManager result_state_manager(
/*max_total_hits=*/std::numeric_limits<int>::max(), document_store(),
clock());
- // Set time as 1s and add state 1.
- clock()->SetSystemTimeMilliseconds(1000);
+
ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state1,
- result_state_manager.RankAndPaginate(std::move(result_state1)));
- // Set time as 1hr2s and add state 2.
- clock()->SetSystemTimeMilliseconds(kDefaultResultStateTtlInMs + 2000);
+ PageResultInfo page_result_info1,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<PriorityQueueScoredDocumentHitsRanker>(
+ std::move(scored_document_hits1), /*is_descending=*/true),
+ /*query_terms=*/{}, SearchSpecProto::default_instance(),
+ CreateScoringSpec(), CreateResultSpec(/*num_per_page=*/1),
+ document_store(), result_retriever()));
+
ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state2,
- result_state_manager.RankAndPaginate(std::move(result_state2)));
+ PageResultInfo page_result_info2,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<PriorityQueueScoredDocumentHitsRanker>(
+ std::move(scored_document_hits2), /*is_descending=*/true),
+ /*query_terms=*/{}, SearchSpecProto::default_instance(),
+ CreateScoringSpec(), CreateResultSpec(/*num_per_page=*/1),
+ document_store(), result_retriever()));
- // Invalidates expired states with default ttl (1 hr). This should only
- // invalidate state 1.
- result_state_manager.InvalidateExpiredResultStates();
+ result_state_manager.InvalidateAllResultStates();
- // page_result_state1.next_page_token() shouldn't be found
- EXPECT_THAT(
- result_state_manager.GetNextPage(page_result_state1.next_page_token),
- StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ // page_result_info1's token (page_result_info1.first) shouldn't be found
+ EXPECT_THAT(result_state_manager.GetNextPage(page_result_info1.first,
+ result_retriever()),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
- // page_result_state2.next_page_token() should be found
- ICING_ASSERT_OK_AND_ASSIGN(
- page_result_state2,
- result_state_manager.GetNextPage(page_result_state2.next_page_token));
- EXPECT_THAT(page_result_state2.scored_document_hits,
- ElementsAre(EqualsScoredDocumentHit(CreateScoredHit(
- /*document_id=*/4))));
+ // page_result_info2's token (page_result_info2.first) shouldn't be found
+ EXPECT_THAT(result_state_manager.GetNextPage(page_result_info2.first,
+ result_retriever()),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
}
TEST_F(ResultStateManagerTest, ShouldRemoveOldestResultState) {
- ResultState result_state1 =
- CreateResultState({AddScoredDocument(/*document_id=*/0),
- AddScoredDocument(/*document_id=*/1)},
- /*num_per_page=*/1);
- ResultState result_state2 =
- CreateResultState({AddScoredDocument(/*document_id=*/2),
- AddScoredDocument(/*document_id=*/3)},
- /*num_per_page=*/1);
- ResultState result_state3 =
- CreateResultState({AddScoredDocument(/*document_id=*/4),
- AddScoredDocument(/*document_id=*/5)},
- /*num_per_page=*/1);
+ auto [scored_document_hits1, document_protos1] =
+ AddScoredDocuments({/*document_id=*/0, /*document_id=*/1});
+ auto [scored_document_hits2, document_protos2] =
+ AddScoredDocuments({/*document_id=*/2, /*document_id=*/3});
+ auto [scored_document_hits3, document_protos3] =
+ AddScoredDocuments({/*document_id=*/4, /*document_id=*/5});
ResultStateManager result_state_manager(/*max_total_hits=*/2,
document_store(), clock());
+
ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state1,
- result_state_manager.RankAndPaginate(std::move(result_state1)));
+ PageResultInfo page_result_info1,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<PriorityQueueScoredDocumentHitsRanker>(
+ std::move(scored_document_hits1), /*is_descending=*/true),
+ /*query_terms=*/{}, SearchSpecProto::default_instance(),
+ CreateScoringSpec(), CreateResultSpec(/*num_per_page=*/1),
+ document_store(), result_retriever()));
+
ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state2,
- result_state_manager.RankAndPaginate(std::move(result_state2)));
+ PageResultInfo page_result_info2,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<PriorityQueueScoredDocumentHitsRanker>(
+ std::move(scored_document_hits2), /*is_descending=*/true),
+ /*query_terms=*/{}, SearchSpecProto::default_instance(),
+ CreateScoringSpec(), CreateResultSpec(/*num_per_page=*/1),
+ document_store(), result_retriever()));
+
// Adding state 3 should cause state 1 to be removed.
ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state3,
- result_state_manager.RankAndPaginate(std::move(result_state3)));
+ PageResultInfo page_result_info3,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<PriorityQueueScoredDocumentHitsRanker>(
+ std::move(scored_document_hits3), /*is_descending=*/true),
+ /*query_terms=*/{}, SearchSpecProto::default_instance(),
+ CreateScoringSpec(), CreateResultSpec(/*num_per_page=*/1),
+ document_store(), result_retriever()));
- EXPECT_THAT(
- result_state_manager.GetNextPage(page_result_state1.next_page_token),
- StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
-
- ICING_ASSERT_OK_AND_ASSIGN(
- page_result_state2,
- result_state_manager.GetNextPage(page_result_state2.next_page_token));
- EXPECT_THAT(page_result_state2.scored_document_hits,
- ElementsAre(EqualsScoredDocumentHit(CreateScoredHit(
- /*document_id=*/2))));
+ EXPECT_THAT(result_state_manager.GetNextPage(page_result_info1.first,
+ result_retriever()),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
- ICING_ASSERT_OK_AND_ASSIGN(
- page_result_state3,
- result_state_manager.GetNextPage(page_result_state3.next_page_token));
- EXPECT_THAT(page_result_state3.scored_document_hits,
- ElementsAre(EqualsScoredDocumentHit(CreateScoredHit(
- /*document_id=*/4))));
+ ICING_ASSERT_OK_AND_ASSIGN(page_result_info2,
+ result_state_manager.GetNextPage(
+ page_result_info2.first, result_retriever()));
+ ASSERT_THAT(page_result_info2.second.results, SizeIs(1));
+ EXPECT_THAT(page_result_info2.second.results.at(0).document(),
+ EqualsProto(document_protos2.at(1)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(page_result_info3,
+ result_state_manager.GetNextPage(
+ page_result_info3.first, result_retriever()));
+ ASSERT_THAT(page_result_info3.second.results, SizeIs(1));
+ EXPECT_THAT(page_result_info3.second.results.at(0).document(),
+ EqualsProto(document_protos3.at(1)));
}
TEST_F(ResultStateManagerTest,
InvalidatedResultStateShouldDecreaseCurrentHitsCount) {
- ResultState result_state1 =
- CreateResultState({AddScoredDocument(/*document_id=*/0),
- AddScoredDocument(/*document_id=*/1)},
- /*num_per_page=*/1);
- ResultState result_state2 =
- CreateResultState({AddScoredDocument(/*document_id=*/2),
- AddScoredDocument(/*document_id=*/3)},
- /*num_per_page=*/1);
- ResultState result_state3 =
- CreateResultState({AddScoredDocument(/*document_id=*/4),
- AddScoredDocument(/*document_id=*/5)},
- /*num_per_page=*/1);
+ auto [scored_document_hits1, document_protos1] =
+ AddScoredDocuments({/*document_id=*/0, /*document_id=*/1});
+ auto [scored_document_hits2, document_protos2] =
+ AddScoredDocuments({/*document_id=*/2, /*document_id=*/3});
+ auto [scored_document_hits3, document_protos3] =
+ AddScoredDocuments({/*document_id=*/4, /*document_id=*/5});
// Add the first three states. Remember, the first page for each result state
- // won't be cached (since it is returned immediately from RankAndPaginate).
- // Each result state has a page size of 1 and a result set of 2 hits. So each
- // result will take up one hit of our three hit budget.
+ // won't be cached (since it is returned immediately from
+ // CacheAndRetrieveFirstPage). Each result state has a page size of 1 and a
+ // result set of 2 hits. So each result will take up one hit of our three hit
+ // budget.
ResultStateManager result_state_manager(/*max_total_hits=*/3,
document_store(), clock());
+
ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state1,
- result_state_manager.RankAndPaginate(std::move(result_state1)));
+ PageResultInfo page_result_info1,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<PriorityQueueScoredDocumentHitsRanker>(
+ std::move(scored_document_hits1), /*is_descending=*/true),
+ /*query_terms=*/{}, SearchSpecProto::default_instance(),
+ CreateScoringSpec(), CreateResultSpec(/*num_per_page=*/1),
+ document_store(), result_retriever()));
+
ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state2,
- result_state_manager.RankAndPaginate(std::move(result_state2)));
+ PageResultInfo page_result_info2,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<PriorityQueueScoredDocumentHitsRanker>(
+ std::move(scored_document_hits2), /*is_descending=*/true),
+ /*query_terms=*/{}, SearchSpecProto::default_instance(),
+ CreateScoringSpec(), CreateResultSpec(/*num_per_page=*/1),
+ document_store(), result_retriever()));
+
ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state3,
- result_state_manager.RankAndPaginate(std::move(result_state3)));
+ PageResultInfo page_result_info3,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<PriorityQueueScoredDocumentHitsRanker>(
+ std::move(scored_document_hits3), /*is_descending=*/true),
+ /*query_terms=*/{}, SearchSpecProto::default_instance(),
+ CreateScoringSpec(), CreateResultSpec(/*num_per_page=*/1),
+ document_store(), result_retriever()));
// Invalidates state 2, so that the number of hits current cached should be
// decremented to 2.
- result_state_manager.InvalidateResultState(
- page_result_state2.next_page_token);
+ result_state_manager.InvalidateResultState(page_result_info2.first);
// If invalidating state 2 correctly decremented the current hit count to 2,
// then adding state 4 should still be within our budget and no other result
// states should be evicted.
- ResultState result_state4 =
- CreateResultState({AddScoredDocument(/*document_id=*/6),
- AddScoredDocument(/*document_id=*/7)},
- /*num_per_page=*/1);
- ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state4,
- result_state_manager.RankAndPaginate(std::move(result_state4)));
-
- ICING_ASSERT_OK_AND_ASSIGN(
- page_result_state1,
- result_state_manager.GetNextPage(page_result_state1.next_page_token));
- EXPECT_THAT(page_result_state1.scored_document_hits,
- ElementsAre(EqualsScoredDocumentHit(CreateScoredHit(
- /*document_id=*/0))));
-
- EXPECT_THAT(
- result_state_manager.GetNextPage(page_result_state2.next_page_token),
- StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
-
- ICING_ASSERT_OK_AND_ASSIGN(
- page_result_state3,
- result_state_manager.GetNextPage(page_result_state3.next_page_token));
- EXPECT_THAT(page_result_state3.scored_document_hits,
- ElementsAre(EqualsScoredDocumentHit(CreateScoredHit(
- /*document_id=*/4))));
+ auto [scored_document_hits4, document_protos4] =
+ AddScoredDocuments({/*document_id=*/6, /*document_id=*/7});
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultInfo page_result_info4,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<PriorityQueueScoredDocumentHitsRanker>(
+ std::move(scored_document_hits4), /*is_descending=*/true),
+ /*query_terms=*/{}, SearchSpecProto::default_instance(),
+ CreateScoringSpec(), CreateResultSpec(/*num_per_page=*/1),
+ document_store(), result_retriever()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(page_result_info1,
+ result_state_manager.GetNextPage(
+ page_result_info1.first, result_retriever()));
+ ASSERT_THAT(page_result_info1.second.results, SizeIs(1));
+ EXPECT_THAT(page_result_info1.second.results.at(0).document(),
+ EqualsProto(document_protos1.at(1)));
+
+ EXPECT_THAT(result_state_manager.GetNextPage(page_result_info2.first,
+ result_retriever()),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
- ICING_ASSERT_OK_AND_ASSIGN(
- page_result_state4,
- result_state_manager.GetNextPage(page_result_state4.next_page_token));
- EXPECT_THAT(page_result_state4.scored_document_hits,
- ElementsAre(EqualsScoredDocumentHit(CreateScoredHit(
- /*document_id=*/6))));
+ ICING_ASSERT_OK_AND_ASSIGN(page_result_info3,
+ result_state_manager.GetNextPage(
+ page_result_info3.first, result_retriever()));
+ ASSERT_THAT(page_result_info3.second.results, SizeIs(1));
+ EXPECT_THAT(page_result_info3.second.results.at(0).document(),
+ EqualsProto(document_protos3.at(1)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(page_result_info4,
+ result_state_manager.GetNextPage(
+ page_result_info4.first, result_retriever()));
+ ASSERT_THAT(page_result_info4.second.results, SizeIs(1));
+ EXPECT_THAT(page_result_info4.second.results.at(0).document(),
+ EqualsProto(document_protos4.at(1)));
}
TEST_F(ResultStateManagerTest,
InvalidatedAllResultStatesShouldResetCurrentHitCount) {
- ResultState result_state1 =
- CreateResultState({AddScoredDocument(/*document_id=*/0),
- AddScoredDocument(/*document_id=*/1)},
- /*num_per_page=*/1);
- ResultState result_state2 =
- CreateResultState({AddScoredDocument(/*document_id=*/2),
- AddScoredDocument(/*document_id=*/3)},
- /*num_per_page=*/1);
- ResultState result_state3 =
- CreateResultState({AddScoredDocument(/*document_id=*/4),
- AddScoredDocument(/*document_id=*/5)},
- /*num_per_page=*/1);
+ auto [scored_document_hits1, document_protos1] =
+ AddScoredDocuments({/*document_id=*/0, /*document_id=*/1});
+ auto [scored_document_hits2, document_protos2] =
+ AddScoredDocuments({/*document_id=*/2, /*document_id=*/3});
+ auto [scored_document_hits3, document_protos3] =
+ AddScoredDocuments({/*document_id=*/4, /*document_id=*/5});
// Add the first three states. Remember, the first page for each result state
- // won't be cached (since it is returned immediately from RankAndPaginate).
- // Each result state has a page size of 1 and a result set of 2 hits. So each
- // result will take up one hit of our three hit budget.
+ // won't be cached (since it is returned immediately from
+ // CacheAndRetrieveFirstPage). Each result state has a page size of 1 and a
+ // result set of 2 hits. So each result will take up one hit of our three hit
+ // budget.
ResultStateManager result_state_manager(/*max_total_hits=*/3,
document_store(), clock());
+
ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state1,
- result_state_manager.RankAndPaginate(std::move(result_state1)));
+ PageResultInfo page_result_info1,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<PriorityQueueScoredDocumentHitsRanker>(
+ std::move(scored_document_hits1), /*is_descending=*/true),
+ /*query_terms=*/{}, SearchSpecProto::default_instance(),
+ CreateScoringSpec(), CreateResultSpec(/*num_per_page=*/1),
+ document_store(), result_retriever()));
+
ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state2,
- result_state_manager.RankAndPaginate(std::move(result_state2)));
+ PageResultInfo page_result_info2,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<PriorityQueueScoredDocumentHitsRanker>(
+ std::move(scored_document_hits2), /*is_descending=*/true),
+ /*query_terms=*/{}, SearchSpecProto::default_instance(),
+ CreateScoringSpec(), CreateResultSpec(/*num_per_page=*/1),
+ document_store(), result_retriever()));
+
ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state3,
- result_state_manager.RankAndPaginate(std::move(result_state3)));
+ PageResultInfo page_result_info3,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<PriorityQueueScoredDocumentHitsRanker>(
+ std::move(scored_document_hits3), /*is_descending=*/true),
+ /*query_terms=*/{}, SearchSpecProto::default_instance(),
+ CreateScoringSpec(), CreateResultSpec(/*num_per_page=*/1),
+ document_store(), result_retriever()));
// Invalidates all states so that the current hit count will be 0.
result_state_manager.InvalidateAllResultStates();
// If invalidating all states correctly reset the current hit count to 0,
- // then the entirety of state 4 should still be within our budget and no other
+ // then adding state 4, 5, 6 should still be within our budget and no other
// result states should be evicted.
- ResultState result_state4 =
- CreateResultState({AddScoredDocument(/*document_id=*/6),
- AddScoredDocument(/*document_id=*/7)},
- /*num_per_page=*/1);
- ResultState result_state5 =
- CreateResultState({AddScoredDocument(/*document_id=*/8),
- AddScoredDocument(/*document_id=*/9)},
- /*num_per_page=*/1);
- ResultState result_state6 =
- CreateResultState({AddScoredDocument(/*document_id=*/10),
- AddScoredDocument(/*document_id=*/11)},
- /*num_per_page=*/1);
- ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state4,
- result_state_manager.RankAndPaginate(std::move(result_state4)));
- ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state5,
- result_state_manager.RankAndPaginate(std::move(result_state5)));
- ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state6,
- result_state_manager.RankAndPaginate(std::move(result_state6)));
-
- EXPECT_THAT(
- result_state_manager.GetNextPage(page_result_state1.next_page_token),
- StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
-
- EXPECT_THAT(
- result_state_manager.GetNextPage(page_result_state2.next_page_token),
- StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
-
- EXPECT_THAT(
- result_state_manager.GetNextPage(page_result_state3.next_page_token),
- StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
-
- ICING_ASSERT_OK_AND_ASSIGN(
- page_result_state4,
- result_state_manager.GetNextPage(page_result_state4.next_page_token));
- EXPECT_THAT(page_result_state4.scored_document_hits,
- ElementsAre(EqualsScoredDocumentHit(CreateScoredHit(
- /*document_id=*/6))));
-
- ICING_ASSERT_OK_AND_ASSIGN(
- page_result_state5,
- result_state_manager.GetNextPage(page_result_state5.next_page_token));
- EXPECT_THAT(page_result_state5.scored_document_hits,
- ElementsAre(EqualsScoredDocumentHit(CreateScoredHit(
- /*document_id=*/8))));
-
- ICING_ASSERT_OK_AND_ASSIGN(
- page_result_state6,
- result_state_manager.GetNextPage(page_result_state6.next_page_token));
- EXPECT_THAT(page_result_state6.scored_document_hits,
- ElementsAre(EqualsScoredDocumentHit(CreateScoredHit(
- /*document_id=*/10))));
-}
-
-TEST_F(ResultStateManagerTest,
- InvalidatedOldResultStatesShouldDecreaseCurrentHitsCount) {
- ResultState result_state1 =
- CreateResultState({AddScoredDocument(/*document_id=*/0),
- AddScoredDocument(/*document_id=*/1),
- AddScoredDocument(/*document_id=*/2),
- AddScoredDocument(/*document_id=*/3)},
- /*num_per_page=*/1);
- ResultState result_state2 =
- CreateResultState({AddScoredDocument(/*document_id=*/4),
- AddScoredDocument(/*document_id=*/5)},
- /*num_per_page=*/1);
- ResultState result_state3 =
- CreateResultState({AddScoredDocument(/*document_id=*/6),
- AddScoredDocument(/*document_id=*/7)},
- /*num_per_page=*/1);
- ResultState result_state4 =
- CreateResultState({AddScoredDocument(/*document_id=*/8),
- AddScoredDocument(/*document_id=*/9)},
- /*num_per_page=*/1);
-
- // Add the first three states. Remember, the first page for each result state
- // won't be cached (since it is returned immediately from RankAndPaginate).
- // So state 1 ~ state 4 will take up 6 hits in total.
- ResultStateManager result_state_manager(/*max_total_hits=*/6,
- document_store(), clock());
- // Set time as 1000ms and add state 1.
- clock()->SetSystemTimeMilliseconds(1000);
- ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state1,
- result_state_manager.RankAndPaginate(std::move(result_state1)));
- // Set time as 1001ms and add state 2.
- clock()->SetSystemTimeMilliseconds(1001);
- ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state2,
- result_state_manager.RankAndPaginate(std::move(result_state2)));
- // Set time as 1002ms and add state 3.
- clock()->SetSystemTimeMilliseconds(1002);
- ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state3,
- result_state_manager.RankAndPaginate(std::move(result_state3)));
- // Set time as 1003ms and add state 4.
- clock()->SetSystemTimeMilliseconds(1003);
- ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state4,
- result_state_manager.RankAndPaginate(std::move(result_state4)));
-
- // Set time as kDefaultResultStateTtlInMs + 1001ms and invalidate expired
- // states with default ttl (1 hr). This should invalidate state 1 and state 2.
- clock()->SetSystemTimeMilliseconds(kDefaultResultStateTtlInMs + 1001);
- result_state_manager.InvalidateExpiredResultStates();
+ auto [scored_document_hits4, document_protos4] =
+ AddScoredDocuments({/*document_id=*/6, /*document_id=*/7});
+ auto [scored_document_hits5, document_protos5] =
+ AddScoredDocuments({/*document_id=*/8, /*document_id=*/9});
+ auto [scored_document_hits6, document_protos6] =
+ AddScoredDocuments({/*document_id=*/10, /*document_id=*/11});
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultInfo page_result_info4,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<PriorityQueueScoredDocumentHitsRanker>(
+ std::move(scored_document_hits4), /*is_descending=*/true),
+ /*query_terms=*/{}, SearchSpecProto::default_instance(),
+ CreateScoringSpec(), CreateResultSpec(/*num_per_page=*/1),
+ document_store(), result_retriever()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultInfo page_result_info5,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<PriorityQueueScoredDocumentHitsRanker>(
+ std::move(scored_document_hits5), /*is_descending=*/true),
+ /*query_terms=*/{}, SearchSpecProto::default_instance(),
+ CreateScoringSpec(), CreateResultSpec(/*num_per_page=*/1),
+ document_store(), result_retriever()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultInfo page_result_info6,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<PriorityQueueScoredDocumentHitsRanker>(
+ std::move(scored_document_hits6), /*is_descending=*/true),
+ /*query_terms=*/{}, SearchSpecProto::default_instance(),
+ CreateScoringSpec(), CreateResultSpec(/*num_per_page=*/1),
+ document_store(), result_retriever()));
+
+ EXPECT_THAT(result_state_manager.GetNextPage(page_result_info1.first,
+ result_retriever()),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
- // page_result_state1.next_page_token() shouldn't be found
- EXPECT_THAT(
- result_state_manager.GetNextPage(page_result_state1.next_page_token),
- StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
- // page_result_state2.next_page_token() shouldn't be found
- EXPECT_THAT(
- result_state_manager.GetNextPage(page_result_state2.next_page_token),
- StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(result_state_manager.GetNextPage(page_result_info2.first,
+ result_retriever()),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
- // If invalidating state 1 and state 2 correctly decremented the current hit
- // count by 4 (to 2), then adding state 5 should still be within our budget
- // and no other result states should be evicted.
- ResultState result_state5 =
- CreateResultState({AddScoredDocument(/*document_id=*/10),
- AddScoredDocument(/*document_id=*/11),
- AddScoredDocument(/*document_id=*/12),
- AddScoredDocument(/*document_id=*/13),
- AddScoredDocument(/*document_id=*/14)},
- /*num_per_page=*/1);
- ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state5,
- result_state_manager.RankAndPaginate(std::move(result_state5)));
+ EXPECT_THAT(result_state_manager.GetNextPage(page_result_info3.first,
+ result_retriever()),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
- // page_result_state3.next_page_token() should be found since there is no
- // eviction.
- ICING_ASSERT_OK_AND_ASSIGN(
- page_result_state3,
- result_state_manager.GetNextPage(page_result_state3.next_page_token));
- EXPECT_THAT(page_result_state3.scored_document_hits,
- ElementsAre(EqualsScoredDocumentHit(CreateScoredHit(
- /*document_id=*/6))));
- // page_result_state4.next_page_token() should be found since there is no
- // eviction.
- ICING_ASSERT_OK_AND_ASSIGN(
- page_result_state4,
- result_state_manager.GetNextPage(page_result_state4.next_page_token));
- EXPECT_THAT(page_result_state4.scored_document_hits,
- ElementsAre(EqualsScoredDocumentHit(CreateScoredHit(
- /*document_id=*/8))));
+ ICING_ASSERT_OK_AND_ASSIGN(page_result_info4,
+ result_state_manager.GetNextPage(
+ page_result_info4.first, result_retriever()));
+ ASSERT_THAT(page_result_info4.second.results, SizeIs(1));
+ EXPECT_THAT(page_result_info4.second.results.at(0).document(),
+ EqualsProto(document_protos4.at(1)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(page_result_info5,
+ result_state_manager.GetNextPage(
+ page_result_info5.first, result_retriever()));
+ ASSERT_THAT(page_result_info5.second.results, SizeIs(1));
+ EXPECT_THAT(page_result_info5.second.results.at(0).document(),
+ EqualsProto(document_protos5.at(1)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(page_result_info6,
+ result_state_manager.GetNextPage(
+ page_result_info6.first, result_retriever()));
+ ASSERT_THAT(page_result_info6.second.results, SizeIs(1));
+ EXPECT_THAT(page_result_info6.second.results.at(0).document(),
+ EqualsProto(document_protos6.at(1)));
}
TEST_F(
ResultStateManagerTest,
InvalidatedResultStateShouldDecreaseCurrentHitsCountByExactStateHitCount) {
- ResultState result_state1 =
- CreateResultState({AddScoredDocument(/*document_id=*/0),
- AddScoredDocument(/*document_id=*/1)},
- /*num_per_page=*/1);
- ResultState result_state2 =
- CreateResultState({AddScoredDocument(/*document_id=*/2),
- AddScoredDocument(/*document_id=*/3)},
- /*num_per_page=*/1);
- ResultState result_state3 =
- CreateResultState({AddScoredDocument(/*document_id=*/4),
- AddScoredDocument(/*document_id=*/5)},
- /*num_per_page=*/1);
+ auto [scored_document_hits1, document_protos1] =
+ AddScoredDocuments({/*document_id=*/0, /*document_id=*/1});
+ auto [scored_document_hits2, document_protos2] =
+ AddScoredDocuments({/*document_id=*/2, /*document_id=*/3});
+ auto [scored_document_hits3, document_protos3] =
+ AddScoredDocuments({/*document_id=*/4, /*document_id=*/5});
// Add the first three states. Remember, the first page for each result state
- // won't be cached (since it is returned immediately from RankAndPaginate).
- // Each result state has a page size of 1 and a result set of 2 hits. So each
- // result will take up one hit of our three hit budget.
+ // won't be cached (since it is returned immediately from
+ // CacheAndRetrieveFirstPage). Each result state has a page size of 1 and a
+ // result set of 2 hits. So each result will take up one hit of our three hit
+ // budget.
ResultStateManager result_state_manager(/*max_total_hits=*/3,
document_store(), clock());
+
ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state1,
- result_state_manager.RankAndPaginate(std::move(result_state1)));
+ PageResultInfo page_result_info1,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<PriorityQueueScoredDocumentHitsRanker>(
+ std::move(scored_document_hits1), /*is_descending=*/true),
+ /*query_terms=*/{}, SearchSpecProto::default_instance(),
+ CreateScoringSpec(), CreateResultSpec(/*num_per_page=*/1),
+ document_store(), result_retriever()));
+
ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state2,
- result_state_manager.RankAndPaginate(std::move(result_state2)));
+ PageResultInfo page_result_info2,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<PriorityQueueScoredDocumentHitsRanker>(
+ std::move(scored_document_hits2), /*is_descending=*/true),
+ /*query_terms=*/{}, SearchSpecProto::default_instance(),
+ CreateScoringSpec(), CreateResultSpec(/*num_per_page=*/1),
+ document_store(), result_retriever()));
+
ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state3,
- result_state_manager.RankAndPaginate(std::move(result_state3)));
+ PageResultInfo page_result_info3,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<PriorityQueueScoredDocumentHitsRanker>(
+ std::move(scored_document_hits3), /*is_descending=*/true),
+ /*query_terms=*/{}, SearchSpecProto::default_instance(),
+ CreateScoringSpec(), CreateResultSpec(/*num_per_page=*/1),
+ document_store(), result_retriever()));
// Invalidates state 2, so that the number of hits current cached should be
// decremented to 2.
- result_state_manager.InvalidateResultState(
- page_result_state2.next_page_token);
+ result_state_manager.InvalidateResultState(page_result_info2.first);
// If invalidating state 2 correctly decremented the current hit count to 2,
// then adding state 4 should still be within our budget and no other result
// states should be evicted.
- ResultState result_state4 =
- CreateResultState({AddScoredDocument(/*document_id=*/6),
- AddScoredDocument(/*document_id=*/7)},
- /*num_per_page=*/1);
+ auto [scored_document_hits4, document_protos4] =
+ AddScoredDocuments({/*document_id=*/6, /*document_id=*/7});
ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state4,
- result_state_manager.RankAndPaginate(std::move(result_state4)));
+ PageResultInfo page_result_info4,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<PriorityQueueScoredDocumentHitsRanker>(
+ std::move(scored_document_hits4), /*is_descending=*/true),
+ /*query_terms=*/{}, SearchSpecProto::default_instance(),
+ CreateScoringSpec(), CreateResultSpec(/*num_per_page=*/1),
+ document_store(), result_retriever()));
// If invalidating result state 2 correctly decremented the current hit count
// to 2 and adding state 4 correctly incremented it to 3, then adding this
// result state should trigger the eviction of state 1.
- ResultState result_state5 =
- CreateResultState({AddScoredDocument(/*document_id=*/8),
- AddScoredDocument(/*document_id=*/9)},
- /*num_per_page=*/1);
- ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state5,
- result_state_manager.RankAndPaginate(std::move(result_state5)));
-
- EXPECT_THAT(
- result_state_manager.GetNextPage(page_result_state1.next_page_token),
- StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
-
- EXPECT_THAT(
- result_state_manager.GetNextPage(page_result_state2.next_page_token),
- StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
-
- ICING_ASSERT_OK_AND_ASSIGN(
- page_result_state3,
- result_state_manager.GetNextPage(page_result_state3.next_page_token));
- EXPECT_THAT(page_result_state3.scored_document_hits,
- ElementsAre(EqualsScoredDocumentHit(CreateScoredHit(
- /*document_id=*/4))));
+ auto [scored_document_hits5, document_protos5] =
+ AddScoredDocuments({/*document_id=*/8, /*document_id=*/9});
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultInfo page_result_info5,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<PriorityQueueScoredDocumentHitsRanker>(
+ std::move(scored_document_hits5), /*is_descending=*/true),
+ /*query_terms=*/{}, SearchSpecProto::default_instance(),
+ CreateScoringSpec(), CreateResultSpec(/*num_per_page=*/1),
+ document_store(), result_retriever()));
+
+ EXPECT_THAT(result_state_manager.GetNextPage(page_result_info1.first,
+ result_retriever()),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
- ICING_ASSERT_OK_AND_ASSIGN(
- page_result_state4,
- result_state_manager.GetNextPage(page_result_state4.next_page_token));
- EXPECT_THAT(page_result_state4.scored_document_hits,
- ElementsAre(EqualsScoredDocumentHit(CreateScoredHit(
- /*document_id=*/6))));
+ EXPECT_THAT(result_state_manager.GetNextPage(page_result_info2.first,
+ result_retriever()),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
- ICING_ASSERT_OK_AND_ASSIGN(
- page_result_state5,
- result_state_manager.GetNextPage(page_result_state5.next_page_token));
- EXPECT_THAT(page_result_state5.scored_document_hits,
- ElementsAre(EqualsScoredDocumentHit(CreateScoredHit(
- /*document_id=*/8))));
+ ICING_ASSERT_OK_AND_ASSIGN(page_result_info3,
+ result_state_manager.GetNextPage(
+ page_result_info3.first, result_retriever()));
+ ASSERT_THAT(page_result_info3.second.results, SizeIs(1));
+ EXPECT_THAT(page_result_info3.second.results.at(0).document(),
+ EqualsProto(document_protos3.at(1)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(page_result_info4,
+ result_state_manager.GetNextPage(
+ page_result_info4.first, result_retriever()));
+ ASSERT_THAT(page_result_info4.second.results, SizeIs(1));
+ EXPECT_THAT(page_result_info4.second.results.at(0).document(),
+ EqualsProto(document_protos4.at(1)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(page_result_info5,
+ result_state_manager.GetNextPage(
+ page_result_info5.first, result_retriever()));
+ ASSERT_THAT(page_result_info5.second.results, SizeIs(1));
+ EXPECT_THAT(page_result_info5.second.results.at(0).document(),
+ EqualsProto(document_protos5.at(1)));
}
TEST_F(ResultStateManagerTest, GetNextPageShouldDecreaseCurrentHitsCount) {
- ResultState result_state1 =
- CreateResultState({AddScoredDocument(/*document_id=*/0),
- AddScoredDocument(/*document_id=*/1)},
- /*num_per_page=*/1);
- ResultState result_state2 =
- CreateResultState({AddScoredDocument(/*document_id=*/2),
- AddScoredDocument(/*document_id=*/3)},
- /*num_per_page=*/1);
- ResultState result_state3 =
- CreateResultState({AddScoredDocument(/*document_id=*/4),
- AddScoredDocument(/*document_id=*/5)},
- /*num_per_page=*/1);
+ auto [scored_document_hits1, document_protos1] =
+ AddScoredDocuments({/*document_id=*/0, /*document_id=*/1});
+ auto [scored_document_hits2, document_protos2] =
+ AddScoredDocuments({/*document_id=*/2, /*document_id=*/3});
+ auto [scored_document_hits3, document_protos3] =
+ AddScoredDocuments({/*document_id=*/4, /*document_id=*/5});
// Add the first three states. Remember, the first page for each result state
- // won't be cached (since it is returned immediately from RankAndPaginate).
- // Each result state has a page size of 1 and a result set of 2 hits. So each
- // result will take up one hit of our three hit budget.
+ // won't be cached (since it is returned immediately from
+ // CacheAndRetrieveFirstPage). Each result state has a page size of 1 and a
+ // result set of 2 hits. So each result will take up one hit of our three hit
+ // budget.
ResultStateManager result_state_manager(/*max_total_hits=*/3,
document_store(), clock());
+
ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state1,
- result_state_manager.RankAndPaginate(std::move(result_state1)));
+ PageResultInfo page_result_info1,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<PriorityQueueScoredDocumentHitsRanker>(
+ std::move(scored_document_hits1), /*is_descending=*/true),
+ /*query_terms=*/{}, SearchSpecProto::default_instance(),
+ CreateScoringSpec(), CreateResultSpec(/*num_per_page=*/1),
+ document_store(), result_retriever()));
+
ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state2,
- result_state_manager.RankAndPaginate(std::move(result_state2)));
+ PageResultInfo page_result_info2,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<PriorityQueueScoredDocumentHitsRanker>(
+ std::move(scored_document_hits2), /*is_descending=*/true),
+ /*query_terms=*/{}, SearchSpecProto::default_instance(),
+ CreateScoringSpec(), CreateResultSpec(/*num_per_page=*/1),
+ document_store(), result_retriever()));
+
ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state3,
- result_state_manager.RankAndPaginate(std::move(result_state3)));
+ PageResultInfo page_result_info3,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<PriorityQueueScoredDocumentHitsRanker>(
+ std::move(scored_document_hits3), /*is_descending=*/true),
+ /*query_terms=*/{}, SearchSpecProto::default_instance(),
+ CreateScoringSpec(), CreateResultSpec(/*num_per_page=*/1),
+ document_store(), result_retriever()));
// GetNextPage for result state 1 should return its result and decrement the
// number of cached hits to 2.
- ICING_ASSERT_OK_AND_ASSIGN(
- page_result_state1,
- result_state_manager.GetNextPage(page_result_state1.next_page_token));
- EXPECT_THAT(page_result_state1.scored_document_hits,
- ElementsAre(EqualsScoredDocumentHit(CreateScoredHit(
- /*document_id=*/0))));
+ ICING_ASSERT_OK_AND_ASSIGN(page_result_info1,
+ result_state_manager.GetNextPage(
+ page_result_info1.first, result_retriever()));
+ ASSERT_THAT(page_result_info1.second.results, SizeIs(1));
+ EXPECT_THAT(page_result_info1.second.results.at(0).document(),
+ EqualsProto(document_protos1.at(1)));
// If retrieving the next page for result state 1 correctly decremented the
// current hit count to 2, then adding state 4 should still be within our
// budget and no other result states should be evicted.
- ResultState result_state4 =
- CreateResultState({AddScoredDocument(/*document_id=*/6),
- AddScoredDocument(/*document_id=*/7)},
- /*num_per_page=*/1);
- ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state4,
- result_state_manager.RankAndPaginate(std::move(result_state4)));
-
- EXPECT_THAT(
- result_state_manager.GetNextPage(page_result_state1.next_page_token),
- StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
-
- ICING_ASSERT_OK_AND_ASSIGN(
- page_result_state2,
- result_state_manager.GetNextPage(page_result_state2.next_page_token));
- EXPECT_THAT(page_result_state2.scored_document_hits,
- ElementsAre(EqualsScoredDocumentHit(CreateScoredHit(
- /*document_id=*/2))));
-
- ICING_ASSERT_OK_AND_ASSIGN(
- page_result_state3,
- result_state_manager.GetNextPage(page_result_state3.next_page_token));
- EXPECT_THAT(page_result_state3.scored_document_hits,
- ElementsAre(EqualsScoredDocumentHit(CreateScoredHit(
- /*document_id=*/4))));
+ auto [scored_document_hits4, document_protos4] =
+ AddScoredDocuments({/*document_id=*/6, /*document_id=*/7});
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultInfo page_result_info4,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<PriorityQueueScoredDocumentHitsRanker>(
+ std::move(scored_document_hits4), /*is_descending=*/true),
+ /*query_terms=*/{}, SearchSpecProto::default_instance(),
+ CreateScoringSpec(), CreateResultSpec(/*num_per_page=*/1),
+ document_store(), result_retriever()));
+
+ EXPECT_THAT(result_state_manager.GetNextPage(page_result_info1.first,
+ result_retriever()),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
- ICING_ASSERT_OK_AND_ASSIGN(
- page_result_state4,
- result_state_manager.GetNextPage(page_result_state4.next_page_token));
- EXPECT_THAT(page_result_state4.scored_document_hits,
- ElementsAre(EqualsScoredDocumentHit(CreateScoredHit(
- /*document_id=*/6))));
+ ICING_ASSERT_OK_AND_ASSIGN(page_result_info2,
+ result_state_manager.GetNextPage(
+ page_result_info2.first, result_retriever()));
+ ASSERT_THAT(page_result_info2.second.results, SizeIs(1));
+ EXPECT_THAT(page_result_info2.second.results.at(0).document(),
+ EqualsProto(document_protos2.at(1)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(page_result_info3,
+ result_state_manager.GetNextPage(
+ page_result_info3.first, result_retriever()));
+ ASSERT_THAT(page_result_info3.second.results, SizeIs(1));
+ EXPECT_THAT(page_result_info3.second.results.at(0).document(),
+ EqualsProto(document_protos3.at(1)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(page_result_info4,
+ result_state_manager.GetNextPage(
+ page_result_info4.first, result_retriever()));
+ ASSERT_THAT(page_result_info4.second.results, SizeIs(1));
+ EXPECT_THAT(page_result_info4.second.results.at(0).document(),
+ EqualsProto(document_protos4.at(1)));
}
TEST_F(ResultStateManagerTest,
GetNextPageShouldDecreaseCurrentHitsCountByExactlyOnePage) {
- ResultState result_state1 =
- CreateResultState({AddScoredDocument(/*document_id=*/0),
- AddScoredDocument(/*document_id=*/1)},
- /*num_per_page=*/1);
- ResultState result_state2 =
- CreateResultState({AddScoredDocument(/*document_id=*/2),
- AddScoredDocument(/*document_id=*/3)},
- /*num_per_page=*/1);
- ResultState result_state3 =
- CreateResultState({AddScoredDocument(/*document_id=*/4),
- AddScoredDocument(/*document_id=*/5)},
- /*num_per_page=*/1);
+ auto [scored_document_hits1, document_protos1] =
+ AddScoredDocuments({/*document_id=*/0, /*document_id=*/1});
+ auto [scored_document_hits2, document_protos2] =
+ AddScoredDocuments({/*document_id=*/2, /*document_id=*/3});
+ auto [scored_document_hits3, document_protos3] =
+ AddScoredDocuments({/*document_id=*/4, /*document_id=*/5});
// Add the first three states. Remember, the first page for each result state
- // won't be cached (since it is returned immediately from RankAndPaginate).
- // Each result state has a page size of 1 and a result set of 2 hits. So each
- // result will take up one hit of our three hit budget.
+ // won't be cached (since it is returned immediately from
+ // CacheAndRetrieveFirstPage). Each result state has a page size of 1 and a
+ // result set of 2 hits. So each result will take up one hit of our three hit
+ // budget.
ResultStateManager result_state_manager(/*max_total_hits=*/3,
document_store(), clock());
+
ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state1,
- result_state_manager.RankAndPaginate(std::move(result_state1)));
+ PageResultInfo page_result_info1,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<PriorityQueueScoredDocumentHitsRanker>(
+ std::move(scored_document_hits1), /*is_descending=*/true),
+ /*query_terms=*/{}, SearchSpecProto::default_instance(),
+ CreateScoringSpec(), CreateResultSpec(/*num_per_page=*/1),
+ document_store(), result_retriever()));
+
ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state2,
- result_state_manager.RankAndPaginate(std::move(result_state2)));
+ PageResultInfo page_result_info2,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<PriorityQueueScoredDocumentHitsRanker>(
+ std::move(scored_document_hits2), /*is_descending=*/true),
+ /*query_terms=*/{}, SearchSpecProto::default_instance(),
+ CreateScoringSpec(), CreateResultSpec(/*num_per_page=*/1),
+ document_store(), result_retriever()));
+
ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state3,
- result_state_manager.RankAndPaginate(std::move(result_state3)));
+ PageResultInfo page_result_info3,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<PriorityQueueScoredDocumentHitsRanker>(
+ std::move(scored_document_hits3), /*is_descending=*/true),
+ /*query_terms=*/{}, SearchSpecProto::default_instance(),
+ CreateScoringSpec(), CreateResultSpec(/*num_per_page=*/1),
+ document_store(), result_retriever()));
// GetNextPage for result state 1 should return its result and decrement the
// number of cached hits to 2.
- ICING_ASSERT_OK_AND_ASSIGN(
- page_result_state1,
- result_state_manager.GetNextPage(page_result_state1.next_page_token));
- EXPECT_THAT(page_result_state1.scored_document_hits,
- ElementsAre(EqualsScoredDocumentHit(CreateScoredHit(
- /*document_id=*/0))));
+ ICING_ASSERT_OK_AND_ASSIGN(page_result_info1,
+ result_state_manager.GetNextPage(
+ page_result_info1.first, result_retriever()));
+ ASSERT_THAT(page_result_info1.second.results, SizeIs(1));
+ EXPECT_THAT(page_result_info1.second.results.at(0).document(),
+ EqualsProto(document_protos1.at(1)));
// If retrieving the next page for result state 1 correctly decremented the
// current hit count to 2, then adding state 4 should still be within our
// budget and no other result states should be evicted.
- ResultState result_state4 =
- CreateResultState({AddScoredDocument(/*document_id=*/6),
- AddScoredDocument(/*document_id=*/7)},
- /*num_per_page=*/1);
+ auto [scored_document_hits4, document_protos4] =
+ AddScoredDocuments({/*document_id=*/6, /*document_id=*/7});
ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state4,
- result_state_manager.RankAndPaginate(std::move(result_state4)));
+ PageResultInfo page_result_info4,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<PriorityQueueScoredDocumentHitsRanker>(
+ std::move(scored_document_hits4), /*is_descending=*/true),
+ /*query_terms=*/{}, SearchSpecProto::default_instance(),
+ CreateScoringSpec(), CreateResultSpec(/*num_per_page=*/1),
+ document_store(), result_retriever()));
// If retrieving the next page for result state 1 correctly decremented the
// current hit count to 2 and adding state 4 correctly incremented it to 3,
// then adding this result state should trigger the eviction of state 2.
- ResultState result_state5 =
- CreateResultState({AddScoredDocument(/*document_id=*/8),
- AddScoredDocument(/*document_id=*/9)},
- /*num_per_page=*/1);
- ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state5,
- result_state_manager.RankAndPaginate(std::move(result_state5)));
-
- EXPECT_THAT(
- result_state_manager.GetNextPage(page_result_state1.next_page_token),
- StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
-
- EXPECT_THAT(
- result_state_manager.GetNextPage(page_result_state2.next_page_token),
- StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
-
- ICING_ASSERT_OK_AND_ASSIGN(
- page_result_state3,
- result_state_manager.GetNextPage(page_result_state3.next_page_token));
- EXPECT_THAT(page_result_state3.scored_document_hits,
- ElementsAre(EqualsScoredDocumentHit(CreateScoredHit(
- /*document_id=*/4))));
+ auto [scored_document_hits5, document_protos5] =
+ AddScoredDocuments({/*document_id=*/8, /*document_id=*/9});
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultInfo page_result_info5,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<PriorityQueueScoredDocumentHitsRanker>(
+ std::move(scored_document_hits5), /*is_descending=*/true),
+ /*query_terms=*/{}, SearchSpecProto::default_instance(),
+ CreateScoringSpec(), CreateResultSpec(/*num_per_page=*/1),
+ document_store(), result_retriever()));
+
+ EXPECT_THAT(result_state_manager.GetNextPage(page_result_info1.first,
+ result_retriever()),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
- ICING_ASSERT_OK_AND_ASSIGN(
- page_result_state4,
- result_state_manager.GetNextPage(page_result_state4.next_page_token));
- EXPECT_THAT(page_result_state4.scored_document_hits,
- ElementsAre(EqualsScoredDocumentHit(CreateScoredHit(
- /*document_id=*/6))));
+ EXPECT_THAT(result_state_manager.GetNextPage(page_result_info2.first,
+ result_retriever()),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
- ICING_ASSERT_OK_AND_ASSIGN(
- page_result_state5,
- result_state_manager.GetNextPage(page_result_state5.next_page_token));
- EXPECT_THAT(page_result_state5.scored_document_hits,
- ElementsAre(EqualsScoredDocumentHit(CreateScoredHit(
- /*document_id=*/8))));
+ ICING_ASSERT_OK_AND_ASSIGN(page_result_info3,
+ result_state_manager.GetNextPage(
+ page_result_info3.first, result_retriever()));
+ ASSERT_THAT(page_result_info3.second.results, SizeIs(1));
+ EXPECT_THAT(page_result_info3.second.results.at(0).document(),
+ EqualsProto(document_protos3.at(1)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(page_result_info4,
+ result_state_manager.GetNextPage(
+ page_result_info4.first, result_retriever()));
+ ASSERT_THAT(page_result_info4.second.results, SizeIs(1));
+ EXPECT_THAT(page_result_info4.second.results.at(0).document(),
+ EqualsProto(document_protos4.at(1)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(page_result_info5,
+ result_state_manager.GetNextPage(
+ page_result_info5.first, result_retriever()));
+ ASSERT_THAT(page_result_info5.second.results, SizeIs(1));
+ EXPECT_THAT(page_result_info5.second.results.at(0).document(),
+ EqualsProto(document_protos5.at(1)));
}
TEST_F(ResultStateManagerTest,
AddingOverBudgetResultStateShouldEvictAllStates) {
- ResultState result_state1 =
- CreateResultState({AddScoredDocument(/*document_id=*/0),
- AddScoredDocument(/*document_id=*/1),
- AddScoredDocument(/*document_id=*/2)},
- /*num_per_page=*/1);
- ResultState result_state2 =
- CreateResultState({AddScoredDocument(/*document_id=*/3),
- AddScoredDocument(/*document_id=*/4)},
- /*num_per_page=*/1);
+ auto [scored_document_hits1, document_protos1] = AddScoredDocuments(
+ {/*document_id=*/0, /*document_id=*/1, /*document_id=*/2});
+ auto [scored_document_hits2, document_protos2] =
+ AddScoredDocuments({/*document_id=*/3, /*document_id=*/4});
// Add the first two states. Remember, the first page for each result state
- // won't be cached (since it is returned immediately from RankAndPaginate).
- // Each result state has a page size of 1. So 3 hits will remain cached.
+ // won't be cached (since it is returned immediately from
+ // CacheAndRetrieveFirstPage). Each result state has a page size of 1. So 3
+ // hits will remain cached.
ResultStateManager result_state_manager(/*max_total_hits=*/4,
document_store(), clock());
+
ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state1,
- result_state_manager.RankAndPaginate(std::move(result_state1)));
+ PageResultInfo page_result_info1,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<PriorityQueueScoredDocumentHitsRanker>(
+ std::move(scored_document_hits1), /*is_descending=*/true),
+ /*query_terms=*/{}, SearchSpecProto::default_instance(),
+ CreateScoringSpec(), CreateResultSpec(/*num_per_page=*/1),
+ document_store(), result_retriever()));
+
ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state2,
- result_state_manager.RankAndPaginate(std::move(result_state2)));
+ PageResultInfo page_result_info2,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<PriorityQueueScoredDocumentHitsRanker>(
+ std::move(scored_document_hits2), /*is_descending=*/true),
+ /*query_terms=*/{}, SearchSpecProto::default_instance(),
+ CreateScoringSpec(), CreateResultSpec(/*num_per_page=*/1),
+ document_store(), result_retriever()));
// Add a result state that is larger than the entire budget. This should
// result in all previous result states being evicted, the first hit from
// result state 3 being returned and the next four hits being cached (the last
// hit should be dropped because it exceeds the max).
- ResultState result_state3 =
- CreateResultState({AddScoredDocument(/*document_id=*/5),
- AddScoredDocument(/*document_id=*/6),
- AddScoredDocument(/*document_id=*/7),
- AddScoredDocument(/*document_id=*/8),
- AddScoredDocument(/*document_id=*/9),
- AddScoredDocument(/*document_id=*/10)},
- /*num_per_page=*/1);
- ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state3,
- result_state_manager.RankAndPaginate(std::move(result_state3)));
+ auto [scored_document_hits3, document_protos3] = AddScoredDocuments(
+ {/*document_id=*/5, /*document_id=*/6, /*document_id=*/7,
+ /*document_id=*/8, /*document_id=*/9, /*document_id=*/10});
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultInfo page_result_info3,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<PriorityQueueScoredDocumentHitsRanker>(
+ std::move(scored_document_hits3), /*is_descending=*/true),
+ /*query_terms=*/{}, SearchSpecProto::default_instance(),
+ CreateScoringSpec(), CreateResultSpec(/*num_per_page=*/1),
+ document_store(), result_retriever()));
+ EXPECT_THAT(page_result_info3.first, Not(Eq(kInvalidNextPageToken)));
// GetNextPage for result state 1 and 2 should return NOT_FOUND.
- EXPECT_THAT(
- result_state_manager.GetNextPage(page_result_state1.next_page_token),
- StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(result_state_manager.GetNextPage(page_result_info1.first,
+ result_retriever()),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
- EXPECT_THAT(
- result_state_manager.GetNextPage(page_result_state2.next_page_token),
- StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(result_state_manager.GetNextPage(page_result_info2.first,
+ result_retriever()),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
// Only the next four results in state 3 should be retrievable.
- ICING_ASSERT_OK_AND_ASSIGN(
- page_result_state3,
- result_state_manager.GetNextPage(page_result_state3.next_page_token));
- EXPECT_THAT(page_result_state3.scored_document_hits,
- ElementsAre(EqualsScoredDocumentHit(CreateScoredHit(
- /*document_id=*/9))));
-
- ICING_ASSERT_OK_AND_ASSIGN(
- page_result_state3,
- result_state_manager.GetNextPage(page_result_state3.next_page_token));
- EXPECT_THAT(page_result_state3.scored_document_hits,
- ElementsAre(EqualsScoredDocumentHit(CreateScoredHit(
- /*document_id=*/8))));
-
- ICING_ASSERT_OK_AND_ASSIGN(
- page_result_state3,
- result_state_manager.GetNextPage(page_result_state3.next_page_token));
- EXPECT_THAT(page_result_state3.scored_document_hits,
- ElementsAre(EqualsScoredDocumentHit(CreateScoredHit(
- /*document_id=*/7))));
-
- ICING_ASSERT_OK_AND_ASSIGN(
- page_result_state3,
- result_state_manager.GetNextPage(page_result_state3.next_page_token));
- EXPECT_THAT(page_result_state3.scored_document_hits,
- ElementsAre(EqualsScoredDocumentHit(CreateScoredHit(
- /*document_id=*/6))));
-
- // The final result should have been dropped because it exceeded the budget.
+ uint64_t next_page_token3 = page_result_info3.first;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ page_result_info3,
+ result_state_manager.GetNextPage(next_page_token3, result_retriever()));
+ EXPECT_THAT(page_result_info3.first, Eq(next_page_token3));
+ ASSERT_THAT(page_result_info3.second.results, SizeIs(1));
+ EXPECT_THAT(page_result_info3.second.results.at(0).document(),
+ EqualsProto(document_protos3.at(1)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ page_result_info3,
+ result_state_manager.GetNextPage(next_page_token3, result_retriever()));
+ EXPECT_THAT(page_result_info3.first, Eq(next_page_token3));
+ ASSERT_THAT(page_result_info3.second.results, SizeIs(1));
+ EXPECT_THAT(page_result_info3.second.results.at(0).document(),
+ EqualsProto(document_protos3.at(2)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ page_result_info3,
+ result_state_manager.GetNextPage(next_page_token3, result_retriever()));
+ EXPECT_THAT(page_result_info3.first, Eq(next_page_token3));
+ ASSERT_THAT(page_result_info3.second.results, SizeIs(1));
+ EXPECT_THAT(page_result_info3.second.results.at(0).document(),
+ EqualsProto(document_protos3.at(3)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ page_result_info3,
+ result_state_manager.GetNextPage(next_page_token3, result_retriever()));
+ // The final document should have been dropped because it exceeded the budget,
+ // so the next page token of the second last round should be
+ // kInvalidNextPageToken.
+ EXPECT_THAT(page_result_info3.first, Eq(kInvalidNextPageToken));
+ ASSERT_THAT(page_result_info3.second.results, SizeIs(1));
+ EXPECT_THAT(page_result_info3.second.results.at(0).document(),
+ EqualsProto(document_protos3.at(4)));
+
+ // Double check that next_page_token3 is not retrievable anymore.
EXPECT_THAT(
- result_state_manager.GetNextPage(page_result_state3.next_page_token),
+ result_state_manager.GetNextPage(next_page_token3, result_retriever()),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
}
TEST_F(ResultStateManagerTest,
AddingResultStateShouldEvictOverBudgetResultState) {
- ResultStateManager result_state_manager(/*max_total_hits=*/4,
- document_store(), clock());
// Add a result state that is larger than the entire budget. The entire result
// state will still be cached
- ResultState result_state1 =
- CreateResultState({AddScoredDocument(/*document_id=*/0),
- AddScoredDocument(/*document_id=*/1),
- AddScoredDocument(/*document_id=*/2),
- AddScoredDocument(/*document_id=*/3),
- AddScoredDocument(/*document_id=*/4),
- AddScoredDocument(/*document_id=*/5)},
- /*num_per_page=*/1);
+ auto [scored_document_hits1, document_protos1] = AddScoredDocuments(
+ {/*document_id=*/0, /*document_id=*/1, /*document_id=*/2,
+ /*document_id=*/3, /*document_id=*/4, /*document_id=*/5});
+
+ ResultStateManager result_state_manager(/*max_total_hits=*/4,
+ document_store(), clock());
+
ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state1,
- result_state_manager.RankAndPaginate(std::move(result_state1)));
+ PageResultInfo page_result_info1,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<PriorityQueueScoredDocumentHitsRanker>(
+ std::move(scored_document_hits1), /*is_descending=*/true),
+ /*query_terms=*/{}, SearchSpecProto::default_instance(),
+ CreateScoringSpec(), CreateResultSpec(/*num_per_page=*/1),
+ document_store(), result_retriever()));
// Add a result state. Because state2 + state1 is larger than the budget,
// state1 should be evicted.
- ResultState result_state2 =
- CreateResultState({AddScoredDocument(/*document_id=*/6),
- AddScoredDocument(/*document_id=*/7)},
- /*num_per_page=*/1);
+ auto [scored_document_hits2, document_protos2] =
+ AddScoredDocuments({/*document_id=*/6, /*document_id=*/7});
ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state2,
- result_state_manager.RankAndPaginate(std::move(result_state2)));
+ PageResultInfo page_result_info2,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<PriorityQueueScoredDocumentHitsRanker>(
+ std::move(scored_document_hits2), /*is_descending=*/true),
+ /*query_terms=*/{}, SearchSpecProto::default_instance(),
+ CreateScoringSpec(), CreateResultSpec(/*num_per_page=*/1),
+ document_store(), result_retriever()));
// state1 should have been evicted and state2 should still be retrievable.
- EXPECT_THAT(
- result_state_manager.GetNextPage(page_result_state1.next_page_token),
- StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
-
- ICING_ASSERT_OK_AND_ASSIGN(
- page_result_state2,
- result_state_manager.GetNextPage(page_result_state2.next_page_token));
- EXPECT_THAT(page_result_state2.scored_document_hits,
- ElementsAre(EqualsScoredDocumentHit(CreateScoredHit(
- /*document_id=*/6))));
-}
-
-TEST_F(ResultStateManagerTest, ShouldGetSnippetContext) {
- ResultSpecProto result_spec = CreateResultSpec(/*num_per_page=*/1);
- result_spec.mutable_snippet_spec()->set_num_to_snippet(5);
- result_spec.mutable_snippet_spec()->set_num_matches_per_property(5);
- result_spec.mutable_snippet_spec()->set_max_window_utf32_length(5);
-
- SearchSpecProto search_spec;
- search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
-
- SectionRestrictQueryTermsMap query_terms_map;
- query_terms_map.emplace("term1", std::unordered_set<std::string>());
-
- ResultState original_result_state = ResultState(
- /*scored_document_hits=*/{AddScoredDocument(/*document_id=*/0),
- AddScoredDocument(/*document_id=*/1)},
- query_terms_map, search_spec, CreateScoringSpec(), result_spec,
- document_store());
-
- ResultStateManager result_state_manager(
- /*max_total_hits=*/std::numeric_limits<int>::max(), document_store(),
- clock());
- ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state,
- result_state_manager.RankAndPaginate(std::move(original_result_state)));
-
- ASSERT_THAT(page_result_state.next_page_token, Gt(kInvalidNextPageToken));
-
- EXPECT_THAT(page_result_state.snippet_context.match_type,
- Eq(TermMatchType::EXACT_ONLY));
- EXPECT_TRUE(page_result_state.snippet_context.query_terms.find("term1") !=
- page_result_state.snippet_context.query_terms.end());
- EXPECT_THAT(page_result_state.snippet_context.snippet_spec,
- EqualsProto(result_spec.snippet_spec()));
-}
-
-TEST_F(ResultStateManagerTest, ShouldGetDefaultSnippetContext) {
- ResultSpecProto result_spec = CreateResultSpec(/*num_per_page=*/1);
- // 0 indicates no snippeting
- result_spec.mutable_snippet_spec()->set_num_to_snippet(0);
- result_spec.mutable_snippet_spec()->set_num_matches_per_property(0);
- result_spec.mutable_snippet_spec()->set_max_window_utf32_length(0);
-
- SearchSpecProto search_spec;
- search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
-
- SectionRestrictQueryTermsMap query_terms_map;
- query_terms_map.emplace("term1", std::unordered_set<std::string>());
-
- ResultState original_result_state = ResultState(
- /*scored_document_hits=*/{AddScoredDocument(/*document_id=*/0),
- AddScoredDocument(/*document_id=*/1)},
- query_terms_map, search_spec, CreateScoringSpec(), result_spec,
- document_store());
-
- ResultStateManager result_state_manager(
- /*max_total_hits=*/std::numeric_limits<int>::max(), document_store(),
- clock());
- ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state,
- result_state_manager.RankAndPaginate(std::move(original_result_state)));
-
- ASSERT_THAT(page_result_state.next_page_token, Gt(kInvalidNextPageToken));
-
- EXPECT_THAT(page_result_state.snippet_context.query_terms, IsEmpty());
- EXPECT_THAT(
- page_result_state.snippet_context.snippet_spec,
- EqualsProto(ResultSpecProto::SnippetSpecProto::default_instance()));
- EXPECT_THAT(page_result_state.snippet_context.match_type,
- Eq(TermMatchType::UNKNOWN));
-}
-
-TEST_F(ResultStateManagerTest, ShouldGetCorrectNumPreviouslyReturned) {
- ResultState original_result_state =
- CreateResultState({AddScoredDocument(/*document_id=*/0),
- AddScoredDocument(/*document_id=*/1),
- AddScoredDocument(/*document_id=*/2),
- AddScoredDocument(/*document_id=*/3),
- AddScoredDocument(/*document_id=*/4)},
- /*num_per_page=*/2);
-
- ResultStateManager result_state_manager(
- /*max_total_hits=*/std::numeric_limits<int>::max(), document_store(),
- clock());
-
- // First page, 2 results
- ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state1,
- result_state_manager.RankAndPaginate(std::move(original_result_state)));
- ASSERT_THAT(page_result_state1.scored_document_hits.size(), Eq(2));
-
- // No previously returned results
- EXPECT_THAT(page_result_state1.num_previously_returned, Eq(0));
-
- uint64_t next_page_token = page_result_state1.next_page_token;
-
- // Second page, 2 results
- ICING_ASSERT_OK_AND_ASSIGN(PageResultState page_result_state2,
- result_state_manager.GetNextPage(next_page_token));
- ASSERT_THAT(page_result_state2.scored_document_hits.size(), Eq(2));
-
- // num_previously_returned = size of first page
- EXPECT_THAT(page_result_state2.num_previously_returned, Eq(2));
-
- // Third page, 1 result
- ICING_ASSERT_OK_AND_ASSIGN(PageResultState page_result_state3,
- result_state_manager.GetNextPage(next_page_token));
- ASSERT_THAT(page_result_state3.scored_document_hits.size(), Eq(1));
-
- // num_previously_returned = size of first and second pages
- EXPECT_THAT(page_result_state3.num_previously_returned, Eq(4));
-
- // No more results
- EXPECT_THAT(result_state_manager.GetNextPage(next_page_token),
+ EXPECT_THAT(result_state_manager.GetNextPage(page_result_info1.first,
+ result_retriever()),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
-}
-TEST_F(ResultStateManagerTest, ShouldStoreAllHits) {
- ScoredDocumentHit scored_hit_1 = AddScoredDocument(/*document_id=*/0);
- ScoredDocumentHit scored_hit_2 = AddScoredDocument(/*document_id=*/1);
- ScoredDocumentHit scored_hit_3 = AddScoredDocument(/*document_id=*/2);
- ScoredDocumentHit scored_hit_4 = AddScoredDocument(/*document_id=*/3);
- ScoredDocumentHit scored_hit_5 = AddScoredDocument(/*document_id=*/4);
+ ICING_ASSERT_OK_AND_ASSIGN(page_result_info2,
+ result_state_manager.GetNextPage(
+ page_result_info2.first, result_retriever()));
+ ASSERT_THAT(page_result_info2.second.results, SizeIs(1));
+ EXPECT_THAT(page_result_info2.second.results.at(0).document(),
+ EqualsProto(document_protos2.at(1)));
+}
- ResultState original_result_state = CreateResultState(
- {scored_hit_1, scored_hit_2, scored_hit_3, scored_hit_4, scored_hit_5},
- /*num_per_page=*/2);
+TEST_F(ResultStateManagerTest,
+ AddingResultStateShouldNotTruncatedAfterFirstPage) {
+ // Add a result state that is larger than the entire budget, but within the
+ // entire budget after the first page. The entire result state will still be
+ // cached and not truncated.
+ auto [scored_document_hits, document_protos] = AddScoredDocuments(
+ {/*document_id=*/0, /*document_id=*/1, /*document_id=*/2,
+ /*document_id=*/3, /*document_id=*/4});
ResultStateManager result_state_manager(/*max_total_hits=*/4,
document_store(), clock());
@@ -1127,33 +1371,46 @@ TEST_F(ResultStateManagerTest, ShouldStoreAllHits) {
// The 5 input scored document hits will not be truncated. The first page of
// two hits will be returned immediately and the other three hits will fit
// within our caching budget.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultInfo page_result_info1,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<PriorityQueueScoredDocumentHitsRanker>(
+ std::move(scored_document_hits), /*is_descending=*/true),
+ /*query_terms=*/{}, SearchSpecProto::default_instance(),
+ CreateScoringSpec(), CreateResultSpec(/*num_per_page=*/2),
+ document_store(), result_retriever()));
// First page, 2 results
- ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state1,
- result_state_manager.RankAndPaginate(std::move(original_result_state)));
- EXPECT_THAT(page_result_state1.scored_document_hits,
- ElementsAre(EqualsScoredDocumentHit(scored_hit_5),
- EqualsScoredDocumentHit(scored_hit_4)));
+ ASSERT_THAT(page_result_info1.second.results, SizeIs(2));
+ EXPECT_THAT(page_result_info1.second.results.at(0).document(),
+ EqualsProto(document_protos.at(0)));
+ EXPECT_THAT(page_result_info1.second.results.at(1).document(),
+ EqualsProto(document_protos.at(1)));
- uint64_t next_page_token = page_result_state1.next_page_token;
+ uint64_t next_page_token = page_result_info1.first;
// Second page, 2 results.
- ICING_ASSERT_OK_AND_ASSIGN(PageResultState page_result_state2,
- result_state_manager.GetNextPage(next_page_token));
- EXPECT_THAT(page_result_state2.scored_document_hits,
- ElementsAre(EqualsScoredDocumentHit(scored_hit_3),
- EqualsScoredDocumentHit(scored_hit_2)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultInfo page_result_info2,
+ result_state_manager.GetNextPage(next_page_token, result_retriever()));
+ ASSERT_THAT(page_result_info2.second.results, SizeIs(2));
+ EXPECT_THAT(page_result_info2.second.results.at(0).document(),
+ EqualsProto(document_protos.at(2)));
+ EXPECT_THAT(page_result_info2.second.results.at(1).document(),
+ EqualsProto(document_protos.at(3)));
// Third page, 1 result.
- ICING_ASSERT_OK_AND_ASSIGN(PageResultState page_result_state3,
- result_state_manager.GetNextPage(next_page_token));
- EXPECT_THAT(page_result_state3.scored_document_hits,
- ElementsAre(EqualsScoredDocumentHit(scored_hit_1)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultInfo page_result_info3,
+ result_state_manager.GetNextPage(next_page_token, result_retriever()));
+ ASSERT_THAT(page_result_info3.second.results, SizeIs(1));
+ EXPECT_THAT(page_result_info3.second.results.at(0).document(),
+ EqualsProto(document_protos.at(4)));
// Fourth page, 0 results.
- EXPECT_THAT(result_state_manager.GetNextPage(next_page_token),
- StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(
+ result_state_manager.GetNextPage(next_page_token, result_retriever()),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
}
} // namespace
diff --git a/icing/result/result-state-v2.cc b/icing/result/result-state-v2.cc
index dde50e3..9cb3838 100644
--- a/icing/result/result-state-v2.cc
+++ b/icing/result/result-state-v2.cc
@@ -52,6 +52,8 @@ ResultStateV2::ResultStateV2(
snippet_context_(CreateSnippetContext(std::move(query_terms), search_spec,
result_spec)),
num_per_page_(result_spec.num_per_page()),
+ num_total_bytes_per_page_threshold_(
+ result_spec.num_total_bytes_per_page_threshold()),
num_total_hits_(nullptr) {
for (const TypePropertyMask& type_field_mask :
result_spec.type_property_masks()) {
diff --git a/icing/result/result-state-v2.h b/icing/result/result-state-v2.h
index fc56936..97ff4b6 100644
--- a/icing/result/result-state-v2.h
+++ b/icing/result/result-state-v2.h
@@ -78,6 +78,11 @@ class ResultStateV2 {
return num_per_page_;
}
+ int32_t num_total_bytes_per_page_threshold() const
+ ICING_SHARED_LOCKS_REQUIRED(mutex) {
+ return num_total_bytes_per_page_threshold_;
+ }
+
absl_ports::shared_mutex mutex;
// When evaluating the next top K hits from scored_document_hits_ranker, some
@@ -113,8 +118,16 @@ class ResultStateV2 {
// Number of results to return in each page.
int num_per_page_ ICING_GUARDED_BY(mutex);
- // Pointer to a global counter to sum up the size of
- // scored_document_hits_ranker in all ResultStates.
+ // The threshold of total bytes of all documents to cutoff, in order to limit
+ // # of bytes in a single page.
+ // Note that it doesn't guarantee the result # of bytes will be smaller, equal
+ // to, or larger than the threshold. Instead, it is just a threshold to
+ // cutoff, and only guarantees total bytes of search results won't exceed the
+ // threshold too much.
+ int32_t num_total_bytes_per_page_threshold_ ICING_GUARDED_BY(mutex);
+
+ // Pointer to a global counter to sum up the size of scored_document_hits in
+ // all ResultStates.
// Does not own.
std::atomic<int>* num_total_hits_ ICING_GUARDED_BY(mutex);
};
diff --git a/icing/result/result-state-v2_test.cc b/icing/result/result-state-v2_test.cc
index 8e6b29a..360e03a 100644
--- a/icing/result/result-state-v2_test.cc
+++ b/icing/result/result-state-v2_test.cc
@@ -122,6 +122,49 @@ class ResultStateV2Test : public ::testing::Test {
std::atomic<int> num_total_hits_;
};
+TEST_F(ResultStateV2Test, ShouldInitializeValuesAccordingToSpecs) {
+ ResultSpecProto result_spec = CreateResultSpec(/*num_per_page=*/2);
+ result_spec.set_num_total_bytes_per_page_threshold(4096);
+
+ ResultStateV2 result_state(
+ std::make_unique<PriorityQueueScoredDocumentHitsRanker>(
+ std::vector<ScoredDocumentHit>(),
+ /*is_descending=*/true),
+ /*query_terms=*/{}, CreateSearchSpec(TermMatchType::EXACT_ONLY),
+ CreateScoringSpec(/*is_descending_order=*/true), result_spec,
+ document_store());
+
+ absl_ports::shared_lock l(&result_state.mutex);
+
+ EXPECT_THAT(result_state.num_returned, Eq(0));
+ EXPECT_THAT(result_state.num_per_page(), Eq(result_spec.num_per_page()));
+ EXPECT_THAT(result_state.num_total_bytes_per_page_threshold(),
+ Eq(result_spec.num_total_bytes_per_page_threshold()));
+}
+
+TEST_F(ResultStateV2Test, ShouldInitializeValuesAccordingToDefaultSpecs) {
+ ResultSpecProto default_result_spec = ResultSpecProto::default_instance();
+ ASSERT_THAT(default_result_spec.num_per_page(), Eq(10));
+ ASSERT_THAT(default_result_spec.num_total_bytes_per_page_threshold(),
+ Eq(std::numeric_limits<int32_t>::max()));
+
+ ResultStateV2 result_state(
+ std::make_unique<PriorityQueueScoredDocumentHitsRanker>(
+ std::vector<ScoredDocumentHit>(),
+ /*is_descending=*/true),
+ /*query_terms=*/{}, CreateSearchSpec(TermMatchType::EXACT_ONLY),
+ CreateScoringSpec(/*is_descending_order=*/true), default_result_spec,
+ document_store());
+
+ absl_ports::shared_lock l(&result_state.mutex);
+
+ EXPECT_THAT(result_state.num_returned, Eq(0));
+ EXPECT_THAT(result_state.num_per_page(),
+ Eq(default_result_spec.num_per_page()));
+ EXPECT_THAT(result_state.num_total_bytes_per_page_threshold(),
+ Eq(default_result_spec.num_total_bytes_per_page_threshold()));
+}
+
TEST_F(ResultStateV2Test, ShouldReturnSnippetContextAccordingToSpecs) {
ResultSpecProto result_spec = CreateResultSpec(/*num_per_page=*/2);
result_spec.mutable_snippet_spec()->set_num_to_snippet(5);
diff --git a/icing/result/snippet-retriever.cc b/icing/result/snippet-retriever.cc
index bd1524e..2391900 100644
--- a/icing/result/snippet-retriever.cc
+++ b/icing/result/snippet-retriever.cc
@@ -80,6 +80,20 @@ inline std::string AddIndexToPath(int values_size, int index,
// is applied based on the Token's type.
std::string NormalizeToken(const Normalizer& normalizer, const Token& token) {
switch (token.type) {
+ case Token::Type::RFC822_NAME:
+ [[fallthrough]];
+ case Token::Type::RFC822_COMMENT:
+ [[fallthrough]];
+ case Token::Type::RFC822_LOCAL_ADDRESS:
+ [[fallthrough]];
+ case Token::Type::RFC822_ADDRESS:
+ [[fallthrough]];
+ case Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL:
+ [[fallthrough]];
+ case Token::Type::RFC822_ADDRESS_COMPONENT_HOST:
+ [[fallthrough]];
+ case Token::Type::RFC822_TOKEN:
+ [[fallthrough]];
case Token::Type::REGULAR:
return normalizer.NormalizeTerm(token.text);
case Token::Type::VERBATIM:
@@ -126,6 +140,20 @@ CharacterIterator FindMatchEnd(const Normalizer& normalizer, const Token& token,
[[fallthrough]];
case Token::Type::QUERY_PROPERTY:
[[fallthrough]];
+ case Token::Type::RFC822_NAME:
+ [[fallthrough]];
+ case Token::Type::RFC822_COMMENT:
+ [[fallthrough]];
+ case Token::Type::RFC822_LOCAL_ADDRESS:
+ [[fallthrough]];
+ case Token::Type::RFC822_ADDRESS:
+ [[fallthrough]];
+ case Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL:
+ [[fallthrough]];
+ case Token::Type::RFC822_ADDRESS_COMPONENT_HOST:
+ [[fallthrough]];
+ case Token::Type::RFC822_TOKEN:
+ [[fallthrough]];
case Token::Type::INVALID:
ICING_LOG(WARNING)
<< "Unexpected Token type " << static_cast<int>(token.type)
diff --git a/icing/scoring/bm25f-calculator.cc b/icing/scoring/bm25f-calculator.cc
index 4b426a9..28ee2ba 100644
--- a/icing/scoring/bm25f-calculator.cc
+++ b/icing/scoring/bm25f-calculator.cc
@@ -20,7 +20,6 @@
#include <unordered_set>
#include <vector>
-#include "icing/absl_ports/str_cat.h"
#include "icing/index/hit/doc-hit-info.h"
#include "icing/index/iterator/doc-hit-info-iterator.h"
#include "icing/store/corpus-associated-scoring-data.h"
@@ -116,9 +115,8 @@ float Bm25fCalculator::ComputeScore(const DocHitInfoIterator* query_it,
score += idf_weight * normalized_tf;
}
- ICING_VLOG(1) << IcingStringUtil::StringPrintf(
- "BM25F: corpus_id:%d docid:%d score:%f\n", data.corpus_id(),
- hit_info.document_id(), score);
+ ICING_VLOG(1) << "BM25F: corpus_id:" << data.corpus_id() << " docid:"
+ << hit_info.document_id() << " score:" << score;
return score;
}
@@ -144,8 +142,7 @@ float Bm25fCalculator::GetCorpusIdfWeightForTerm(std::string_view term,
// First, figure out corpus scoring data.
auto status_or = document_store_->GetCorpusAssociatedScoreData(corpus_id);
if (!status_or.ok()) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "No scoring data for corpus [%d]", corpus_id);
+ ICING_LOG(ERROR) << "No scoring data for corpus [" << corpus_id << "]";
return 0;
}
CorpusAssociatedScoreData csdata = status_or.ValueOrDie();
@@ -155,9 +152,8 @@ float Bm25fCalculator::GetCorpusIdfWeightForTerm(std::string_view term,
float idf =
nqi != 0 ? log(1.0f + (num_docs - nqi + 0.5f) / (nqi + 0.5f)) : 0.0f;
corpus_idf_map_.insert({corpus_term_info.value, idf});
- ICING_VLOG(1) << IcingStringUtil::StringPrintf(
- "corpus_id:%d term:%s N:%d nqi:%d idf:%f", corpus_id,
- std::string(term).c_str(), num_docs, nqi, idf);
+ ICING_VLOG(1) << "corpus_id:" << corpus_id << " term:"
+ << term << " N:" << num_docs << "nqi:" << nqi << " idf:" << idf;
return idf;
}
@@ -176,8 +172,7 @@ float Bm25fCalculator::GetCorpusAvgDocLength(CorpusId corpus_id) {
// First, figure out corpus scoring data.
auto status_or = document_store_->GetCorpusAssociatedScoreData(corpus_id);
if (!status_or.ok()) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "No scoring data for corpus [%d]", corpus_id);
+ ICING_LOG(ERROR) << "No scoring data for corpus [" << corpus_id << "]";
return 0;
}
CorpusAssociatedScoreData csdata = status_or.ValueOrDie();
@@ -205,9 +200,9 @@ float Bm25fCalculator::ComputedNormalizedTermFrequency(
float normalized_tf =
f_q * (k1_ + 1) / (f_q + k1_ * (1 - b_ + b_ * dl / avgdl));
- ICING_VLOG(1) << IcingStringUtil::StringPrintf(
- "corpus_id:%d docid:%d dl:%d avgdl:%f f_q:%f norm_tf:%f\n",
- data.corpus_id(), hit_info.document_id(), dl, avgdl, f_q, normalized_tf);
+ ICING_VLOG(1) << "corpus_id:" << data.corpus_id() << " docid:"
+ << hit_info.document_id() << " dl:" << dl << " avgdl:" << avgdl << " f_q:"
+ << f_q << " norm_tf:" << normalized_tf;
return normalized_tf;
}
@@ -240,8 +235,8 @@ SchemaTypeId Bm25fCalculator::GetSchemaTypeId(DocumentId document_id) const {
// GetDocumentFilterData is if the document_id is outside of the range of
// allocated document_ids, which shouldn't be possible since we're getting
// this document_id from the posting lists.
- ICING_LOG(WARNING) << IcingStringUtil::StringPrintf(
- "No document filter data for document [%d]", document_id);
+ ICING_LOG(WARNING) << "No document filter data for document ["
+ << document_id << "]";
return kInvalidSchemaTypeId;
}
return filter_data_optional.value().schema_type_id();
diff --git a/icing/scoring/priority-queue-scored-document-hits-ranker.cc b/icing/scoring/priority-queue-scored-document-hits-ranker.cc
index 13da0ae..691b088 100644
--- a/icing/scoring/priority-queue-scored-document-hits-ranker.cc
+++ b/icing/scoring/priority-queue-scored-document-hits-ranker.cc
@@ -23,11 +23,9 @@ namespace icing {
namespace lib {
PriorityQueueScoredDocumentHitsRanker::PriorityQueueScoredDocumentHitsRanker(
- const std::vector<ScoredDocumentHit>& scored_document_hits,
- bool is_descending)
+ std::vector<ScoredDocumentHit>&& scored_document_hits, bool is_descending)
: comparator_(/*is_ascending=*/!is_descending),
- scored_document_hits_pq_(scored_document_hits.begin(),
- scored_document_hits.end(), comparator_) {}
+ scored_document_hits_pq_(comparator_, std::move(scored_document_hits)) {}
ScoredDocumentHit PriorityQueueScoredDocumentHitsRanker::PopNext() {
ScoredDocumentHit ret = scored_document_hits_pq_.top();
diff --git a/icing/scoring/priority-queue-scored-document-hits-ranker.h b/icing/scoring/priority-queue-scored-document-hits-ranker.h
index c104585..e0ae4b0 100644
--- a/icing/scoring/priority-queue-scored-document-hits-ranker.h
+++ b/icing/scoring/priority-queue-scored-document-hits-ranker.h
@@ -29,7 +29,7 @@ namespace lib {
class PriorityQueueScoredDocumentHitsRanker : public ScoredDocumentHitsRanker {
public:
explicit PriorityQueueScoredDocumentHitsRanker(
- const std::vector<ScoredDocumentHit>& scored_document_hits,
+ std::vector<ScoredDocumentHit>&& scored_document_hits,
bool is_descending = true);
~PriorityQueueScoredDocumentHitsRanker() override = default;
diff --git a/icing/store/document-store.cc b/icing/store/document-store.cc
index aa3122b..8a79b6d 100644
--- a/icing/store/document-store.cc
+++ b/icing/store/document-store.cc
@@ -1489,8 +1489,11 @@ libtextclassifier3::Status DocumentStore::UpdateSchemaStore(
// Update the SchemaTypeId for this entry
ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
schema_store_->GetSchemaTypeId(document.schema()));
- filter_cache_->mutable_array()[document_id].set_schema_type_id(
- schema_type_id);
+ ICING_ASSIGN_OR_RETURN(
+ typename FileBackedVector<DocumentFilterData>::MutableView
+ doc_filter_data_view,
+ filter_cache_->GetMutable(document_id));
+ doc_filter_data_view.Get().set_schema_type_id(schema_type_id);
} else {
// Document is no longer valid with the new SchemaStore. Mark as
// deleted
@@ -1550,8 +1553,11 @@ libtextclassifier3::Status DocumentStore::OptimizedUpdateSchemaStore(
ICING_ASSIGN_OR_RETURN(
SchemaTypeId schema_type_id,
schema_store_->GetSchemaTypeId(document.schema()));
- filter_cache_->mutable_array()[document_id].set_schema_type_id(
- schema_type_id);
+ ICING_ASSIGN_OR_RETURN(
+ typename FileBackedVector<DocumentFilterData>::MutableView
+ doc_filter_data_view,
+ filter_cache_->GetMutable(document_id));
+ doc_filter_data_view.Get().set_schema_type_id(schema_type_id);
}
if (revalidate_document) {
delete_document = !document_validator_.Validate(document).ok();
@@ -1576,9 +1582,10 @@ libtextclassifier3::Status DocumentStore::Optimize() {
return libtextclassifier3::Status::OK;
}
-libtextclassifier3::Status DocumentStore::OptimizeInto(
- const std::string& new_directory, const LanguageSegmenter* lang_segmenter,
- OptimizeStatsProto* stats) {
+libtextclassifier3::StatusOr<std::vector<DocumentId>>
+DocumentStore::OptimizeInto(const std::string& new_directory,
+ const LanguageSegmenter* lang_segmenter,
+ OptimizeStatsProto* stats) {
// Validates directory
if (new_directory == base_dir_) {
return absl_ports::InvalidArgumentError(
@@ -1596,6 +1603,7 @@ libtextclassifier3::Status DocumentStore::OptimizeInto(
int num_deleted = 0;
int num_expired = 0;
UsageStore::UsageScores default_usage;
+ std::vector<DocumentId> document_id_old_to_new(size, kInvalidDocumentId);
for (DocumentId document_id = 0; document_id < size; document_id++) {
auto document_or = Get(document_id, /*clear_internal_fields=*/false);
if (absl_ports::IsNotFound(document_or.status())) {
@@ -1641,6 +1649,8 @@ libtextclassifier3::Status DocumentStore::OptimizeInto(
return new_document_id_or.status();
}
+ document_id_old_to_new[document_id] = new_document_id_or.ValueOrDie();
+
// Copy over usage scores.
ICING_ASSIGN_OR_RETURN(UsageStore::UsageScores usage_scores,
usage_store_->GetUsageScores(document_id));
@@ -1659,7 +1669,7 @@ libtextclassifier3::Status DocumentStore::OptimizeInto(
stats->set_num_expired_documents(num_expired);
}
ICING_RETURN_IF_ERROR(new_doc_store->PersistToDisk(PersistType::FULL));
- return libtextclassifier3::Status::OK;
+ return document_id_old_to_new;
}
libtextclassifier3::StatusOr<DocumentStore::OptimizeInfo>
diff --git a/icing/store/document-store.h b/icing/store/document-store.h
index 450b1b9..41dd6a9 100644
--- a/icing/store/document-store.h
+++ b/icing/store/document-store.h
@@ -388,10 +388,10 @@ class DocumentStore {
// method based on device usage.
//
// Returns:
- // OK on success
+ // A vector that maps from old document id to new document id on success
// INVALID_ARGUMENT if new_directory is same as current base directory
// INTERNAL_ERROR on IO error
- libtextclassifier3::Status OptimizeInto(
+ libtextclassifier3::StatusOr<std::vector<DocumentId>> OptimizeInto(
const std::string& new_directory, const LanguageSegmenter* lang_segmenter,
OptimizeStatsProto* stats = nullptr);
diff --git a/icing/store/document-store_test.cc b/icing/store/document-store_test.cc
index 59e5d74..6f444cb 100644
--- a/icing/store/document-store_test.cc
+++ b/icing/store/document-store_test.cc
@@ -59,6 +59,7 @@ namespace {
using ::icing::lib::portable_equals_proto::EqualsProto;
using ::testing::_;
+using ::testing::ElementsAre;
using ::testing::Eq;
using ::testing::Ge;
using ::testing::Gt;
@@ -1058,8 +1059,8 @@ TEST_F(DocumentStoreTest, OptimizeInto) {
// deleted
ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(optimized_dir.c_str()));
ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(optimized_dir.c_str()));
- ICING_ASSERT_OK(
- doc_store->OptimizeInto(optimized_dir, lang_segmenter_.get()));
+ EXPECT_THAT(doc_store->OptimizeInto(optimized_dir, lang_segmenter_.get()),
+ IsOkAndHolds(ElementsAre(0, 1, 2)));
int64_t optimized_size1 =
filesystem_.GetFileSize(optimized_document_log.c_str());
EXPECT_EQ(original_size, optimized_size1);
@@ -1069,8 +1070,9 @@ TEST_F(DocumentStoreTest, OptimizeInto) {
ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(optimized_dir.c_str()));
ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(optimized_dir.c_str()));
ICING_ASSERT_OK(doc_store->Delete("namespace", "uri1"));
- ICING_ASSERT_OK(
- doc_store->OptimizeInto(optimized_dir, lang_segmenter_.get()));
+ // DocumentId 0 is removed.
+ EXPECT_THAT(doc_store->OptimizeInto(optimized_dir, lang_segmenter_.get()),
+ IsOkAndHolds(ElementsAre(kInvalidDocumentId, 0, 1)));
int64_t optimized_size2 =
filesystem_.GetFileSize(optimized_document_log.c_str());
EXPECT_THAT(original_size, Gt(optimized_size2));
@@ -1083,11 +1085,39 @@ TEST_F(DocumentStoreTest, OptimizeInto) {
// expired
ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(optimized_dir.c_str()));
ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(optimized_dir.c_str()));
- ICING_ASSERT_OK(
- doc_store->OptimizeInto(optimized_dir, lang_segmenter_.get()));
+ // DocumentId 0 is removed, and DocumentId 2 is expired.
+ EXPECT_THAT(
+ doc_store->OptimizeInto(optimized_dir, lang_segmenter_.get()),
+ IsOkAndHolds(ElementsAre(kInvalidDocumentId, 0, kInvalidDocumentId)));
int64_t optimized_size3 =
filesystem_.GetFileSize(optimized_document_log.c_str());
EXPECT_THAT(optimized_size2, Gt(optimized_size3));
+
+ // Delete the last document
+ ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(optimized_dir.c_str()));
+ ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(optimized_dir.c_str()));
+ ICING_ASSERT_OK(doc_store->Delete("namespace", "uri2"));
+ // DocumentId 0 and 1 is removed, and DocumentId 2 is expired.
+ EXPECT_THAT(doc_store->OptimizeInto(optimized_dir, lang_segmenter_.get()),
+ IsOkAndHolds(ElementsAre(kInvalidDocumentId, kInvalidDocumentId,
+ kInvalidDocumentId)));
+ int64_t optimized_size4 =
+ filesystem_.GetFileSize(optimized_document_log.c_str());
+ EXPECT_THAT(optimized_size3, Gt(optimized_size4));
+}
+
+TEST_F(DocumentStoreTest, OptimizeIntoForEmptyDocumentStore) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+ std::string optimized_dir = document_store_dir_ + "_optimize";
+ ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(optimized_dir.c_str()));
+ ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(optimized_dir.c_str()));
+ EXPECT_THAT(doc_store->OptimizeInto(optimized_dir, lang_segmenter_.get()),
+ IsOkAndHolds(IsEmpty()));
}
TEST_F(DocumentStoreTest, ShouldRecoverFromDataLoss) {
diff --git a/icing/tokenization/icu/icu-language-segmenter_test.cc b/icing/tokenization/icu/icu-language-segmenter_test.cc
index 4098be5..71e04e2 100644
--- a/icing/tokenization/icu/icu-language-segmenter_test.cc
+++ b/icing/tokenization/icu/icu-language-segmenter_test.cc
@@ -15,12 +15,12 @@
#include <memory>
#include <string_view>
-#include "icing/jni/jni-cache.h"
#include "icing/text_classifier/lib3/utils/base/status.h"
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "gmock/gmock.h"
#include "gtest/gtest.h"
#include "icing/absl_ports/str_cat.h"
+#include "icing/jni/jni-cache.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/icu-i18n-test-utils.h"
diff --git a/icing/tokenization/language-segmenter-factory.h b/icing/tokenization/language-segmenter-factory.h
index cae3eee..2505a07 100644
--- a/icing/tokenization/language-segmenter-factory.h
+++ b/icing/tokenization/language-segmenter-factory.h
@@ -18,9 +18,8 @@
#include <memory>
#include <string_view>
-#include "icing/jni/jni-cache.h"
-
#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/jni/jni-cache.h"
#include "icing/tokenization/language-segmenter.h"
namespace icing {
diff --git a/icing/tokenization/reverse_jni/reverse-jni-break-iterator.cc b/icing/tokenization/reverse_jni/reverse-jni-break-iterator.cc
index 8e1e563..dbd7f5a 100644
--- a/icing/tokenization/reverse_jni/reverse-jni-break-iterator.cc
+++ b/icing/tokenization/reverse_jni/reverse-jni-break-iterator.cc
@@ -21,11 +21,11 @@
#include <cmath>
#include <map>
-#include "icing/jni/jni-cache.h"
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/text_classifier/lib3/utils/java/jni-base.h"
#include "icing/text_classifier/lib3/utils/java/jni-helper.h"
#include "icing/absl_ports/canonical_errors.h"
+#include "icing/jni/jni-cache.h"
#include "icing/util/status-macros.h"
namespace icing {
diff --git a/icing/tokenization/reverse_jni/reverse-jni-break-iterator.h b/icing/tokenization/reverse_jni/reverse-jni-break-iterator.h
index 41b470c..537666c 100644
--- a/icing/tokenization/reverse_jni/reverse-jni-break-iterator.h
+++ b/icing/tokenization/reverse_jni/reverse-jni-break-iterator.h
@@ -20,8 +20,8 @@
#include <queue>
#include <string>
-#include "icing/jni/jni-cache.h"
#include "icing/text_classifier/lib3/utils/java/jni-base.h"
+#include "icing/jni/jni-cache.h"
namespace icing {
namespace lib {
diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-factory.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-factory.cc
index 0da4c2d..a251f90 100644
--- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-factory.cc
+++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-factory.cc
@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "icing/jni/jni-cache.h"
#include "icing/absl_ports/canonical_errors.h"
+#include "icing/jni/jni-cache.h"
#include "icing/tokenization/language-segmenter-factory.h"
#include "icing/tokenization/reverse_jni/reverse-jni-language-segmenter.h"
#include "icing/util/logging.h"
diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.h b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.h
index f06dac9..29df4ee 100644
--- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.h
+++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.h
@@ -21,8 +21,8 @@
#include <string_view>
#include <vector>
-#include "icing/jni/jni-cache.h"
#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/jni/jni-cache.h"
#include "icing/tokenization/language-segmenter.h"
namespace icing {
diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc
index 8b13cd1..47a01fe 100644
--- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc
+++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc
@@ -17,11 +17,11 @@
#include <memory>
#include <string_view>
-#include "icing/jni/jni-cache.h"
#include "icing/text_classifier/lib3/utils/base/status.h"
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "gmock/gmock.h"
#include "icing/absl_ports/str_cat.h"
+#include "icing/jni/jni-cache.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/icu-i18n-test-utils.h"
#include "icing/testing/jni-test-helpers.h"
diff --git a/icing/tokenization/rfc822-tokenizer.cc b/icing/tokenization/rfc822-tokenizer.cc
new file mode 100644
index 0000000..4a96783
--- /dev/null
+++ b/icing/tokenization/rfc822-tokenizer.cc
@@ -0,0 +1,565 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/tokenization/rfc822-tokenizer.h"
+
+#include <algorithm>
+#include <deque>
+#include <queue>
+#include <string_view>
+#include <utility>
+
+#include "icing/tokenization/token.h"
+#include "icing/tokenization/tokenizer.h"
+#include "icing/util/character-iterator.h"
+#include "icing/util/i18n-utils.h"
+#include "icing/util/status-macros.h"
+#include "unicode/umachine.h"
+
+namespace icing {
+namespace lib {
+
+class Rfc822TokenIterator : public Tokenizer::Iterator {
+ public:
+ // Cursor is the index into the string_view, text_end_ is the length.
+ explicit Rfc822TokenIterator(std::string_view text)
+ : term_(std::move(text)),
+ iterator_(text, 0, 0, 0),
+ text_end_(text.length()) {}
+
+ struct NameInfo {
+ NameInfo(const char* at_sign, bool name_found)
+ : at_sign(at_sign), name_found(name_found) {}
+ const char* at_sign;
+ bool name_found;
+ };
+
+ bool Advance() override {
+ // Advance through the queue.
+ if (!token_queue_.empty()) {
+ token_queue_.pop_front();
+ }
+
+ // There is still something left.
+ if (!token_queue_.empty()) {
+ return true;
+ }
+
+ // Done with the entire string_view
+ if (iterator_.utf8_index() >= text_end_) {
+ return false;
+ }
+
+ AdvancePastWhitespace();
+
+ GetNextRfc822Token();
+
+ return true;
+ }
+
+ // Advance until the next email delimiter, generating as many tokens as
+ // necessary.
+ void GetNextRfc822Token() {
+ int token_start = iterator_.utf8_index();
+ const char* at_sign_in_name = nullptr;
+ bool address_found = false;
+ bool name_found = false;
+ // We start at unquoted and run until a ",;\n<( .
+ while (iterator_.utf8_index() < text_end_) {
+ UChar32 c = iterator_.GetCurrentChar();
+ if (c == ',' || c == ';' || c == '\n') {
+ // End of the token, advance cursor past this then quit
+ token_queue_.push_back(Token(
+ Token::Type::RFC822_TOKEN,
+ term_.substr(token_start, iterator_.utf8_index() - token_start)));
+ AdvanceCursor();
+ break;
+ }
+
+ if (c == '"') {
+ NameInfo quoted_result = ConsumeQuotedSection();
+ if (quoted_result.at_sign != nullptr) {
+ at_sign_in_name = quoted_result.at_sign;
+ }
+ if (!name_found) {
+ name_found = quoted_result.name_found;
+ }
+ } else if (c == '(') {
+ ConsumeParenthesizedSection();
+ } else if (c == '<') {
+ // Only set address_found to true if ConsumeAdress returns true.
+ // Otherwise, keep address_found as is to prevent setting address_found
+ // back to false if it is true
+ if (ConsumeAddress()) {
+ address_found = true;
+ }
+ } else {
+ NameInfo unquoted_result = ConsumeUnquotedSection();
+ if (unquoted_result.at_sign != nullptr) {
+ at_sign_in_name = unquoted_result.at_sign;
+ }
+ if (!name_found) {
+ name_found = unquoted_result.name_found;
+ }
+ }
+ }
+ if (iterator_.utf8_index() >= text_end_) {
+ token_queue_.push_back(
+ Token(Token::Type::RFC822_TOKEN,
+ term_.substr(token_start, text_end_ - token_start)));
+ }
+
+ // At this point the token_queue is not empty.
+ // If an address is found, use the tokens we have
+ // If an address isn't found, and a name isn't found, also use the tokens
+ // we have.
+ // If an address isn't found but a name is, convert name Tokens to email
+ // Tokens
+ if (!address_found && name_found) {
+ ConvertNameToEmail(at_sign_in_name);
+ }
+ }
+
+ void ConvertNameToEmail(const char* at_sign_in_name) {
+ // The name tokens will be will be used as the address now
+ const char* address_start = nullptr;
+ const char* local_address_end = nullptr;
+ const char* address_end = term_.begin();
+
+ // If we need to transform name tokens into various tokens, we keep the
+ // order of which the name tokens appeared. Name tokens that appear before
+ // an @ sign in the name will become RFC822_ADDRESS_COMPONENT_LOCAL, and
+ // those after will become RFC822_ADDRESS_COMPONENT_HOST. We aren't able
+ // to determine RFC822_ADDRESS and RFC822_LOCAL_ADDRESS before checking
+ // the name tokens, so they will be added after the component tokens.
+
+ for (Token& token : token_queue_) {
+ if (token.type == Token::Type::RFC822_NAME) {
+ // Names need to be converted to address tokens
+ std::string_view text = token.text;
+
+ // Find the ADDRESS and LOCAL_ADDRESS.
+ if (address_start == nullptr) {
+ address_start = text.begin();
+ }
+
+ if (at_sign_in_name >= text.end()) {
+ local_address_end = text.end();
+ }
+
+ address_end = text.end();
+
+ if (text.begin() < at_sign_in_name) {
+ token = Token(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, text);
+ } else if (text.begin() > at_sign_in_name) {
+ token = Token(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, text);
+ }
+ }
+ }
+
+ token_queue_.push_back(
+ Token(Token::Type::RFC822_ADDRESS,
+ std::string_view(address_start, address_end - address_start)));
+
+ if (local_address_end != nullptr) {
+ token_queue_.push_back(Token(
+ Token::Type::RFC822_LOCAL_ADDRESS,
+ std::string_view(address_start, local_address_end - address_start)));
+ }
+ }
+
+ // Returns the location of the last at sign in the unquoted section, and if
+ // we have found a name. This is useful in case we do not find an address
+ // and have to use the name. An unquoted section may look like "Alex Sav", or
+ // "alex@google.com". In the absense of a bracketed email address, the
+ // unquoted section will be used as the email address along with the quoted
+ // section.
+ NameInfo ConsumeUnquotedSection() {
+ const char* at_sign_location = nullptr;
+ UChar32 c;
+
+ int token_start = -1;
+ bool name_found = false;
+
+ // Advance to another state or a character marking the end of token, one
+ // of \n,; .
+ while (iterator_.utf8_index() < text_end_) {
+ c = iterator_.GetCurrentChar();
+
+ if (i18n_utils::IsAlphaNumeric(c)) {
+ name_found = true;
+
+ if (token_start == -1) {
+ // Start recording
+ token_start = iterator_.utf8_index();
+ }
+ AdvanceCursor();
+
+ } else {
+ if (token_start != -1) {
+ if (c == '@') {
+ // Mark the last @ sign.
+ at_sign_location = term_.data() + iterator_.utf8_index();
+ }
+
+ // The character is non alphabetic, save a token.
+ token_queue_.push_back(Token(
+ Token::Type::RFC822_NAME,
+ term_.substr(token_start, iterator_.utf8_index() - token_start)));
+ token_start = -1;
+ }
+
+ if (c == '"' || c == '<' || c == '(' || c == '\n' || c == ';' ||
+ c == ',') {
+ // Stay on the token.
+ break;
+ }
+
+ AdvanceCursor();
+ }
+ }
+ if (token_start != -1) {
+ token_queue_.push_back(Token(
+ Token::Type::RFC822_NAME,
+ term_.substr(token_start, iterator_.utf8_index() - token_start)));
+ }
+ return NameInfo(at_sign_location, name_found);
+ }
+
+ // Names that are within quotes should have all characters blindly unescaped.
+ // When a name is made into an address, it isn't re-escaped.
+
+ // Returns the location of the last at sign in the quoted section. This is
+ // useful in case we do not find an address and have to use the name. The
+ // quoted section may contain whitespaces
+ NameInfo ConsumeQuotedSection() {
+ // Get past the first quote.
+ AdvanceCursor();
+ const char* at_sign_location = nullptr;
+
+ bool end_quote_found = false;
+ bool name_found = false;
+ UChar32 c;
+
+ int token_start = -1;
+
+ while (!end_quote_found && (iterator_.utf8_index() < text_end_)) {
+ c = iterator_.GetCurrentChar();
+
+ if (i18n_utils::IsAlphaNumeric(c)) {
+ name_found = true;
+
+ if (token_start == -1) {
+ // Start tracking the token.
+ token_start = iterator_.utf8_index();
+ }
+ AdvanceCursor();
+
+ } else {
+ // Non- alphabetic
+ if (c == '\\') {
+ // A backslash, let's look at the next character.
+ CharacterIterator temp = iterator_;
+ temp.AdvanceToUtf32(iterator_.utf32_index() + 1);
+ UChar32 n = temp.GetCurrentChar();
+ if (i18n_utils::IsAlphaNumeric(n)) {
+ // The next character is alphabetic, skip the slash and don't end
+ // the last token. For quoted sections, the only things that are
+ // escaped are double quotes and slashes. For example, in "a\lex",
+ // an l appears after the slash. We want to treat this as if it was
+ // just "alex". So we tokenize it as <RFC822_NAME, "a\lex">.
+ AdvanceCursor();
+ } else {
+ // Not alphabetic, so save the last token if necessary.
+ if (token_start != -1) {
+ token_queue_.push_back(
+ Token(Token::Type::RFC822_NAME,
+ term_.substr(token_start,
+ iterator_.utf8_index() - token_start)));
+ token_start = -1;
+ }
+
+ // Skip the backslash.
+ AdvanceCursor();
+
+ if (n == '"' || n == '\\' || n == '@') {
+ // Skip these too if they're next.
+ AdvanceCursor();
+ }
+ }
+
+ } else {
+ // Not a backslash.
+
+ if (c == '@') {
+ // Mark the last @ sign.
+ at_sign_location = term_.data() + iterator_.utf8_index();
+ }
+
+ if (token_start != -1) {
+ token_queue_.push_back(
+ Token(Token::Type::RFC822_NAME,
+ term_.substr(token_start,
+ iterator_.utf8_index() - token_start)));
+ token_start = -1;
+ }
+
+ if (c == '"') {
+ end_quote_found = true;
+ }
+ // Advance one more time to get past the non-alphabetic character.
+ AdvanceCursor();
+ }
+ }
+ }
+ if (token_start != -1) {
+ token_queue_.push_back(Token(
+ Token::Type::RFC822_NAME,
+ term_.substr(token_start, iterator_.utf8_index() - token_start)));
+ }
+ return NameInfo(at_sign_location, name_found);
+ }
+
+ // '(', ')', '\\' chars should be escaped. All other escaped chars should be
+ // unescaped.
+ void ConsumeParenthesizedSection() {
+ // Skip the initial (
+ AdvanceCursor();
+
+ int paren_layer = 1;
+ UChar32 c;
+
+ int token_start = -1;
+
+ while (paren_layer > 0 && (iterator_.utf8_index() < text_end_)) {
+ c = iterator_.GetCurrentChar();
+
+ if (i18n_utils::IsAlphaNumeric(c)) {
+ if (token_start == -1) {
+ // Start tracking a token.
+ token_start = iterator_.utf8_index();
+ }
+ AdvanceCursor();
+
+ } else {
+ // Non alphabetic.
+ if (c == '\\') {
+ // A backslash, let's look at the next character.
+ UChar32 n = i18n_utils::GetUChar32At(term_.data(), term_.length(),
+ iterator_.utf8_index() + 1);
+ if (i18n_utils::IsAlphaNumeric(n)) {
+ // Alphabetic, skip the slash and don't end the last token.
+ AdvanceCursor();
+ } else {
+ // Not alphabetic, save the last token if necessary.
+ if (token_start != -1) {
+ token_queue_.push_back(
+ Token(Token::Type::RFC822_COMMENT,
+ term_.substr(token_start,
+ iterator_.utf8_index() - token_start)));
+ token_start = -1;
+ }
+
+ // Skip the backslash.
+ AdvanceCursor();
+
+ if (n == ')' || n == '(' || n == '\\') {
+ // Skip these too if they're next.
+ AdvanceCursor();
+ }
+ }
+ } else {
+ // Not a backslash.
+ if (token_start != -1) {
+ token_queue_.push_back(
+ Token(Token::Type::RFC822_COMMENT,
+ term_.substr(token_start,
+ iterator_.utf8_index() - token_start)));
+ token_start = -1;
+ }
+
+ if (c == '(') {
+ paren_layer++;
+ } else if (c == ')') {
+ paren_layer--;
+ }
+ AdvanceCursor();
+ }
+ }
+ }
+
+ if (token_start != -1) {
+ // Ran past the end of term_ without getting the last token.
+
+ // substr returns "a view of the substring [pos, pos + // rcount), where
+ // rcount is the smaller of count and size() - pos" therefore the count
+ // argument can be any value >= this->cursor - token_start. Therefore,
+ // ignoring the mutation warning.
+ token_queue_.push_back(Token(
+ Token::Type::RFC822_COMMENT,
+ term_.substr(token_start, iterator_.utf8_index() - token_start)));
+ }
+ }
+
+ // Returns true if we find an address.
+ bool ConsumeAddress() {
+ // Skip the first <.
+ AdvanceCursor();
+
+ // Save the start position.
+ CharacterIterator address_start_iterator = iterator_;
+
+ int at_sign = -1;
+ int address_end = -1;
+
+ UChar32 c = iterator_.GetCurrentChar();
+ // Quick scan for @ and > signs.
+ while (c != '>' && iterator_.utf8_index() < text_end_) {
+ AdvanceCursor();
+ c = iterator_.GetCurrentChar();
+ if (c == '@') {
+ at_sign = iterator_.utf8_index();
+ }
+ }
+
+ if (iterator_.utf8_index() <= address_start_iterator.utf8_index()) {
+ // There is nothing between the brackets, either we have "<" or "<>"
+ return false;
+ }
+
+ // Either we find a > or run to the end, either way this is the end of the
+ // address. The ending bracket will be handled by ConsumeUnquoted.
+ address_end = iterator_.utf8_index();
+
+ // Reset to the start.
+ iterator_ = address_start_iterator;
+
+ int address_start = address_start_iterator.utf8_index();
+
+ Token::Type type = Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL;
+
+ // Create a local address token.
+ if (at_sign != -1) {
+ token_queue_.push_back(
+ Token(Token::Type::RFC822_LOCAL_ADDRESS,
+ term_.substr(address_start, at_sign - address_start)));
+ } else {
+ // All the tokens in the address are host components.
+ type = Token::Type::RFC822_ADDRESS_COMPONENT_HOST;
+ }
+
+ token_queue_.push_back(
+ Token(Token::Type::RFC822_ADDRESS,
+ term_.substr(address_start, address_end - address_start)));
+
+ int token_start = -1;
+
+ while (iterator_.utf8_index() < address_end) {
+ c = iterator_.GetCurrentChar();
+
+ if (i18n_utils::IsAlphaNumeric(c)) {
+ if (token_start == -1) {
+ token_start = iterator_.utf8_index();
+ }
+
+ } else {
+ // non alphabetic
+ if (c == '\\') {
+ // A backslash, let's look at the next character.
+ CharacterIterator temp = iterator_;
+ temp.AdvanceToUtf32(iterator_.utf32_index() + 1);
+ UChar32 n = temp.GetCurrentChar();
+ if (!i18n_utils::IsAlphaNumeric(n)) {
+ // Not alphabetic, end the last token if necessary.
+ if (token_start != -1) {
+ token_queue_.push_back(Token(
+ type, term_.substr(token_start,
+ iterator_.utf8_index() - token_start)));
+ token_start = -1;
+ }
+ }
+ } else {
+ // Not backslash.
+ if (token_start != -1) {
+ token_queue_.push_back(Token(
+ type, term_.substr(token_start,
+ iterator_.utf8_index() - token_start)));
+ token_start = -1;
+ }
+ // Switch to host component tokens.
+ if (iterator_.utf8_index() == at_sign) {
+ type = Token::Type::RFC822_ADDRESS_COMPONENT_HOST;
+ }
+ }
+ }
+ AdvanceCursor();
+ }
+ if (token_start != -1) {
+ token_queue_.push_back(Token(
+ type,
+ term_.substr(token_start, iterator_.utf8_index() - token_start)));
+ }
+ // Unquoted will handle the closing bracket > if these is one.
+ return true;
+ }
+
+ Token GetToken() const override {
+ if (token_queue_.empty()) {
+ return Token(Token::Type::INVALID, term_);
+ }
+ return token_queue_.front();
+ }
+
+ private:
+ void AdvanceCursor() {
+ iterator_.AdvanceToUtf32(iterator_.utf32_index() + 1);
+ }
+
+ void AdvancePastWhitespace() {
+ while (i18n_utils::IsWhitespaceAt(term_, iterator_.utf8_index())) {
+ AdvanceCursor();
+ }
+ }
+
+ std::string_view term_;
+ CharacterIterator iterator_;
+ int text_end_;
+
+ // A temporary store of Tokens. As we advance through the provided string, we
+ // parse entire addresses at a time rather than one token at a time. However,
+ // since we call the tokenizer with Advance() alternating with GetToken(), we
+ // need to store tokens for subsequent GetToken calls if Advance generates
+ // multiple tokens (it usually does). A queue is used as we want the first
+ // token generated to be the first token returned from GetToken.
+ std::deque<Token> token_queue_;
+};
+
+libtextclassifier3::StatusOr<std::unique_ptr<Tokenizer::Iterator>>
+Rfc822Tokenizer::Tokenize(std::string_view text) const {
+ return std::make_unique<Rfc822TokenIterator>(text);
+}
+
+libtextclassifier3::StatusOr<std::vector<Token>> Rfc822Tokenizer::TokenizeAll(
+ std::string_view text) const {
+ ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer::Iterator> iterator,
+ Tokenize(text));
+ std::vector<Token> tokens;
+ while (iterator->Advance()) {
+ tokens.push_back(iterator->GetToken());
+ }
+ return tokens;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/absl_ports/status_imports.h b/icing/tokenization/rfc822-tokenizer.h
index 3a97fd6..09e4624 100644
--- a/icing/absl_ports/status_imports.h
+++ b/icing/tokenization/rfc822-tokenizer.h
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 Google LLC
+// Copyright (C) 2022 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -12,21 +12,27 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#ifndef ICING_ABSL_PORTS_STATUS_IMPORTS_H_
-#define ICING_ABSL_PORTS_STATUS_IMPORTS_H_
+#ifndef ICING_TOKENIZATION_RFC822_TOKENIZER_H_
+#define ICING_TOKENIZATION_RFC822_TOKENIZER_H_
-#include "icing/text_classifier/lib3/utils/base/status.h"
+#include <vector>
+
+#include "icing/tokenization/tokenizer.h"
namespace icing {
namespace lib {
-namespace absl_ports {
-// TODO(b/144458732) Delete this file once visibility on TC3 Status has been
-// granted to the sample app.
-using Status = libtextclassifier3::Status;
+class Rfc822Tokenizer : public Tokenizer {
+ public:
+ libtextclassifier3::StatusOr<std::unique_ptr<Tokenizer::Iterator>> Tokenize(
+ std::string_view text) const override;
+
+ libtextclassifier3::StatusOr<std::vector<Token>> TokenizeAll(
+ std::string_view text) const override;
+
+};
-} // namespace absl_ports
} // namespace lib
} // namespace icing
-#endif // ICING_ABSL_PORTS_STATUS_IMPORTS_H_
+#endif // ICING_TOKENIZATION_RFC822_TOKENIZER_H_
diff --git a/icing/tokenization/rfc822-tokenizer_test.cc b/icing/tokenization/rfc822-tokenizer_test.cc
new file mode 100644
index 0000000..e3c6da6
--- /dev/null
+++ b/icing/tokenization/rfc822-tokenizer_test.cc
@@ -0,0 +1,797 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/tokenization/rfc822-tokenizer.h"
+
+#include <memory>
+#include <string>
+#include <string_view>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/jni-test-helpers.h"
+#include "icing/tokenization/language-segmenter-factory.h"
+#include "unicode/uloc.h"
+
+namespace icing {
+namespace lib {
+namespace {
+using ::testing::ElementsAre;
+
+class Rfc822TokenizerTest : public testing::Test {
+ protected:
+ void SetUp() override {
+ jni_cache_ = GetTestJniCache();
+ language_segmenter_factory::SegmenterOptions options(ULOC_US,
+ jni_cache_.get());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ language_segmenter_,
+ language_segmenter_factory::Create(std::move(options)));
+ }
+ std::unique_ptr<const JniCache> jni_cache_;
+ std::unique_ptr<LanguageSegmenter> language_segmenter_;
+};
+
+TEST_F(Rfc822TokenizerTest, Simple) {
+ Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer();
+
+ std::string_view s("<你alex@google.com>");
+
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(s),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_LOCAL_ADDRESS, "你alex"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "你alex@google.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "你alex"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "google"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "com"),
+ EqualsToken(Token::Type::RFC822_TOKEN, "<你alex@google.com>"))));
+}
+
+TEST_F(Rfc822TokenizerTest, Small) {
+ Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer();
+
+ std::string_view s("\"a\"");
+
+ EXPECT_THAT(rfc822_tokenizer.TokenizeAll(s),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "a"),
+ EqualsToken(Token::Type::RFC822_TOKEN, "\"a\""),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "a"))));
+
+ s = "\"a\", \"b\"";
+
+ EXPECT_THAT(rfc822_tokenizer.TokenizeAll(s),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "a"),
+ EqualsToken(Token::Type::RFC822_TOKEN, "\"a\""),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "a"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "b"),
+ EqualsToken(Token::Type::RFC822_TOKEN, "\"b\""),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "b"))));
+
+ s = "(a)";
+
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(s),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::RFC822_COMMENT, "a"),
+ EqualsToken(Token::Type::RFC822_TOKEN, "(a)"))));
+}
+
+TEST_F(Rfc822TokenizerTest, PB) {
+ Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer();
+
+ std::string_view s("peanut (comment) butter, <alex@google.com>");
+
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(s),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "peanut"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "comment"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "butter"),
+ EqualsToken(Token::Type::RFC822_TOKEN, "peanut (comment) butter"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "peanut (comment) butter"),
+ EqualsToken(Token::Type::RFC822_LOCAL_ADDRESS, "alex"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "alex@google.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "alex"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "google"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "com"),
+ EqualsToken(Token::Type::RFC822_TOKEN, "<alex@google.com>"))));
+}
+
+TEST_F(Rfc822TokenizerTest, NoBrackets) {
+ Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer();
+
+ std::string_view s("alex@google.com");
+
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(s),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "alex"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "google"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "com"),
+ EqualsToken(Token::Type::RFC822_TOKEN, "alex@google.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "alex@google.com"),
+ EqualsToken(Token::Type::RFC822_LOCAL_ADDRESS, "alex"))));
+}
+
+TEST_F(Rfc822TokenizerTest, TwoAddresses) {
+ Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer();
+
+ std::string_view s("<你alex@google.com>; <alexsav@gmail.com>");
+
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(s),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_LOCAL_ADDRESS, "你alex"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "你alex@google.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "你alex"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "google"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "com"),
+ EqualsToken(Token::Type::RFC822_TOKEN, "<你alex@google.com>"),
+ EqualsToken(Token::Type::RFC822_LOCAL_ADDRESS, "alexsav"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "alexsav@gmail.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "alexsav"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "gmail"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "com"),
+ EqualsToken(Token::Type::RFC822_TOKEN, "<alexsav@gmail.com>"))));
+}
+
+TEST_F(Rfc822TokenizerTest, CommentB) {
+ Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer();
+
+ std::string_view s("(a comment) <alex@google.com>");
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(s),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_COMMENT, "a"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "comment"),
+ EqualsToken(Token::Type::RFC822_LOCAL_ADDRESS, "alex"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "alex@google.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "alex"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "google"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "com"),
+ EqualsToken(Token::Type::RFC822_TOKEN,
+ "(a comment) <alex@google.com>"))));
+}
+
+TEST_F(Rfc822TokenizerTest, NameAndComment) {
+ Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer();
+
+ std::string_view s("\"a name\" also a name <alex@google.com>");
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(s),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_NAME, "a"),
+ EqualsToken(Token::Type::RFC822_NAME, "name"),
+ EqualsToken(Token::Type::RFC822_NAME, "also"),
+ EqualsToken(Token::Type::RFC822_NAME, "a"),
+ EqualsToken(Token::Type::RFC822_NAME, "name"),
+ EqualsToken(Token::Type::RFC822_LOCAL_ADDRESS, "alex"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "alex@google.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "alex"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "google"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "com"),
+ EqualsToken(Token::Type::RFC822_TOKEN,
+ "\"a name\" also a name <alex@google.com>"))));
+}
+
+// Test from tokenizer_test.cc.
+TEST_F(Rfc822TokenizerTest, Rfc822SanityCheck) {
+ Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer();
+
+ std::string addr1("A name (A comment) <address@domain.com>");
+ std::string addr2(
+ "\"(Another name)\" (A different comment) "
+ "<bob-loblaw@foo.bar.com>");
+ std::string addr3("<no.at.sign.present>");
+ std::string addr4("<double@at@signs.present>");
+ std::string rfc822 = addr1 + ", " + addr2 + ", " + addr3 + ", " + addr4;
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(rfc822),
+ IsOkAndHolds(ElementsAre(
+
+ EqualsToken(Token::Type::RFC822_NAME, "A"),
+ EqualsToken(Token::Type::RFC822_NAME, "name"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "A"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "comment"),
+ EqualsToken(Token::Type::RFC822_LOCAL_ADDRESS, "address"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "address@domain.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "address"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "domain"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "com"),
+ EqualsToken(Token::Type::RFC822_TOKEN, addr1),
+
+ EqualsToken(Token::Type::RFC822_NAME, "Another"),
+ EqualsToken(Token::Type::RFC822_NAME, "name"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "A"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "different"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "comment"),
+ EqualsToken(Token::Type::RFC822_LOCAL_ADDRESS, "bob-loblaw"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "bob-loblaw@foo.bar.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "bob"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "loblaw"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "foo"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "bar"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "com"),
+ EqualsToken(Token::Type::RFC822_TOKEN, addr2),
+
+ EqualsToken(Token::Type::RFC822_ADDRESS, "no.at.sign.present"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "no"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "at"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "sign"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "present"),
+ EqualsToken(Token::Type::RFC822_TOKEN, addr3),
+
+ EqualsToken(Token::Type::RFC822_LOCAL_ADDRESS, "double@at"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "double@at@signs.present"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "double"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "at"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "signs"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "present"),
+ EqualsToken(Token::Type::RFC822_TOKEN, addr4))));
+}
+
+// Tests from rfc822 converter.
+TEST_F(Rfc822TokenizerTest, SimpleRfcText) {
+ Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer();
+ std::string test_string =
+ "foo@google.com,bar@google.com,baz@google.com,foo+hello@google.com,baz@"
+ "corp.google.com";
+
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(test_string),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "foo"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "google"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "com"),
+ EqualsToken(Token::Type::RFC822_TOKEN, "foo@google.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "foo@google.com"),
+ EqualsToken(Token::Type::RFC822_LOCAL_ADDRESS, "foo"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "bar"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "google"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "com"),
+ EqualsToken(Token::Type::RFC822_TOKEN, "bar@google.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "bar@google.com"),
+ EqualsToken(Token::Type::RFC822_LOCAL_ADDRESS, "bar"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "baz"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "google"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "com"),
+ EqualsToken(Token::Type::RFC822_TOKEN, "baz@google.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "baz@google.com"),
+ EqualsToken(Token::Type::RFC822_LOCAL_ADDRESS, "baz"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "foo"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "hello"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "google"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "com"),
+ EqualsToken(Token::Type::RFC822_TOKEN, "foo+hello@google.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "foo+hello@google.com"),
+ EqualsToken(Token::Type::RFC822_LOCAL_ADDRESS, "foo+hello"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "baz"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "corp"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "google"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "com"),
+ EqualsToken(Token::Type::RFC822_TOKEN, "baz@corp.google.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "baz@corp.google.com"),
+ EqualsToken(Token::Type::RFC822_LOCAL_ADDRESS, "baz"))));
+}
+
+TEST_F(Rfc822TokenizerTest, ComplicatedRfcText) {
+ Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer();
+ std::string test_string =
+ R"raw("Weird, But&(Also)\\Valid" Name (!With, "an" \\odd\\ cmt too¡) <Foo B(a)r,Baz@g.co>
+ <easy@google.com>)raw";
+
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(test_string),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_NAME, "Weird"),
+ EqualsToken(Token::Type::RFC822_NAME, "But"),
+ EqualsToken(Token::Type::RFC822_NAME, "Also"),
+ EqualsToken(Token::Type::RFC822_NAME, "Valid"),
+ EqualsToken(Token::Type::RFC822_NAME, "Name"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "With"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "an"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "odd"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "cmt"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "too"),
+ EqualsToken(Token::Type::RFC822_LOCAL_ADDRESS, "Foo B(a)r,Baz"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "Foo B(a)r,Baz@g.co"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "Foo"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "B"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "a"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "r"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "Baz"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "g"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "co"),
+ EqualsToken(
+ Token::Type::RFC822_TOKEN,
+ R"raw("Weird, But&(Also)\\Valid" Name (!With, "an" \\odd\\ cmt too¡) <Foo B(a)r,Baz@g.co>)raw"),
+ EqualsToken(Token::Type::RFC822_LOCAL_ADDRESS, "easy"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "easy@google.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "easy"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "google"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "com"),
+ EqualsToken(Token::Type::RFC822_TOKEN, "<easy@google.com>"))));
+}
+
+TEST_F(Rfc822TokenizerTest, FromHtmlBugs) {
+ Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer();
+ // This input used to cause HTML parsing exception. We don't do HTML parsing
+ // any more (b/8388100) so we are just checking that it does not crash and
+ // that it retains the input.
+
+ // http://b/8988210. Put crashing string "&\r" x 100 into name and comment
+ // field of rfc822 token.
+
+ std::string s("\"");
+ for (int i = 0; i < 100; i++) {
+ s.append("&\r");
+ }
+ s.append("\" (");
+ for (int i = 0; i < 100; i++) {
+ s.append("&\r");
+ }
+ s.append(") <foo@google.com>");
+
+ // It shouldn't change anything
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(s),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_LOCAL_ADDRESS, "foo"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "foo@google.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "foo"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "google"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "com"),
+ EqualsToken(Token::Type::RFC822_TOKEN, s))));
+}
+
+TEST_F(Rfc822TokenizerTest, EmptyComponentsTest) {
+ Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer();
+ EXPECT_THAT(rfc822_tokenizer.TokenizeAll(""),
+ IsOkAndHolds(testing::IsEmpty()));
+
+ // Name is considered the address if address is empty.
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll("name<>"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "name"),
+ EqualsToken(Token::Type::RFC822_TOKEN, "name<>"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "name"))));
+
+ // Empty name and address means that there is no token.
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll("(a long comment with nothing else)"),
+ IsOkAndHolds(
+ ElementsAre(EqualsToken(Token::Type::RFC822_COMMENT, "a"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "long"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "comment"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "with"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "nothing"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "else"),
+ EqualsToken(Token::Type::RFC822_TOKEN,
+ "(a long comment with nothing else)"))));
+
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll("name ()"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "name"),
+ EqualsToken(Token::Type::RFC822_TOKEN, "name ()"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "name"))));
+
+ EXPECT_THAT(rfc822_tokenizer.TokenizeAll(R"((comment) "")"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_COMMENT, "comment"),
+ EqualsToken(Token::Type::RFC822_TOKEN, "(comment) \"\""))));
+}
+
+TEST_F(Rfc822TokenizerTest, NameTest) {
+ Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer();
+
+ // Name spread between address or comment.
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll("peanut <address> butter"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_NAME, "peanut"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "address"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "address"),
+ EqualsToken(Token::Type::RFC822_NAME, "butter"),
+ EqualsToken(Token::Type::RFC822_TOKEN, "peanut <address> butter"))));
+
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll("peanut (comment) butter"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "peanut"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "comment"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "butter"),
+ EqualsToken(Token::Type::RFC822_TOKEN, "peanut (comment) butter"),
+ EqualsToken(Token::Type::RFC822_ADDRESS,
+ "peanut (comment) butter"))));
+
+ // Dropping quotes when they're not needed.
+ std::string s = R"(peanut <address> "butter")";
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(s),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_NAME, "peanut"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "address"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "address"),
+ EqualsToken(Token::Type::RFC822_NAME, "butter"),
+ EqualsToken(Token::Type::RFC822_TOKEN, s))));
+
+ s = R"(peanut "butter")";
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(s),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "peanut"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "butter"),
+ EqualsToken(Token::Type::RFC822_TOKEN, s),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "peanut \"butter"))));
+ // Adding quotes when they are needed.
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll("ple@se quote this <addr>"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_NAME, "ple"),
+ EqualsToken(Token::Type::RFC822_NAME, "se"),
+ EqualsToken(Token::Type::RFC822_NAME, "quote"),
+ EqualsToken(Token::Type::RFC822_NAME, "this"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "addr"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "addr"),
+
+ EqualsToken(Token::Type::RFC822_TOKEN, "ple@se quote this <addr>"))));
+}
+
+TEST_F(Rfc822TokenizerTest, CommentEscapeTest) {
+ Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer();
+ // '(', ')', '\\' chars should be escaped. All other escaped chars should be
+ // unescaped.
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(R"((co\)mm\\en\(t))"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_COMMENT, "co"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "mm"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "en"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "t"),
+ EqualsToken(Token::Type::RFC822_TOKEN, R"((co\)mm\\en\(t))"))));
+
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(R"((c\om\ment) name)"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_COMMENT, R"(c\om\ment)"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "name"),
+ EqualsToken(Token::Type::RFC822_TOKEN, R"((c\om\ment) name)"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "name"))));
+
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(R"((co(m\))ment) name)"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_COMMENT, "co"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "m"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "ment"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "name"),
+ EqualsToken(Token::Type::RFC822_TOKEN, R"((co(m\))ment) name)"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "name"))));
+}
+
+TEST_F(Rfc822TokenizerTest, QuoteEscapeTest) {
+ Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer();
+ // All names that include non-alphanumeric chars must be quoted and have '\\'
+ // and '"' chars escaped.
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(R"(n\\a\me <addr>)"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_NAME, "n"),
+ EqualsToken(Token::Type::RFC822_NAME, "a"),
+ EqualsToken(Token::Type::RFC822_NAME, "me"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "addr"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "addr"),
+ EqualsToken(Token::Type::RFC822_TOKEN, R"(n\\a\me <addr>)"))));
+
+ // Names that are within quotes should have all characters blindly unescaped.
+ // When a name is made into an address, it isn't re-escaped.
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(R"("n\\a\m\"e")"),
+ // <n\am"e>
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "n"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "a\\m"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "e"),
+ EqualsToken(Token::Type::RFC822_TOKEN, R"("n\\a\m\"e")"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, R"(n\\a\m\"e)"))));
+}
+
+TEST_F(Rfc822TokenizerTest, UnterminatedComponentTest) {
+ Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer();
+
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll("name (comment"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "name"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "comment"),
+ EqualsToken(Token::Type::RFC822_TOKEN, "name (comment"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "name"))));
+
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(R"(half of "the name)"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "half"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "of"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "the"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "name"),
+ EqualsToken(Token::Type::RFC822_TOKEN, "half of \"the name"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "half of \"the name"))));
+
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(R"("name\)"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "name"),
+ EqualsToken(Token::Type::RFC822_TOKEN, "\"name\\"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "name"))));
+
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(R"(name (comment\)"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "name"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "comment"),
+ EqualsToken(Token::Type::RFC822_TOKEN, "name (comment\\"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "name"))));
+
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(R"(<addr> "name\)"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_ADDRESS, "addr"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "addr"),
+ EqualsToken(Token::Type::RFC822_NAME, "name"),
+ EqualsToken(Token::Type::RFC822_TOKEN, "<addr> \"name\\"))));
+
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(R"(name (comment\))"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "name"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "comment"),
+ EqualsToken(Token::Type::RFC822_TOKEN, R"(name (comment\))"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "name"))));
+}
+
+TEST_F(Rfc822TokenizerTest, Tokenize) {
+ Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer();
+
+ std::string text =
+ R"raw("Berg" (home) <berg\@google.com>, tom\@google.com (work))raw";
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(text),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_NAME, "Berg"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "home"),
+ EqualsToken(Token::Type::RFC822_LOCAL_ADDRESS, "berg\\"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "berg\\@google.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "berg"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "google"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "com"),
+ EqualsToken(Token::Type::RFC822_TOKEN,
+ R"("Berg" (home) <berg\@google.com>)"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "tom"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "google"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "com"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "work"),
+ EqualsToken(Token::Type::RFC822_TOKEN, "tom\\@google.com (work)"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "tom\\@google.com"))));
+
+ text = R"raw(Foo Bar (something) <foo\@google.com>, )raw"
+ R"raw(blah\@google.com (something))raw";
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(text),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_NAME, "Foo"),
+ EqualsToken(Token::Type::RFC822_NAME, "Bar"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "something"),
+ EqualsToken(Token::Type::RFC822_LOCAL_ADDRESS, "foo\\"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "foo\\@google.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "foo"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "google"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "com"),
+ EqualsToken(Token::Type::RFC822_TOKEN,
+ "Foo Bar (something) <foo\\@google.com>"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "blah"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "google"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "com"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "something"),
+ EqualsToken(Token::Type::RFC822_TOKEN,
+ "blah\\@google.com (something)"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "blah\\@google.com"))));
+}
+
+TEST_F(Rfc822TokenizerTest, EdgeCases) {
+ Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer();
+
+ // Text to trigger the scenario where you have a non-alphabetic followed
+ // by a \ followed by non alphabetic to end an in-address token.
+ std::string text = R"raw(<be.\&rg@google.com>)raw";
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(text),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_LOCAL_ADDRESS, "be.\\&rg"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "be.\\&rg@google.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "be"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "rg"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "google"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "com"),
+ EqualsToken(Token::Type::RFC822_TOKEN,
+ R"raw(<be.\&rg@google.com>)raw"))));
+
+ // A \ followed by an alphabetic shouldn't end the token.
+ text = "<a\\lex@google.com>";
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(text),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_LOCAL_ADDRESS, "a\\lex"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "a\\lex@google.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "a\\lex"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "google"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "com"),
+ EqualsToken(Token::Type::RFC822_TOKEN, "<a\\lex@google.com>"))));
+
+ // \\ or \" in a quoted section.
+ text = R"("al\\ex@goo\"<idk>gle.com")";
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(text),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "al"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "ex"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "goo"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "idk"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "gle"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "com"),
+ EqualsToken(Token::Type::RFC822_TOKEN,
+ R"("al\\ex@goo\"<idk>gle.com")"),
+ EqualsToken(Token::Type::RFC822_ADDRESS,
+ R"(al\\ex@goo\"<idk>gle.com)"),
+ EqualsToken(Token::Type::RFC822_LOCAL_ADDRESS, "al\\\\ex"))));
+
+ text = "<alex@google.com";
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(text),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_LOCAL_ADDRESS, "alex"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "alex@google.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "alex"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "google"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "com"),
+ EqualsToken(Token::Type::RFC822_TOKEN, "<alex@google.com"))));
+}
+
+TEST_F(Rfc822TokenizerTest, NumberInAddress) {
+ Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer();
+ std::string text = "<3alex@google.com>";
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(text),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_LOCAL_ADDRESS, "3alex"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "3alex@google.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "3alex"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "google"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "com"),
+ EqualsToken(Token::Type::RFC822_TOKEN, "<3alex@google.com>"))));
+}
+
+TEST_F(Rfc822TokenizerTest, DoubleQuoteDoubleSlash) {
+ Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer();
+ std::string text = R"("alex\"")";
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(text),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "alex"),
+ EqualsToken(Token::Type::RFC822_TOKEN, text),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "alex"))));
+
+ text = R"("alex\\\a")";
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(text),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "alex"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "a"),
+ EqualsToken(Token::Type::RFC822_TOKEN, text),
+ EqualsToken(Token::Type::RFC822_ADDRESS, R"(alex\\\a)"))));
+}
+
+TEST_F(Rfc822TokenizerTest, TwoEmails) {
+ Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer();
+ std::string text = "tjbarron@google.com alexsav@google.com";
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(text),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "tjbarron"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "google"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "alexsav"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "google"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "com"),
+ EqualsToken(Token::Type::RFC822_TOKEN, text),
+ EqualsToken(Token::Type::RFC822_ADDRESS, text),
+ EqualsToken(Token::Type::RFC822_LOCAL_ADDRESS,
+ "tjbarron@google.com alexsav"))));
+}
+
+TEST_F(Rfc822TokenizerTest, BackSlashes) {
+ Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer();
+ std::string text = R"("\name")";
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(text),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "name"),
+ EqualsToken(Token::Type::RFC822_TOKEN, "\"\\name\""),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "name"))));
+
+ text = R"("name@foo\@gmail")";
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(text),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "name"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "foo"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "gmail"),
+ EqualsToken(Token::Type::RFC822_TOKEN, text),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "name@foo\\@gmail"),
+ EqualsToken(Token::Type::RFC822_LOCAL_ADDRESS, "name"))));
+}
+
+TEST_F(Rfc822TokenizerTest, BigWhitespace) {
+ Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer();
+ std::string text = "\"quoted\" <address>";
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(text),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_NAME, "quoted"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "address"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "address"),
+ EqualsToken(Token::Type::RFC822_TOKEN, text))));
+}
+
+TEST_F(Rfc822TokenizerTest, AtSignFirst) {
+ Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer();
+ std::string text = "\"@foo\"";
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(text),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "foo"),
+ EqualsToken(Token::Type::RFC822_TOKEN, text),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "foo"))));
+}
+
+TEST_F(Rfc822TokenizerTest, SlashThenUnicode) {
+ Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer();
+ std::string text = R"("quoted\你cjk")";
+ EXPECT_THAT(rfc822_tokenizer.TokenizeAll(text),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST,
+ "quoted\\你cjk"),
+ EqualsToken(Token::Type::RFC822_TOKEN, text),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "quoted\\你cjk"))));
+}
+
+TEST_F(Rfc822TokenizerTest, AddressEmptyAddress) {
+ Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer();
+ std::string text = "<address> <> Name";
+ EXPECT_THAT(rfc822_tokenizer.TokenizeAll(text),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_ADDRESS, "address"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST,
+ "address"),
+ EqualsToken(Token::Type::RFC822_NAME, "Name"),
+ EqualsToken(Token::Type::RFC822_TOKEN, text))));
+}
+
+} // namespace
+} // namespace lib
+} // namespace icing
diff --git a/icing/tokenization/token.h b/icing/tokenization/token.h
index 0c268be..24f567b 100644
--- a/icing/tokenization/token.h
+++ b/icing/tokenization/token.h
@@ -29,6 +29,15 @@ struct Token {
VERBATIM, // A token that should be indexed and searched without any
// modifications to the raw text
+ // An RFC822 section with the content in RFC822_TOKEN tokenizes as follows:
+ RFC822_NAME, // "User", "Johnsson"
+ RFC822_COMMENT, // "A", "comment", "here"
+ RFC822_LOCAL_ADDRESS, // "user.name"
+ RFC822_ADDRESS, // "user.name@domain.name.com"
+ RFC822_ADDRESS_COMPONENT_LOCAL, // "user", "name",
+ RFC822_ADDRESS_COMPONENT_HOST, // "domain", "name", "com"
+ RFC822_TOKEN, // "User Johnsson (A comment) <user.name@domain.name.com>"
+
// Types only used in raw query
QUERY_OR, // Indicates OR logic between its left and right tokens
QUERY_EXCLUSION, // Indicates exclusion operation on next token
@@ -45,10 +54,10 @@ struct Token {
: type(type_in), text(text_in) {}
// The type of token
- const Type type;
+ Type type;
// The content of token
- const std::string_view text;
+ std::string_view text;
};
} // namespace lib
diff --git a/icing/util/clock.h b/icing/util/clock.h
index 2bb7818..9e57854 100644
--- a/icing/util/clock.h
+++ b/icing/util/clock.h
@@ -16,6 +16,7 @@
#define ICING_UTIL_CLOCK_H_
#include <cstdint>
+#include <functional>
#include <memory>
namespace icing {
@@ -69,6 +70,32 @@ class Clock {
virtual std::unique_ptr<Timer> GetNewTimer() const;
};
+// A convenient RAII timer class that receives a callback. Upon destruction, the
+// callback will be called with the elapsed milliseconds or nanoseconds passed
+// as a parameter, depending on which Unit was passed in the constructor.
+class ScopedTimer {
+ public:
+ enum class Unit { kMillisecond, kNanosecond };
+
+ ScopedTimer(std::unique_ptr<Timer> timer,
+ std::function<void(int64_t)> callback,
+ Unit unit = Unit::kMillisecond)
+ : timer_(std::move(timer)), callback_(std::move(callback)), unit_(unit) {}
+
+ ~ScopedTimer() {
+ if (unit_ == Unit::kMillisecond) {
+ callback_(timer_->GetElapsedMilliseconds());
+ } else {
+ callback_(timer_->GetElapsedNanoseconds());
+ }
+ }
+
+ private:
+ std::unique_ptr<Timer> timer_;
+ std::function<void(int64_t)> callback_;
+ Unit unit_;
+};
+
} // namespace lib
} // namespace icing
diff --git a/icing/util/crc32.h b/icing/util/crc32.h
index 5befe44..207a80a 100644
--- a/icing/util/crc32.h
+++ b/icing/util/crc32.h
@@ -35,6 +35,8 @@ class Crc32 {
explicit Crc32(uint32_t init_crc) : crc_(init_crc) {}
+ explicit Crc32(std::string_view str) : crc_(0) { Append(str); }
+
inline bool operator==(const Crc32& other) const {
return crc_ == other.Get();
}
diff --git a/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java b/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java
index c690990..b55cfd1 100644
--- a/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java
+++ b/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java
@@ -17,6 +17,7 @@ package com.google.android.icing;
import static com.google.common.truth.Truth.assertThat;
import static com.google.common.truth.Truth.assertWithMessage;
+import com.google.android.icing.IcingSearchEngine;
import com.google.android.icing.proto.DebugInfoResultProto;
import com.google.android.icing.proto.DebugInfoVerbosity;
import com.google.android.icing.proto.DeleteByNamespaceResultProto;
@@ -60,7 +61,6 @@ import com.google.android.icing.proto.SuggestionSpecProto.SuggestionScoringSpecP
import com.google.android.icing.proto.TermMatchType;
import com.google.android.icing.proto.TermMatchType.Code;
import com.google.android.icing.proto.UsageReport;
-import com.google.android.icing.IcingSearchEngine;
import java.io.File;
import java.util.HashMap;
import java.util.Map;
diff --git a/proto/icing/proto/optimize.proto b/proto/icing/proto/optimize.proto
index 42290f3..0accb9a 100644
--- a/proto/icing/proto/optimize.proto
+++ b/proto/icing/proto/optimize.proto
@@ -63,7 +63,7 @@ message GetOptimizeInfoResultProto {
optional int64 time_since_last_optimize_ms = 4;
}
-// Next tag: 10
+// Next tag: 11
message OptimizeStatsProto {
// Overall time used for the function call.
optional int32 latency_ms = 1;
@@ -91,4 +91,15 @@ message OptimizeStatsProto {
// The amount of time since the last optimize ran.
optional int64 time_since_last_optimize_ms = 9;
+
+ enum IndexRestorationMode {
+ // The index has been translated in place to match the optimized document
+ // store.
+ INDEX_TRANSLATION = 0;
+ // The index has been rebuilt from scratch during optimization. This could
+ // happen when we received a DATA_LOSS error from OptimizeDocumentStore,
+ // Index::Optimize failed, or rebuilding could be faster.
+ FULL_INDEX_REBUILD = 1;
+ }
+ optional IndexRestorationMode index_restoration_mode = 10;
}
diff --git a/proto/icing/proto/search.proto b/proto/icing/proto/search.proto
index f005c76..7a361d3 100644
--- a/proto/icing/proto/search.proto
+++ b/proto/icing/proto/search.proto
@@ -65,7 +65,7 @@ message SearchSpecProto {
// Client-supplied specifications on what to include/how to format the search
// results.
-// Next tag: 6
+// Next tag: 7
message ResultSpecProto {
// The results will be returned in pages, and num_per_page specifies the
// number of documents in one page.
@@ -133,6 +133,15 @@ message ResultSpecProto {
// ["ns0doc0", "ns0doc1", "ns1doc0", "ns3doc0", "ns3doc1", "ns2doc1",
// "ns3doc2"].
repeated ResultGrouping result_groupings = 5;
+
+ // The threshold of total bytes of all documents to cutoff, in order to limit
+ // # of bytes in a single page.
+ // Note that it doesn't guarantee the result # of bytes will be smaller, equal
+ // to, or larger than the threshold. Instead, it is just a threshold to
+ // cutoff, and only guarantees total bytes of search results will exceed the
+ // threshold by less than the size of the final search result.
+ optional int32 num_total_bytes_per_page_threshold = 6
+ [default = 2147483647]; // INT_MAX
}
// The representation of a single match within a DocumentProto property.
diff --git a/synced_AOSP_CL_number.txt b/synced_AOSP_CL_number.txt
index 305f410..cd00254 100644
--- a/synced_AOSP_CL_number.txt
+++ b/synced_AOSP_CL_number.txt
@@ -1 +1 @@
-set(synced_AOSP_CL_number=455217954)
+set(synced_AOSP_CL_number=466546985)