aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--icing/file/memory-mapped-file-leak_test.cc78
-rw-r--r--icing/file/posting_list/flash-index-storage-header.h (renamed from icing/index/main/flash-index-storage-header.h)6
-rw-r--r--icing/file/posting_list/flash-index-storage.cc (renamed from icing/index/main/flash-index-storage.cc)56
-rw-r--r--icing/file/posting_list/flash-index-storage.h (renamed from icing/index/main/flash-index-storage.h)32
-rw-r--r--icing/file/posting_list/flash-index-storage_test.cc (renamed from icing/index/main/flash-index-storage_test.cc)125
-rw-r--r--icing/file/posting_list/index-block.cc (renamed from icing/index/main/index-block.cc)78
-rw-r--r--icing/file/posting_list/index-block.h (renamed from icing/index/main/index-block.h)47
-rw-r--r--icing/file/posting_list/index-block_test.cc (renamed from icing/index/main/index-block_test.cc)179
-rw-r--r--icing/file/posting_list/posting-list-common.h35
-rw-r--r--icing/file/posting_list/posting-list-free.h (renamed from icing/index/main/posting-list-free.h)54
-rw-r--r--icing/file/posting_list/posting-list-free_test.cc (renamed from icing/index/main/posting-list-free_test.cc)86
-rw-r--r--icing/file/posting_list/posting-list-identifier.cc (renamed from icing/index/main/posting-list-identifier.cc)4
-rw-r--r--icing/file/posting_list/posting-list-identifier.h (renamed from icing/index/main/posting-list-identifier.h)17
-rw-r--r--icing/file/posting_list/posting-list-used.cc56
-rw-r--r--icing/file/posting_list/posting-list-used.h143
-rw-r--r--icing/file/posting_list/posting-list-utils.cc (renamed from icing/index/main/posting-list-utils.cc)23
-rw-r--r--icing/file/posting_list/posting-list-utils.h (renamed from icing/index/main/posting-list-utils.h)22
-rw-r--r--icing/icing-search-engine-with-icu-file_test.cc13
-rw-r--r--icing/icing-search-engine_backwards_compatibility_test.cc46
-rw-r--r--icing/icing-search-engine_fuzz_test.cc10
-rw-r--r--icing/icing-search-engine_test.cc361
-rw-r--r--icing/index/hit/hit.cc36
-rw-r--r--icing/index/hit/hit.h73
-rw-r--r--icing/index/hit/hit_test.cc69
-rw-r--r--icing/index/index-processor.cc2
-rw-r--r--icing/index/index-processor_test.cc291
-rw-r--r--icing/index/iterator/doc-hit-info-iterator-and.cc7
-rw-r--r--icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc10
-rw-r--r--icing/index/main/doc-hit-info-iterator-term-main.cc3
-rw-r--r--icing/index/main/main-index-merger.cc3
-rw-r--r--icing/index/main/main-index.cc67
-rw-r--r--icing/index/main/main-index.h10
-rw-r--r--icing/index/main/main-index_test.cc1
-rw-r--r--icing/index/main/posting-list-accessor.cc58
-rw-r--r--icing/index/main/posting-list-accessor.h18
-rw-r--r--icing/index/main/posting-list-accessor_test.cc236
-rw-r--r--icing/index/main/posting-list-used-hit-serializer.cc (renamed from icing/index/main/posting-list-used.cc)489
-rw-r--r--icing/index/main/posting-list-used-hit-serializer.h (renamed from icing/index/main/posting-list-used.h)237
-rw-r--r--icing/index/main/posting-list-used-hit-serializer_test.cc (renamed from icing/index/main/posting-list-used_test.cc)484
-rw-r--r--icing/monkey_test/icing-monkey-test-runner.cc442
-rw-r--r--icing/monkey_test/icing-monkey-test-runner.h71
-rw-r--r--icing/monkey_test/icing-search-engine_monkey_test.cc30
-rw-r--r--icing/monkey_test/in-memory-icing-search-engine.cc113
-rw-r--r--icing/monkey_test/in-memory-icing-search-engine.h39
-rw-r--r--icing/monkey_test/monkey-test-generators.cc5
-rw-r--r--icing/monkey_test/monkey-test-generators.h2
-rw-r--r--icing/monkey_test/monkey-tokenized-document.h2
-rw-r--r--icing/query/query-processor.cc15
-rw-r--r--icing/query/query-processor_test.cc57
-rw-r--r--icing/result/result-retriever-v2_projection_test.cc33
-rw-r--r--icing/result/result-retriever-v2_snippet_test.cc33
-rw-r--r--icing/result/result-retriever-v2_test.cc33
-rw-r--r--icing/result/result-retriever_test.cc33
-rw-r--r--icing/result/snippet-retriever.cc42
-rw-r--r--icing/result/snippet-retriever_test.cc414
-rw-r--r--icing/schema-builder.h50
-rw-r--r--icing/schema/schema-store.cc8
-rw-r--r--icing/schema/schema-store.h16
-rw-r--r--icing/schema/schema-store_test.cc172
-rw-r--r--icing/schema/schema-util_test.cc220
-rw-r--r--icing/schema/section-manager.cc246
-rw-r--r--icing/schema/section-manager.h43
-rw-r--r--icing/schema/section-manager_test.cc615
-rw-r--r--icing/schema/section.h67
-rw-r--r--icing/scoring/priority-queue-scored-document-hits-ranker.h9
-rw-r--r--icing/scoring/scored-document-hit.h9
-rw-r--r--icing/scoring/scorer.cc4
-rw-r--r--icing/scoring/scorer_test.cc6
-rw-r--r--icing/scoring/scoring-processor_test.cc6
-rw-r--r--icing/store/document-id.h3
-rw-r--r--icing/store/document-store_benchmark.cc31
-rw-r--r--icing/store/document-store_test.cc121
-rw-r--r--icing/tokenization/token.h2
-rw-r--r--icing/tokenization/tokenizer-factory.cc11
-rw-r--r--icing/util/document-validator_test.cc11
-rw-r--r--icing/util/tokenized-document.cc6
-rw-r--r--proto/icing/proto/schema.proto42
-rw-r--r--proto/icing/proto/scoring.proto3
-rw-r--r--proto/icing/proto/search.proto58
-rw-r--r--synced_AOSP_CL_number.txt2
80 files changed, 4321 insertions, 2369 deletions
diff --git a/icing/file/memory-mapped-file-leak_test.cc b/icing/file/memory-mapped-file-leak_test.cc
deleted file mode 100644
index ff031df..0000000
--- a/icing/file/memory-mapped-file-leak_test.cc
+++ /dev/null
@@ -1,78 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "perftools/profiles/collector/heap/alloc_recorder.h"
-#include "gmock/gmock.h"
-#include "gtest/gtest.h"
-#include "icing/file/filesystem.h"
-#include "icing/file/memory-mapped-file.h"
-#include "icing/testing/common-matchers.h"
-#include "icing/testing/recorder-test-utils.h"
-#include "icing/testing/tmp-directory.h"
-
-namespace icing {
-namespace lib {
-namespace {
-
-namespace heap_profile = ::perftools::profiles::collector::heap;
-
-using testing::Le;
-
-TEST(MemoryMappedFileTest, MMapMemoryLeak) {
- std::string test_dir = GetTestTempDir();
- std::string recorder_dir = test_dir + "/recorder";
- Filesystem filesystem;
- ASSERT_TRUE(filesystem.CreateDirectoryRecursively(recorder_dir.c_str()));
-
- ASSERT_TRUE(heap_profile::AllocRecorderStartWithMmapTracking(recorder_dir));
- {
- std::string mmfile_dir = test_dir + "/file";
- ASSERT_TRUE(filesystem.CreateDirectoryRecursively(mmfile_dir.c_str()));
-
- // Don't use ICING_ASSERT_OK_AND_ASSIGN or matcher IsOk to prevent
- // unnecessary implicit heap memory allocation in these macros.
- libtextclassifier3::StatusOr<MemoryMappedFile> mmfile_or =
- MemoryMappedFile::Create(filesystem, mmfile_dir + "/mmfile",
- MemoryMappedFile::READ_WRITE_AUTO_SYNC);
- ASSERT_TRUE(mmfile_or.ok());
- MemoryMappedFile mmfile = std::move(mmfile_or).ValueOrDie();
-
- // How this works:
- // We request a 500-byte mapping starting at the 101st byte of the file.
- // But(!), mmap only accepts offsets that are multiples of page size. So
- // instead mmfile will create a 600-byte mapping starting at the 1st byte of
- // file and then return the address of the 101st byte within that mapping.
- // For this reason, total bytes and peak bytes will be 600 bytes.
- //
- // When mmfile goes out of scope it needs to munmap the mapping that it
- // created. But, remember that the mapping is larger (600 bytes) than what
- // we requested (500 bytes)! So mmfile needs to remember the actual size of
- // the mapping, NOT the requested size. Calling munmap with the correct size
- // will ensure that total_inuse_bytes is 0 after mmfile goes out of scope.
- // Calling munmap with the requested size would still keep 100 bytes of the
- // mapping around!
- mmfile.Remap(100, 500);
- }
- heap_profile::AllocRecorderStop();
-
- // Mmap only affects bytes measurements.
- ProfileInfo profile_info = SummarizeProfileProto(recorder_dir + ".0.pb.gz");
- EXPECT_THAT(profile_info.total_alloc_bytes, Le(600));
- EXPECT_THAT(profile_info.peak_bytes, Le(600));
- EXPECT_THAT(profile_info.inuse_bytes, Le(0));
-}
-
-} // namespace
-} // namespace lib
-} // namespace icing
diff --git a/icing/index/main/flash-index-storage-header.h b/icing/file/posting_list/flash-index-storage-header.h
index 71ec816..6bbf1ba 100644
--- a/icing/index/main/flash-index-storage-header.h
+++ b/icing/file/posting_list/flash-index-storage-header.h
@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#ifndef ICING_INDEX_MAIN_FLASH_INDEX_STORAGE_HEADER_H_
-#define ICING_INDEX_MAIN_FLASH_INDEX_STORAGE_HEADER_H_
+#ifndef ICING_FILE_POSTING_LIST_FLASH_INDEX_STORAGE_HEADER_H_
+#define ICING_FILE_POSTING_LIST_FLASH_INDEX_STORAGE_HEADER_H_
#include <cstdint>
#include <memory>
@@ -119,4 +119,4 @@ static_assert(16 == sizeof(HeaderBlock::Header),
} // namespace lib
} // namespace icing
-#endif // ICING_INDEX_MAIN_FLASH_INDEX_STORAGE_HEADER_H_
+#endif // ICING_FILE_POSTING_LIST_FLASH_INDEX_STORAGE_HEADER_H_
diff --git a/icing/index/main/flash-index-storage.cc b/icing/file/posting_list/flash-index-storage.cc
index 33dacf9..f74bc55 100644
--- a/icing/index/main/flash-index-storage.cc
+++ b/icing/file/posting_list/flash-index-storage.cc
@@ -12,23 +12,19 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "icing/index/main/flash-index-storage.h"
+#include "icing/file/posting_list/flash-index-storage.h"
#include <sys/types.h>
#include <algorithm>
#include <cerrno>
-#include <cinttypes>
#include <cstdint>
#include <memory>
-#include <unordered_set>
#include "icing/absl_ports/canonical_errors.h"
#include "icing/absl_ports/str_cat.h"
-#include "icing/file/memory-mapped-file.h"
-#include "icing/index/main/index-block.h"
-#include "icing/index/main/posting-list-free.h"
-#include "icing/index/main/posting-list-utils.h"
+#include "icing/file/posting_list/index-block.h"
+#include "icing/file/posting_list/posting-list-common.h"
#include "icing/legacy/core/icing-string-util.h"
#include "icing/util/logging.h"
#include "icing/util/math-util.h"
@@ -55,9 +51,9 @@ uint32_t SelectBlockSize() {
libtextclassifier3::StatusOr<FlashIndexStorage> FlashIndexStorage::Create(
const std::string& index_filename, const Filesystem* filesystem,
- bool in_memory) {
+ PostingListUsedSerializer* serializer, bool in_memory) {
ICING_RETURN_ERROR_IF_NULL(filesystem);
- FlashIndexStorage storage(index_filename, filesystem, in_memory);
+ FlashIndexStorage storage(index_filename, filesystem, serializer, in_memory);
if (!storage.Init()) {
return absl_ports::InternalError(
"Unable to successfully read header block!");
@@ -67,10 +63,12 @@ libtextclassifier3::StatusOr<FlashIndexStorage> FlashIndexStorage::Create(
FlashIndexStorage::FlashIndexStorage(const std::string& index_filename,
const Filesystem* filesystem,
+ PostingListUsedSerializer* serializer,
bool has_in_memory_freelists)
: index_filename_(index_filename),
num_blocks_(0),
filesystem_(filesystem),
+ serializer_(serializer),
has_in_memory_freelists_(has_in_memory_freelists) {}
FlashIndexStorage::~FlashIndexStorage() {
@@ -127,13 +125,16 @@ bool FlashIndexStorage::CreateHeader() {
// Work down from the largest posting list that fits in
// block_size. We don't care about locality of blocks because this
// is a flash index.
- for (uint32_t posting_list_bytes =
- IndexBlock::CalculateMaxPostingListBytes(block_size);
- posting_list_bytes >= posting_list_utils::min_posting_list_size();
+ for (uint32_t posting_list_bytes = IndexBlock::CalculateMaxPostingListBytes(
+ block_size, serializer_->GetDataTypeBytes());
+ posting_list_bytes >= serializer_->GetMinPostingListSize();
posting_list_bytes /= 2) {
uint32_t aligned_posting_list_bytes =
- (posting_list_bytes / sizeof(Hit) * sizeof(Hit));
- ICING_VLOG(1) << "Block size " << header_block_->header()->num_index_block_infos << ": " << aligned_posting_list_bytes;
+ (posting_list_bytes / serializer_->GetDataTypeBytes()) *
+ serializer_->GetDataTypeBytes();
+ ICING_VLOG(1) << "Block size "
+ << header_block_->header()->num_index_block_infos << ": "
+ << aligned_posting_list_bytes;
// Initialize free list to empty.
HeaderBlock::Header::IndexBlockInfo* block_info =
@@ -167,18 +168,22 @@ bool FlashIndexStorage::OpenHeader(int64_t file_size) {
return false;
}
if (file_size % read_header.header()->block_size != 0) {
- ICING_LOG(ERROR) << "Index size " << file_size << " not a multiple of block size " << read_header.header()->block_size;
+ ICING_LOG(ERROR) << "Index size " << file_size
+ << " not a multiple of block size "
+ << read_header.header()->block_size;
return false;
}
if (file_size < static_cast<int64_t>(read_header.header()->block_size)) {
- ICING_LOG(ERROR) << "Index size " << file_size << " shorter than block size " << read_header.header()->block_size;
+ ICING_LOG(ERROR) << "Index size " << file_size
+ << " shorter than block size "
+ << read_header.header()->block_size;
return false;
}
if (read_header.header()->block_size % getpagesize() != 0) {
ICING_LOG(ERROR) << "Block size " << read_header.header()->block_size
- << " is not a multiple of page size " << getpagesize();
+ << " is not a multiple of page size " << getpagesize();
return false;
}
num_blocks_ = file_size / read_header.header()->block_size;
@@ -207,11 +212,12 @@ bool FlashIndexStorage::OpenHeader(int64_t file_size) {
for (int i = 0; i < header_block_->header()->num_index_block_infos; ++i) {
int posting_list_bytes =
header_block_->header()->index_block_infos[i].posting_list_bytes;
- if (posting_list_bytes % sizeof(Hit) != 0) {
- ICING_LOG(ERROR) << "Posting list size misaligned, index " << i
- << ", size "
+ if (posting_list_bytes % serializer_->GetDataTypeBytes() != 0) {
+ ICING_LOG(ERROR)
+ << "Posting list size misaligned, index " << i << ", size "
<< header_block_->header()->index_block_infos[i].posting_list_bytes
- << ", hit " << sizeof(Hit) << ", file_size " << file_size;
+ << ", data_type_bytes " << serializer_->GetDataTypeBytes()
+ << ", file_size " << file_size;
return false;
}
}
@@ -270,7 +276,7 @@ libtextclassifier3::StatusOr<IndexBlock> FlashIndexStorage::GetIndexBlock(
}
off_t offset = static_cast<off_t>(block_index) * block_size();
return IndexBlock::CreateFromPreexistingIndexBlockRegion(
- *filesystem_, index_filename_, offset, block_size());
+ *filesystem_, index_filename_, serializer_, offset, block_size());
}
libtextclassifier3::StatusOr<IndexBlock> FlashIndexStorage::CreateIndexBlock(
@@ -283,7 +289,8 @@ libtextclassifier3::StatusOr<IndexBlock> FlashIndexStorage::CreateIndexBlock(
}
off_t offset = static_cast<off_t>(block_index) * block_size();
return IndexBlock::CreateFromUninitializedRegion(
- *filesystem_, index_filename_, offset, block_size(), posting_list_size);
+ *filesystem_, index_filename_, serializer_, offset, block_size(),
+ posting_list_size);
}
int FlashIndexStorage::FindBestIndexBlockInfo(
@@ -381,7 +388,8 @@ FlashIndexStorage::AllocateNewPostingList(int block_info_index) {
libtextclassifier3::StatusOr<PostingListHolder>
FlashIndexStorage::AllocatePostingList(uint32_t min_posting_list_bytes) {
- int max_block_size = IndexBlock::CalculateMaxPostingListBytes(block_size());
+ int max_block_size = IndexBlock::CalculateMaxPostingListBytes(
+ block_size(), serializer_->GetDataTypeBytes());
if (min_posting_list_bytes > max_block_size) {
return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
"Requested posting list size %d exceeds max posting list size %d",
diff --git a/icing/index/main/flash-index-storage.h b/icing/file/posting_list/flash-index-storage.h
index fceb26f..032bfd2 100644
--- a/icing/index/main/flash-index-storage.h
+++ b/icing/file/posting_list/flash-index-storage.h
@@ -12,22 +12,23 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#ifndef ICING_INDEX_FLASH_INDEX_STORAGE_H_
-#define ICING_INDEX_FLASH_INDEX_STORAGE_H_
+#ifndef ICING_FILE_POSTING_LIST_FLASH_INDEX_STORAGE_H_
+#define ICING_FILE_POSTING_LIST_FLASH_INDEX_STORAGE_H_
#include <cstdint>
#include <memory>
#include <string>
+#include <vector>
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/absl_ports/canonical_errors.h"
#include "icing/file/filesystem.h"
-#include "icing/index/main/flash-index-storage-header.h"
-#include "icing/index/main/index-block.h"
-#include "icing/index/main/posting-list-free.h"
-#include "icing/index/main/posting-list-identifier.h"
-#include "icing/index/main/posting-list-used.h"
+#include "icing/file/posting_list/flash-index-storage-header.h"
+#include "icing/file/posting_list/index-block.h"
+#include "icing/file/posting_list/posting-list-identifier.h"
+#include "icing/file/posting_list/posting-list-used.h"
#include "icing/legacy/core/icing-packed-pod.h"
+#include "icing/proto/debug.pb.h"
#include "icing/store/document-id.h"
namespace icing {
@@ -84,7 +85,7 @@ class FlashIndexStorage {
// one from disk.
static libtextclassifier3::StatusOr<FlashIndexStorage> Create(
const std::string& index_filename, const Filesystem* filesystem,
- bool in_memory = true);
+ PostingListUsedSerializer* serializer, bool in_memory = true);
// Retrieve the PostingList referred to by PostingListIdentifier. This posting
// list must have been previously allocated by a prior call to
@@ -136,7 +137,7 @@ class FlashIndexStorage {
return filesystem_->GetDiskUsage(block_fd_.get());
}
- // Returns the size of the index file used to contains hits.
+ // Returns the size of the index file used to contains data.
uint64_t GetElementsSize() const {
// Element size is the same as disk size excluding the header block.
return GetDiskUsage() - block_size();
@@ -157,14 +158,19 @@ class FlashIndexStorage {
return 1.0 - static_cast<double>(num_blocks_) / kMaxBlockIndex;
}
+ const PostingListUsedSerializer* serializer() const { return serializer_; }
+ PostingListUsedSerializer* serializer() { return serializer_; }
+
libtextclassifier3::Status Reset();
// TODO(b/222349894) Convert the string output to a protocol buffer instead.
void GetDebugInfo(DebugInfoVerbosity::Code verbosity, std::string* out) const;
private:
- FlashIndexStorage(const std::string& index_filename,
- const Filesystem* filesystem, bool has_in_memory_freelists);
+ explicit FlashIndexStorage(const std::string& index_filename,
+ const Filesystem* filesystem,
+ PostingListUsedSerializer* serializer,
+ bool has_in_memory_freelists);
// Init the index from persistence. Create if file does not exist. We do not
// erase corrupt files.
@@ -281,10 +287,12 @@ class FlashIndexStorage {
const Filesystem* filesystem_; // not owned; can't be null
+ PostingListUsedSerializer* serializer_; // not owned; can't be null
+
bool has_in_memory_freelists_;
};
} // namespace lib
} // namespace icing
-#endif // ICING_INDEX_FLASH_INDEX_STORAGE_H_
+#endif // ICING_FILE_POSTING_LIST_FLASH_INDEX_STORAGE_H_
diff --git a/icing/index/main/flash-index-storage_test.cc b/icing/file/posting_list/flash-index-storage_test.cc
index 25fcaad..50f21f3 100644
--- a/icing/index/main/flash-index-storage_test.cc
+++ b/icing/file/posting_list/flash-index-storage_test.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "icing/index/main/flash-index-storage.h"
+#include "icing/file/posting_list/flash-index-storage.h"
#include <unistd.h>
@@ -27,6 +27,7 @@
#include "gtest/gtest.h"
#include "icing/file/filesystem.h"
#include "icing/index/hit/hit.h"
+#include "icing/index/main/posting-list-used-hit-serializer.h"
#include "icing/store/document-id.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/tmp-directory.h"
@@ -49,9 +50,13 @@ class FlashIndexStorageTest : public testing::Test {
test_dir_ = GetTestTempDir() + "/test_dir";
file_name_ = test_dir_ + "/test_file.idx.index";
ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(test_dir_.c_str()));
+
+ // TODO(b/249829533): test different serializers
+ serializer_ = std::make_unique<PostingListUsedHitSerializer>();
}
void TearDown() override {
+ serializer_.reset();
ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(test_dir_.c_str()));
}
@@ -59,6 +64,7 @@ class FlashIndexStorageTest : public testing::Test {
std::string test_dir_;
std::string file_name_;
Filesystem filesystem_;
+ std::unique_ptr<PostingListUsedHitSerializer> serializer_;
};
TEST_F(FlashIndexStorageTest, CorruptHeader) {
@@ -66,13 +72,13 @@ TEST_F(FlashIndexStorageTest, CorruptHeader) {
// Create the header file
ICING_ASSERT_OK_AND_ASSIGN(
FlashIndexStorage flash_index_storage,
- FlashIndexStorage::Create(file_name_, &filesystem_));
+ FlashIndexStorage::Create(file_name_, &filesystem_, serializer_.get()));
}
{
// Read the valid header - should pass
ICING_ASSERT_OK_AND_ASSIGN(
FlashIndexStorage flash_index_storage,
- FlashIndexStorage::Create(file_name_, &filesystem_));
+ FlashIndexStorage::Create(file_name_, &filesystem_, serializer_.get()));
}
{
// Corrupt the header file by changing pl_bytes
@@ -84,8 +90,9 @@ TEST_F(FlashIndexStorageTest, CorruptHeader) {
{
// Read the header file - should fail because pl_bytes is not divisible
// by sizeof(Hit), which is 5 as of writing
- ASSERT_THAT(FlashIndexStorage::Create(file_name_, &filesystem_),
- StatusIs(libtextclassifier3::StatusCode::INTERNAL));
+ ASSERT_THAT(
+ FlashIndexStorage::Create(file_name_, &filesystem_, serializer_.get()),
+ StatusIs(libtextclassifier3::StatusCode::INTERNAL));
}
{
// Correct the pl_bytes header alignment
@@ -98,7 +105,7 @@ TEST_F(FlashIndexStorageTest, CorruptHeader) {
// Read the valid header - should pass
ICING_ASSERT_OK_AND_ASSIGN(
FlashIndexStorage flash_index_storage,
- FlashIndexStorage::Create(file_name_, &filesystem_));
+ FlashIndexStorage::Create(file_name_, &filesystem_, serializer_.get()));
}
// Delete the file
@@ -110,7 +117,7 @@ TEST_F(FlashIndexStorageTest, EmptyStorage) {
// Create the header file
ICING_ASSERT_OK_AND_ASSIGN(
FlashIndexStorage flash_index_storage,
- FlashIndexStorage::Create(file_name_, &filesystem_));
+ FlashIndexStorage::Create(file_name_, &filesystem_, serializer_.get()));
// An 'empty' FlashIndexStorage should have:
// 1. One block allocated for the header
EXPECT_THAT(flash_index_storage.num_blocks(), Eq(1));
@@ -126,7 +133,7 @@ TEST_F(FlashIndexStorageTest, EmptyStorage) {
// Read the valid header. All functions should return the same values.
ICING_ASSERT_OK_AND_ASSIGN(
FlashIndexStorage flash_index_storage,
- FlashIndexStorage::Create(file_name_, &filesystem_));
+ FlashIndexStorage::Create(file_name_, &filesystem_, serializer_.get()));
EXPECT_THAT(flash_index_storage.num_blocks(), Eq(1));
EXPECT_THAT(flash_index_storage.empty(), IsTrue());
EXPECT_THAT(flash_index_storage.get_last_indexed_docid(),
@@ -140,7 +147,7 @@ TEST_F(FlashIndexStorageTest, FreeListInMemory) {
// Create the header file
ICING_ASSERT_OK_AND_ASSIGN(
FlashIndexStorage flash_index_storage,
- FlashIndexStorage::Create(file_name_, &filesystem_));
+ FlashIndexStorage::Create(file_name_, &filesystem_, serializer_.get()));
{
// 1. Request a PL that is 1/2 block size. Remember that block size also
// includes the BlockHeader. The BlockHeader isn't publicly visible, so we
@@ -165,9 +172,10 @@ TEST_F(FlashIndexStorageTest, FreeListInMemory) {
Hit(/*section_id=*/5, /*document_id=*/2, /*term_frequency=*/100),
Hit(/*section_id=*/8, /*document_id=*/5, /*term_frequency=*/197)};
for (const Hit& hit : hits1) {
- ICING_ASSERT_OK(posting_list_holder1.posting_list.PrependHit(hit));
+ ICING_ASSERT_OK(
+ serializer_->PrependHit(&posting_list_holder1.posting_list, hit));
}
- EXPECT_THAT(posting_list_holder1.posting_list.GetHits(),
+ EXPECT_THAT(serializer_->GetHits(&posting_list_holder1.posting_list),
IsOkAndHolds(ElementsAreArray(hits1.rbegin(), hits1.rend())));
// 2. Get another PL. This should be on the same flash block. There should
@@ -188,9 +196,10 @@ TEST_F(FlashIndexStorageTest, FreeListInMemory) {
Hit(/*section_id=*/9, /*document_id=*/7, /*term_frequency=*/100),
Hit(/*section_id=*/6, /*document_id=*/7, /*term_frequency=*/197)};
for (const Hit& hit : hits2) {
- ICING_ASSERT_OK(posting_list_holder2.posting_list.PrependHit(hit));
+ ICING_ASSERT_OK(
+ serializer_->PrependHit(&posting_list_holder2.posting_list, hit));
}
- EXPECT_THAT(posting_list_holder2.posting_list.GetHits(),
+ EXPECT_THAT(serializer_->GetHits(&posting_list_holder2.posting_list),
IsOkAndHolds(ElementsAreArray(hits2.rbegin(), hits2.rend())));
// 3. Now, free the first posting list. This should add it to the free list
@@ -214,7 +223,7 @@ TEST_F(FlashIndexStorageTest, FreeListInMemory) {
EXPECT_THAT(posting_list_holder3.id.block_index(), Eq(id1.block_index()));
// Make sure this pl is empty. The hits that used to be there should be
// gone.
- EXPECT_THAT(posting_list_holder3.posting_list.GetHits(),
+ EXPECT_THAT(serializer_->GetHits(&posting_list_holder3.posting_list),
IsOkAndHolds(IsEmpty()));
std::vector<Hit> hits3 = {
Hit(/*section_id=*/7, /*document_id=*/1, /*term_frequency=*/62),
@@ -222,9 +231,10 @@ TEST_F(FlashIndexStorageTest, FreeListInMemory) {
Hit(/*section_id=*/11, /*document_id=*/18, /*term_frequency=*/12),
Hit(/*section_id=*/7, /*document_id=*/100, /*term_frequency=*/74)};
for (const Hit& hit : hits3) {
- ICING_ASSERT_OK(posting_list_holder3.posting_list.PrependHit(hit));
+ ICING_ASSERT_OK(
+ serializer_->PrependHit(&posting_list_holder3.posting_list, hit));
}
- EXPECT_THAT(posting_list_holder3.posting_list.GetHits(),
+ EXPECT_THAT(serializer_->GetHits(&posting_list_holder3.posting_list),
IsOkAndHolds(ElementsAreArray(hits3.rbegin(), hits3.rend())));
}
EXPECT_THAT(flash_index_storage.GetDiskUsage(),
@@ -235,7 +245,8 @@ TEST_F(FlashIndexStorageTest, FreeListNotInMemory) {
// Create the header file
ICING_ASSERT_OK_AND_ASSIGN(
FlashIndexStorage flash_index_storage,
- FlashIndexStorage::Create(file_name_, &filesystem_, /*in_memory=*/false));
+ FlashIndexStorage::Create(file_name_, &filesystem_, serializer_.get(),
+ /*in_memory=*/false));
{
// 1. Request a PL that is 1/2 block size. Remember that block size also
@@ -261,9 +272,10 @@ TEST_F(FlashIndexStorageTest, FreeListNotInMemory) {
Hit(/*section_id=*/5, /*document_id=*/2, /*term_frequency=*/100),
Hit(/*section_id=*/8, /*document_id=*/5, /*term_frequency=*/197)};
for (const Hit& hit : hits1) {
- ICING_ASSERT_OK(posting_list_holder1.posting_list.PrependHit(hit));
+ ICING_ASSERT_OK(
+ serializer_->PrependHit(&posting_list_holder1.posting_list, hit));
}
- EXPECT_THAT(posting_list_holder1.posting_list.GetHits(),
+ EXPECT_THAT(serializer_->GetHits(&posting_list_holder1.posting_list),
IsOkAndHolds(ElementsAreArray(hits1.rbegin(), hits1.rend())));
// 2. Get another PL. This should be on the same flash block. There should
@@ -284,9 +296,10 @@ TEST_F(FlashIndexStorageTest, FreeListNotInMemory) {
Hit(/*section_id=*/9, /*document_id=*/7, /*term_frequency=*/100),
Hit(/*section_id=*/6, /*document_id=*/7, /*term_frequency=*/197)};
for (const Hit& hit : hits2) {
- ICING_ASSERT_OK(posting_list_holder2.posting_list.PrependHit(hit));
+ ICING_ASSERT_OK(
+ serializer_->PrependHit(&posting_list_holder2.posting_list, hit));
}
- EXPECT_THAT(posting_list_holder2.posting_list.GetHits(),
+ EXPECT_THAT(serializer_->GetHits(&posting_list_holder2.posting_list),
IsOkAndHolds(ElementsAreArray(hits2.rbegin(), hits2.rend())));
// 3. Now, free the first posting list. This should add it to the free list
@@ -310,7 +323,7 @@ TEST_F(FlashIndexStorageTest, FreeListNotInMemory) {
EXPECT_THAT(posting_list_holder3.id.block_index(), Eq(id1.block_index()));
// Make sure this pl is empty. The hits that used to be there should be
// gone.
- EXPECT_THAT(posting_list_holder3.posting_list.GetHits(),
+ EXPECT_THAT(serializer_->GetHits(&posting_list_holder3.posting_list),
IsOkAndHolds(IsEmpty()));
std::vector<Hit> hits3 = {
Hit(/*section_id=*/7, /*document_id=*/1, /*term_frequency=*/62),
@@ -318,9 +331,10 @@ TEST_F(FlashIndexStorageTest, FreeListNotInMemory) {
Hit(/*section_id=*/11, /*document_id=*/18, /*term_frequency=*/12),
Hit(/*section_id=*/7, /*document_id=*/100, /*term_frequency=*/74)};
for (const Hit& hit : hits3) {
- ICING_ASSERT_OK(posting_list_holder3.posting_list.PrependHit(hit));
+ ICING_ASSERT_OK(
+ serializer_->PrependHit(&posting_list_holder3.posting_list, hit));
}
- EXPECT_THAT(posting_list_holder3.posting_list.GetHits(),
+ EXPECT_THAT(serializer_->GetHits(&posting_list_holder3.posting_list),
IsOkAndHolds(ElementsAreArray(hits3.rbegin(), hits3.rend())));
}
EXPECT_THAT(flash_index_storage.GetDiskUsage(),
@@ -334,17 +348,18 @@ TEST_F(FlashIndexStorageTest, FreeListInMemoryPersistence) {
// Create the header file
ICING_ASSERT_OK_AND_ASSIGN(
FlashIndexStorage flash_index_storage,
- FlashIndexStorage::Create(file_name_, &filesystem_));
+ FlashIndexStorage::Create(file_name_, &filesystem_, serializer_.get()));
{
// 1. Request a PL that is 1/2 block size. Remember that block size also
// includes the BlockHeader. The BlockHeader isn't publicly visible, so we
// subtract 100 bytes to be sure. AllocatePostingList will round up from
// kHalfBlockPostingListSize to whatever the correct size is.
- half_block_posting_list_size = (flash_index_storage.block_size() - 100) / 2;
- ICING_ASSERT_OK_AND_ASSIGN(
- PostingListHolder posting_list_holder1,
- flash_index_storage.AllocatePostingList(half_block_posting_list_size));
+ half_block_posting_list_size =
+ (flash_index_storage.block_size() - 100) / 2;
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListHolder posting_list_holder1,
+ flash_index_storage.AllocatePostingList(
+ half_block_posting_list_size));
// We expect:
// 1. FlashIndexStorage will return a valid id.
id1 = posting_list_holder1.id;
@@ -359,16 +374,17 @@ TEST_F(FlashIndexStorageTest, FreeListInMemoryPersistence) {
Hit(/*section_id=*/5, /*document_id=*/2, /*term_frequency=*/100),
Hit(/*section_id=*/8, /*document_id=*/5, /*term_frequency=*/197)};
for (const Hit& hit : hits1) {
- ICING_ASSERT_OK(posting_list_holder1.posting_list.PrependHit(hit));
+ ICING_ASSERT_OK(
+ serializer_->PrependHit(&posting_list_holder1.posting_list, hit));
}
- EXPECT_THAT(posting_list_holder1.posting_list.GetHits(),
+ EXPECT_THAT(serializer_->GetHits(&posting_list_holder1.posting_list),
IsOkAndHolds(ElementsAreArray(hits1.rbegin(), hits1.rend())));
// 2. Get another PL. This should be on the same flash block. There should
// be no allocation.
- ICING_ASSERT_OK_AND_ASSIGN(
- PostingListHolder posting_list_holder2,
- flash_index_storage.AllocatePostingList(half_block_posting_list_size));
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListHolder posting_list_holder2,
+ flash_index_storage.AllocatePostingList(
+ half_block_posting_list_size));
// We expect:
// 1. FlashIndexStorage will return a valid id.
EXPECT_THAT(posting_list_holder2.id.is_valid(), IsTrue());
@@ -382,17 +398,19 @@ TEST_F(FlashIndexStorageTest, FreeListInMemoryPersistence) {
Hit(/*section_id=*/9, /*document_id=*/7, /*term_frequency=*/100),
Hit(/*section_id=*/6, /*document_id=*/7, /*term_frequency=*/197)};
for (const Hit& hit : hits2) {
- ICING_ASSERT_OK(posting_list_holder2.posting_list.PrependHit(hit));
+ ICING_ASSERT_OK(
+ serializer_->PrependHit(&posting_list_holder2.posting_list, hit));
}
- EXPECT_THAT(posting_list_holder2.posting_list.GetHits(),
+ EXPECT_THAT(serializer_->GetHits(&posting_list_holder2.posting_list),
IsOkAndHolds(ElementsAreArray(hits2.rbegin(), hits2.rend())));
- // 3. Now, free the first posting list. This should add it to the free list
+ // 3. Now, free the first posting list. This should add it to the free
+ // list
flash_index_storage.FreePostingList(std::move(posting_list_holder1));
}
EXPECT_THAT(flash_index_storage.GetDiskUsage(),
- Eq(2 * flash_index_storage.block_size()));
+ Eq(2 * flash_index_storage.block_size()));
// 4. The FlashIndexStorage should go out of scope and flush the in-memory
// posting list to disk
}
@@ -401,14 +419,14 @@ TEST_F(FlashIndexStorageTest, FreeListInMemoryPersistence) {
// Recreate the flash index.
ICING_ASSERT_OK_AND_ASSIGN(
FlashIndexStorage flash_index_storage,
- FlashIndexStorage::Create(file_name_, &filesystem_));
+ FlashIndexStorage::Create(file_name_, &filesystem_, serializer_.get()));
{
// 5. Request another posting list. This should NOT grow the index because
// the first posting list is free.
- ICING_ASSERT_OK_AND_ASSIGN(
- PostingListHolder posting_list_holder3,
- flash_index_storage.AllocatePostingList(half_block_posting_list_size));
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListHolder posting_list_holder3,
+ flash_index_storage.AllocatePostingList(
+ half_block_posting_list_size));
// We expect:
// 1. FlashIndexStorage will return a valid id.
EXPECT_THAT(posting_list_holder3.id.is_valid(), IsTrue());
@@ -422,7 +440,7 @@ TEST_F(FlashIndexStorageTest, FreeListInMemoryPersistence) {
EXPECT_THAT(posting_list_holder3.id.block_index(), Eq(id1.block_index()));
// Make sure this pl is empty. The hits that used to be there should be
// gone.
- EXPECT_THAT(posting_list_holder3.posting_list.GetHits(),
+ EXPECT_THAT(serializer_->GetHits(&posting_list_holder3.posting_list),
IsOkAndHolds(IsEmpty()));
std::vector<Hit> hits3 = {
Hit(/*section_id=*/7, /*document_id=*/1, /*term_frequency=*/62),
@@ -430,13 +448,14 @@ TEST_F(FlashIndexStorageTest, FreeListInMemoryPersistence) {
Hit(/*section_id=*/11, /*document_id=*/18, /*term_frequency=*/12),
Hit(/*section_id=*/7, /*document_id=*/100, /*term_frequency=*/74)};
for (const Hit& hit : hits3) {
- ICING_ASSERT_OK(posting_list_holder3.posting_list.PrependHit(hit));
+ ICING_ASSERT_OK(
+ serializer_->PrependHit(&posting_list_holder3.posting_list, hit));
}
- EXPECT_THAT(posting_list_holder3.posting_list.GetHits(),
+ EXPECT_THAT(serializer_->GetHits(&posting_list_holder3.posting_list),
IsOkAndHolds(ElementsAreArray(hits3.rbegin(), hits3.rend())));
}
EXPECT_THAT(flash_index_storage.GetDiskUsage(),
- Eq(2 * flash_index_storage.block_size()));
+ Eq(2 * flash_index_storage.block_size()));
}
}
@@ -444,7 +463,7 @@ TEST_F(FlashIndexStorageTest, DifferentSizedPostingLists) {
// Create the header file
ICING_ASSERT_OK_AND_ASSIGN(
FlashIndexStorage flash_index_storage,
- FlashIndexStorage::Create(file_name_, &filesystem_));
+ FlashIndexStorage::Create(file_name_, &filesystem_, serializer_.get()));
{
// 1. Request a PL that is 1/2 block size. Remember that block size also
// includes the BlockHeader. The BlockHeader isn't publicly visible, so we
@@ -471,9 +490,10 @@ TEST_F(FlashIndexStorageTest, DifferentSizedPostingLists) {
Hit(/*section_id=*/5, /*document_id=*/2, /*term_frequency=*/100),
Hit(/*section_id=*/8, /*document_id=*/5, /*term_frequency=*/197)};
for (const Hit& hit : hits1) {
- ICING_ASSERT_OK(posting_list_holder1.posting_list.PrependHit(hit));
+ ICING_ASSERT_OK(
+ serializer_->PrependHit(&posting_list_holder1.posting_list, hit));
}
- EXPECT_THAT(posting_list_holder1.posting_list.GetHits(),
+ EXPECT_THAT(serializer_->GetHits(&posting_list_holder1.posting_list),
IsOkAndHolds(ElementsAreArray(hits1.rbegin(), hits1.rend())));
// 2. Get a PL that is 1/4 block size. Even though a 1/4 block PL could
@@ -497,9 +517,10 @@ TEST_F(FlashIndexStorageTest, DifferentSizedPostingLists) {
Hit(/*section_id=*/9, /*document_id=*/7, /*term_frequency=*/100),
Hit(/*section_id=*/6, /*document_id=*/7, /*term_frequency=*/197)};
for (const Hit& hit : hits2) {
- ICING_ASSERT_OK(posting_list_holder2.posting_list.PrependHit(hit));
+ ICING_ASSERT_OK(
+ serializer_->PrependHit(&posting_list_holder2.posting_list, hit));
}
- EXPECT_THAT(posting_list_holder2.posting_list.GetHits(),
+ EXPECT_THAT(serializer_->GetHits(&posting_list_holder2.posting_list),
IsOkAndHolds(ElementsAreArray(hits2.rbegin(), hits2.rend())));
// 3. Request another 1/4 block-size posting list. This should NOT grow the
@@ -526,7 +547,7 @@ TEST_F(FlashIndexStorageTest, AllocateTooLargePostingList) {
// Create the header file
ICING_ASSERT_OK_AND_ASSIGN(
FlashIndexStorage flash_index_storage,
- FlashIndexStorage::Create(file_name_, &filesystem_));
+ FlashIndexStorage::Create(file_name_, &filesystem_, serializer_.get()));
// Request a PL that is 2x block size.
const int kDoubleBlockSize = flash_index_storage.block_size() * 2;
diff --git a/icing/index/main/index-block.cc b/icing/file/posting_list/index-block.cc
index fe989c7..1b9982e 100644
--- a/icing/index/main/index-block.cc
+++ b/icing/file/posting_list/index-block.cc
@@ -12,19 +12,22 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "icing/index/main/index-block.h"
+#include "icing/file/posting_list/index-block.h"
-#include <algorithm>
-#include <cinttypes>
-#include <limits>
+#include <sys/types.h>
+
+#include <cstdint>
+#include <memory>
+#include <string_view>
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/absl_ports/canonical_errors.h"
#include "icing/file/memory-mapped-file.h"
-#include "icing/index/main/posting-list-free.h"
-#include "icing/index/main/posting-list-utils.h"
+#include "icing/file/posting_list/posting-list-common.h"
+#include "icing/file/posting_list/posting-list-free.h"
+#include "icing/file/posting_list/posting-list-utils.h"
#include "icing/legacy/core/icing-string-util.h"
-#include "icing/util/math-util.h"
+#include "icing/util/logging.h"
#include "icing/util/status-macros.h"
namespace icing {
@@ -32,35 +35,30 @@ namespace lib {
namespace {
-libtextclassifier3::Status ValidatePostingListBytes(uint32_t posting_list_bytes,
- uint32_t block_size) {
- if (posting_list_bytes >
- IndexBlock::CalculateMaxPostingListBytes(block_size) ||
- !posting_list_utils::IsValidPostingListSize(posting_list_bytes)) {
+libtextclassifier3::Status ValidatePostingListBytes(
+ PostingListUsedSerializer* serializer, uint32_t posting_list_bytes,
+ uint32_t block_size) {
+ if (posting_list_bytes > IndexBlock::CalculateMaxPostingListBytes(
+ block_size, serializer->GetDataTypeBytes()) ||
+ !posting_list_utils::IsValidPostingListSize(
+ posting_list_bytes, serializer->GetDataTypeBytes(),
+ serializer->GetMinPostingListSize())) {
return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
"Requested posting list size %d is illegal for a flash block with max "
"posting list size of %d",
posting_list_bytes,
- IndexBlock::CalculateMaxPostingListBytes(block_size)));
+ IndexBlock::CalculateMaxPostingListBytes(
+ block_size, serializer->GetDataTypeBytes())));
}
return libtextclassifier3::Status::OK;
}
} // namespace
-uint32_t IndexBlock::ApproximateFullPostingListHitsForBlock(
- uint32_t block_size, int posting_list_index_bits) {
- // Assume 50% compressed and most don't have term frequencies.
- uint32_t bytes_per_hit = sizeof(Hit::Value) / 2;
- return (block_size - sizeof(BlockHeader)) /
- ((1u << posting_list_index_bits) * bytes_per_hit);
-}
-
libtextclassifier3::StatusOr<IndexBlock>
-IndexBlock::CreateFromPreexistingIndexBlockRegion(const Filesystem& filesystem,
- std::string_view file_path,
- off_t offset,
- uint32_t block_size) {
+IndexBlock::CreateFromPreexistingIndexBlockRegion(
+ const Filesystem& filesystem, std::string_view file_path,
+ PostingListUsedSerializer* serializer, off_t offset, uint32_t block_size) {
if (block_size < sizeof(BlockHeader)) {
return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
"Provided block_size %d is too small to fit even the BlockHeader!",
@@ -71,15 +69,16 @@ IndexBlock::CreateFromPreexistingIndexBlockRegion(const Filesystem& filesystem,
filesystem, file_path,
MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
ICING_RETURN_IF_ERROR(mmapped_file.Remap(offset, block_size));
- IndexBlock block(std::move(mmapped_file));
- ICING_RETURN_IF_ERROR(
- ValidatePostingListBytes(block.get_posting_list_bytes(), block_size));
+ IndexBlock block(serializer, std::move(mmapped_file));
+ ICING_RETURN_IF_ERROR(ValidatePostingListBytes(
+ serializer, block.get_posting_list_bytes(), block_size));
return block;
}
libtextclassifier3::StatusOr<IndexBlock>
IndexBlock::CreateFromUninitializedRegion(const Filesystem& filesystem,
std::string_view file_path,
+ PostingListUsedSerializer* serializer,
off_t offset, uint32_t block_size,
uint32_t posting_list_bytes) {
if (block_size < sizeof(BlockHeader)) {
@@ -88,13 +87,13 @@ IndexBlock::CreateFromUninitializedRegion(const Filesystem& filesystem,
block_size));
}
ICING_RETURN_IF_ERROR(
- ValidatePostingListBytes(posting_list_bytes, block_size));
+ ValidatePostingListBytes(serializer, posting_list_bytes, block_size));
ICING_ASSIGN_OR_RETURN(MemoryMappedFile mmapped_file,
MemoryMappedFile::Create(
filesystem, file_path,
MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
ICING_RETURN_IF_ERROR(mmapped_file.Remap(offset, block_size));
- IndexBlock block(std::move(mmapped_file));
+ IndexBlock block(serializer, std::move(mmapped_file));
// Safe to ignore the return value of Reset. Reset returns an error if
// posting_list_bytes is invalid, but this function ensures that
// posting_list_bytes is valid thanks to the call to ValidatePostingListBytes
@@ -103,17 +102,19 @@ IndexBlock::CreateFromUninitializedRegion(const Filesystem& filesystem,
return block;
}
-IndexBlock::IndexBlock(MemoryMappedFile&& mmapped_block)
+IndexBlock::IndexBlock(PostingListUsedSerializer* serializer,
+ MemoryMappedFile&& mmapped_block)
: header_(reinterpret_cast<BlockHeader*>(mmapped_block.mutable_region())),
posting_lists_start_ptr_(mmapped_block.mutable_region() +
sizeof(BlockHeader)),
block_size_in_bytes_(mmapped_block.region_size()),
+ serializer_(serializer),
mmapped_block_(
std::make_unique<MemoryMappedFile>(std::move(mmapped_block))) {}
libtextclassifier3::Status IndexBlock::Reset(int posting_list_bytes) {
ICING_RETURN_IF_ERROR(ValidatePostingListBytes(
- posting_list_bytes, mmapped_block_->region_size()));
+ serializer_, posting_list_bytes, mmapped_block_->region_size()));
header_->free_list_posting_list_index = kInvalidPostingListIndex;
header_->next_block_index = kInvalidBlockIndex;
header_->posting_list_bytes = posting_list_bytes;
@@ -140,7 +141,8 @@ IndexBlock::GetAllocatedPostingList(PostingListIndex posting_list_index) {
posting_list_index, max_num_posting_lists()));
}
return PostingListUsed::CreateFromPreexistingPostingListUsedRegion(
- get_posting_list_ptr(posting_list_index), get_posting_list_bytes());
+ serializer_, get_posting_list_ptr(posting_list_index),
+ get_posting_list_bytes());
}
libtextclassifier3::StatusOr<PostingListIndex>
@@ -159,7 +161,9 @@ IndexBlock::AllocatePostingList() {
// always return OK and ValueOrDie is safe to call.
auto posting_list_or =
PostingListFree::CreateFromPreexistingPostingListFreeRegion(
- get_posting_list_ptr(posting_list_index), get_posting_list_bytes());
+ get_posting_list_ptr(posting_list_index), get_posting_list_bytes(),
+ serializer_->GetDataTypeBytes(),
+ serializer_->GetMinPostingListSize());
PostingListFree plfree = std::move(posting_list_or).ValueOrDie();
header_->free_list_posting_list_index = plfree.get_next_posting_list_index();
@@ -172,7 +176,8 @@ IndexBlock::AllocatePostingList() {
// Make it a used posting list.
PostingListUsed::CreateFromUnitializedRegion(
- get_posting_list_ptr(posting_list_index), get_posting_list_bytes());
+ serializer_, get_posting_list_ptr(posting_list_index),
+ get_posting_list_bytes());
return posting_list_index;
}
@@ -188,7 +193,8 @@ void IndexBlock::FreePostingList(PostingListIndex posting_list_index) {
// So CreateFromUninitializedRegion will always return OK and ValueOrDie is
// safe to call.
auto posting_list_or = PostingListFree::CreateFromUnitializedRegion(
- get_posting_list_ptr(posting_list_index), get_posting_list_bytes());
+ get_posting_list_ptr(posting_list_index), get_posting_list_bytes(),
+ serializer_->GetDataTypeBytes(), serializer_->GetMinPostingListSize());
PostingListFree plfree = std::move(posting_list_or).ValueOrDie();
// Put at the head of the list.
diff --git a/icing/index/main/index-block.h b/icing/file/posting_list/index-block.h
index 8a7aa16..589f155 100644
--- a/icing/index/main/index-block.h
+++ b/icing/file/posting_list/index-block.h
@@ -12,30 +12,23 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#ifndef ICING_INDEX_MAIN_INDEX_BLOCK_H_
-#define ICING_INDEX_MAIN_INDEX_BLOCK_H_
+#ifndef ICING_FILE_POSTING_LIST_INDEX_BLOCK_H_
+#define ICING_FILE_POSTING_LIST_INDEX_BLOCK_H_
-#include <sys/mman.h>
+#include <sys/types.h>
-#include <algorithm>
-#include <cstring>
-#include <limits>
+#include <cstdint>
#include <memory>
-#include <string>
-#include <unordered_set>
-#include <vector>
+#include <string_view>
#include "icing/file/memory-mapped-file.h"
-#include "icing/index/hit/hit.h"
-#include "icing/index/main/posting-list-free.h"
-#include "icing/index/main/posting-list-used.h"
+#include "icing/file/posting_list/posting-list-common.h"
+#include "icing/file/posting_list/posting-list-used.h"
#include "icing/legacy/index/icing-bit-util.h"
namespace icing {
namespace lib {
-inline constexpr uint32_t kInvalidBlockIndex = 0;
-
// This class is used to manage I/O to a single flash block and to manage the
// division of that flash block into PostingLists. It provides an interface to
// allocate, free and read posting lists.
@@ -51,17 +44,12 @@ class IndexBlock {
public:
// What is the maximum posting list size in bytes that can be stored in this
// block.
- static uint32_t CalculateMaxPostingListBytes(uint32_t block_size_in_bytes) {
- return (block_size_in_bytes - sizeof(BlockHeader)) / sizeof(Hit) *
- sizeof(Hit);
+ static uint32_t CalculateMaxPostingListBytes(uint32_t block_size_in_bytes,
+ uint32_t data_type_bytes) {
+ return (block_size_in_bytes - sizeof(BlockHeader)) / data_type_bytes *
+ data_type_bytes;
}
- // For a given min number of bits needed to store PostingListIndex for a
- // block of "block_size", return the approximate number of hits that a full
- // posting list in this block could accomodate.
- static uint32_t ApproximateFullPostingListHitsForBlock(
- uint32_t block_size, int posting_list_index_bits);
-
// Create an IndexBlock to reference the previously used region of the
// mmapped_file starting at offset with size block_size
//
@@ -74,6 +62,7 @@ class IndexBlock {
static libtextclassifier3::StatusOr<IndexBlock>
CreateFromPreexistingIndexBlockRegion(const Filesystem& filesystem,
std::string_view file_path,
+ PostingListUsedSerializer* serializer,
off_t offset, uint32_t block_size);
// Create an IndexBlock to reference an uninitialized region of the
@@ -88,8 +77,9 @@ class IndexBlock {
// max_posting_list_bytes(size).
// - INTERNAL_ERROR if unable to mmap the region [offset, offset+block_size)
static libtextclassifier3::StatusOr<IndexBlock> CreateFromUninitializedRegion(
- const Filesystem& filesystem, std::string_view file_path, off_t offset,
- uint32_t block_size, uint32_t posting_list_bytes);
+ const Filesystem& filesystem, std::string_view file_path,
+ PostingListUsedSerializer* serializer, off_t offset, uint32_t block_size,
+ uint32_t posting_list_bytes);
IndexBlock(const IndexBlock&) = delete;
IndexBlock& operator=(const IndexBlock&) = delete;
@@ -174,7 +164,8 @@ class IndexBlock {
private:
// Assumes that mmapped_file already has established a valid mapping to the
// requested block.
- explicit IndexBlock(MemoryMappedFile&& mmapped_block);
+ explicit IndexBlock(PostingListUsedSerializer* serializer,
+ MemoryMappedFile&& mmapped_block);
// Resets IndexBlock to hold posting lists of posting_list_bytes size and adds
// all posting lists to the free list.
@@ -212,6 +203,8 @@ class IndexBlock {
char* posting_lists_start_ptr_;
uint32_t block_size_in_bytes_;
+ PostingListUsedSerializer* serializer_; // Does not own.
+
// MemoryMappedFile used to interact with the underlying flash block.
std::unique_ptr<MemoryMappedFile> mmapped_block_;
};
@@ -219,4 +212,4 @@ class IndexBlock {
} // namespace lib
} // namespace icing
-#endif // ICING_INDEX_MAIN_INDEX_BLOCK_H_
+#endif // ICING_FILE_POSTING_LIST_INDEX_BLOCK_H_
diff --git a/icing/index/main/index-block_test.cc b/icing/file/posting_list/index-block_test.cc
index 322918d..775858d 100644
--- a/icing/index/main/index-block_test.cc
+++ b/icing/file/posting_list/index-block_test.cc
@@ -12,14 +12,14 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "icing/index/main/index-block.h"
+#include "icing/file/posting_list/index-block.h"
#include "icing/text_classifier/lib3/utils/base/status.h"
#include "gmock/gmock.h"
#include "gtest/gtest.h"
#include "icing/file/filesystem.h"
-#include "icing/file/memory-mapped-file.h"
-#include "icing/index/main/posting-list-used.h"
+#include "icing/file/posting_list/posting-list-used.h"
+#include "icing/index/main/posting-list-used-hit-serializer.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/tmp-directory.h"
@@ -28,53 +28,57 @@ namespace lib {
namespace {
+using ::testing::ElementsAreArray;
+using ::testing::Eq;
+
static constexpr int kBlockSize = 4096;
-bool CreateFileWithSize(const Filesystem& filesystem, const std::string& file,
- int size) {
- size_t parent_dir_end = file.find_last_of('/');
- if (parent_dir_end == std::string::npos) {
- return false;
+class IndexBlockTest : public ::testing::Test {
+ protected:
+ void SetUp() override {
+ test_dir_ = GetTestTempDir() + "/flash";
+ flash_file_ = test_dir_ + "/0";
+ ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(test_dir_.c_str()));
+
+ // Grow the file by one block for the IndexBlock to use.
+ ASSERT_TRUE(filesystem_.Grow(flash_file_.c_str(), kBlockSize));
+
+ // TODO: test different serializers
+ serializer_ = std::make_unique<PostingListUsedHitSerializer>();
}
- std::string file_dir = file.substr(0, parent_dir_end);
- return filesystem.CreateDirectoryRecursively(file_dir.c_str()) &&
- filesystem.Grow(file.c_str(), size);
-}
-using ::testing::ElementsAreArray;
-using ::testing::Eq;
+ void TearDown() override {
+ serializer_.reset();
+ ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(test_dir_.c_str()));
+ }
-TEST(IndexBlockTest, CreateFromUninitializedRegionProducesEmptyBlock) {
- constexpr int kPostingListBytes = 20;
+ std::string test_dir_;
+ std::string flash_file_;
+ Filesystem filesystem_;
+ std::unique_ptr<PostingListUsedHitSerializer> serializer_;
+};
- Filesystem filesystem;
- std::string flash_file = GetTestTempDir() + "/flash/0";
- // Grow the file by one block for the IndexBlock to use.
- ASSERT_TRUE(CreateFileWithSize(filesystem, flash_file, kBlockSize));
+TEST_F(IndexBlockTest, CreateFromUninitializedRegionProducesEmptyBlock) {
+ constexpr int kPostingListBytes = 20;
{
// Create an IndexBlock from this newly allocated file block.
ICING_ASSERT_OK_AND_ASSIGN(
IndexBlock block, IndexBlock::CreateFromUninitializedRegion(
- filesystem, flash_file, /*offset=*/0, kBlockSize,
- kPostingListBytes));
+ filesystem_, flash_file_, serializer_.get(),
+ /*offset=*/0, kBlockSize, kPostingListBytes));
EXPECT_TRUE(block.has_free_posting_lists());
}
}
-TEST(IndexBlockTest, SizeAccessorsWorkCorrectly) {
+TEST_F(IndexBlockTest, SizeAccessorsWorkCorrectly) {
constexpr int kPostingListBytes1 = 20;
- Filesystem filesystem;
- std::string flash_file = GetTestTempDir() + "/flash/0";
- // Grow the file by one block for the IndexBlock to use.
- ASSERT_TRUE(CreateFileWithSize(filesystem, flash_file, kBlockSize));
-
// Create an IndexBlock from this newly allocated file block.
- ICING_ASSERT_OK_AND_ASSIGN(
- IndexBlock block, IndexBlock::CreateFromUninitializedRegion(
- filesystem, flash_file, /*offset=*/0, kBlockSize,
- kPostingListBytes1));
+ ICING_ASSERT_OK_AND_ASSIGN(IndexBlock block,
+ IndexBlock::CreateFromUninitializedRegion(
+ filesystem_, flash_file_, serializer_.get(),
+ /*offset=*/0, kBlockSize, kPostingListBytes1));
EXPECT_THAT(block.get_posting_list_bytes(), Eq(kPostingListBytes1));
// There should be (4096 - 12) / 20 = 204 posting lists
// (sizeof(BlockHeader)==12). We can store a PostingListIndex of 203 in only 8
@@ -85,9 +89,10 @@ TEST(IndexBlockTest, SizeAccessorsWorkCorrectly) {
constexpr int kPostingListBytes2 = 200;
// Create an IndexBlock from this newly allocated file block.
- ICING_ASSERT_OK_AND_ASSIGN(block, IndexBlock::CreateFromUninitializedRegion(
- filesystem, flash_file, /*offset=*/0,
- kBlockSize, kPostingListBytes2));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ block, IndexBlock::CreateFromUninitializedRegion(
+ filesystem_, flash_file_, serializer_.get(), /*offset=*/0,
+ kBlockSize, kPostingListBytes2));
EXPECT_THAT(block.get_posting_list_bytes(), Eq(kPostingListBytes2));
// There should be (4096 - 12) / 200 = 20 posting lists
// (sizeof(BlockHeader)==12). We can store a PostingListIndex of 19 in only 5
@@ -96,14 +101,9 @@ TEST(IndexBlockTest, SizeAccessorsWorkCorrectly) {
EXPECT_THAT(block.posting_list_index_bits(), Eq(5));
}
-TEST(IndexBlockTest, IndexBlockChangesPersistAcrossInstances) {
+TEST_F(IndexBlockTest, IndexBlockChangesPersistAcrossInstances) {
constexpr int kPostingListBytes = 2000;
- Filesystem filesystem;
- std::string flash_file = GetTestTempDir() + "/flash/0";
- // Grow the file by one block for the IndexBlock to use.
- ASSERT_TRUE(CreateFileWithSize(filesystem, flash_file, kBlockSize));
-
std::vector<Hit> test_hits{
Hit(/*section_id=*/2, /*document_id=*/0, Hit::kDefaultTermFrequency),
Hit(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency),
@@ -116,7 +116,7 @@ TEST(IndexBlockTest, IndexBlockChangesPersistAcrossInstances) {
// Create an IndexBlock from this newly allocated file block.
ICING_ASSERT_OK_AND_ASSIGN(
IndexBlock block, IndexBlock::CreateFromUninitializedRegion(
- filesystem, flash_file,
+ filesystem_, flash_file_, serializer_.get(),
/*offset=*/0,
/*block_size=*/kBlockSize, kPostingListBytes));
// Add hits to the first posting list.
@@ -124,33 +124,30 @@ TEST(IndexBlockTest, IndexBlockChangesPersistAcrossInstances) {
ICING_ASSERT_OK_AND_ASSIGN(PostingListUsed pl_used,
block.GetAllocatedPostingList(allocated_index));
for (const Hit& hit : test_hits) {
- ICING_ASSERT_OK(pl_used.PrependHit(hit));
+ ICING_ASSERT_OK(serializer_->PrependHit(&pl_used, hit));
}
- EXPECT_THAT(pl_used.GetHits(), IsOkAndHolds(ElementsAreArray(
- test_hits.rbegin(), test_hits.rend())));
+ EXPECT_THAT(
+ serializer_->GetHits(&pl_used),
+ IsOkAndHolds(ElementsAreArray(test_hits.rbegin(), test_hits.rend())));
}
{
// Create an IndexBlock from the previously allocated file block.
ICING_ASSERT_OK_AND_ASSIGN(
- IndexBlock block,
- IndexBlock::CreateFromPreexistingIndexBlockRegion(
- filesystem, flash_file, /*offset=*/0, kBlockSize));
+ IndexBlock block, IndexBlock::CreateFromPreexistingIndexBlockRegion(
+ filesystem_, flash_file_, serializer_.get(),
+ /*offset=*/0, kBlockSize));
ICING_ASSERT_OK_AND_ASSIGN(PostingListUsed pl_used,
block.GetAllocatedPostingList(allocated_index));
- EXPECT_THAT(pl_used.GetHits(), IsOkAndHolds(ElementsAreArray(
- test_hits.rbegin(), test_hits.rend())));
+ EXPECT_THAT(
+ serializer_->GetHits(&pl_used),
+ IsOkAndHolds(ElementsAreArray(test_hits.rbegin(), test_hits.rend())));
EXPECT_TRUE(block.has_free_posting_lists());
}
}
-TEST(IndexBlockTest, IndexBlockMultiplePostingLists) {
+TEST_F(IndexBlockTest, IndexBlockMultiplePostingLists) {
constexpr int kPostingListBytes = 2000;
- Filesystem filesystem;
- std::string flash_file = GetTestTempDir() + "/flash/0";
- // Grow the file by one block for the IndexBlock to use.
- ASSERT_TRUE(CreateFileWithSize(filesystem, flash_file, kBlockSize));
-
std::vector<Hit> hits_in_posting_list1{
Hit(/*section_id=*/2, /*document_id=*/0, Hit::kDefaultTermFrequency),
Hit(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency),
@@ -171,8 +168,8 @@ TEST(IndexBlockTest, IndexBlockMultiplePostingLists) {
// Create an IndexBlock from this newly allocated file block.
ICING_ASSERT_OK_AND_ASSIGN(
IndexBlock block, IndexBlock::CreateFromUninitializedRegion(
- filesystem, flash_file, /*offset=*/0, kBlockSize,
- kPostingListBytes));
+ filesystem_, flash_file_, serializer_.get(),
+ /*offset=*/0, kBlockSize, kPostingListBytes));
// Add hits to the first posting list.
ICING_ASSERT_OK_AND_ASSIGN(allocated_index_1, block.AllocatePostingList());
@@ -180,9 +177,9 @@ TEST(IndexBlockTest, IndexBlockMultiplePostingLists) {
PostingListUsed pl_used_1,
block.GetAllocatedPostingList(allocated_index_1));
for (const Hit& hit : hits_in_posting_list1) {
- ICING_ASSERT_OK(pl_used_1.PrependHit(hit));
+ ICING_ASSERT_OK(serializer_->PrependHit(&pl_used_1, hit));
}
- EXPECT_THAT(pl_used_1.GetHits(),
+ EXPECT_THAT(serializer_->GetHits(&pl_used_1),
IsOkAndHolds(ElementsAreArray(hits_in_posting_list1.rbegin(),
hits_in_posting_list1.rend())));
@@ -192,9 +189,9 @@ TEST(IndexBlockTest, IndexBlockMultiplePostingLists) {
PostingListUsed pl_used_2,
block.GetAllocatedPostingList(allocated_index_2));
for (const Hit& hit : hits_in_posting_list2) {
- ICING_ASSERT_OK(pl_used_2.PrependHit(hit));
+ ICING_ASSERT_OK(serializer_->PrependHit(&pl_used_2, hit));
}
- EXPECT_THAT(pl_used_2.GetHits(),
+ EXPECT_THAT(serializer_->GetHits(&pl_used_2),
IsOkAndHolds(ElementsAreArray(hits_in_posting_list2.rbegin(),
hits_in_posting_list2.rend())));
@@ -205,19 +202,19 @@ TEST(IndexBlockTest, IndexBlockMultiplePostingLists) {
{
// Create an IndexBlock from the previously allocated file block.
ICING_ASSERT_OK_AND_ASSIGN(
- IndexBlock block,
- IndexBlock::CreateFromPreexistingIndexBlockRegion(
- filesystem, flash_file, /*offset=*/0, kBlockSize));
+ IndexBlock block, IndexBlock::CreateFromPreexistingIndexBlockRegion(
+ filesystem_, flash_file_, serializer_.get(),
+ /*offset=*/0, kBlockSize));
ICING_ASSERT_OK_AND_ASSIGN(
PostingListUsed pl_used_1,
block.GetAllocatedPostingList(allocated_index_1));
- EXPECT_THAT(pl_used_1.GetHits(),
+ EXPECT_THAT(serializer_->GetHits(&pl_used_1),
IsOkAndHolds(ElementsAreArray(hits_in_posting_list1.rbegin(),
hits_in_posting_list1.rend())));
ICING_ASSERT_OK_AND_ASSIGN(
PostingListUsed pl_used_2,
block.GetAllocatedPostingList(allocated_index_2));
- EXPECT_THAT(pl_used_2.GetHits(),
+ EXPECT_THAT(serializer_->GetHits(&pl_used_2),
IsOkAndHolds(ElementsAreArray(hits_in_posting_list2.rbegin(),
hits_in_posting_list2.rend())));
EXPECT_THAT(block.AllocatePostingList(),
@@ -226,19 +223,14 @@ TEST(IndexBlockTest, IndexBlockMultiplePostingLists) {
}
}
-TEST(IndexBlockTest, IndexBlockReallocatingPostingLists) {
+TEST_F(IndexBlockTest, IndexBlockReallocatingPostingLists) {
constexpr int kPostingListBytes = 2000;
- Filesystem filesystem;
- std::string flash_file = GetTestTempDir() + "/flash/0";
- // Grow the file by one block for the IndexBlock to use.
- ASSERT_TRUE(CreateFileWithSize(filesystem, flash_file, kBlockSize));
-
// Create an IndexBlock from this newly allocated file block.
- ICING_ASSERT_OK_AND_ASSIGN(
- IndexBlock block,
- IndexBlock::CreateFromUninitializedRegion(
- filesystem, flash_file, /*offset=*/0, kBlockSize, kPostingListBytes));
+ ICING_ASSERT_OK_AND_ASSIGN(IndexBlock block,
+ IndexBlock::CreateFromUninitializedRegion(
+ filesystem_, flash_file_, serializer_.get(),
+ /*offset=*/0, kBlockSize, kPostingListBytes));
// Add hits to the first posting list.
std::vector<Hit> hits_in_posting_list1{
@@ -253,9 +245,9 @@ TEST(IndexBlockTest, IndexBlockReallocatingPostingLists) {
ICING_ASSERT_OK_AND_ASSIGN(PostingListUsed pl_used_1,
block.GetAllocatedPostingList(allocated_index_1));
for (const Hit& hit : hits_in_posting_list1) {
- ICING_ASSERT_OK(pl_used_1.PrependHit(hit));
+ ICING_ASSERT_OK(serializer_->PrependHit(&pl_used_1, hit));
}
- EXPECT_THAT(pl_used_1.GetHits(),
+ EXPECT_THAT(serializer_->GetHits(&pl_used_1),
IsOkAndHolds(ElementsAreArray(hits_in_posting_list1.rbegin(),
hits_in_posting_list1.rend())));
@@ -272,9 +264,9 @@ TEST(IndexBlockTest, IndexBlockReallocatingPostingLists) {
ICING_ASSERT_OK_AND_ASSIGN(PostingListUsed pl_used_2,
block.GetAllocatedPostingList(allocated_index_2));
for (const Hit& hit : hits_in_posting_list2) {
- ICING_ASSERT_OK(pl_used_2.PrependHit(hit));
+ ICING_ASSERT_OK(serializer_->PrependHit(&pl_used_2, hit));
}
- EXPECT_THAT(pl_used_2.GetHits(),
+ EXPECT_THAT(serializer_->GetHits(&pl_used_2),
IsOkAndHolds(ElementsAreArray(hits_in_posting_list2.rbegin(),
hits_in_posting_list2.rend())));
@@ -298,9 +290,9 @@ TEST(IndexBlockTest, IndexBlockReallocatingPostingLists) {
ICING_ASSERT_OK_AND_ASSIGN(pl_used_1,
block.GetAllocatedPostingList(allocated_index_3));
for (const Hit& hit : hits_in_posting_list3) {
- ICING_ASSERT_OK(pl_used_1.PrependHit(hit));
+ ICING_ASSERT_OK(serializer_->PrependHit(&pl_used_1, hit));
}
- EXPECT_THAT(pl_used_1.GetHits(),
+ EXPECT_THAT(serializer_->GetHits(&pl_used_1),
IsOkAndHolds(ElementsAreArray(hits_in_posting_list3.rbegin(),
hits_in_posting_list3.rend())));
EXPECT_THAT(block.AllocatePostingList(),
@@ -308,22 +300,17 @@ TEST(IndexBlockTest, IndexBlockReallocatingPostingLists) {
EXPECT_FALSE(block.has_free_posting_lists());
}
-TEST(IndexBlockTest, IndexBlockNextBlockIndex) {
+TEST_F(IndexBlockTest, IndexBlockNextBlockIndex) {
constexpr int kPostingListBytes = 2000;
constexpr int kSomeBlockIndex = 22;
- Filesystem filesystem;
- std::string flash_file = GetTestTempDir() + "/flash/0";
- // Grow the file by one block for the IndexBlock to use.
- ASSERT_TRUE(CreateFileWithSize(filesystem, flash_file, kBlockSize));
-
{
// Create an IndexBlock from this newly allocated file block and set the
// next block index.
ICING_ASSERT_OK_AND_ASSIGN(
IndexBlock block, IndexBlock::CreateFromUninitializedRegion(
- filesystem, flash_file, /*offset=*/0, kBlockSize,
- kPostingListBytes));
+ filesystem_, flash_file_, serializer_.get(),
+ /*offset=*/0, kBlockSize, kPostingListBytes));
EXPECT_THAT(block.next_block_index(), Eq(kInvalidBlockIndex));
block.set_next_block_index(kSomeBlockIndex);
EXPECT_THAT(block.next_block_index(), Eq(kSomeBlockIndex));
@@ -332,9 +319,9 @@ TEST(IndexBlockTest, IndexBlockNextBlockIndex) {
// Create an IndexBlock from this previously allocated file block and make
// sure that next_block_index is still set properly.
ICING_ASSERT_OK_AND_ASSIGN(
- IndexBlock block,
- IndexBlock::CreateFromPreexistingIndexBlockRegion(
- filesystem, flash_file, /*offset=*/0, kBlockSize));
+ IndexBlock block, IndexBlock::CreateFromPreexistingIndexBlockRegion(
+ filesystem_, flash_file_, serializer_.get(),
+ /*offset=*/0, kBlockSize));
EXPECT_THAT(block.next_block_index(), Eq(kSomeBlockIndex));
}
{
@@ -342,8 +329,8 @@ TEST(IndexBlockTest, IndexBlockNextBlockIndex) {
// reset the next_block_index to kInvalidBlockIndex.
ICING_ASSERT_OK_AND_ASSIGN(
IndexBlock block, IndexBlock::CreateFromUninitializedRegion(
- filesystem, flash_file, /*offset=*/0, kBlockSize,
- kPostingListBytes));
+ filesystem_, flash_file_, serializer_.get(),
+ /*offset=*/0, kBlockSize, kPostingListBytes));
EXPECT_THAT(block.next_block_index(), Eq(kInvalidBlockIndex));
}
}
diff --git a/icing/file/posting_list/posting-list-common.h b/icing/file/posting_list/posting-list-common.h
new file mode 100644
index 0000000..cbe2ddf
--- /dev/null
+++ b/icing/file/posting_list/posting-list-common.h
@@ -0,0 +1,35 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_FILE_POSTING_LIST_POSTING_LIST_COMMON_H_
+#define ICING_FILE_POSTING_LIST_POSTING_LIST_COMMON_H_
+
+#include <cstdint>
+
+namespace icing {
+namespace lib {
+
+// A FlashIndexBlock can contain multiple posting lists. This specifies which
+// PostingList in the FlashIndexBlock we want to refer to.
+using PostingListIndex = int32_t;
+inline constexpr PostingListIndex kInvalidPostingListIndex = ~0U;
+
+inline constexpr uint32_t kNumSpecialData = 2;
+
+inline constexpr uint32_t kInvalidBlockIndex = 0;
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_FILE_POSTING_LIST_POSTING_LIST_COMMON_H_
diff --git a/icing/index/main/posting-list-free.h b/icing/file/posting_list/posting-list-free.h
index 75b99d7..073e344 100644
--- a/icing/index/main/posting-list-free.h
+++ b/icing/file/posting_list/posting-list-free.h
@@ -12,30 +12,22 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#ifndef ICING_INDEX_MAIN_POSTING_LIST_FREE_H_
-#define ICING_INDEX_MAIN_POSTING_LIST_FREE_H_
-
-#include <sys/mman.h>
+#ifndef ICING_FILE_POSTING_LIST_POSTING_LIST_FREE_H_
+#define ICING_FILE_POSTING_LIST_POSTING_LIST_FREE_H_
#include <cstdint>
#include <cstring>
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/absl_ports/canonical_errors.h"
-#include "icing/index/hit/hit.h"
-#include "icing/index/main/posting-list-utils.h"
+#include "icing/file/posting_list/posting-list-common.h"
+#include "icing/file/posting_list/posting-list-utils.h"
#include "icing/legacy/core/icing-string-util.h"
-#include "icing/util/logging.h"
#include "icing/util/status-macros.h"
namespace icing {
namespace lib {
-// A FlashIndexBlock can contain multiple posting lists. This specifies which
-// PostingList in the FlashIndexBlock we want to refer to.
-using PostingListIndex = int32_t;
-inline constexpr PostingListIndex kInvalidPostingListIndex = ~0U;
-
// A posting list in the index block's free list.
//
// We re-use the first sizeof(PostingListIndex) bytes of the posting list
@@ -51,14 +43,17 @@ class PostingListFree {
//
// RETURNS:
// - A valid PostingListFree on success
- // - INVALID_ARGUMENT if size_in_bytes < min_posting_list_size()
- // || size_in_bytes % sizeof(Hit) != 0.
+ // - INVALID_ARGUMENT if posting_list_utils::IsValidPostingListSize check
+ // fails
// - FAILED_PRECONDITION if posting_list_buffer is null
static libtextclassifier3::StatusOr<PostingListFree>
- CreateFromPreexistingPostingListFreeRegion(void *posting_list_buffer,
- uint32_t size_in_bytes) {
+ CreateFromPreexistingPostingListFreeRegion(void* posting_list_buffer,
+ uint32_t size_in_bytes,
+ uint32_t data_type_bytes,
+ uint32_t min_posting_list_size) {
ICING_RETURN_ERROR_IF_NULL(posting_list_buffer);
- if (!posting_list_utils::IsValidPostingListSize(size_in_bytes)) {
+ if (!posting_list_utils::IsValidPostingListSize(
+ size_in_bytes, data_type_bytes, min_posting_list_size)) {
return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
"Requested posting list size %d is invalid!", size_in_bytes));
}
@@ -74,15 +69,17 @@ class PostingListFree {
//
// RETURNS:
// - A valid PostingListFree on success
- // - INVALID_ARGUMENT if size_in_bytes < min_size() || size_in_bytes %
- // sizeof(Hit) != 0.
+ // - INVALID_ARGUMENT if posting_list_utils::IsValidPostingListSize check
+ // fails
// - FAILED_PRECONDITION if posting_list_buffer is null
static libtextclassifier3::StatusOr<PostingListFree>
- CreateFromUnitializedRegion(void *posting_list_buffer,
- uint32_t size_in_bytes) {
+ CreateFromUnitializedRegion(void* posting_list_buffer, uint32_t size_in_bytes,
+ uint32_t data_type_bytes,
+ uint32_t min_posting_list_size) {
ICING_ASSIGN_OR_RETURN(PostingListFree posting_list_free,
CreateFromPreexistingPostingListFreeRegion(
- posting_list_buffer, size_in_bytes));
+ posting_list_buffer, size_in_bytes,
+ data_type_bytes, min_posting_list_size));
posting_list_free.Clear();
return posting_list_free;
}
@@ -101,8 +98,8 @@ class PostingListFree {
}
private:
- PostingListFree(void *posting_list_buffer, uint32_t size_in_bytes)
- : posting_list_buffer_(static_cast<uint8_t *>(posting_list_buffer)),
+ explicit PostingListFree(void* posting_list_buffer, uint32_t size_in_bytes)
+ : posting_list_buffer_(static_cast<uint8_t*>(posting_list_buffer)),
size_in_bytes_(size_in_bytes) {}
// Reset the current free posting list as unchained free posting list so that
@@ -114,16 +111,11 @@ class PostingListFree {
// A byte array of size size_in_bytes_. The first sizeof(PostingListIndex)
// bytes which will store the next posting list index, the rest are unused and
// can be anything.
- uint8_t *posting_list_buffer_;
+ uint8_t* posting_list_buffer_;
[[maybe_unused]] uint32_t size_in_bytes_;
-
- static_assert(sizeof(PostingListIndex) <=
- posting_list_utils::min_posting_list_size(),
- "PostingListIndex must be small enough to fit in a "
- "minimum-sized Posting List.");
};
} // namespace lib
} // namespace icing
-#endif // ICING_INDEX_MAIN_POSTING_LIST_FREE_H_
+#endif // ICING_FILE_POSTING_LIST_POSTING_LIST_FREE_H_
diff --git a/icing/index/main/posting-list-free_test.cc b/icing/file/posting_list/posting-list-free_test.cc
index a152934..99e3cf5 100644
--- a/icing/index/main/posting-list-free_test.cc
+++ b/icing/file/posting_list/posting-list-free_test.cc
@@ -12,14 +12,14 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "icing/index/main/posting-list-free.h"
+#include "icing/file/posting_list/posting-list-free.h"
#include <cstdint>
#include <memory>
#include "icing/text_classifier/lib3/utils/base/status.h"
#include "gtest/gtest.h"
-#include "icing/index/main/posting-list-utils.h"
+#include "icing/index/main/posting-list-used-hit-serializer.h"
#include "icing/testing/common-matchers.h"
namespace icing {
@@ -27,55 +27,76 @@ namespace lib {
namespace {
+// TODO(b/249829533): test different serializers
+
TEST(PostingListTest, PostingListFree) {
+ PostingListUsedHitSerializer serializer;
static const size_t kHitsSize = 2551 * sizeof(Hit);
std::unique_ptr<char[]> hits_buf = std::make_unique<char[]>(kHitsSize);
ICING_ASSERT_OK_AND_ASSIGN(
PostingListFree pl_free,
PostingListFree::CreateFromUnitializedRegion(
- static_cast<void *>(hits_buf.get()), kHitsSize));
+ static_cast<void *>(hits_buf.get()), kHitsSize,
+ serializer.GetDataTypeBytes(), serializer.GetMinPostingListSize()));
EXPECT_EQ(pl_free.get_next_posting_list_index(), kInvalidPostingListIndex);
}
TEST(PostingListTest, PostingListTooSmallInvalidArgument) {
- static const size_t kHitSizeTooSmall =
- posting_list_utils::min_posting_list_size() - sizeof(Hit);
+ PostingListUsedHitSerializer serializer;
+ const size_t kHitSizeTooSmall =
+ serializer.GetMinPostingListSize() - sizeof(Hit);
std::unique_ptr<char[]> hits_buf = std::make_unique<char[]>(kHitSizeTooSmall);
- EXPECT_THAT(PostingListFree::CreateFromUnitializedRegion(
- static_cast<void *>(hits_buf.get()), kHitSizeTooSmall),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
- EXPECT_THAT(PostingListFree::CreateFromPreexistingPostingListFreeRegion(
- static_cast<void *>(hits_buf.get()), kHitSizeTooSmall),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(
+ PostingListFree::CreateFromUnitializedRegion(
+ static_cast<void *>(hits_buf.get()), kHitSizeTooSmall,
+ serializer.GetDataTypeBytes(), serializer.GetMinPostingListSize()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(
+ PostingListFree::CreateFromPreexistingPostingListFreeRegion(
+ static_cast<void *>(hits_buf.get()), kHitSizeTooSmall,
+ serializer.GetDataTypeBytes(), serializer.GetMinPostingListSize()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
}
TEST(PostingListTest, PostingListNotAlignedInvalidArgument) {
- static const size_t kHitSizeNotAligned =
- posting_list_utils::min_posting_list_size() + 1;
+ PostingListUsedHitSerializer serializer;
+ const size_t kHitSizeNotAligned = serializer.GetMinPostingListSize() + 1;
std::unique_ptr<char[]> hits_buf =
std::make_unique<char[]>(kHitSizeNotAligned);
- EXPECT_THAT(PostingListFree::CreateFromUnitializedRegion(
- static_cast<void *>(hits_buf.get()), kHitSizeNotAligned),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
- EXPECT_THAT(PostingListFree::CreateFromPreexistingPostingListFreeRegion(
- static_cast<void *>(hits_buf.get()), kHitSizeNotAligned),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(
+ PostingListFree::CreateFromUnitializedRegion(
+ static_cast<void *>(hits_buf.get()), kHitSizeNotAligned,
+ serializer.GetDataTypeBytes(), serializer.GetMinPostingListSize()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(
+ PostingListFree::CreateFromPreexistingPostingListFreeRegion(
+ static_cast<void *>(hits_buf.get()), kHitSizeNotAligned,
+ serializer.GetDataTypeBytes(), serializer.GetMinPostingListSize()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
}
TEST(PostingListTest, PostingListNullBufferFailedPrecondition) {
- static const size_t kHitSize = posting_list_utils::min_posting_list_size();
- EXPECT_THAT(PostingListFree::CreateFromUnitializedRegion(
- /*posting_list_buffer=*/nullptr, kHitSize),
- StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
- EXPECT_THAT(PostingListFree::CreateFromPreexistingPostingListFreeRegion(
- /*posting_list_buffer=*/nullptr, kHitSize),
- StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+ PostingListUsedHitSerializer serializer;
+ const size_t kHitSize = serializer.GetMinPostingListSize();
+
+ // nullptr posting_list_buffer
+ EXPECT_THAT(
+ PostingListFree::CreateFromUnitializedRegion(
+ /*posting_list_buffer=*/nullptr, kHitSize,
+ serializer.GetDataTypeBytes(), serializer.GetMinPostingListSize()),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+ EXPECT_THAT(
+ PostingListFree::CreateFromPreexistingPostingListFreeRegion(
+ /*posting_list_buffer=*/nullptr, kHitSize,
+ serializer.GetDataTypeBytes(), serializer.GetMinPostingListSize()),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
}
TEST(PostingListTest, PostingListFreePreexistingRegion) {
+ PostingListUsedHitSerializer serializer;
constexpr PostingListIndex kOtherPostingListIndex = 12;
static const size_t kHitsSize = 2551 * sizeof(Hit);
@@ -85,7 +106,8 @@ TEST(PostingListTest, PostingListFreePreexistingRegion) {
ICING_ASSERT_OK_AND_ASSIGN(
PostingListFree pl_free,
PostingListFree::CreateFromUnitializedRegion(
- static_cast<void *>(hits_buf.get()), kHitsSize));
+ static_cast<void *>(hits_buf.get()), kHitsSize,
+ serializer.GetDataTypeBytes(), serializer.GetMinPostingListSize()));
pl_free.set_next_posting_list_index(kOtherPostingListIndex);
EXPECT_EQ(pl_free.get_next_posting_list_index(), kOtherPostingListIndex);
}
@@ -95,12 +117,14 @@ TEST(PostingListTest, PostingListFreePreexistingRegion) {
ICING_ASSERT_OK_AND_ASSIGN(
PostingListFree pl_free,
PostingListFree::CreateFromPreexistingPostingListFreeRegion(
- static_cast<void *>(hits_buf.get()), kHitsSize));
+ static_cast<void *>(hits_buf.get()), kHitsSize,
+ serializer.GetDataTypeBytes(), serializer.GetMinPostingListSize()));
EXPECT_EQ(pl_free.get_next_posting_list_index(), kOtherPostingListIndex);
}
}
TEST(PostingListTest, PostingListFreeUninitializedRegion) {
+ PostingListUsedHitSerializer serializer;
constexpr PostingListIndex kOtherPostingListIndex = 12;
static const size_t kHitsSize = 2551 * sizeof(Hit);
@@ -110,7 +134,8 @@ TEST(PostingListTest, PostingListFreeUninitializedRegion) {
ICING_ASSERT_OK_AND_ASSIGN(
PostingListFree pl_free,
PostingListFree::CreateFromUnitializedRegion(
- static_cast<void *>(hits_buf.get()), kHitsSize));
+ static_cast<void *>(hits_buf.get()), kHitsSize,
+ serializer.GetDataTypeBytes(), serializer.GetMinPostingListSize()));
pl_free.set_next_posting_list_index(kOtherPostingListIndex);
EXPECT_EQ(pl_free.get_next_posting_list_index(), kOtherPostingListIndex);
}
@@ -120,7 +145,8 @@ TEST(PostingListTest, PostingListFreeUninitializedRegion) {
ICING_ASSERT_OK_AND_ASSIGN(
PostingListFree pl_free,
PostingListFree::CreateFromUnitializedRegion(
- static_cast<void *>(hits_buf.get()), kHitsSize));
+ static_cast<void *>(hits_buf.get()), kHitsSize,
+ serializer.GetDataTypeBytes(), serializer.GetMinPostingListSize()));
EXPECT_EQ(pl_free.get_next_posting_list_index(), kInvalidPostingListIndex);
}
}
diff --git a/icing/index/main/posting-list-identifier.cc b/icing/file/posting_list/posting-list-identifier.cc
index 1cdac65..4491c38 100644
--- a/icing/index/main/posting-list-identifier.cc
+++ b/icing/file/posting_list/posting-list-identifier.cc
@@ -12,7 +12,9 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "icing/index/main/posting-list-identifier.h"
+#include "icing/file/posting_list/posting-list-identifier.h"
+
+#include "icing/file/posting_list/posting-list-common.h"
namespace icing {
namespace lib {
diff --git a/icing/index/main/posting-list-identifier.h b/icing/file/posting_list/posting-list-identifier.h
index 4953865..05c7ce5 100644
--- a/icing/index/main/posting-list-identifier.h
+++ b/icing/file/posting_list/posting-list-identifier.h
@@ -12,11 +12,12 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#ifndef ICING_INDEX_POSTING_LIST_IDENTIFIER_H_
-#define ICING_INDEX_POSTING_LIST_IDENTIFIER_H_
+#ifndef ICING_FILE_POSTING_LIST_POSTING_LIST_IDENTIFIER_H_
+#define ICING_FILE_POSTING_LIST_POSTING_LIST_IDENTIFIER_H_
-#include "icing/index/main/index-block.h"
-#include "icing/index/main/posting-list-free.h"
+#include <cstdint>
+
+#include "icing/file/posting_list/posting-list-common.h"
#include "icing/legacy/index/icing-bit-util.h"
namespace icing {
@@ -62,9 +63,9 @@ class PostingListIdentifier {
// 2. posting_list_index - the index of this posting list within the block
// 3. posting_list_index_bits - the number of bits needed to encode the
// largest posting_list_index that this block can have.
- PostingListIdentifier(uint32_t block_index,
- PostingListIndex posting_list_index,
- int posting_list_index_bits) {
+ explicit PostingListIdentifier(uint32_t block_index,
+ PostingListIndex posting_list_index,
+ int posting_list_index_bits) {
val_ = 0;
BITFIELD_OR(val_, /*offset=*/0, /*len=*/posting_list_index_bits,
/*val=*/static_cast<uint64_t>(posting_list_index));
@@ -113,4 +114,4 @@ class PostingListIdentifier {
} // namespace lib
} // namespace icing
-#endif // ICING_INDEX_POSTING_LIST_IDENTIFIER_H_
+#endif // ICING_FILE_POSTING_LIST_POSTING_LIST_IDENTIFIER_H_
diff --git a/icing/file/posting_list/posting-list-used.cc b/icing/file/posting_list/posting-list-used.cc
new file mode 100644
index 0000000..370b9c7
--- /dev/null
+++ b/icing/file/posting_list/posting-list-used.cc
@@ -0,0 +1,56 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/file/posting_list/posting-list-used.h"
+
+#include <cstdint>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/file/posting_list/posting-list-utils.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+libtextclassifier3::StatusOr<PostingListUsed>
+PostingListUsed::CreateFromPreexistingPostingListUsedRegion(
+ PostingListUsedSerializer* serializer, void* posting_list_buffer,
+ uint32_t size_in_bytes) {
+ ICING_RETURN_ERROR_IF_NULL(serializer);
+ ICING_RETURN_ERROR_IF_NULL(posting_list_buffer);
+
+ if (!posting_list_utils::IsValidPostingListSize(
+ size_in_bytes, serializer->GetDataTypeBytes(),
+ serializer->GetMinPostingListSize())) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Requested posting list size %d is invalid!", size_in_bytes));
+ }
+ return PostingListUsed(posting_list_buffer, size_in_bytes);
+}
+
+libtextclassifier3::StatusOr<PostingListUsed>
+PostingListUsed::CreateFromUnitializedRegion(
+ PostingListUsedSerializer* serializer, void* posting_list_buffer,
+ uint32_t size_in_bytes) {
+ ICING_ASSIGN_OR_RETURN(PostingListUsed posting_list_used,
+ CreateFromPreexistingPostingListUsedRegion(
+ serializer, posting_list_buffer, size_in_bytes));
+ serializer->Clear(&posting_list_used);
+ return posting_list_used;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/file/posting_list/posting-list-used.h b/icing/file/posting_list/posting-list-used.h
new file mode 100644
index 0000000..ec4b067
--- /dev/null
+++ b/icing/file/posting_list/posting-list-used.h
@@ -0,0 +1,143 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_FILE_POSTING_LIST_POSTING_LIST_USED_H_
+#define ICING_FILE_POSTING_LIST_POSTING_LIST_USED_H_
+
+#include <cstdint>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+
+namespace icing {
+namespace lib {
+
+class PostingListUsed;
+
+// Interface for PostingListUsed data serialization and deserialization.
+// - It contains several common methods used by lower level of posting list
+// management related classes (e.g. FlashIndexStorage, IndexBlock,
+// PostingListUsed, etc).
+// - Higher level classes (e.g. MainIndex) create their desired serializers
+// according to the data type they're dealing with, and pass the instance down
+// to all posting list management related classes.
+// - Data specific methods can also be implemented in each serializer. They
+// won't be used by posting list management related classes, but higher level
+// classes are able to call it and deal with the specific data type.
+//
+// E.g. main index stores 'Hit' data into posting lists.
+// - MainIndex creates PostingListUsedHitSerializer instance and uses hit data
+// related methods to serialize/deserialize Hit data to/from posting lists.
+// - FlashIndexStorage, IndexBlock, PostingListUsed use the serializer created
+// by MainIndex, but hold the reference/pointer in the interface format
+// (PostingListUsedSerializer) and only use common interface methods to manage
+// posting list.
+class PostingListUsedSerializer {
+ public:
+ virtual ~PostingListUsedSerializer() = default;
+
+ // Returns byte size of the data type.
+ virtual uint32_t GetDataTypeBytes() const = 0;
+
+ // Returns minimum posting list size allowed.
+ //
+ // Note that min posting list size should also be large enough to store a
+ // single PostingListIndex (for posting list management usage), so we have to
+ // add static_assert in each serializer implementation.
+ // E.g.
+ // static constexpr uint32_t kMinPostingListSize = kSpecialHitsSize;
+ // static_assert(sizeof(PostingListIndex) <= kMinPostingListSize, "");
+ virtual uint32_t GetMinPostingListSize() const = 0;
+
+ // Returns minimum size of posting list that can fit these used bytes
+ // (see MoveFrom).
+ virtual uint32_t GetMinPostingListSizeToFit(
+ const PostingListUsed* posting_list_used) const = 0;
+
+ // Returns bytes used by actual data.
+ virtual uint32_t GetBytesUsed(
+ const PostingListUsed* posting_list_used) const = 0;
+
+ // Clears the posting list. It is usually used for initializing a newly
+ // allocated (or reclaimed from free posting list chain) posting list.
+ virtual void Clear(PostingListUsed* posting_list_used) const = 0;
+
+ // Moves contents from posting list 'src' to 'dst'. Clears 'src'.
+ //
+ // RETURNS:
+ // - OK on success
+ // - INVALID_ARGUMENT if 'src' is not valid or 'src' is too large to fit in
+ // 'dst'.
+ // - FAILED_PRECONDITION if 'dst' posting list is in a corrupted state.
+ virtual libtextclassifier3::Status MoveFrom(PostingListUsed* dst,
+ PostingListUsed* src) const = 0;
+};
+
+// A posting list with data in it. Layout depends on the serializer.
+class PostingListUsed {
+ public:
+ // Creates a PostingListUsed that points to a buffer of size_in_bytes bytes.
+ // 'Preexisting' means that posting_list_buffer was previously modified by
+ // another instance of PostingListUsed.
+ //
+ // Caller owns the data buffer and must not free it while using a
+ // PostingListUsed.
+ //
+ // RETURNS:
+ // - A valid PostingListUsed if successful
+ // - INVALID_ARGUMENT if posting_list_utils::IsValidPostingListSize check
+ // fails
+ // - FAILED_PRECONDITION if serializer or posting_list_buffer is null
+ static libtextclassifier3::StatusOr<PostingListUsed>
+ CreateFromPreexistingPostingListUsedRegion(
+ PostingListUsedSerializer* serializer, void* posting_list_buffer,
+ uint32_t size_in_bytes);
+
+ // Creates a PostingListUsed that points to a buffer of size_in_bytes bytes
+ // and initializes the content of the buffer so that the returned
+ // PostingListUsed is empty.
+ //
+ // Caller owns the posting_list_buffer buffer and must not free it while using
+ // a PostingListUsed.
+ //
+ // RETURNS:
+ // - A valid PostingListUsed if successful
+ // - INVALID_ARGUMENT if posting_list_utils::IsValidPostingListSize check
+ // fails
+ // - FAILED_PRECONDITION if serializer or posting_list_buffer is null
+ static libtextclassifier3::StatusOr<PostingListUsed>
+ CreateFromUnitializedRegion(PostingListUsedSerializer* serializer,
+ void* posting_list_buffer,
+ uint32_t size_in_bytes);
+
+ uint8_t* posting_list_buffer() { return posting_list_buffer_; }
+ const uint8_t* posting_list_buffer() const { return posting_list_buffer_; }
+
+ uint32_t size_in_bytes() const { return size_in_bytes_; }
+
+ private:
+ explicit PostingListUsed(void* posting_list_buffer, uint32_t size_in_bytes)
+ : posting_list_buffer_(static_cast<uint8_t*>(posting_list_buffer)),
+ size_in_bytes_(size_in_bytes) {}
+
+ // A byte array of size size_in_bytes_ containing encoded data for this
+ // posting list.
+ uint8_t* posting_list_buffer_; // does not own!
+ uint32_t size_in_bytes_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_FILE_POSTING_LIST_POSTING_LIST_USED_H_
diff --git a/icing/index/main/posting-list-utils.cc b/icing/file/posting_list/posting-list-utils.cc
index b734767..2adbc26 100644
--- a/icing/index/main/posting-list-utils.cc
+++ b/icing/file/posting_list/posting-list-utils.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "icing/index/main/posting-list-utils.h"
+#include "icing/file/posting_list/posting-list-utils.h"
#include "icing/legacy/index/icing-bit-util.h"
#include "icing/util/logging.h"
@@ -22,27 +22,28 @@ namespace lib {
namespace posting_list_utils {
-bool IsValidPostingListSize(uint32_t size_in_bytes) {
- // size must be sizeof(Hit) aligned. Otherwise, we can have serious
+bool IsValidPostingListSize(uint32_t size_in_bytes, uint32_t data_type_bytes,
+ uint32_t min_posting_list_size) {
+ // size must be data_type_bytes aligned. Otherwise, we can have serious
// wasted space in the worst case.
- if (size_in_bytes % sizeof(Hit) != 0) {
- ICING_LOG(ERROR) << "Size " << size_in_bytes << " hit " << sizeof(Hit);
+ if (size_in_bytes % data_type_bytes != 0) {
+ ICING_LOG(ERROR) << "Size " << size_in_bytes << " data " << data_type_bytes;
return false;
}
// Must be able to store the min information.
- if (size_in_bytes < min_posting_list_size()) {
+ if (size_in_bytes < min_posting_list_size) {
ICING_LOG(ERROR) << "Size " << size_in_bytes << " is less than min size "
- << min_posting_list_size();
+ << min_posting_list_size;
return false;
}
- // We re-use the first two hits as pointers into the posting list
- // so the posting list size must fit in sizeof(Hit).
- if (BitsToStore(size_in_bytes) > sizeof(Hit::Value) * 8) {
+ // We re-use the first two data as pointers into the posting list
+ // so the posting list size must fit in data_type_bytes.
+ if (BitsToStore(size_in_bytes) > data_type_bytes * 8) {
ICING_LOG(ERROR)
<< "Posting list size must be small enough to store the offset in "
- << sizeof(Hit::Value) * 8 << " bytes.";
+ << data_type_bytes << " bytes.";
return false;
}
diff --git a/icing/index/main/posting-list-utils.h b/icing/file/posting_list/posting-list-utils.h
index 77537a7..6a1e28c 100644
--- a/icing/index/main/posting-list-utils.h
+++ b/icing/file/posting_list/posting-list-utils.h
@@ -12,34 +12,26 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#ifndef ICING_INDEX_MAIN_POSTING_LIST_UTILS_H_
-#define ICING_INDEX_MAIN_POSTING_LIST_UTILS_H_
+#ifndef ICING_FILE_POSTING_LIST_POSTING_LIST_UTILS_H_
+#define ICING_FILE_POSTING_LIST_POSTING_LIST_UTILS_H_
#include <cstdint>
-#include "icing/index/hit/hit.h"
-
namespace icing {
namespace lib {
namespace posting_list_utils {
-// Represents the byte length of the two special hits described
-// in the private section of posting-list-used.h.
-inline constexpr uint32_t kNumSpecialHits = 2;
-inline constexpr uint32_t kSpecialHitsSize = sizeof(Hit) * kNumSpecialHits;
-
-constexpr uint32_t min_posting_list_size() { return kSpecialHitsSize; }
-
// For a posting list size to be valid, it must:
-// 1) be sizeof(Hit) aligned
+// 1) be data_type_bytes aligned
// 2) be equal to or larger than min_posting_list_size
-// 3) be small enough to be encoded within a single Hit (5 bytes)
-bool IsValidPostingListSize(uint32_t size_in_bytes);
+// 3) be small enough to be encoded within a single data (data_type_bytes)
+bool IsValidPostingListSize(uint32_t size_in_bytes, uint32_t data_type_bytes,
+ uint32_t min_posting_list_size);
} // namespace posting_list_utils
} // namespace lib
} // namespace icing
-#endif // ICING_INDEX_MAIN_POSTING_LIST_UTILS_H_
+#endif // ICING_FILE_POSTING_LIST_POSTING_LIST_UTILS_H_
diff --git a/icing/icing-search-engine-with-icu-file_test.cc b/icing/icing-search-engine-with-icu-file_test.cc
index 1012b47..39f9df0 100644
--- a/icing/icing-search-engine-with-icu-file_test.cc
+++ b/icing/icing-search-engine-with-icu-file_test.cc
@@ -34,17 +34,10 @@
namespace icing {
namespace lib {
namespace {
+
using ::icing::lib::portable_equals_proto::EqualsProto;
using ::testing::Eq;
-constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REQUIRED =
- PropertyConfigProto::Cardinality::REQUIRED;
-
-constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN =
- StringIndexingConfig::TokenizerType::PLAIN;
-
-constexpr TermMatchType::Code MATCH_PREFIX = TermMatchType::PREFIX;
-
std::string GetTestBaseDir() {
return GetTestTempDir() + "/icing_with_icu_files";
}
@@ -79,7 +72,7 @@ TEST(IcingSearchEngineWithIcuFileTest, ShouldInitialize) {
.AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty(
PropertyConfigBuilder()
.SetName("body")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
.SetCardinality(CARDINALITY_REQUIRED)))
.Build();
EXPECT_THAT(icing.SetSchema(schema).status().code(), Eq(StatusProto::OK));
@@ -94,7 +87,7 @@ TEST(IcingSearchEngineWithIcuFileTest, ShouldIndexAndSearch) {
.AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty(
PropertyConfigBuilder()
.SetName("body")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
.SetCardinality(CARDINALITY_REQUIRED)))
.Build();
ASSERT_THAT(icing.SetSchema(schema).status().code(), Eq(StatusProto::OK));
diff --git a/icing/icing-search-engine_backwards_compatibility_test.cc b/icing/icing-search-engine_backwards_compatibility_test.cc
index 2574313..b9233cb 100644
--- a/icing/icing-search-engine_backwards_compatibility_test.cc
+++ b/icing/icing-search-engine_backwards_compatibility_test.cc
@@ -41,12 +41,6 @@ namespace {
using ::icing::lib::portable_equals_proto::EqualsProto;
using ::testing::Eq;
-constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY;
-constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL =
- PropertyConfigProto::Cardinality::OPTIONAL;
-constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN =
- StringIndexingConfig::TokenizerType::PLAIN;
-
// For mocking purpose, we allow tests to provide a custom Filesystem.
class TestIcingSearchEngine : public IcingSearchEngine {
public:
@@ -141,16 +135,16 @@ TEST_F(IcingSearchEngineBackwardsCompatibilityTest,
SchemaBuilder()
.AddType(SchemaTypeConfigBuilder()
.SetType("email")
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("subject")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL))
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("body")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
.Build();
// Make sure our schema is still the same as we expect. If not, there's
@@ -281,16 +275,16 @@ TEST_F(IcingSearchEngineBackwardsCompatibilityTest, MigrateToLargerScale) {
SchemaBuilder()
.AddType(SchemaTypeConfigBuilder()
.SetType("email")
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("subject")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL))
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("body")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
.Build();
// Make sure our schema is still the same as we expect. If not, there's
diff --git a/icing/icing-search-engine_fuzz_test.cc b/icing/icing-search-engine_fuzz_test.cc
index 6448ee1..2cf19ad 100644
--- a/icing/icing-search-engine_fuzz_test.cc
+++ b/icing/icing-search-engine_fuzz_test.cc
@@ -34,14 +34,6 @@ namespace icing {
namespace lib {
namespace {
-constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REQUIRED =
- PropertyConfigProto::Cardinality::REQUIRED;
-
-constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN =
- StringIndexingConfig::TokenizerType::PLAIN;
-
-constexpr TermMatchType::Code MATCH_PREFIX = TermMatchType::PREFIX;
-
IcingSearchEngineOptions Setup() {
IcingSearchEngineOptions icing_options;
icing_options.set_base_dir(GetTestTempDir() + "/icing");
@@ -86,7 +78,7 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
.AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty(
PropertyConfigBuilder()
.SetName("body")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
.SetCardinality(CARDINALITY_REQUIRED)))
.Build();
icing.SetSchema(schema_proto);
diff --git a/icing/icing-search-engine_test.cc b/icing/icing-search-engine_test.cc
index bca83dc..7a60101 100644
--- a/icing/icing-search-engine_test.cc
+++ b/icing/icing-search-engine_test.cc
@@ -97,21 +97,6 @@ constexpr std::string_view kIpsumText =
"vehicula posuere vitae, convallis eu lorem. Donec semper augue eu nibh "
"placerat semper.";
-constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL =
- PropertyConfigProto::Cardinality::OPTIONAL;
-constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REQUIRED =
- PropertyConfigProto::Cardinality::REQUIRED;
-constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REPEATED =
- PropertyConfigProto::Cardinality::REPEATED;
-
-constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN =
- StringIndexingConfig::TokenizerType::PLAIN;
-constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_NONE =
- StringIndexingConfig::TokenizerType::NONE;
-
-constexpr TermMatchType::Code MATCH_PREFIX = TermMatchType::PREFIX;
-constexpr TermMatchType::Code MATCH_NONE = TermMatchType::UNKNOWN;
-
PortableFileBackedProtoLog<DocumentWrapper>::Header ReadDocumentLogHeader(
Filesystem filesystem, const std::string& file_path) {
PortableFileBackedProtoLog<DocumentWrapper>::Header header;
@@ -219,51 +204,55 @@ SchemaProto CreateMessageSchema() {
.AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty(
PropertyConfigBuilder()
.SetName("body")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
.SetCardinality(CARDINALITY_REQUIRED)))
.Build();
}
SchemaProto CreateEmailSchema() {
return SchemaBuilder()
- .AddType(
- SchemaTypeConfigBuilder()
- .SetType("Email")
- .AddProperty(PropertyConfigBuilder()
- .SetName("body")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_REQUIRED))
- .AddProperty(PropertyConfigBuilder()
- .SetName("subject")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_REQUIRED)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Email")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)))
.Build();
}
SchemaProto CreatePersonAndEmailSchema() {
return SchemaBuilder()
- .AddType(
- SchemaTypeConfigBuilder()
- .SetType("Person")
- .AddProperty(PropertyConfigBuilder()
- .SetName("name")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL))
- .AddProperty(PropertyConfigBuilder()
- .SetName("emailAddress")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Person")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("emailAddress")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
.AddType(
SchemaTypeConfigBuilder()
.SetType("Email")
- .AddProperty(PropertyConfigBuilder()
- .SetName("body")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL))
- .AddProperty(PropertyConfigBuilder()
- .SetName("subject")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
.AddProperty(PropertyConfigBuilder()
.SetName("sender")
.SetDataTypeDocument(
@@ -1285,10 +1274,11 @@ TEST_F(IcingSearchEngineTest,
SchemaTypeConfigProto person_proto =
SchemaTypeConfigBuilder()
.SetType("Person")
- .AddProperty(PropertyConfigBuilder()
- .SetName("name")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
.Build();
SchemaProto nested_schema =
SchemaBuilder()
@@ -1301,11 +1291,11 @@ TEST_F(IcingSearchEngineTest,
"Person",
/*index_nested_properties=*/true)
.SetCardinality(CARDINALITY_OPTIONAL))
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("subject")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
.Build();
SetSchemaResultProto set_schema_result = icing.SetSchema(nested_schema);
@@ -1373,11 +1363,11 @@ TEST_F(IcingSearchEngineTest,
"Person",
/*index_nested_properties=*/false)
.SetCardinality(CARDINALITY_OPTIONAL))
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("subject")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
.Build();
set_schema_result = icing.SetSchema(no_nested_schema);
@@ -1415,16 +1405,16 @@ TEST_F(IcingSearchEngineTest,
SchemaBuilder()
.AddType(SchemaTypeConfigBuilder()
.SetType("Email")
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("subject")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL))
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("body")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
.Build();
SetSchemaResultProto set_schema_result =
@@ -1472,7 +1462,7 @@ TEST_F(IcingSearchEngineTest,
.AddType(SchemaTypeConfigBuilder().SetType("Email").AddProperty(
PropertyConfigBuilder()
.SetName("subject")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
.SetCardinality(CARDINALITY_OPTIONAL)))
.Build();
@@ -1507,16 +1497,16 @@ TEST_F(
SchemaBuilder()
.AddType(SchemaTypeConfigBuilder()
.SetType("Email")
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("subject")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL))
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("body")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
.Build();
SetSchemaResultProto set_schema_result =
@@ -1564,16 +1554,16 @@ TEST_F(
SchemaBuilder()
.AddType(SchemaTypeConfigBuilder()
.SetType("Email")
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("subject")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL))
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("to")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("to")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
.Build();
set_schema_result = icing.SetSchema(
@@ -1608,25 +1598,26 @@ TEST_F(IcingSearchEngineTest, ForceSetSchemaIncompatibleNestedDocsAreDeleted) {
.SetDataTypeDocument("Person",
/*index_nested_properties=*/true)
.SetCardinality(CARDINALITY_OPTIONAL))
- .AddProperty(PropertyConfigBuilder()
- .SetName("subject")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
.Build();
SchemaProto nested_schema =
SchemaBuilder()
.AddType(SchemaTypeConfigBuilder()
.SetType("Person")
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("name")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL))
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("company")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("company")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
.AddType(email_schema_type)
.Build();
@@ -1681,7 +1672,7 @@ TEST_F(IcingSearchEngineTest, ForceSetSchemaIncompatibleNestedDocsAreDeleted) {
.AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty(
PropertyConfigBuilder()
.SetName("name")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
.SetCardinality(CARDINALITY_OPTIONAL)))
.AddType(email_schema_type)
.Build();
@@ -6833,7 +6824,7 @@ TEST_F(IcingSearchEngineTest,
.AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty(
PropertyConfigBuilder()
.SetName("unindexedField")
- .SetDataTypeString(MATCH_NONE, TOKENIZER_NONE)
+ .SetDataTypeString(TERM_MATCH_UNKNOWN, TOKENIZER_NONE)
.SetCardinality(CARDINALITY_REQUIRED)))
.Build();
ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
@@ -7294,16 +7285,16 @@ TEST_F(IcingSearchEngineTest,
.AddType(
SchemaTypeConfigBuilder()
.SetType("Message")
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("body")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_REQUIRED))
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("subject")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
.Build();
// Write the marker file
std::string marker_filepath =
@@ -8148,7 +8139,7 @@ TEST_F(IcingSearchEngineTest, SnippetErrorTest) {
.AddType(SchemaTypeConfigBuilder().SetType("Generic").AddProperty(
PropertyConfigBuilder()
.SetName("subject")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
.SetCardinality(CARDINALITY_REPEATED)))
.Build();
ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
@@ -8257,7 +8248,7 @@ TEST_F(IcingSearchEngineTest, CJKSnippetTest) {
// Search and request snippet matching but no windowing.
SearchSpecProto search_spec;
search_spec.set_query("走");
- search_spec.set_term_match_type(MATCH_PREFIX);
+ search_spec.set_term_match_type(TERM_MATCH_PREFIX);
ResultSpecProto result_spec;
result_spec.mutable_snippet_spec()->set_num_to_snippet(
@@ -8328,7 +8319,7 @@ TEST_F(IcingSearchEngineTest, InvalidToEmptyQueryTest) {
// Search and request snippet matching but no windowing.
SearchSpecProto search_spec;
search_spec.set_query("?");
- search_spec.set_term_match_type(MATCH_PREFIX);
+ search_spec.set_term_match_type(TERM_MATCH_PREFIX);
ScoringSpecProto scoring_spec;
ResultSpecProto result_spec;
@@ -8394,7 +8385,7 @@ TEST_F(IcingSearchEngineTest, EmojiSnippetTest) {
// Search and request snippet matching but no windowing.
SearchSpecProto search_spec;
search_spec.set_query("🐟");
- search_spec.set_term_match_type(MATCH_PREFIX);
+ search_spec.set_term_match_type(TERM_MATCH_PREFIX);
ResultSpecProto result_spec;
result_spec.mutable_snippet_spec()->set_num_to_snippet(1);
@@ -8455,7 +8446,7 @@ TEST_F(IcingSearchEngineTest, PutDocumentIndexFailureDeletion) {
// Make sure that the document isn't searchable.
SearchSpecProto search_spec;
search_spec.set_query("foo");
- search_spec.set_term_match_type(MATCH_PREFIX);
+ search_spec.set_term_match_type(TERM_MATCH_PREFIX);
SearchResultProto search_results =
icing.Search(search_spec, ScoringSpecProto::default_instance(),
@@ -9103,12 +9094,12 @@ TEST_F(IcingSearchEngineTest,
.AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty(
PropertyConfigBuilder()
.SetName("body")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
.SetCardinality(CARDINALITY_REQUIRED)))
.AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty(
PropertyConfigBuilder()
.SetName("name")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
.SetCardinality(CARDINALITY_OPTIONAL)))
.AddType(SchemaTypeConfigBuilder()
.SetType("Email")
@@ -9118,11 +9109,11 @@ TEST_F(IcingSearchEngineTest,
"Person",
/*index_nested_properties=*/true)
.SetCardinality(CARDINALITY_OPTIONAL))
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("subject")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
.Build();
ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
@@ -9176,7 +9167,7 @@ TEST_F(IcingSearchEngineTest, SearchSuggestionsTest_SchemaTypeNotFound) {
.AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty(
PropertyConfigBuilder()
.SetName("body")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
.SetCardinality(CARDINALITY_REQUIRED)))
.Build();
ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
@@ -9312,12 +9303,12 @@ TEST_F(IcingSearchEngineTest,
.AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty(
PropertyConfigBuilder()
.SetName("body")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
.SetCardinality(CARDINALITY_REQUIRED)))
.AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty(
PropertyConfigBuilder()
.SetName("name")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
.SetCardinality(CARDINALITY_OPTIONAL)))
.AddType(SchemaTypeConfigBuilder()
.SetType("Email")
@@ -9327,11 +9318,11 @@ TEST_F(IcingSearchEngineTest,
"Person",
/*index_nested_properties=*/true)
.SetCardinality(CARDINALITY_OPTIONAL))
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("subject")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
.Build();
ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
@@ -9391,12 +9382,12 @@ TEST_F(IcingSearchEngineTest,
.AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty(
PropertyConfigBuilder()
.SetName("body")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
.SetCardinality(CARDINALITY_REQUIRED)))
.AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty(
PropertyConfigBuilder()
.SetName("name")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
.SetCardinality(CARDINALITY_OPTIONAL)))
.AddType(SchemaTypeConfigBuilder()
.SetType("Email")
@@ -9406,11 +9397,11 @@ TEST_F(IcingSearchEngineTest,
"Person",
/*index_nested_properties=*/true)
.SetCardinality(CARDINALITY_OPTIONAL))
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("subject")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
.Build();
ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
@@ -9447,7 +9438,7 @@ TEST_F(IcingSearchEngineTest, SearchSuggestionsTest_OrderByTermFrequency) {
.AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty(
PropertyConfigBuilder()
.SetName("body")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
.SetCardinality(CARDINALITY_REQUIRED)))
.Build();
ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
@@ -9697,49 +9688,49 @@ TEST_F(IcingSearchEngineTest, IcingShouldWorkFor64Sections) {
.AddType(SchemaTypeConfigBuilder()
// Person has 4 sections.
.SetType("Person")
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("firstName")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL))
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("lastName")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL))
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("emailAddress")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL))
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("phoneNumber")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("firstName")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("lastName")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("emailAddress")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("phoneNumber")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
.AddType(SchemaTypeConfigBuilder()
// Email has 16 sections.
.SetType("Email")
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("body")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL))
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("subject")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL))
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("date")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL))
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("time")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("date")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("time")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
.AddProperty(
PropertyConfigBuilder()
.SetName("sender")
diff --git a/icing/index/hit/hit.cc b/icing/index/hit/hit.cc
index ce1c366..493e62b 100644
--- a/icing/index/hit/hit.cc
+++ b/icing/index/hit/hit.cc
@@ -35,9 +35,20 @@ enum FlagOffset {
kHasTermFrequency = 2,
kNumFlags = 3,
};
+
+static_assert(kDocumentIdBits + kSectionIdBits + kNumFlags <
+ sizeof(Hit::Value) * 8,
+ "Hit::kInvalidValue contains risky value and we should have at "
+ "least one unused bit to avoid potential bugs. Please follow the "
+ "process mentioned in hit.h to correct the value of "
+ "Hit::kInvalidValue and remove this static_assert afterwards.");
+
static_assert(kDocumentIdBits + kSectionIdBits + kNumFlags <=
sizeof(Hit::Value) * 8,
"HitOverflow");
+static_assert(kDocumentIdBits == 22, "");
+static_assert(kSectionIdBits == 6, "");
+static_assert(kNumFlags == 3, "");
inline DocumentId InvertDocumentId(DocumentId document_id) {
static_assert(kMaxDocumentId <= (std::numeric_limits<DocumentId>::max() - 1),
@@ -52,6 +63,31 @@ inline DocumentId InvertDocumentId(DocumentId document_id) {
} // namespace
+BasicHit::BasicHit(SectionId section_id, DocumentId document_id) {
+ // Values are stored so that when sorted, they appear in document_id
+ // descending, section_id ascending, order. So inverted document_id appears in
+ // the most significant bits, followed by (uninverted) section_id.
+ Value temp_value = 0;
+ bit_util::BitfieldSet(/*new_value=*/InvertDocumentId(document_id),
+ /*lsb_offset=*/kSectionIdBits, /*len=*/kDocumentIdBits,
+ /*value_out=*/&temp_value);
+ bit_util::BitfieldSet(/*new_value=*/section_id, /*lsb_offset=*/0,
+ /*len=*/kSectionIdBits, /*value_out=*/&temp_value);
+ value_ = temp_value;
+}
+
+DocumentId BasicHit::document_id() const {
+ DocumentId inverted_document_id = bit_util::BitfieldGet(
+ value_, /*lsb_offset=*/kSectionIdBits, /*len=*/kDocumentIdBits);
+ // Undo the document_id inversion.
+ return InvertDocumentId(inverted_document_id);
+}
+
+SectionId BasicHit::section_id() const {
+ return bit_util::BitfieldGet(value_, /*lsb_offset=*/0,
+ /*len=*/kSectionIdBits);
+}
+
Hit::Hit(SectionId section_id, DocumentId document_id,
Hit::TermFrequency term_frequency, bool is_in_prefix_section,
bool is_prefix_hit)
diff --git a/icing/index/hit/hit.h b/icing/index/hit/hit.h
index 35c9238..111b320 100644
--- a/icing/index/hit/hit.h
+++ b/icing/index/hit/hit.h
@@ -26,6 +26,54 @@
namespace icing {
namespace lib {
+// BasicHit is a specific encoding that refers to content within a document. A
+// basic hit consists of:
+// - a DocumentId
+// - a SectionId
+// referring to the document and section that the hit corresponds to.
+//
+// The hit is the most basic unit of the index and, when grouped together by
+// term, can be used to encode what terms appear in what documents.
+//
+// BasicHit is for indices (e.g. numeric index) that don't require term
+// frequency.
+class BasicHit {
+ public:
+ // The datatype used to encode BasicHit information: the document_id and
+ // section_id.
+ using Value = uint32_t;
+
+ // WARNING: Changing this value will invalidate any pre-existing posting lists
+ // on user devices.
+ //
+ // kInvalidValue contains:
+ // - 0 for unused bits. Note that unused bits are always 0 for both valid and
+ // invalid BasicHit values.
+ // - Inverted kInvalidDocumentId
+ // - SectionId 0 (valid), which is ok because inverted kInvalidDocumentId has
+ // already invalidated the value. In fact, we currently use all 2^6 section
+ // ids and there is no "invalid section id", so it doesn't matter what
+ // SectionId we set for kInvalidValue.
+ static constexpr Value kInvalidValue = 0;
+
+ explicit BasicHit(SectionId section_id, DocumentId document_id);
+
+ explicit BasicHit() : value_(kInvalidValue) {}
+
+ bool is_valid() const { return value_ != kInvalidValue; }
+ Value value() const { return value_; }
+ DocumentId document_id() const;
+ SectionId section_id() const;
+
+ bool operator<(const BasicHit& h2) const { return value_ < h2.value_; }
+ bool operator==(const BasicHit& h2) const { return value_ == h2.value_; }
+
+ private:
+ // Value bits layout: 4 unused + 22 document_id + 6 section id.
+ Value value_;
+} __attribute__((packed));
+static_assert(sizeof(BasicHit) == 4, "");
+
// Hit is a specific encoding that refers to content within a document. A hit
// consists of:
// - a DocumentId
@@ -36,7 +84,8 @@ namespace lib {
// - whether the Hit does not appear exactly in the document, but instead
// represents a term that is a prefix of a term in the document
// - whether the Hit came from a section that has prefix expansion enabled
-// and a term frequency for the hit.
+// and a term frequency for the hit.
+//
// The hit is the most basic unit of the index and, when grouped together by
// term, can be used to encode what terms appear in what documents.
class Hit {
@@ -47,6 +96,26 @@ class Hit {
// WARNING: Changing this value will invalidate any pre-existing posting lists
// on user devices.
+ //
+ // WARNING:
+ // - Hit::kInvalidValue should contain inverted kInvalidDocumentId, which is
+ // b'00...0. However, currently we set it as UINT32_MAX and actually it
+ // contains b'11...1, which is the inverted document_id 0.
+ // - It means Hit::kInvalidValue contains valid (document_id, section_id,
+ // flags), so we potentially cannot distinguish if a Hit is invalid or not.
+ // The invalidity is an essential feature for posting list since we use it
+ // to determine the state of the posting list.
+ // - The reason why it won't break the current posting list is because the
+ // unused bit(s) are set as 1 for Hit::kInvalidValue and 0 for all valid
+ // Hits. In other words, the unused bit(s) are actually serving as "invalid
+ // flag".
+ // - If we want to exhaust all unused bits in the future, then we have to
+ // change Hit::kInvalidValue to set the inverted document_id section
+ // correctly (b'00...0, refer to BasicHit::kInvalidValue as an example).
+ // - Also this problem is guarded by static_assert in hit.cc. If exhausting
+ // all unused bits, then the static_assert will detect and fail. We can
+ // safely remove the static_assert check after following the above process
+ // to resolve the incorrect Hit::kInvalidValue issue.
static constexpr Value kInvalidValue = std::numeric_limits<Value>::max();
// Docs are sorted in reverse, and 0 is never used as the inverted
// DocumentId (because it is the inverse of kInvalidValue), so it is always
@@ -91,7 +160,7 @@ class Hit {
private:
// Value and TermFrequency must be in this order.
- // Value bits layout: 5 unused + 20 document_id + 4 section id + 3 flags.
+ // Value bits layout: 1 unused + 22 document_id + 6 section id + 3 flags.
Value value_;
TermFrequency term_frequency_;
} __attribute__((packed));
diff --git a/icing/index/hit/hit_test.cc b/icing/index/hit/hit_test.cc
index d47ca37..0086d91 100644
--- a/icing/index/hit/hit_test.cc
+++ b/icing/index/hit/hit_test.cc
@@ -26,6 +26,7 @@ namespace {
using ::testing::ElementsAre;
using ::testing::Eq;
+using ::testing::Ge;
using ::testing::IsFalse;
using ::testing::IsTrue;
using ::testing::Lt;
@@ -35,6 +36,63 @@ static constexpr DocumentId kSomeDocumentId = 24;
static constexpr SectionId kSomeSectionid = 5;
static constexpr Hit::TermFrequency kSomeTermFrequency = 57;
+TEST(BasicHitTest, Accessors) {
+ BasicHit h1(kSomeSectionid, kSomeDocumentId);
+ EXPECT_THAT(h1.document_id(), Eq(kSomeDocumentId));
+ EXPECT_THAT(h1.section_id(), Eq(kSomeSectionid));
+}
+
+TEST(BasicHitTest, Invalid) {
+ BasicHit default_invalid;
+ EXPECT_THAT(default_invalid.is_valid(), IsFalse());
+
+ // Also make sure the invalid BasicHit contains an invalid document id.
+ EXPECT_THAT(default_invalid.document_id(), Eq(kInvalidDocumentId));
+ EXPECT_THAT(default_invalid.section_id(), Eq(kMinSectionId));
+}
+
+TEST(BasicHitTest, Valid) {
+ BasicHit maximum_document_id_hit(kSomeSectionid, kMaxDocumentId);
+ EXPECT_THAT(maximum_document_id_hit.is_valid(), IsTrue());
+
+ BasicHit maximum_section_id_hit(kMaxSectionId, kSomeDocumentId);
+ EXPECT_THAT(maximum_section_id_hit.is_valid(), IsTrue());
+
+ BasicHit minimum_document_id_hit(kSomeSectionid, kMinDocumentId);
+ EXPECT_THAT(minimum_document_id_hit.is_valid(), IsTrue());
+
+ BasicHit minimum_section_id_hit(kMinSectionId, kSomeDocumentId);
+ EXPECT_THAT(minimum_section_id_hit.is_valid(), IsTrue());
+
+ BasicHit all_maximum_hit(kMaxSectionId, kMaxDocumentId);
+ EXPECT_THAT(all_maximum_hit.is_valid(), IsTrue());
+
+ BasicHit all_minimum_hit(kMinSectionId, kMinDocumentId);
+ EXPECT_THAT(all_minimum_hit.is_valid(), IsTrue());
+
+ // We use invalid BasicHit for std::lower_bound. Verify that value of the
+ // smallest valid BasicHit (which contains kMinSectionId, kMaxDocumentId) is
+ // >= BasicHit::kInvalidValue.
+ BasicHit smallest_hit(kMinSectionId, kMaxDocumentId);
+ ASSERT_THAT(smallest_hit.is_valid(), IsTrue());
+ EXPECT_THAT(smallest_hit.value(), Ge(BasicHit::kInvalidValue));
+}
+
+TEST(BasicHitTest, Comparison) {
+ BasicHit hit(/*section_id=*/1, /*document_id=*/243);
+ // DocumentIds are sorted in ascending order. So a hit with a lower
+ // document_id should be considered greater than one with a higher
+ // document_id.
+ BasicHit higher_document_id_hit(/*section_id=*/1, /*document_id=*/2409);
+ BasicHit higher_section_id_hit(/*section_id=*/15, /*document_id=*/243);
+
+ std::vector<BasicHit> hits{hit, higher_document_id_hit,
+ higher_section_id_hit};
+ std::sort(hits.begin(), hits.end());
+ EXPECT_THAT(hits,
+ ElementsAre(higher_document_id_hit, hit, higher_section_id_hit));
+}
+
TEST(HitTest, HasTermFrequencyFlag) {
Hit h1(kSomeSectionid, kSomeDocumentId, Hit::kDefaultTermFrequency);
EXPECT_THAT(h1.has_term_frequency(), IsFalse());
@@ -101,6 +159,17 @@ TEST(HitTest, Valid) {
Hit minimum_section_id_hit(0, kSomeDocumentId, kSomeTermFrequency);
EXPECT_THAT(minimum_section_id_hit.is_valid(), IsTrue());
+
+ // We use Hit with value Hit::kMaxDocumentIdSortValue for std::lower_bound in
+ // the lite index. Verify that the value of the smallest valid Hit (which
+ // contains kMinSectionId, kMaxDocumentId and 3 flags = false) is >=
+ // Hit::kMaxDocumentIdSortValue.
+ Hit smallest_hit(kMinSectionId, kMaxDocumentId, Hit::kDefaultTermFrequency);
+ ASSERT_THAT(smallest_hit.is_valid(), IsTrue());
+ ASSERT_THAT(smallest_hit.has_term_frequency(), IsFalse());
+ ASSERT_THAT(smallest_hit.is_prefix_hit(), IsFalse());
+ ASSERT_THAT(smallest_hit.is_in_prefix_section(), IsFalse());
+ EXPECT_THAT(smallest_hit.value(), Ge(Hit::kMaxDocumentIdSortValue));
}
TEST(HitTest, Comparison) {
diff --git a/icing/index/index-processor.cc b/icing/index/index-processor.cc
index a1dacde..cfeda31 100644
--- a/icing/index/index-processor.cc
+++ b/icing/index/index-processor.cc
@@ -89,6 +89,8 @@ libtextclassifier3::Status IndexProcessor::IndexDocument(
[[fallthrough]];
case StringIndexingConfig::TokenizerType::RFC822:
[[fallthrough]];
+ case StringIndexingConfig::TokenizerType::URL:
+ [[fallthrough]];
case StringIndexingConfig::TokenizerType::PLAIN:
std::string normalized_term = normalizer_.NormalizeTerm(token);
status = editor.BufferTerm(normalized_term.c_str());
diff --git a/icing/index/index-processor_test.cc b/icing/index/index-processor_test.cc
index e961b0e..3c848d3 100644
--- a/icing/index/index-processor_test.cc
+++ b/icing/index/index-processor_test.cc
@@ -94,6 +94,12 @@ constexpr std::string_view kNestedProperty = "nested";
constexpr std::string_view kExactVerbatimProperty = "verbatimExact";
constexpr std::string_view kPrefixedVerbatimProperty = "verbatimPrefixed";
constexpr std::string_view kRfc822Property = "rfc822";
+// TODO (b/246964044): remove ifdef guard when url-tokenizer is ready for export
+// to Android.
+#ifdef ENABLE_URL_TOKENIZER
+constexpr std::string_view kExactUrlProperty = "urlExact";
+constexpr std::string_view kPrefixedUrlProperty = "urlPrefixed";
+#endif // ENABLE_URL_TOKENIZER
constexpr DocumentId kDocumentId0 = 0;
constexpr DocumentId kDocumentId1 = 1;
@@ -103,8 +109,15 @@ constexpr SectionId kPrefixedSectionId = 1;
constexpr SectionId kRepeatedSectionId = 2;
constexpr SectionId kRfc822SectionId = 3;
constexpr SectionId kNestedSectionId = 4;
+#ifdef ENABLE_URL_TOKENIZER
+constexpr SectionId kUrlExactSectionId = 5;
+constexpr SectionId kUrlPrefixedSectionId = 6;
+constexpr SectionId kExactVerbatimSectionId = 7;
+constexpr SectionId kPrefixedVerbatimSectionId = 8;
+#else // !ENABLE_URL_TOKENIZER
constexpr SectionId kExactVerbatimSectionId = 5;
constexpr SectionId kPrefixedVerbatimSectionId = 6;
+#endif // ENABLE_URL_TOKENIZER
using Cardinality = PropertyConfigProto::Cardinality;
using DataType = PropertyConfigProto::DataType;
@@ -113,25 +126,10 @@ using ::testing::Eq;
using ::testing::IsEmpty;
using ::testing::Test;
-constexpr PropertyConfigProto::DataType::Code TYPE_STRING =
- PropertyConfigProto::DataType::STRING;
-constexpr PropertyConfigProto::DataType::Code TYPE_BYTES =
- PropertyConfigProto::DataType::BYTES;
-
-constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL =
- PropertyConfigProto::Cardinality::OPTIONAL;
-constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REPEATED =
- PropertyConfigProto::Cardinality::REPEATED;
-
-constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN =
- StringIndexingConfig::TokenizerType::PLAIN;
-constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_VERBATIM =
- StringIndexingConfig::TokenizerType::VERBATIM;
-constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_RFC822 =
- StringIndexingConfig::TokenizerType::RFC822;
-
-constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY;
-constexpr TermMatchType::Code MATCH_PREFIX = TermMatchType::PREFIX;
+#ifdef ENABLE_URL_TOKENIZER
+constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_URL =
+ StringIndexingConfig::TokenizerType::URL;
+#endif // ENABLE_URL_TOKENIZER
class IndexProcessorTest : public Test {
protected:
@@ -169,16 +167,16 @@ class IndexProcessorTest : public Test {
.AddType(
SchemaTypeConfigBuilder()
.SetType(kFakeType)
- .AddProperty(
- PropertyConfigBuilder()
- .SetName(kExactProperty)
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL))
- .AddProperty(
- PropertyConfigBuilder()
- .SetName(kPrefixedProperty)
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kExactProperty)
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kPrefixedProperty)
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
.AddProperty(PropertyConfigBuilder()
.SetName(kUnindexedProperty1)
.SetDataType(TYPE_STRING)
@@ -187,26 +185,38 @@ class IndexProcessorTest : public Test {
.SetName(kUnindexedProperty2)
.SetDataType(TYPE_BYTES)
.SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kRepeatedProperty)
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REPEATED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kExactVerbatimProperty)
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_VERBATIM)
+ .SetCardinality(CARDINALITY_REPEATED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kPrefixedVerbatimProperty)
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_VERBATIM)
+ .SetCardinality(CARDINALITY_REPEATED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kRfc822Property)
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_RFC822)
+ .SetCardinality(CARDINALITY_REPEATED))
+#ifdef ENABLE_URL_TOKENIZER
.AddProperty(
PropertyConfigBuilder()
- .SetName(kRepeatedProperty)
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_REPEATED))
- .AddProperty(
- PropertyConfigBuilder()
- .SetName(kExactVerbatimProperty)
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_VERBATIM)
- .SetCardinality(CARDINALITY_REPEATED))
- .AddProperty(
- PropertyConfigBuilder()
- .SetName(kPrefixedVerbatimProperty)
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_VERBATIM)
+ .SetName(kExactUrlProperty)
+ .SetDataTypeString(MATCH_EXACT, TOKENIZER_URL)
.SetCardinality(CARDINALITY_REPEATED))
.AddProperty(
PropertyConfigBuilder()
- .SetName(kRfc822Property)
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_RFC822)
+ .SetName(kPrefixedUrlProperty)
+ .SetDataTypeString(MATCH_PREFIX, TOKENIZER_URL)
.SetCardinality(CARDINALITY_REPEATED))
+#endif // ENABLE_URL_TOKENIZER
.AddProperty(
PropertyConfigBuilder()
.SetName(kSubProperty)
@@ -216,11 +226,11 @@ class IndexProcessorTest : public Test {
.AddType(
SchemaTypeConfigBuilder()
.SetType(kNestedType)
- .AddProperty(
- PropertyConfigBuilder()
- .SetName(kNestedProperty)
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kNestedProperty)
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
.Build();
ICING_ASSERT_OK(schema_store_->SetSchema(schema));
@@ -1075,6 +1085,191 @@ TEST_F(IndexProcessorTest, Rfc822PropertyNoMatch) {
EXPECT_THAT(hits, IsEmpty());
}
+#ifdef ENABLE_URL_TOKENIZER
+TEST_F(IndexProcessorTest, ExactUrlProperty) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kExactUrlProperty),
+ "http://www.google.com")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+ EXPECT_THAT(tokenized_document.num_tokens(), 7);
+
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
+ IsOk());
+ EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
+
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("google", kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
+ std::vector<DocHitInfoTermFrequencyPair> hits =
+ GetHitsWithTermFrequency(std::move(itr));
+ std::unordered_map<SectionId, Hit::TermFrequency> expected_map{
+ {kUrlExactSectionId, 1}};
+ EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
+ kDocumentId0, expected_map)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(itr,
+ index_->GetIterator("http", kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
+ hits = GetHitsWithTermFrequency(std::move(itr));
+ expected_map = {{kUrlExactSectionId, 1}};
+ EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
+ kDocumentId0, expected_map)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ itr, index_->GetIterator("www.google.com", kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
+ hits = GetHitsWithTermFrequency(std::move(itr));
+ expected_map = {{kUrlExactSectionId, 1}};
+ EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
+ kDocumentId0, expected_map)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ itr, index_->GetIterator("http://www.google.com", kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
+ hits = GetHitsWithTermFrequency(std::move(itr));
+ expected_map = {{kUrlExactSectionId, 1}};
+ EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
+ kDocumentId0, expected_map)));
+}
+
+TEST_F(IndexProcessorTest, ExactUrlPropertyDoesNotMatchPrefix) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kExactUrlProperty),
+ "https://mail.google.com/calendar/render")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+ EXPECT_THAT(tokenized_document.num_tokens(), 8);
+
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
+ IsOk());
+ EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("co", kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
+ std::vector<DocHitInfoTermFrequencyPair> hits =
+ GetHitsWithTermFrequency(std::move(itr));
+ EXPECT_THAT(hits, IsEmpty());
+
+ ICING_ASSERT_OK_AND_ASSIGN(itr,
+ index_->GetIterator("mail.go", kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
+ hits = GetHitsWithTermFrequency(std::move(itr));
+ EXPECT_THAT(hits, IsEmpty());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ itr, index_->GetIterator("mail.google.com", kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
+ hits = GetHitsWithTermFrequency(std::move(itr));
+ EXPECT_THAT(hits, IsEmpty());
+}
+
+TEST_F(IndexProcessorTest, PrefixUrlProperty) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kPrefixedUrlProperty),
+ "http://www.google.com")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+ EXPECT_THAT(tokenized_document.num_tokens(), 7);
+
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
+ IsOk());
+ EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
+
+ // "goo" is a prefix of "google" and "google.com"
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("goo", kSectionIdMaskAll, TermMatchType::PREFIX));
+ std::vector<DocHitInfoTermFrequencyPair> hits =
+ GetHitsWithTermFrequency(std::move(itr));
+ std::unordered_map<SectionId, Hit::TermFrequency> expected_map{
+ {kUrlPrefixedSectionId, 1}};
+ EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
+ kDocumentId0, expected_map)));
+
+ // "http" is a prefix of "http" and "http://www.google.com"
+ ICING_ASSERT_OK_AND_ASSIGN(itr, index_->GetIterator("http", kSectionIdMaskAll,
+ TermMatchType::PREFIX));
+ hits = GetHitsWithTermFrequency(std::move(itr));
+ expected_map = {{kUrlPrefixedSectionId, 1}};
+ EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
+ kDocumentId0, expected_map)));
+
+ // "www.go" is a prefix of "www.google.com"
+ ICING_ASSERT_OK_AND_ASSIGN(
+ itr,
+ index_->GetIterator("www.go", kSectionIdMaskAll, TermMatchType::PREFIX));
+ hits = GetHitsWithTermFrequency(std::move(itr));
+ expected_map = {{kUrlPrefixedSectionId, 1}};
+ EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
+ kDocumentId0, expected_map)));
+}
+
+TEST_F(IndexProcessorTest, PrefixUrlPropertyNoMatch) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kPrefixedUrlProperty),
+ "https://mail.google.com/calendar/render")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+ EXPECT_THAT(tokenized_document.num_tokens(), 8);
+
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
+ IsOk());
+ EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
+
+ // no token starts with "gle", so we should have no hits
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("gle", kSectionIdMaskAll, TermMatchType::PREFIX));
+ std::vector<DocHitInfoTermFrequencyPair> hits =
+ GetHitsWithTermFrequency(std::move(itr));
+ EXPECT_THAT(hits, IsEmpty());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ itr,
+ index_->GetIterator("w.goo", kSectionIdMaskAll, TermMatchType::PREFIX));
+ hits = GetHitsWithTermFrequency(std::move(itr));
+ EXPECT_THAT(hits, IsEmpty());
+
+ // tokens have separators removed, so no hits here
+ ICING_ASSERT_OK_AND_ASSIGN(itr, index_->GetIterator(".com", kSectionIdMaskAll,
+ TermMatchType::PREFIX));
+ hits = GetHitsWithTermFrequency(std::move(itr));
+ EXPECT_THAT(hits, IsEmpty());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ itr, index_->GetIterator("calendar/render", kSectionIdMaskAll,
+ TermMatchType::PREFIX));
+ hits = GetHitsWithTermFrequency(std::move(itr));
+ EXPECT_THAT(hits, IsEmpty());
+}
+#endif // ENABLE_URL_TOKENIZER
+
} // namespace
} // namespace lib
diff --git a/icing/index/iterator/doc-hit-info-iterator-and.cc b/icing/index/iterator/doc-hit-info-iterator-and.cc
index 6bde8e6..3b7ede9 100644
--- a/icing/index/iterator/doc-hit-info-iterator-and.cc
+++ b/icing/index/iterator/doc-hit-info-iterator-and.cc
@@ -55,11 +55,12 @@ std::unique_ptr<DocHitInfoIterator> CreateAndIterator(
if (iterators.size() <= kBinaryAndIteratorPerformanceThreshold &&
iterators.size() >= kMinBinaryIterators) {
// Accumulate the iterators that need to be ANDed together.
- iterator = std::move(iterators.at(0));
- for (size_t i = 1; i < iterators.size(); ++i) {
+ iterator = std::move(iterators.at(iterators.size() - 1));
+ for (int i = iterators.size() - 2; i >= 0; --i) {
std::unique_ptr<DocHitInfoIterator> temp_iterator = std::move(iterator);
iterator = std::make_unique<DocHitInfoIteratorAnd>(
- std::move(temp_iterator), std::move(iterators[i]));
+ /*short_it=*/std::move(iterators[i]),
+ /*long_it=*/std::move(temp_iterator));
}
} else {
// If the vector is too small, the AndNary iterator can handle it and return
diff --git a/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc b/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc
index 485f85b..e80d8f0 100644
--- a/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc
+++ b/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc
@@ -48,14 +48,6 @@ using ::testing::ElementsAreArray;
using ::testing::Eq;
using ::testing::IsEmpty;
-constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL =
- PropertyConfigProto::Cardinality::OPTIONAL;
-
-constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN =
- StringIndexingConfig::TokenizerType::PLAIN;
-
-constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY;
-
class DocHitInfoIteratorSectionRestrictTest : public ::testing::Test {
protected:
DocHitInfoIteratorSectionRestrictTest()
@@ -74,7 +66,7 @@ class DocHitInfoIteratorSectionRestrictTest : public ::testing::Test {
.AddProperty(
PropertyConfigBuilder()
.SetName(indexed_property_)
- .SetDataTypeString(MATCH_EXACT,
+ .SetDataTypeString(TERM_MATCH_EXACT,
TOKENIZER_PLAIN)
.SetCardinality(CARDINALITY_OPTIONAL)))
.Build();
diff --git a/icing/index/main/doc-hit-info-iterator-term-main.cc b/icing/index/main/doc-hit-info-iterator-term-main.cc
index 4bd87aa..098a450 100644
--- a/icing/index/main/doc-hit-info-iterator-term-main.cc
+++ b/icing/index/main/doc-hit-info-iterator-term-main.cc
@@ -20,12 +20,13 @@
#include "icing/text_classifier/lib3/utils/base/status.h"
#include "icing/absl_ports/canonical_errors.h"
#include "icing/absl_ports/str_cat.h"
+#include "icing/file/posting_list/posting-list-identifier.h"
#include "icing/index/hit/doc-hit-info.h"
#include "icing/index/main/posting-list-accessor.h"
-#include "icing/index/main/posting-list-identifier.h"
#include "icing/legacy/core/icing-string-util.h"
#include "icing/schema/section.h"
#include "icing/store/document-id.h"
+#include "icing/util/logging.h"
#include "icing/util/status-macros.h"
namespace icing {
diff --git a/icing/index/main/main-index-merger.cc b/icing/index/main/main-index-merger.cc
index f49dc74..c26a6d7 100644
--- a/icing/index/main/main-index-merger.cc
+++ b/icing/index/main/main-index-merger.cc
@@ -20,10 +20,11 @@
#include <unordered_map>
#include "icing/absl_ports/canonical_errors.h"
+#include "icing/file/posting_list/index-block.h"
#include "icing/index/lite/term-id-hit-pair.h"
-#include "icing/index/main/index-block.h"
#include "icing/index/term-id-codec.h"
#include "icing/legacy/core/icing-string-util.h"
+#include "icing/util/logging.h"
#include "icing/util/status-macros.h"
namespace icing {
diff --git a/icing/index/main/main-index.cc b/icing/index/main/main-index.cc
index 5de92d0..1c61bfa 100644
--- a/icing/index/main/main-index.cc
+++ b/icing/index/main/main-index.cc
@@ -22,13 +22,16 @@
#include "icing/absl_ports/canonical_errors.h"
#include "icing/absl_ports/str_cat.h"
#include "icing/file/destructible-directory.h"
-#include "icing/index/main/index-block.h"
+#include "icing/file/posting_list/posting-list-common.h"
+#include "icing/index/main/posting-list-used-hit-serializer.h"
#include "icing/index/term-id-codec.h"
#include "icing/index/term-property-id.h"
+#include "icing/legacy/core/icing-string-util.h"
#include "icing/legacy/index/icing-dynamic-trie.h"
#include "icing/proto/debug.pb.h"
#include "icing/proto/storage.pb.h"
#include "icing/proto/term.pb.h"
+#include "icing/util/logging.h"
#include "icing/util/status-macros.h"
namespace icing {
@@ -94,7 +97,9 @@ MainIndex::MainIndex(const std::string& index_directory,
const IcingFilesystem* icing_filesystem)
: base_dir_(index_directory),
filesystem_(filesystem),
- icing_filesystem_(icing_filesystem) {}
+ icing_filesystem_(icing_filesystem),
+ posting_list_used_hit_serializer_(
+ std::make_unique<PostingListUsedHitSerializer>()) {}
libtextclassifier3::StatusOr<std::unique_ptr<MainIndex>> MainIndex::Create(
const std::string& index_directory, const Filesystem* filesystem,
@@ -115,7 +120,8 @@ libtextclassifier3::Status MainIndex::Init() {
std::string flash_index_file = base_dir_ + "/main_index";
ICING_ASSIGN_OR_RETURN(
FlashIndexStorage flash_index,
- FlashIndexStorage::Create(flash_index_file, filesystem_));
+ FlashIndexStorage::Create(flash_index_file, filesystem_,
+ posting_list_used_hit_serializer_.get()));
flash_index_storage_ =
std::make_unique<FlashIndexStorage>(std::move(flash_index));
@@ -161,9 +167,11 @@ MainIndex::GetAccessorForExactTerm(const std::string& term) {
return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
"Term %s is not present in main lexicon.", term.c_str()));
}
- ICING_ASSIGN_OR_RETURN(PostingListAccessor accessor,
- PostingListAccessor::CreateFromExisting(
- flash_index_storage_.get(), posting_list_id));
+ ICING_ASSIGN_OR_RETURN(
+ PostingListAccessor accessor,
+ PostingListAccessor::CreateFromExisting(
+ flash_index_storage_.get(), posting_list_used_hit_serializer_.get(),
+ posting_list_id));
return std::make_unique<PostingListAccessor>(std::move(accessor));
}
@@ -193,9 +201,11 @@ MainIndex::GetAccessorForPrefixTerm(const std::string& prefix) {
}
PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
memcpy(&posting_list_id, main_itr.GetValue(), sizeof(posting_list_id));
- ICING_ASSIGN_OR_RETURN(PostingListAccessor pl_accessor,
- PostingListAccessor::CreateFromExisting(
- flash_index_storage_.get(), posting_list_id));
+ ICING_ASSIGN_OR_RETURN(
+ PostingListAccessor pl_accessor,
+ PostingListAccessor::CreateFromExisting(
+ flash_index_storage_.get(), posting_list_used_hit_serializer_.get(),
+ posting_list_id));
GetPrefixAccessorResult result = {
std::make_unique<PostingListAccessor>(std::move(pl_accessor)), exact};
return result;
@@ -234,9 +244,11 @@ MainIndex::FindTermsByPrefix(
PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
memcpy(&posting_list_id, term_iterator.GetValue(), sizeof(posting_list_id));
- ICING_ASSIGN_OR_RETURN(PostingListAccessor pl_accessor,
- PostingListAccessor::CreateFromExisting(
- flash_index_storage_.get(), posting_list_id));
+ ICING_ASSIGN_OR_RETURN(
+ PostingListAccessor pl_accessor,
+ PostingListAccessor::CreateFromExisting(
+ flash_index_storage_.get(), posting_list_used_hit_serializer_.get(),
+ posting_list_id));
ICING_ASSIGN_OR_RETURN(std::vector<Hit> hits,
pl_accessor.GetNextHitsBatch());
while (!hits.empty()) {
@@ -549,7 +561,8 @@ libtextclassifier3::Status MainIndex::AddHits(
sizeof(backfill_posting_list_id));
ICING_ASSIGN_OR_RETURN(
PostingListAccessor hit_accum,
- PostingListAccessor::Create(flash_index_storage_.get()));
+ PostingListAccessor::Create(flash_index_storage_.get(),
+ posting_list_used_hit_serializer_.get()));
ICING_RETURN_IF_ERROR(
AddPrefixBackfillHits(backfill_posting_list_id, &hit_accum));
PostingListAccessor::FinalizeResult result =
@@ -583,15 +596,18 @@ libtextclassifier3::Status MainIndex::AddHitsForTerm(
return absl_ports::InternalError(
"Valid posting list has an invalid block index!");
}
- ICING_ASSIGN_OR_RETURN(PostingListAccessor tmp,
- PostingListAccessor::CreateFromExisting(
- flash_index_storage_.get(), posting_list_id));
+ ICING_ASSIGN_OR_RETURN(
+ PostingListAccessor tmp,
+ PostingListAccessor::CreateFromExisting(
+ flash_index_storage_.get(), posting_list_used_hit_serializer_.get(),
+ posting_list_id));
pl_accessor = std::make_unique<PostingListAccessor>(std::move(tmp));
} else {
// New posting list.
ICING_ASSIGN_OR_RETURN(
PostingListAccessor tmp,
- PostingListAccessor::Create(flash_index_storage_.get()));
+ PostingListAccessor::Create(flash_index_storage_.get(),
+ posting_list_used_hit_serializer_.get()));
pl_accessor = std::make_unique<PostingListAccessor>(std::move(tmp));
}
@@ -621,8 +637,9 @@ libtextclassifier3::Status MainIndex::AddPrefixBackfillHits(
PostingListAccessor* hit_accum) {
ICING_ASSIGN_OR_RETURN(
PostingListAccessor backfill_accessor,
- PostingListAccessor::CreateFromExisting(flash_index_storage_.get(),
- backfill_posting_list_id));
+ PostingListAccessor::CreateFromExisting(
+ flash_index_storage_.get(), posting_list_used_hit_serializer_.get(),
+ backfill_posting_list_id));
std::vector<Hit> backfill_hits;
ICING_ASSIGN_OR_RETURN(std::vector<Hit> tmp,
backfill_accessor.GetNextHitsBatch());
@@ -760,7 +777,9 @@ libtextclassifier3::StatusOr<DocumentId> MainIndex::TransferAndAddHits(
ICING_ASSIGN_OR_RETURN(
PostingListAccessor hit_accum,
- PostingListAccessor::Create(new_index->flash_index_storage_.get()));
+ PostingListAccessor::Create(
+ new_index->flash_index_storage_.get(),
+ new_index->posting_list_used_hit_serializer_.get()));
for (auto itr = new_hits.rbegin(); itr != new_hits.rend(); ++itr) {
ICING_RETURN_IF_ERROR(hit_accum.PrependHit(*itr));
}
@@ -806,9 +825,11 @@ libtextclassifier3::Status MainIndex::TransferIndex(
<< "Got invalid posting_list_id from previous main index";
continue;
}
- ICING_ASSIGN_OR_RETURN(PostingListAccessor pl_accessor,
- PostingListAccessor::CreateFromExisting(
- flash_index_storage_.get(), posting_list_id));
+ ICING_ASSIGN_OR_RETURN(
+ PostingListAccessor pl_accessor,
+ PostingListAccessor::CreateFromExisting(
+ flash_index_storage_.get(), posting_list_used_hit_serializer_.get(),
+ posting_list_id));
ICING_ASSIGN_OR_RETURN(
DocumentId curr_largest_document_id,
TransferAndAddHits(document_id_old_to_new, term_itr.GetKey(),
diff --git a/icing/index/main/main-index.h b/icing/index/main/main-index.h
index aa3fc38..e257a77 100644
--- a/icing/index/main/main-index.h
+++ b/icing/index/main/main-index.h
@@ -20,9 +20,10 @@
#include "icing/text_classifier/lib3/utils/base/status.h"
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/file/filesystem.h"
+#include "icing/file/posting_list/flash-index-storage.h"
#include "icing/index/lite/term-id-hit-pair.h"
-#include "icing/index/main/flash-index-storage.h"
#include "icing/index/main/posting-list-accessor.h"
+#include "icing/index/main/posting-list-used-hit-serializer.h"
#include "icing/index/term-id-codec.h"
#include "icing/index/term-metadata.h"
#include "icing/legacy/index/icing-dynamic-trie.h"
@@ -203,8 +204,9 @@ class MainIndex {
const std::vector<DocumentId>& document_id_old_to_new);
private:
- MainIndex(const std::string& index_directory, const Filesystem* filesystem,
- const IcingFilesystem* icing_filesystem);
+ explicit MainIndex(const std::string& index_directory,
+ const Filesystem* filesystem,
+ const IcingFilesystem* icing_filesystem);
libtextclassifier3::Status Init();
@@ -323,6 +325,8 @@ class MainIndex {
std::string base_dir_;
const Filesystem* filesystem_;
const IcingFilesystem* icing_filesystem_;
+ std::unique_ptr<PostingListUsedHitSerializer>
+ posting_list_used_hit_serializer_;
std::unique_ptr<FlashIndexStorage> flash_index_storage_;
std::unique_ptr<IcingDynamicTrie> main_lexicon_;
};
diff --git a/icing/index/main/main-index_test.cc b/icing/index/main/main-index_test.cc
index bfda014..92601e7 100644
--- a/icing/index/main/main-index_test.cc
+++ b/icing/index/main/main-index_test.cc
@@ -22,7 +22,6 @@
#include "icing/index/lite/term-id-hit-pair.h"
#include "icing/index/main/doc-hit-info-iterator-term-main.h"
#include "icing/index/main/main-index-merger.h"
-#include "icing/index/main/main-index.h"
#include "icing/index/term-id-codec.h"
#include "icing/index/term-property-id.h"
#include "icing/legacy/index/icing-dynamic-trie.h"
diff --git a/icing/index/main/posting-list-accessor.cc b/icing/index/main/posting-list-accessor.cc
index 93b7b0b..06ab0a1 100644
--- a/icing/index/main/posting-list-accessor.cc
+++ b/icing/index/main/posting-list-accessor.cc
@@ -14,38 +14,43 @@
#include "icing/index/main/posting-list-accessor.h"
+#include <cstdint>
#include <memory>
+#include <vector>
#include "icing/absl_ports/canonical_errors.h"
-#include "icing/index/main/flash-index-storage.h"
-#include "icing/index/main/index-block.h"
-#include "icing/index/main/posting-list-identifier.h"
-#include "icing/index/main/posting-list-used.h"
+#include "icing/file/posting_list/flash-index-storage.h"
+#include "icing/file/posting_list/index-block.h"
+#include "icing/file/posting_list/posting-list-identifier.h"
+#include "icing/file/posting_list/posting-list-used.h"
+#include "icing/index/main/posting-list-used-hit-serializer.h"
#include "icing/util/status-macros.h"
namespace icing {
namespace lib {
libtextclassifier3::StatusOr<PostingListAccessor> PostingListAccessor::Create(
- FlashIndexStorage *storage) {
- uint32_t max_posting_list_bytes =
- IndexBlock::CalculateMaxPostingListBytes(storage->block_size());
+ FlashIndexStorage *storage, PostingListUsedHitSerializer *serializer) {
+ uint32_t max_posting_list_bytes = IndexBlock::CalculateMaxPostingListBytes(
+ storage->block_size(), serializer->GetDataTypeBytes());
std::unique_ptr<uint8_t[]> posting_list_buffer_array =
std::make_unique<uint8_t[]>(max_posting_list_bytes);
ICING_ASSIGN_OR_RETURN(
PostingListUsed posting_list_buffer,
PostingListUsed::CreateFromUnitializedRegion(
- posting_list_buffer_array.get(), max_posting_list_bytes));
- return PostingListAccessor(storage, std::move(posting_list_buffer_array),
+ serializer, posting_list_buffer_array.get(), max_posting_list_bytes));
+ return PostingListAccessor(storage, serializer,
+ std::move(posting_list_buffer_array),
std::move(posting_list_buffer));
}
libtextclassifier3::StatusOr<PostingListAccessor>
PostingListAccessor::CreateFromExisting(
- FlashIndexStorage *storage,
+ FlashIndexStorage *storage, PostingListUsedHitSerializer *serializer,
PostingListIdentifier existing_posting_list_id) {
// Our posting_list_buffer_ will start as empty.
- ICING_ASSIGN_OR_RETURN(PostingListAccessor pl_accessor, Create(storage));
+ ICING_ASSIGN_OR_RETURN(PostingListAccessor pl_accessor,
+ Create(storage, serializer));
ICING_ASSIGN_OR_RETURN(PostingListHolder holder,
storage->GetPostingList(existing_posting_list_id));
pl_accessor.preexisting_posting_list_ =
@@ -64,8 +69,9 @@ PostingListAccessor::GetNextHitsBatch() {
"Cannot retrieve hits from a PostingListAccessor that was not created "
"from a preexisting posting list.");
}
- ICING_ASSIGN_OR_RETURN(std::vector<Hit> batch,
- preexisting_posting_list_->posting_list.GetHits());
+ ICING_ASSIGN_OR_RETURN(
+ std::vector<Hit> batch,
+ serializer_->GetHits(&preexisting_posting_list_->posting_list));
uint32_t next_block_index;
// Posting lists will only be chained when they are max-sized, in which case
// block.next_block_index() will point to the next block for the next posting
@@ -95,7 +101,7 @@ libtextclassifier3::Status PostingListAccessor::PrependHit(const Hit &hit) {
PostingListUsed &active_pl = (preexisting_posting_list_ != nullptr)
? preexisting_posting_list_->posting_list
: posting_list_buffer_;
- libtextclassifier3::Status status = active_pl.PrependHit(hit);
+ libtextclassifier3::Status status = serializer_->PrependHit(&active_pl, hit);
if (!absl_ports::IsResourceExhausted(status)) {
return status;
}
@@ -112,7 +118,7 @@ libtextclassifier3::Status PostingListAccessor::PrependHit(const Hit &hit) {
// It's fine to explicitly reference posting_list_buffer_ here because there's
// no way of reaching this line while preexisting_posting_list_ is still in
// use.
- return posting_list_buffer_.PrependHit(hit);
+ return serializer_->PrependHit(&posting_list_buffer_, hit);
}
void PostingListAccessor::FlushPreexistingPostingList() {
@@ -127,7 +133,8 @@ void PostingListAccessor::FlushPreexistingPostingList() {
// and free this posting list.
//
// Move will always succeed since posting_list_buffer_ is max_pl_bytes.
- posting_list_buffer_.MoveFrom(&preexisting_posting_list_->posting_list);
+ serializer_->MoveFrom(/*dst=*/&posting_list_buffer_,
+ /*src=*/&preexisting_posting_list_->posting_list);
// Now that all the contents of this posting list have been copied, there's
// no more use for it. Make it available to be used for another posting
@@ -140,13 +147,14 @@ void PostingListAccessor::FlushPreexistingPostingList() {
libtextclassifier3::Status PostingListAccessor::FlushInMemoryPostingList() {
// We exceeded max_pl_bytes(). Need to flush posting_list_buffer_ and update
// the chain.
- uint32_t max_posting_list_bytes =
- IndexBlock::CalculateMaxPostingListBytes(storage_->block_size());
+ uint32_t max_posting_list_bytes = IndexBlock::CalculateMaxPostingListBytes(
+ storage_->block_size(), serializer_->GetDataTypeBytes());
ICING_ASSIGN_OR_RETURN(PostingListHolder holder,
storage_->AllocatePostingList(max_posting_list_bytes));
holder.block.set_next_block_index(prev_block_identifier_.block_index());
prev_block_identifier_ = holder.id;
- return holder.posting_list.MoveFrom(&posting_list_buffer_);
+ return serializer_->MoveFrom(/*dst=*/&holder.posting_list,
+ /*src=*/&posting_list_buffer_);
}
PostingListAccessor::FinalizeResult PostingListAccessor::Finalize(
@@ -158,7 +166,7 @@ PostingListAccessor::FinalizeResult PostingListAccessor::Finalize(
accessor.preexisting_posting_list_->id};
return result;
}
- if (accessor.posting_list_buffer_.BytesUsed() <= 0) {
+ if (accessor.serializer_->GetBytesUsed(&accessor.posting_list_buffer_) <= 0) {
FinalizeResult result = {absl_ports::InvalidArgumentError(
"Can't finalize an empty PostingListAccessor. "
"There's nothing to Finalize!"),
@@ -166,10 +174,12 @@ PostingListAccessor::FinalizeResult PostingListAccessor::Finalize(
return result;
}
uint32_t posting_list_bytes =
- accessor.posting_list_buffer_.MinPostingListSizeToFit();
+ accessor.serializer_->GetMinPostingListSizeToFit(
+ &accessor.posting_list_buffer_);
if (accessor.prev_block_identifier_.is_valid()) {
posting_list_bytes = IndexBlock::CalculateMaxPostingListBytes(
- accessor.storage_->block_size());
+ accessor.storage_->block_size(),
+ accessor.serializer_->GetDataTypeBytes());
}
auto holder_or = accessor.storage_->AllocatePostingList(posting_list_bytes);
if (!holder_or.ok()) {
@@ -189,7 +199,9 @@ PostingListAccessor::FinalizeResult PostingListAccessor::Finalize(
// is valid because we created it in-memory. And finally, we know that the
// hits from posting_list_buffer_ will fit in editor.posting_list() because we
// requested it be at at least posting_list_bytes large.
- auto status = holder.posting_list.MoveFrom(&accessor.posting_list_buffer_);
+ auto status =
+ accessor.serializer_->MoveFrom(/*dst=*/&holder.posting_list,
+ /*src=*/&accessor.posting_list_buffer_);
if (!status.ok()) {
FinalizeResult result = {std::move(status),
accessor.prev_block_identifier_};
diff --git a/icing/index/main/posting-list-accessor.h b/icing/index/main/posting-list-accessor.h
index e1bb3c0..3f93c3a 100644
--- a/icing/index/main/posting-list-accessor.h
+++ b/icing/index/main/posting-list-accessor.h
@@ -15,14 +15,17 @@
#ifndef ICING_INDEX_POSTING_LIST_ACCESSOR_H_
#define ICING_INDEX_POSTING_LIST_ACCESSOR_H_
+#include <cstdint>
#include <memory>
+#include <vector>
#include "icing/text_classifier/lib3/utils/base/status.h"
#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/file/posting_list/flash-index-storage.h"
+#include "icing/file/posting_list/posting-list-identifier.h"
+#include "icing/file/posting_list/posting-list-used.h"
#include "icing/index/hit/hit.h"
-#include "icing/index/main/flash-index-storage.h"
-#include "icing/index/main/posting-list-identifier.h"
-#include "icing/index/main/posting-list-used.h"
+#include "icing/index/main/posting-list-used-hit-serializer.h"
namespace icing {
namespace lib {
@@ -48,7 +51,7 @@ class PostingListAccessor {
// - On success, a valid instance of PostingListAccessor
// - INVALID_ARGUMENT error if storage has an invalid block_size.
static libtextclassifier3::StatusOr<PostingListAccessor> Create(
- FlashIndexStorage* storage);
+ FlashIndexStorage* storage, PostingListUsedHitSerializer* serializer);
// Create a PostingListAccessor with an existing posting list identified by
// existing_posting_list_id.
@@ -61,7 +64,7 @@ class PostingListAccessor {
// - On success, a valid instance of PostingListAccessor
// - INVALID_ARGUMENT if storage has an invalid block_size.
static libtextclassifier3::StatusOr<PostingListAccessor> CreateFromExisting(
- FlashIndexStorage* storage,
+ FlashIndexStorage* storage, PostingListUsedHitSerializer* serializer,
PostingListIdentifier existing_posting_list_id);
// Retrieve the next batch of hits for the posting list chain
@@ -109,10 +112,11 @@ class PostingListAccessor {
private:
explicit PostingListAccessor(
- FlashIndexStorage* storage,
+ FlashIndexStorage* storage, PostingListUsedHitSerializer* serializer,
std::unique_ptr<uint8_t[]> posting_list_buffer_array,
PostingListUsed posting_list_buffer)
: storage_(storage),
+ serializer_(serializer),
prev_block_identifier_(PostingListIdentifier::kInvalid),
posting_list_buffer_array_(std::move(posting_list_buffer_array)),
posting_list_buffer_(std::move(posting_list_buffer)),
@@ -137,6 +141,8 @@ class PostingListAccessor {
FlashIndexStorage* storage_; // Does not own.
+ PostingListUsedHitSerializer* serializer_; // Does not own.
+
// The PostingListIdentifier of the first max-sized posting list in the
// posting list chain or PostingListIdentifier::kInvalid if there is no
// posting list chain.
diff --git a/icing/index/main/posting-list-accessor_test.cc b/icing/index/main/posting-list-accessor_test.cc
index a539fe4..3145420 100644
--- a/icing/index/main/posting-list-accessor_test.cc
+++ b/icing/index/main/posting-list-accessor_test.cc
@@ -19,11 +19,12 @@
#include "gmock/gmock.h"
#include "gtest/gtest.h"
#include "icing/file/filesystem.h"
+#include "icing/file/posting_list/flash-index-storage.h"
+#include "icing/file/posting_list/index-block.h"
+#include "icing/file/posting_list/posting-list-identifier.h"
+#include "icing/file/posting_list/posting-list-used.h"
#include "icing/index/hit/hit.h"
-#include "icing/index/main/flash-index-storage.h"
-#include "icing/index/main/index-block.h"
-#include "icing/index/main/posting-list-identifier.h"
-#include "icing/index/main/posting-list-used.h"
+#include "icing/index/main/posting-list-used-hit-serializer.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/hit-test-utils.h"
#include "icing/testing/tmp-directory.h"
@@ -39,20 +40,45 @@ using ::testing::Eq;
using ::testing::Lt;
using ::testing::SizeIs;
-TEST(PostingListAccessorStorageTest, HitsAddAndRetrieveProperly) {
- std::string test_dir = GetTestTempDir() + "/test_dir";
- std::string file_name = test_dir + "/test_file.idx.index";
- Filesystem filesystem;
- ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(test_dir.c_str()));
- ASSERT_TRUE(filesystem.CreateDirectoryRecursively(test_dir.c_str()));
+class PostingListAccessorTest : public ::testing::Test {
+ protected:
+ void SetUp() override {
+ test_dir_ = GetTestTempDir() + "/test_dir";
+ file_name_ = test_dir_ + "/test_file.idx.index";
- ICING_ASSERT_OK_AND_ASSIGN(FlashIndexStorage flash_index_storage,
- FlashIndexStorage::Create(file_name, &filesystem));
+ ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(test_dir_.c_str()));
+ ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(test_dir_.c_str()));
+
+ serializer_ = std::make_unique<PostingListUsedHitSerializer>();
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FlashIndexStorage flash_index_storage,
+ FlashIndexStorage::Create(file_name_, &filesystem_, serializer_.get()));
+ flash_index_storage_ =
+ std::make_unique<FlashIndexStorage>(std::move(flash_index_storage));
+ }
+
+ void TearDown() override {
+ flash_index_storage_.reset();
+ serializer_.reset();
+ ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(test_dir_.c_str()));
+ }
+
+ Filesystem filesystem_;
+ std::string test_dir_;
+ std::string file_name_;
+ std::unique_ptr<PostingListUsedHitSerializer> serializer_;
+ std::unique_ptr<FlashIndexStorage> flash_index_storage_;
+};
+
+TEST_F(PostingListAccessorTest, HitsAddAndRetrieveProperly) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListAccessor pl_accessor,
+ PostingListAccessor::Create(flash_index_storage_.get(),
+ serializer_.get()));
// Add some hits! Any hits!
std::vector<Hit> hits1 =
CreateHits(/*num_hits=*/5, /*desired_byte_length=*/1);
- ICING_ASSERT_OK_AND_ASSIGN(PostingListAccessor pl_accessor,
- PostingListAccessor::Create(&flash_index_storage));
for (const Hit& hit : hits1) {
ICING_ASSERT_OK(pl_accessor.PrependHit(hit));
}
@@ -64,23 +90,17 @@ TEST(PostingListAccessorStorageTest, HitsAddAndRetrieveProperly) {
// Retrieve some hits.
ICING_ASSERT_OK_AND_ASSIGN(PostingListHolder pl_holder,
- flash_index_storage.GetPostingList(result.id));
- EXPECT_THAT(pl_holder.posting_list.GetHits(),
+ flash_index_storage_->GetPostingList(result.id));
+ EXPECT_THAT(serializer_->GetHits(&pl_holder.posting_list),
IsOkAndHolds(ElementsAreArray(hits1.rbegin(), hits1.rend())));
EXPECT_THAT(pl_holder.block.next_block_index(), Eq(kInvalidBlockIndex));
}
-TEST(PostingListAccessorStorageTest, PreexistingPLKeepOnSameBlock) {
- std::string test_dir = GetTestTempDir() + "/test_dir";
- std::string file_name = test_dir + "/test_file.idx.index";
- Filesystem filesystem;
- ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(test_dir.c_str()));
- ASSERT_TRUE(filesystem.CreateDirectoryRecursively(test_dir.c_str()));
-
- ICING_ASSERT_OK_AND_ASSIGN(FlashIndexStorage flash_index_storage,
- FlashIndexStorage::Create(file_name, &filesystem));
- ICING_ASSERT_OK_AND_ASSIGN(PostingListAccessor pl_accessor,
- PostingListAccessor::Create(&flash_index_storage));
+TEST_F(PostingListAccessorTest, PreexistingPLKeepOnSameBlock) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListAccessor pl_accessor,
+ PostingListAccessor::Create(flash_index_storage_.get(),
+ serializer_.get()));
// Add a single hit. This will fit in a min-sized posting list.
Hit hit1(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency);
ICING_ASSERT_OK(pl_accessor.PrependHit(hit1));
@@ -95,8 +115,9 @@ TEST(PostingListAccessorStorageTest, PreexistingPLKeepOnSameBlock) {
// at least two hits, so this should NOT cause the previous pl to be
// reallocated.
ICING_ASSERT_OK_AND_ASSIGN(
- pl_accessor, PostingListAccessor::CreateFromExisting(&flash_index_storage,
- result1.id));
+ pl_accessor,
+ PostingListAccessor::CreateFromExisting(flash_index_storage_.get(),
+ serializer_.get(), result1.id));
Hit hit2 = CreateHit(hit1, /*desired_byte_length=*/1);
ICING_ASSERT_OK(pl_accessor.PrependHit(hit2));
PostingListAccessor::FinalizeResult result2 =
@@ -108,22 +129,16 @@ TEST(PostingListAccessorStorageTest, PreexistingPLKeepOnSameBlock) {
// The posting list at result2.id should hold all of the hits that have been
// added.
ICING_ASSERT_OK_AND_ASSIGN(PostingListHolder pl_holder,
- flash_index_storage.GetPostingList(result2.id));
- EXPECT_THAT(pl_holder.posting_list.GetHits(),
+ flash_index_storage_->GetPostingList(result2.id));
+ EXPECT_THAT(serializer_->GetHits(&pl_holder.posting_list),
IsOkAndHolds(ElementsAre(hit2, hit1)));
}
-TEST(PostingListAccessorStorageTest, PreexistingPLReallocateToLargerPL) {
- std::string test_dir = GetTestTempDir() + "/test_dir";
- std::string file_name = test_dir + "/test_file.idx.index";
- Filesystem filesystem;
- ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(test_dir.c_str()));
- ASSERT_TRUE(filesystem.CreateDirectoryRecursively(test_dir.c_str()));
-
- ICING_ASSERT_OK_AND_ASSIGN(FlashIndexStorage flash_index_storage,
- FlashIndexStorage::Create(file_name, &filesystem));
- ICING_ASSERT_OK_AND_ASSIGN(PostingListAccessor pl_accessor,
- PostingListAccessor::Create(&flash_index_storage));
+TEST_F(PostingListAccessorTest, PreexistingPLReallocateToLargerPL) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListAccessor pl_accessor,
+ PostingListAccessor::Create(flash_index_storage_.get(),
+ serializer_.get()));
// The smallest posting list size is 15 bytes. The first four hits will be
// compressed to one byte each and will be able to fit in the 5 byte padded
// region. The last hit will fit in one of the special hits. The posting list
@@ -142,8 +157,9 @@ TEST(PostingListAccessorStorageTest, PreexistingPLReallocateToLargerPL) {
// Now let's add some more hits!
ICING_ASSERT_OK_AND_ASSIGN(
- pl_accessor, PostingListAccessor::CreateFromExisting(&flash_index_storage,
- result1.id));
+ pl_accessor,
+ PostingListAccessor::CreateFromExisting(flash_index_storage_.get(),
+ serializer_.get(), result1.id));
// The current posting list can fit at most 2 more hits. Adding 12 more hits
// should result in these hits being moved to a larger posting list.
std::vector<Hit> hits2 = CreateHits(
@@ -167,22 +183,16 @@ TEST(PostingListAccessorStorageTest, PreexistingPLReallocateToLargerPL) {
hits1.push_back(hit);
}
ICING_ASSERT_OK_AND_ASSIGN(PostingListHolder pl_holder,
- flash_index_storage.GetPostingList(result2.id));
- EXPECT_THAT(pl_holder.posting_list.GetHits(),
+ flash_index_storage_->GetPostingList(result2.id));
+ EXPECT_THAT(serializer_->GetHits(&pl_holder.posting_list),
IsOkAndHolds(ElementsAreArray(hits1.rbegin(), hits1.rend())));
}
-TEST(PostingListAccessorStorageTest, MultiBlockChainsBlocksProperly) {
- std::string test_dir = GetTestTempDir() + "/test_dir";
- std::string file_name = test_dir + "/test_file.idx.index";
- Filesystem filesystem;
- ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(test_dir.c_str()));
- ASSERT_TRUE(filesystem.CreateDirectoryRecursively(test_dir.c_str()));
-
- ICING_ASSERT_OK_AND_ASSIGN(FlashIndexStorage flash_index_storage,
- FlashIndexStorage::Create(file_name, &filesystem));
- ICING_ASSERT_OK_AND_ASSIGN(PostingListAccessor pl_accessor,
- PostingListAccessor::Create(&flash_index_storage));
+TEST_F(PostingListAccessorTest, MultiBlockChainsBlocksProperly) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListAccessor pl_accessor,
+ PostingListAccessor::Create(flash_index_storage_.get(),
+ serializer_.get()));
// Add some hits! Any hits!
std::vector<Hit> hits1 =
CreateHits(/*num_hits=*/5000, /*desired_byte_length=*/1);
@@ -202,11 +212,11 @@ TEST(PostingListAccessorStorageTest, MultiBlockChainsBlocksProperly) {
// Now let's retrieve them!
ICING_ASSERT_OK_AND_ASSIGN(
PostingListHolder pl_holder,
- flash_index_storage.GetPostingList(second_block_id));
+ flash_index_storage_->GetPostingList(second_block_id));
// This pl_holder will only hold a posting list with the hits that didn't fit
// on the first block.
ICING_ASSERT_OK_AND_ASSIGN(std::vector<Hit> second_block_hits,
- pl_holder.posting_list.GetHits());
+ serializer_->GetHits(&pl_holder.posting_list));
ASSERT_THAT(second_block_hits, SizeIs(Lt(hits1.size())));
auto first_block_hits_start = hits1.rbegin() + second_block_hits.size();
EXPECT_THAT(second_block_hits,
@@ -219,24 +229,17 @@ TEST(PostingListAccessorStorageTest, MultiBlockChainsBlocksProperly) {
PostingListIdentifier pl_id(first_block_id, /*posting_list_index=*/0,
/*posting_list_index_bits=*/0);
ICING_ASSERT_OK_AND_ASSIGN(pl_holder,
- flash_index_storage.GetPostingList(pl_id));
+ flash_index_storage_->GetPostingList(pl_id));
EXPECT_THAT(
- pl_holder.posting_list.GetHits(),
+ serializer_->GetHits(&pl_holder.posting_list),
IsOkAndHolds(ElementsAreArray(first_block_hits_start, hits1.rend())));
}
-TEST(PostingListAccessorStorageTest,
- PreexistingMultiBlockReusesBlocksProperly) {
- std::string test_dir = GetTestTempDir() + "/test_dir";
- std::string file_name = test_dir + "/test_file.idx.index";
- Filesystem filesystem;
- ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(test_dir.c_str()));
- ASSERT_TRUE(filesystem.CreateDirectoryRecursively(test_dir.c_str()));
-
- ICING_ASSERT_OK_AND_ASSIGN(FlashIndexStorage flash_index_storage,
- FlashIndexStorage::Create(file_name, &filesystem));
- ICING_ASSERT_OK_AND_ASSIGN(PostingListAccessor pl_accessor,
- PostingListAccessor::Create(&flash_index_storage));
+TEST_F(PostingListAccessorTest, PreexistingMultiBlockReusesBlocksProperly) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListAccessor pl_accessor,
+ PostingListAccessor::Create(flash_index_storage_.get(),
+ serializer_.get()));
// Add some hits! Any hits!
std::vector<Hit> hits1 =
CreateHits(/*num_hits=*/5000, /*desired_byte_length=*/1);
@@ -254,8 +257,9 @@ TEST(PostingListAccessorStorageTest,
// Now add a couple more hits. These should fit on the existing, not full
// second block.
ICING_ASSERT_OK_AND_ASSIGN(
- pl_accessor, PostingListAccessor::CreateFromExisting(&flash_index_storage,
- first_add_id));
+ pl_accessor,
+ PostingListAccessor::CreateFromExisting(flash_index_storage_.get(),
+ serializer_.get(), first_add_id));
std::vector<Hit> hits2 = CreateHits(
/*start_docid=*/hits1.back().document_id() + 1, /*num_hits=*/50,
/*desired_byte_length=*/1);
@@ -273,12 +277,13 @@ TEST(PostingListAccessorStorageTest,
for (const Hit& hit : hits2) {
hits1.push_back(hit);
}
- ICING_ASSERT_OK_AND_ASSIGN(PostingListHolder pl_holder,
- flash_index_storage.GetPostingList(second_add_id));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListHolder pl_holder,
+ flash_index_storage_->GetPostingList(second_add_id));
// This pl_holder will only hold a posting list with the hits that didn't fit
// on the first block.
ICING_ASSERT_OK_AND_ASSIGN(std::vector<Hit> second_block_hits,
- pl_holder.posting_list.GetHits());
+ serializer_->GetHits(&pl_holder.posting_list));
ASSERT_THAT(second_block_hits, SizeIs(Lt(hits1.size())));
auto first_block_hits_start = hits1.rbegin() + second_block_hits.size();
EXPECT_THAT(second_block_hits,
@@ -291,39 +296,27 @@ TEST(PostingListAccessorStorageTest,
PostingListIdentifier pl_id(first_block_id, /*posting_list_index=*/0,
/*posting_list_index_bits=*/0);
ICING_ASSERT_OK_AND_ASSIGN(pl_holder,
- flash_index_storage.GetPostingList(pl_id));
+ flash_index_storage_->GetPostingList(pl_id));
EXPECT_THAT(
- pl_holder.posting_list.GetHits(),
+ serializer_->GetHits(&pl_holder.posting_list),
IsOkAndHolds(ElementsAreArray(first_block_hits_start, hits1.rend())));
}
-TEST(PostingListAccessorStorageTest, InvalidHitReturnsInvalidArgument) {
- std::string test_dir = GetTestTempDir() + "/test_dir";
- std::string file_name = test_dir + "/test_file.idx.index";
- Filesystem filesystem;
- ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(test_dir.c_str()));
- ASSERT_TRUE(filesystem.CreateDirectoryRecursively(test_dir.c_str()));
-
- ICING_ASSERT_OK_AND_ASSIGN(FlashIndexStorage flash_index_storage,
- FlashIndexStorage::Create(file_name, &filesystem));
- ICING_ASSERT_OK_AND_ASSIGN(PostingListAccessor pl_accessor,
- PostingListAccessor::Create(&flash_index_storage));
+TEST_F(PostingListAccessorTest, InvalidHitReturnsInvalidArgument) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListAccessor pl_accessor,
+ PostingListAccessor::Create(flash_index_storage_.get(),
+ serializer_.get()));
Hit invalid_hit;
EXPECT_THAT(pl_accessor.PrependHit(invalid_hit),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
}
-TEST(PostingListAccessorStorageTest, HitsNotDecreasingReturnsInvalidArgument) {
- std::string test_dir = GetTestTempDir() + "/test_dir";
- std::string file_name = test_dir + "/test_file.idx.index";
- Filesystem filesystem;
- ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(test_dir.c_str()));
- ASSERT_TRUE(filesystem.CreateDirectoryRecursively(test_dir.c_str()));
-
- ICING_ASSERT_OK_AND_ASSIGN(FlashIndexStorage flash_index_storage,
- FlashIndexStorage::Create(file_name, &filesystem));
- ICING_ASSERT_OK_AND_ASSIGN(PostingListAccessor pl_accessor,
- PostingListAccessor::Create(&flash_index_storage));
+TEST_F(PostingListAccessorTest, HitsNotDecreasingReturnsInvalidArgument) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListAccessor pl_accessor,
+ PostingListAccessor::Create(flash_index_storage_.get(),
+ serializer_.get()));
Hit hit1(/*section_id=*/3, /*document_id=*/1, Hit::kDefaultTermFrequency);
ICING_ASSERT_OK(pl_accessor.PrependHit(hit1));
@@ -336,43 +329,32 @@ TEST(PostingListAccessorStorageTest, HitsNotDecreasingReturnsInvalidArgument) {
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
}
-TEST(PostingListAccessorStorageTest, NewPostingListNoHitsAdded) {
- std::string test_dir = GetTestTempDir() + "/test_dir";
- std::string file_name = test_dir + "/test_file.idx.index";
- Filesystem filesystem;
- ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(test_dir.c_str()));
- ASSERT_TRUE(filesystem.CreateDirectoryRecursively(test_dir.c_str()));
-
- ICING_ASSERT_OK_AND_ASSIGN(FlashIndexStorage flash_index_storage,
- FlashIndexStorage::Create(file_name, &filesystem));
- ICING_ASSERT_OK_AND_ASSIGN(PostingListAccessor pl_accessor,
- PostingListAccessor::Create(&flash_index_storage));
+TEST_F(PostingListAccessorTest, NewPostingListNoHitsAdded) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListAccessor pl_accessor,
+ PostingListAccessor::Create(flash_index_storage_.get(),
+ serializer_.get()));
PostingListAccessor::FinalizeResult result1 =
PostingListAccessor::Finalize(std::move(pl_accessor));
EXPECT_THAT(result1.status,
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
}
-TEST(PostingListAccessorStorageTest, PreexistingPostingListNoHitsAdded) {
- std::string test_dir = GetTestTempDir() + "/test_dir";
- std::string file_name = test_dir + "/test_file.idx.index";
- Filesystem filesystem;
- ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(test_dir.c_str()));
- ASSERT_TRUE(filesystem.CreateDirectoryRecursively(test_dir.c_str()));
-
- ICING_ASSERT_OK_AND_ASSIGN(FlashIndexStorage flash_index_storage,
- FlashIndexStorage::Create(file_name, &filesystem));
- ICING_ASSERT_OK_AND_ASSIGN(PostingListAccessor pl_accessor,
- PostingListAccessor::Create(&flash_index_storage));
+TEST_F(PostingListAccessorTest, PreexistingPostingListNoHitsAdded) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListAccessor pl_accessor,
+ PostingListAccessor::Create(flash_index_storage_.get(),
+ serializer_.get()));
Hit hit1(/*section_id=*/3, /*document_id=*/1, Hit::kDefaultTermFrequency);
ICING_ASSERT_OK(pl_accessor.PrependHit(hit1));
PostingListAccessor::FinalizeResult result1 =
PostingListAccessor::Finalize(std::move(pl_accessor));
ICING_ASSERT_OK(result1.status);
- ICING_ASSERT_OK_AND_ASSIGN(PostingListAccessor pl_accessor2,
- PostingListAccessor::CreateFromExisting(
- &flash_index_storage, result1.id));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListAccessor pl_accessor2,
+ PostingListAccessor::CreateFromExisting(flash_index_storage_.get(),
+ serializer_.get(), result1.id));
PostingListAccessor::FinalizeResult result2 =
PostingListAccessor::Finalize(std::move(pl_accessor2));
ICING_ASSERT_OK(result2.status);
diff --git a/icing/index/main/posting-list-used.cc b/icing/index/main/posting-list-used-hit-serializer.cc
index 62e73e5..d45a428 100644
--- a/icing/index/main/posting-list-used.cc
+++ b/icing/index/main/posting-list-used-hit-serializer.cc
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 Google LLC
+// Copyright (C) 2022 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -12,17 +12,19 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "icing/index/main/posting-list-used.h"
+#include "icing/index/main/posting-list-used-hit-serializer.h"
-#include <algorithm>
-#include <cinttypes>
#include <cstdint>
+#include <cstring>
#include <limits>
+#include <vector>
#include "icing/absl_ports/canonical_errors.h"
-#include "icing/index/main/posting-list-utils.h"
+#include "icing/file/posting_list/posting-list-common.h"
+#include "icing/file/posting_list/posting-list-used.h"
#include "icing/legacy/core/icing-string-util.h"
#include "icing/legacy/index/icing-bit-util.h"
+#include "icing/util/logging.h"
#include "icing/util/status-macros.h"
namespace icing {
@@ -30,97 +32,110 @@ namespace lib {
namespace {
-uint32_t GetTermFrequencyByteSize(const Hit &hit) {
+uint32_t GetTermFrequencyByteSize(const Hit& hit) {
return hit.has_term_frequency() ? sizeof(Hit::TermFrequency) : 0;
}
} // namespace
-libtextclassifier3::StatusOr<PostingListUsed>
-PostingListUsed::CreateFromPreexistingPostingListUsedRegion(
- void *posting_list_buffer, uint32_t size_in_bytes) {
- ICING_RETURN_ERROR_IF_NULL(posting_list_buffer);
- if (!posting_list_utils::IsValidPostingListSize(size_in_bytes)) {
- return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
- "Requested posting list size %d is invalid!", size_in_bytes));
- }
- return PostingListUsed(posting_list_buffer, size_in_bytes);
+uint32_t PostingListUsedHitSerializer::GetBytesUsed(
+ const PostingListUsed* posting_list_used) const {
+ // The special hits will be included if they represent actual hits. If they
+ // represent the hit offset or the invalid hit sentinel, they are not
+ // included.
+ return posting_list_used->size_in_bytes() -
+ GetStartByteOffset(posting_list_used);
}
-libtextclassifier3::StatusOr<PostingListUsed>
-PostingListUsed::CreateFromUnitializedRegion(void *posting_list_buffer,
- uint32_t size_in_bytes) {
- ICING_ASSIGN_OR_RETURN(PostingListUsed posting_list_used,
- CreateFromPreexistingPostingListUsedRegion(
- posting_list_buffer, size_in_bytes));
- posting_list_used.Clear();
- return posting_list_used;
+uint32_t PostingListUsedHitSerializer::GetMinPostingListSizeToFit(
+ const PostingListUsed* posting_list_used) const {
+ if (IsFull(posting_list_used) || IsAlmostFull(posting_list_used)) {
+ // If in either the FULL state or ALMOST_FULL state, this posting list *is*
+ // the minimum size posting list that can fit these hits. So just return the
+ // size of the posting list.
+ return posting_list_used->size_in_bytes();
+ }
+
+ // In NOT_FULL status BytesUsed contains no special hits. The minimum sized
+ // posting list that would be guaranteed to fit these hits would be
+ // ALMOST_FULL, with kInvalidHit in special_hit(0), the uncompressed Hit in
+ // special_hit(1) and the n compressed hits in the compressed region.
+ // BytesUsed contains one uncompressed Hit and n compressed hits. Therefore,
+ // fitting these hits into a posting list would require BytesUsed plus one
+ // extra hit.
+ return GetBytesUsed(posting_list_used) + sizeof(Hit);
}
-void PostingListUsed::Clear() {
- // Safe to ignore return value because size_in_bytes_ a valid argument.
- set_start_byte_offset(size_in_bytes_);
+void PostingListUsedHitSerializer::Clear(
+ PostingListUsed* posting_list_used) const {
+ // Safe to ignore return value because posting_list_used->size_in_bytes() is
+ // a valid argument.
+ SetStartByteOffset(posting_list_used,
+ /*offset=*/posting_list_used->size_in_bytes());
}
-libtextclassifier3::Status PostingListUsed::MoveFrom(PostingListUsed *other) {
- ICING_RETURN_ERROR_IF_NULL(other);
- if (other->MinPostingListSizeToFit() > size_in_bytes_) {
+libtextclassifier3::Status PostingListUsedHitSerializer::MoveFrom(
+ PostingListUsed* dst, PostingListUsed* src) const {
+ ICING_RETURN_ERROR_IF_NULL(dst);
+ ICING_RETURN_ERROR_IF_NULL(src);
+ if (GetMinPostingListSizeToFit(src) > dst->size_in_bytes()) {
return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
- "other->MinPostingListSizeToFit %d must be larger than size %d.",
- other->MinPostingListSizeToFit(), size_in_bytes_));
+ "src MinPostingListSizeToFit %d must be larger than size %d.",
+ GetMinPostingListSizeToFit(src), dst->size_in_bytes()));
}
- if (!IsPostingListValid()) {
+ if (!IsPostingListValid(dst)) {
return absl_ports::FailedPreconditionError(
- "This posting list is in an invalid state and can't be used!");
+ "Dst posting list is in an invalid state and can't be used!");
}
- if (!other->IsPostingListValid()) {
+ if (!IsPostingListValid(src)) {
return absl_ports::InvalidArgumentError(
- "Cannot MoveFrom an invalid posting list!");
+ "Cannot MoveFrom an invalid src posting list!");
}
- // Pop just enough hits that all of other's compressed hits fit in
- // this posting_list's compressed area. Then we can memcpy that area.
+ // Pop just enough hits that all of src's compressed hits fit in
+ // dst posting_list's compressed area. Then we can memcpy that area.
std::vector<Hit> hits;
- while (other->full() || other->almost_full() ||
- (size_in_bytes_ - posting_list_utils::kSpecialHitsSize <
- other->BytesUsed())) {
- if (!other->GetHitsInternal(/*limit=*/1, /*pop=*/true, &hits).ok()) {
+ while (IsFull(src) || IsAlmostFull(src) ||
+ (dst->size_in_bytes() - kSpecialHitsSize < GetBytesUsed(src))) {
+ if (!GetHitsInternal(src, /*limit=*/1, /*pop=*/true, &hits).ok()) {
return absl_ports::AbortedError(
- "Unable to retrieve hits from other posting list.");
+ "Unable to retrieve hits from src posting list.");
}
}
// memcpy the area and set up start byte offset.
- Clear();
- memcpy(posting_list_buffer_ + size_in_bytes_ - other->BytesUsed(),
- other->posting_list_buffer_ + other->get_start_byte_offset(),
- other->BytesUsed());
- // Because we popped all hits from other outside of the compressed area and we
- // guaranteed that other->BytesUsed is less than size_in_bytes_ -
+ Clear(dst);
+ memcpy(dst->posting_list_buffer() + dst->size_in_bytes() - GetBytesUsed(src),
+ src->posting_list_buffer() + GetStartByteOffset(src),
+ GetBytesUsed(src));
+ // Because we popped all hits from src outside of the compressed area and we
+ // guaranteed that GetBytesUsed(src) is less than dst->size_in_bytes() -
// kSpecialHitSize. This is guaranteed to be a valid byte offset for the
// NOT_FULL state, so ignoring the value is safe.
- set_start_byte_offset(size_in_bytes_ - other->BytesUsed());
+ SetStartByteOffset(dst, dst->size_in_bytes() - GetBytesUsed(src));
// Put back remaining hits.
for (size_t i = 0; i < hits.size(); i++) {
- const Hit &hit = hits[hits.size() - i - 1];
+ const Hit& hit = hits[hits.size() - i - 1];
// PrependHit can return either INVALID_ARGUMENT - if hit is invalid or not
// less than the previous hit - or RESOURCE_EXHAUSTED. RESOURCE_EXHAUSTED
// should be impossible because we've already assured that there is enough
// room above.
- ICING_RETURN_IF_ERROR(PrependHit(hit));
+ ICING_RETURN_IF_ERROR(PrependHit(dst, hit));
}
- other->Clear();
+ Clear(src);
return libtextclassifier3::Status::OK;
}
-uint32_t PostingListUsed::GetPadEnd(uint32_t offset) const {
+uint32_t PostingListUsedHitSerializer::GetPadEnd(
+ const PostingListUsed* posting_list_used, uint32_t offset) const {
Hit::Value pad;
uint32_t pad_end = offset;
- while (pad_end < size_in_bytes_) {
- size_t pad_len = VarInt::Decode(posting_list_buffer_ + pad_end, &pad);
+ while (pad_end < posting_list_used->size_in_bytes()) {
+ size_t pad_len = VarInt::Decode(
+ posting_list_used->posting_list_buffer() + pad_end, &pad);
if (pad != 0) {
// No longer a pad.
break;
@@ -130,22 +145,24 @@ uint32_t PostingListUsed::GetPadEnd(uint32_t offset) const {
return pad_end;
}
-bool PostingListUsed::PadToEnd(uint32_t start, uint32_t end) {
- if (end > size_in_bytes_) {
+bool PostingListUsedHitSerializer::PadToEnd(PostingListUsed* posting_list_used,
+ uint32_t start,
+ uint32_t end) const {
+ if (end > posting_list_used->size_in_bytes()) {
ICING_LOG(ERROR) << "Cannot pad a region that ends after size!";
return false;
}
// In VarInt a value of 0 encodes to 0.
- memset(posting_list_buffer_ + start, 0, end - start);
+ memset(posting_list_used->posting_list_buffer() + start, 0, end - start);
return true;
}
-libtextclassifier3::Status PostingListUsed::PrependHitToAlmostFull(
- const Hit &hit) {
+libtextclassifier3::Status PostingListUsedHitSerializer::PrependHitToAlmostFull(
+ PostingListUsed* posting_list_used, const Hit& hit) const {
// Get delta between first hit and the new hit. Try to fit delta
// in the padded area and put new hit at the special position 1.
- // Calling ValueOrDie is safe here because 1 < kNumSpecialHits.
- Hit cur = get_special_hit(1).ValueOrDie();
+ // Calling ValueOrDie is safe here because 1 < kNumSpecialData.
+ Hit cur = GetSpecialHit(posting_list_used, /*index=*/1).ValueOrDie();
if (cur.value() <= hit.value()) {
return absl_ports::InvalidArgumentError(
"Hit being prepended must be strictly less than the most recent Hit");
@@ -155,58 +172,62 @@ libtextclassifier3::Status PostingListUsed::PrependHitToAlmostFull(
size_t delta_len = VarInt::Encode(delta, delta_buf);
uint32_t cur_term_frequency_bytes = GetTermFrequencyByteSize(cur);
- uint32_t pad_end = GetPadEnd(posting_list_utils::kSpecialHitsSize);
+ uint32_t pad_end = GetPadEnd(posting_list_used,
+ /*offset=*/kSpecialHitsSize);
- if (pad_end >= posting_list_utils::kSpecialHitsSize + delta_len +
- cur_term_frequency_bytes) {
+ if (pad_end >= kSpecialHitsSize + delta_len + cur_term_frequency_bytes) {
// Pad area has enough space for delta and term_frequency of existing hit
// (cur). Write delta at pad_end - delta_len - cur_term_frequency_bytes.
- uint8_t *delta_offset =
- posting_list_buffer_ + pad_end - delta_len - cur_term_frequency_bytes;
+ uint8_t* delta_offset = posting_list_used->posting_list_buffer() + pad_end -
+ delta_len - cur_term_frequency_bytes;
memcpy(delta_offset, delta_buf, delta_len);
// Now copy term_frequency.
Hit::TermFrequency term_frequency = cur.term_frequency();
- uint8_t *term_frequency_offset = delta_offset + delta_len;
+ uint8_t* term_frequency_offset = delta_offset + delta_len;
memcpy(term_frequency_offset, &term_frequency, cur_term_frequency_bytes);
// Now first hit is the new hit, at special position 1. Safe to ignore the
- // return value because 1 < kNumSpecialHits.
- set_special_hit(1, hit);
+ // return value because 1 < kNumSpecialData.
+ SetSpecialHit(posting_list_used, /*index=*/1, hit);
// Safe to ignore the return value because sizeof(Hit) is a valid argument.
- set_start_byte_offset(sizeof(Hit));
+ SetStartByteOffset(posting_list_used, /*offset=*/sizeof(Hit));
} else {
// No space for delta. We put the new hit at special position 0
// and go to the full state. Safe to ignore the return value because 1 <
- // kNumSpecialHits.
- set_special_hit(0, hit);
+ // kNumSpecialData.
+ SetSpecialHit(posting_list_used, /*index=*/0, hit);
}
return libtextclassifier3::Status::OK;
}
-void PostingListUsed::PrependHitToEmpty(const Hit &hit) {
+void PostingListUsedHitSerializer::PrependHitToEmpty(
+ PostingListUsed* posting_list_used, const Hit& hit) const {
// First hit to be added. Just add verbatim, no compression.
- if (size_in_bytes_ == posting_list_utils::kSpecialHitsSize) {
- // Safe to ignore the return value because 1 < kNumSpecialHits
- set_special_hit(1, hit);
+ if (posting_list_used->size_in_bytes() == kSpecialHitsSize) {
+ // Safe to ignore the return value because 1 < kNumSpecialData
+ SetSpecialHit(posting_list_used, /*index=*/1, hit);
// Safe to ignore the return value because sizeof(Hit) is a valid argument.
- set_start_byte_offset(sizeof(Hit));
+ SetStartByteOffset(posting_list_used, /*offset=*/sizeof(Hit));
} else {
// Since this is the first hit, size != kSpecialHitsSize and
// size % sizeof(Hit) == 0, we know that there is room to fit 'hit' into
// the compressed region, so ValueOrDie is safe.
- uint32_t offset = PrependHitUncompressed(hit, size_in_bytes_).ValueOrDie();
+ uint32_t offset =
+ PrependHitUncompressed(posting_list_used, hit,
+ /*offset=*/posting_list_used->size_in_bytes())
+ .ValueOrDie();
// Safe to ignore the return value because PrependHitUncompressed is
// guaranteed to return a valid offset.
- set_start_byte_offset(offset);
+ SetStartByteOffset(posting_list_used, offset);
}
}
-libtextclassifier3::Status PostingListUsed::PrependHitToNotFull(
- const Hit &hit, uint32_t offset) {
+libtextclassifier3::Status PostingListUsedHitSerializer::PrependHitToNotFull(
+ PostingListUsed* posting_list_used, const Hit& hit, uint32_t offset) const {
// First hit in compressed area. It is uncompressed. See if delta
// between the first hit and new hit will still fit in the
// compressed area.
- if (offset + sizeof(Hit::Value) > size_in_bytes_) {
+ if (offset + sizeof(Hit::Value) > posting_list_used->size_in_bytes()) {
// The first hit in the compressed region *should* be uncompressed, but
// somehow there isn't enough room between offset and the end of the
// compressed area to fit an uncompressed hit. This should NEVER happen.
@@ -214,7 +235,8 @@ libtextclassifier3::Status PostingListUsed::PrependHitToNotFull(
"Posting list is in an invalid state.");
}
Hit::Value cur_value;
- memcpy(&cur_value, posting_list_buffer_ + offset, sizeof(Hit::Value));
+ memcpy(&cur_value, posting_list_used->posting_list_buffer() + offset,
+ sizeof(Hit::Value));
if (cur_value <= hit.value()) {
return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
"Hit %d being prepended must be strictly less than the most recent "
@@ -228,45 +250,49 @@ libtextclassifier3::Status PostingListUsed::PrependHitToNotFull(
// offset now points to one past the end of the first hit.
offset += sizeof(Hit::Value);
- if (posting_list_utils::kSpecialHitsSize + sizeof(Hit::Value) + delta_len +
+ if (kSpecialHitsSize + sizeof(Hit::Value) + delta_len +
hit_term_frequency_bytes <=
offset) {
// Enough space for delta in compressed area.
// Prepend delta.
offset -= delta_len;
- memcpy(posting_list_buffer_ + offset, delta_buf, delta_len);
+ memcpy(posting_list_used->posting_list_buffer() + offset, delta_buf,
+ delta_len);
// Prepend new hit with (possibly) its term_frequency. We know that there is
// room for 'hit' because of the if statement above, so calling ValueOrDie
// is safe.
- offset = PrependHitUncompressed(hit, offset).ValueOrDie();
+ offset =
+ PrependHitUncompressed(posting_list_used, hit, offset).ValueOrDie();
// offset is guaranteed to be valid here. So it's safe to ignore the return
// value. The if above will guarantee that offset >= kSpecialHitSize and <
- // size_in_bytes_ because the if ensures that there is enough room between
- // offset and kSpecialHitSize to fit the delta of the previous hit, any
- // term_frequency and the uncompressed hit.
- set_start_byte_offset(offset);
- } else if (posting_list_utils::kSpecialHitsSize + delta_len <= offset) {
+ // posting_list_used->size_in_bytes() because the if ensures that there is
+ // enough room between offset and kSpecialHitSize to fit the delta of the
+ // previous hit, any term_frequency and the uncompressed hit.
+ SetStartByteOffset(posting_list_used, offset);
+ } else if (kSpecialHitsSize + delta_len <= offset) {
// Only have space for delta. The new hit must be put in special
// position 1.
// Prepend delta.
offset -= delta_len;
- memcpy(posting_list_buffer_ + offset, delta_buf, delta_len);
+ memcpy(posting_list_used->posting_list_buffer() + offset, delta_buf,
+ delta_len);
// Prepend pad. Safe to ignore the return value of PadToEnd because offset
- // must be less than size_in_bytes_. Otherwise, this function already would
- // have returned FAILED_PRECONDITION.
- PadToEnd(posting_list_utils::kSpecialHitsSize, offset);
+ // must be less than posting_list_used->size_in_bytes(). Otherwise, this
+ // function already would have returned FAILED_PRECONDITION.
+ PadToEnd(posting_list_used, /*start=*/kSpecialHitsSize,
+ /*end=*/offset);
// Put new hit in special position 1. Safe to ignore return value because 1
- // < kNumSpecialHits.
- set_special_hit(1, hit);
+ // < kNumSpecialData.
+ SetSpecialHit(posting_list_used, /*index=*/1, hit);
// State almost_full. Safe to ignore the return value because sizeof(Hit) is
// a valid argument.
- set_start_byte_offset(sizeof(Hit));
+ SetStartByteOffset(posting_list_used, /*offset=*/sizeof(Hit));
} else {
// Very rare case where delta is larger than sizeof(Hit::Value)
// (i.e. varint delta encoding expanded required storage). We
@@ -277,59 +303,65 @@ libtextclassifier3::Status PostingListUsed::PrependHitToNotFull(
// Therefore, offset must be less than kSpecialHitSize + 5. Since posting
// list size must be divisible by sizeof(Hit) (5), it is guaranteed that
// offset < size_in_bytes, so it is safe to ignore the return value here.
- ConsumeTermFrequencyIfPresent(&cur, &offset);
+ ConsumeTermFrequencyIfPresent(posting_list_used, &cur, &offset);
// Safe to ignore the return value of PadToEnd because offset must be less
- // than size_in_bytes_. Otherwise, this function already would have returned
- // FAILED_PRECONDITION.
- PadToEnd(posting_list_utils::kSpecialHitsSize, offset);
- // Safe to ignore the return value here because 0 and 1 < kNumSpecialHits.
- set_special_hit(1, cur);
- set_special_hit(0, hit);
+ // than posting_list_used->size_in_bytes(). Otherwise, this function
+ // already would have returned FAILED_PRECONDITION.
+ PadToEnd(posting_list_used, /*start=*/kSpecialHitsSize,
+ /*end=*/offset);
+ // Safe to ignore the return value here because 0 and 1 < kNumSpecialData.
+ SetSpecialHit(posting_list_used, /*index=*/1, cur);
+ SetSpecialHit(posting_list_used, /*index=*/0, hit);
}
return libtextclassifier3::Status::OK;
}
-libtextclassifier3::Status PostingListUsed::PrependHit(const Hit &hit) {
+libtextclassifier3::Status PostingListUsedHitSerializer::PrependHit(
+ PostingListUsed* posting_list_used, const Hit& hit) const {
static_assert(sizeof(Hit::Value) <= sizeof(uint64_t),
"Hit::Value cannot be larger than 8 bytes because the delta "
"must be able to fit in 8 bytes.");
if (!hit.is_valid()) {
return absl_ports::InvalidArgumentError("Cannot prepend an invalid hit!");
}
- if (!IsPostingListValid()) {
+ if (!IsPostingListValid(posting_list_used)) {
return absl_ports::FailedPreconditionError(
"This PostingListUsed is in an invalid state and can't add any hits!");
}
- if (full()) {
+ if (IsFull(posting_list_used)) {
// State full: no space left.
return absl_ports::ResourceExhaustedError("No more room for hits");
- } else if (almost_full()) {
- return PrependHitToAlmostFull(hit);
- } else if (empty()) {
- PrependHitToEmpty(hit);
+ } else if (IsAlmostFull(posting_list_used)) {
+ return PrependHitToAlmostFull(posting_list_used, hit);
+ } else if (IsEmpty(posting_list_used)) {
+ PrependHitToEmpty(posting_list_used, hit);
return libtextclassifier3::Status::OK;
} else {
- uint32_t offset = get_start_byte_offset();
- return PrependHitToNotFull(hit, offset);
+ uint32_t offset = GetStartByteOffset(posting_list_used);
+ return PrependHitToNotFull(posting_list_used, hit, offset);
}
}
-libtextclassifier3::StatusOr<std::vector<Hit>> PostingListUsed::GetHits()
- const {
+libtextclassifier3::StatusOr<std::vector<Hit>>
+PostingListUsedHitSerializer::GetHits(
+ const PostingListUsed* posting_list_used) const {
std::vector<Hit> hits_out;
- ICING_RETURN_IF_ERROR(GetHits(&hits_out));
+ ICING_RETURN_IF_ERROR(GetHits(posting_list_used, &hits_out));
return hits_out;
}
-libtextclassifier3::Status PostingListUsed::GetHits(
- std::vector<Hit> *hits_out) const {
- return GetHitsInternal(/*limit=*/std::numeric_limits<uint32_t>::max(),
+libtextclassifier3::Status PostingListUsedHitSerializer::GetHits(
+ const PostingListUsed* posting_list_used,
+ std::vector<Hit>* hits_out) const {
+ return GetHitsInternal(posting_list_used,
+ /*limit=*/std::numeric_limits<uint32_t>::max(),
/*pop=*/false, hits_out);
}
-libtextclassifier3::Status PostingListUsed::PopFrontHits(uint32_t num_hits) {
- if (num_hits == 1 && full()) {
+libtextclassifier3::Status PostingListUsedHitSerializer::PopFrontHits(
+ PostingListUsed* posting_list_used, uint32_t num_hits) const {
+ if (num_hits == 1 && IsFull(posting_list_used)) {
// The PL is in full status which means that we save 2 uncompressed hits in
// the 2 special postions. But full status may be reached by 2 different
// statuses.
@@ -383,31 +415,35 @@ libtextclassifier3::Status PostingListUsed::PopFrontHits(uint32_t num_hits) {
// Popping 2 hits should never fail because we've just ensured that the
// posting list is in the FULL state.
- ICING_RETURN_IF_ERROR(GetHitsInternal(/*limit=*/2, /*pop=*/true, &out));
+ ICING_RETURN_IF_ERROR(
+ GetHitsInternal(posting_list_used, /*limit=*/2, /*pop=*/true, &out));
// PrependHit should never fail because out[1] is a valid hit less than
// previous hits in the posting list and because there's no way that the
// posting list could run out of room because it previously stored this hit
// AND another hit.
- PrependHit(out[1]);
+ PrependHit(posting_list_used, out[1]);
} else if (num_hits > 0) {
- return GetHitsInternal(/*limit=*/num_hits, /*pop=*/true, nullptr);
+ return GetHitsInternal(posting_list_used, /*limit=*/num_hits, /*pop=*/true,
+ nullptr);
}
return libtextclassifier3::Status::OK;
}
-libtextclassifier3::Status PostingListUsed::GetHitsInternal(
- uint32_t limit, bool pop, std::vector<Hit> *out) const {
+libtextclassifier3::Status PostingListUsedHitSerializer::GetHitsInternal(
+ const PostingListUsed* posting_list_used, uint32_t limit, bool pop,
+ std::vector<Hit>* out) const {
// Put current uncompressed val here.
Hit::Value val = Hit::kInvalidValue;
- uint32_t offset = get_start_byte_offset();
+ uint32_t offset = GetStartByteOffset(posting_list_used);
uint32_t count = 0;
// First traverse the first two special positions.
- while (count < limit && offset < posting_list_utils::kSpecialHitsSize) {
+ while (count < limit && offset < kSpecialHitsSize) {
// Calling ValueOrDie is safe here because offset / sizeof(Hit) <
- // kNumSpecialHits because of the check above.
- Hit hit = get_special_hit(offset / sizeof(Hit)).ValueOrDie();
+ // kNumSpecialData because of the check above.
+ Hit hit = GetSpecialHit(posting_list_used, /*index=*/offset / sizeof(Hit))
+ .ValueOrDie();
val = hit.value();
if (out != nullptr) {
out->push_back(hit);
@@ -417,25 +453,26 @@ libtextclassifier3::Status PostingListUsed::GetHitsInternal(
}
// If special position 1 was set then we need to skip padding.
- if (val != Hit::kInvalidValue &&
- offset == posting_list_utils::kSpecialHitsSize) {
- offset = GetPadEnd(offset);
+ if (val != Hit::kInvalidValue && offset == kSpecialHitsSize) {
+ offset = GetPadEnd(posting_list_used, offset);
}
- while (count < limit && offset < size_in_bytes_) {
+ while (count < limit && offset < posting_list_used->size_in_bytes()) {
if (val == Hit::kInvalidValue) {
// First hit is in compressed area. Put that in val.
- memcpy(&val, posting_list_buffer_ + offset, sizeof(Hit::Value));
+ memcpy(&val, posting_list_used->posting_list_buffer() + offset,
+ sizeof(Hit::Value));
offset += sizeof(Hit::Value);
} else {
// Now we have delta encoded subsequent hits. Decode and push.
uint64_t delta;
- offset += VarInt::Decode(posting_list_buffer_ + offset, &delta);
+ offset += VarInt::Decode(
+ posting_list_used->posting_list_buffer() + offset, &delta);
val += delta;
}
Hit hit(val);
libtextclassifier3::Status status =
- ConsumeTermFrequencyIfPresent(&hit, &offset);
+ ConsumeTermFrequencyIfPresent(posting_list_used, &hit, &offset);
if (!status.ok()) {
// This posting list has been corrupted somehow. The first hit of the
// posting list claims to have a term frequency, but there's no more room
@@ -453,29 +490,32 @@ libtextclassifier3::Status PostingListUsed::GetHitsInternal(
}
if (pop) {
- PostingListUsed *mutable_this = const_cast<PostingListUsed *>(this);
+ PostingListUsed* mutable_posting_list_used =
+ const_cast<PostingListUsed*>(posting_list_used);
// Modify the posting list so that we pop all hits actually
// traversed.
- if (offset >= posting_list_utils::kSpecialHitsSize &&
- offset < size_in_bytes_) {
+ if (offset >= kSpecialHitsSize &&
+ offset < posting_list_used->size_in_bytes()) {
// In the compressed area. Pop and reconstruct. offset/val is
// the last traversed hit, which we must discard. So move one
// more forward.
uint64_t delta;
- offset += VarInt::Decode(posting_list_buffer_ + offset, &delta);
+ offset += VarInt::Decode(
+ posting_list_used->posting_list_buffer() + offset, &delta);
val += delta;
// Now val is the first hit of the new posting list.
- if (posting_list_utils::kSpecialHitsSize + sizeof(Hit::Value) <= offset) {
+ if (kSpecialHitsSize + sizeof(Hit::Value) <= offset) {
// val fits in compressed area. Simply copy.
offset -= sizeof(Hit::Value);
- memcpy(posting_list_buffer_ + offset, &val, sizeof(Hit::Value));
+ memcpy(mutable_posting_list_used->posting_list_buffer() + offset, &val,
+ sizeof(Hit::Value));
} else {
// val won't fit in compressed area. Also see if there is a
// term_frequency.
Hit hit(val);
libtextclassifier3::Status status =
- ConsumeTermFrequencyIfPresent(&hit, &offset);
+ ConsumeTermFrequencyIfPresent(posting_list_used, &hit, &offset);
if (!status.ok()) {
// This posting list has been corrupted somehow. The first hit of
// the posting list claims to have a term frequency, but there's no
@@ -487,20 +527,24 @@ libtextclassifier3::Status PostingListUsed::GetHitsInternal(
}
return absl_ports::InternalError("Posting list has been corrupted!");
}
- // Okay to ignore the return value here because 1 < kNumSpecialHits.
- mutable_this->set_special_hit(1, hit);
+ // Okay to ignore the return value here because 1 < kNumSpecialData.
+ SetSpecialHit(mutable_posting_list_used, /*index=*/1, hit);
// Prepend pad. Safe to ignore the return value of PadToEnd because
- // offset must be less than size_in_bytes_ thanks to the if above.
- mutable_this->PadToEnd(posting_list_utils::kSpecialHitsSize, offset);
+ // offset must be less than posting_list_used->size_in_bytes() thanks to
+ // the if above.
+ PadToEnd(mutable_posting_list_used,
+ /*start=*/kSpecialHitsSize,
+ /*end=*/offset);
offset = sizeof(Hit);
}
}
// offset is guaranteed to be valid so ignoring the return value of
// set_start_byte_offset is safe. It falls into one of four scenarios:
- // Scenario 1: the above if was false because offset is not < size_in_bytes_
- // In this case, offset must be == size_in_bytes_ because we reached
- // offset by unwinding hits on the posting list.
+ // Scenario 1: the above if was false because offset is not <
+ // posting_list_used->size_in_bytes()
+ // In this case, offset must be == posting_list_used->size_in_bytes()
+ // because we reached offset by unwinding hits on the posting list.
// Scenario 2: offset is < kSpecialHitSize
// In this case, offset is guaranteed to be either 0 or sizeof(Hit)
// because offset is incremented by sizeof(Hit) within the first while
@@ -514,104 +558,91 @@ libtextclassifier3::Status PostingListUsed::GetHitsInternal(
// in the posting list is too large to fit as an uncompressed hit in the
// in the compressed region. Therefore, it must be stored in a special hit
// and offset will be sizeof(Hit).
- mutable_this->set_start_byte_offset(offset);
+ SetStartByteOffset(mutable_posting_list_used, offset);
}
return libtextclassifier3::Status::OK;
}
-libtextclassifier3::StatusOr<Hit> PostingListUsed::get_special_hit(
- uint32_t index) const {
+libtextclassifier3::StatusOr<Hit> PostingListUsedHitSerializer::GetSpecialHit(
+ const PostingListUsed* posting_list_used, uint32_t index) const {
static_assert(sizeof(Hit::Value) >= sizeof(uint32_t), "HitTooSmall");
- if (index >= posting_list_utils::kNumSpecialHits || index < 0) {
+ if (index >= kNumSpecialData || index < 0) {
return absl_ports::InvalidArgumentError(
"Special hits only exist at indices 0 and 1");
}
Hit val;
- memcpy(&val, posting_list_buffer_ + index * sizeof(val), sizeof(val));
+ memcpy(&val, posting_list_used->posting_list_buffer() + index * sizeof(val),
+ sizeof(val));
return val;
}
-bool PostingListUsed::set_special_hit(uint32_t index, const Hit &val) {
- if (index >= posting_list_utils::kNumSpecialHits || index < 0) {
+bool PostingListUsedHitSerializer::SetSpecialHit(
+ PostingListUsed* posting_list_used, uint32_t index, const Hit& val) const {
+ if (index >= kNumSpecialData || index < 0) {
ICING_LOG(ERROR) << "Special hits only exist at indices 0 and 1";
return false;
}
- memcpy(posting_list_buffer_ + index * sizeof(val), &val, sizeof(val));
+ memcpy(posting_list_used->posting_list_buffer() + index * sizeof(val), &val,
+ sizeof(val));
return true;
}
-uint32_t PostingListUsed::BytesUsed() const {
- // The special hits will be included if they represent actual hits. If they
- // represent the hit offset or the invalid hit sentinel, they are not
- // included.
- return size_in_bytes_ - get_start_byte_offset();
-}
-
-uint32_t PostingListUsed::MinPostingListSizeToFit() const {
- if (full() || almost_full()) {
- // If in either the FULL state or ALMOST_FULL state, this posting list *is*
- // the minimum size posting list that can fit these hits. So just return the
- // size of the posting list.
- return size_in_bytes_;
- }
-
- // In NOT_FULL status BytesUsed contains no special hits. The minimum sized
- // posting list that would be guaranteed to fit these hits would be
- // ALMOST_FULL, with kInvalidHit in special_hit(0), the uncompressed Hit in
- // special_hit(1) and the n compressed hits in the compressed region.
- // BytesUsed contains one uncompressed Hit and n compressed hits. Therefore,
- // fitting these hits into a posting list would require BytesUsed plus one
- // extra hit.
- return BytesUsed() + sizeof(Hit);
-}
-
-bool PostingListUsed::IsPostingListValid() const {
- if (almost_full()) {
+bool PostingListUsedHitSerializer::IsPostingListValid(
+ const PostingListUsed* posting_list_used) const {
+ if (IsAlmostFull(posting_list_used)) {
// Special Hit 1 should hold a Hit. Calling ValueOrDie is safe because we
- // know that 1 < kNumSpecialHits.
- if (!get_special_hit(1).ValueOrDie().is_valid()) {
+ // know that 1 < kNumSpecialData.
+ if (!GetSpecialHit(posting_list_used, /*index=*/1)
+ .ValueOrDie()
+ .is_valid()) {
ICING_LOG(ERROR)
<< "Both special hits cannot be invalid at the same time.";
return false;
}
- } else if (!full()) {
+ } else if (!IsFull(posting_list_used)) {
// NOT_FULL. Special Hit 0 should hold a valid offset. Calling ValueOrDie is
- // safe because we know that 0 < kNumSpecialHits.
- if (get_special_hit(0).ValueOrDie().value() > size_in_bytes_ ||
- get_special_hit(0).ValueOrDie().value() <
- posting_list_utils::kSpecialHitsSize) {
- ICING_LOG(ERROR) << "Hit: " << get_special_hit(0).ValueOrDie().value()
- << " size: " << size_in_bytes_
- << " sp size: " << posting_list_utils::kSpecialHitsSize;
+ // safe because we know that 0 < kNumSpecialData.
+ if (GetSpecialHit(posting_list_used, /*index=*/0).ValueOrDie().value() >
+ posting_list_used->size_in_bytes() ||
+ GetSpecialHit(posting_list_used, /*index=*/0).ValueOrDie().value() <
+ kSpecialHitsSize) {
+ ICING_LOG(ERROR)
+ << "Hit: "
+ << GetSpecialHit(posting_list_used, /*index=*/0).ValueOrDie().value()
+ << " size: " << posting_list_used->size_in_bytes()
+ << " sp size: " << kSpecialHitsSize;
return false;
}
}
return true;
}
-uint32_t PostingListUsed::get_start_byte_offset() const {
- if (full()) {
+uint32_t PostingListUsedHitSerializer::GetStartByteOffset(
+ const PostingListUsed* posting_list_used) const {
+ if (IsFull(posting_list_used)) {
return 0;
- } else if (almost_full()) {
+ } else if (IsAlmostFull(posting_list_used)) {
return sizeof(Hit);
} else {
// NOT_FULL, calling ValueOrDie is safe because we know that 0 <
- // kNumSpecialHits.
- return get_special_hit(0).ValueOrDie().value();
+ // kNumSpecialData.
+ return GetSpecialHit(posting_list_used, /*index=*/0).ValueOrDie().value();
}
}
-bool PostingListUsed::set_start_byte_offset(uint32_t offset) {
- if (offset > size_in_bytes_) {
+bool PostingListUsedHitSerializer::SetStartByteOffset(
+ PostingListUsed* posting_list_used, uint32_t offset) const {
+ if (offset > posting_list_used->size_in_bytes()) {
ICING_LOG(ERROR) << "offset cannot be a value greater than size "
- << size_in_bytes_ << ". offset is " << offset << ".";
+ << posting_list_used->size_in_bytes() << ". offset is "
+ << offset << ".";
return false;
}
- if (offset < posting_list_utils::kSpecialHitsSize && offset > sizeof(Hit)) {
+ if (offset < kSpecialHitsSize && offset > sizeof(Hit)) {
ICING_LOG(ERROR) << "offset cannot be a value between (" << sizeof(Hit)
- << ", " << posting_list_utils::kSpecialHitsSize
- << "). offset is " << offset << ".";
+ << ", " << kSpecialHitsSize << "). offset is " << offset
+ << ".";
return false;
}
if (offset < sizeof(Hit) && offset != 0) {
@@ -619,55 +650,61 @@ bool PostingListUsed::set_start_byte_offset(uint32_t offset) {
<< "). offset is " << offset << ".";
return false;
}
- if (offset >= posting_list_utils::kSpecialHitsSize) {
+ if (offset >= kSpecialHitsSize) {
// not_full state. Safe to ignore the return value because 0 and 1 are both
- // < kNumSpecialHits.
- set_special_hit(0, Hit(offset));
- set_special_hit(1, Hit());
+ // < kNumSpecialData.
+ SetSpecialHit(posting_list_used, /*index=*/0, Hit(offset));
+ SetSpecialHit(posting_list_used, /*index=*/1, Hit());
} else if (offset == sizeof(Hit)) {
// almost_full state. Safe to ignore the return value because 1 is both <
- // kNumSpecialHits.
- set_special_hit(0, Hit());
+ // kNumSpecialData.
+ SetSpecialHit(posting_list_used, /*index=*/0, Hit());
}
// Nothing to do for the FULL state - the offset isn't actually stored
// anywhere and both special hits hold valid hits.
return true;
}
-libtextclassifier3::StatusOr<uint32_t> PostingListUsed::PrependHitUncompressed(
- const Hit &hit, uint32_t offset) {
+libtextclassifier3::StatusOr<uint32_t>
+PostingListUsedHitSerializer::PrependHitUncompressed(
+ PostingListUsed* posting_list_used, const Hit& hit, uint32_t offset) const {
if (hit.has_term_frequency()) {
- if (offset < posting_list_utils::kSpecialHitsSize + sizeof(Hit)) {
+ if (offset < kSpecialHitsSize + sizeof(Hit)) {
return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
"Not enough room to prepend Hit at offset %d.", offset));
}
offset -= sizeof(Hit);
- memcpy(posting_list_buffer_ + offset, &hit, sizeof(Hit));
+ memcpy(posting_list_used->posting_list_buffer() + offset, &hit,
+ sizeof(Hit));
} else {
- if (offset < posting_list_utils::kSpecialHitsSize + sizeof(Hit::Value)) {
+ if (offset < kSpecialHitsSize + sizeof(Hit::Value)) {
return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
"Not enough room to prepend Hit::Value at offset %d.", offset));
}
offset -= sizeof(Hit::Value);
Hit::Value val = hit.value();
- memcpy(posting_list_buffer_ + offset, &val, sizeof(Hit::Value));
+ memcpy(posting_list_used->posting_list_buffer() + offset, &val,
+ sizeof(Hit::Value));
}
return offset;
}
-libtextclassifier3::Status PostingListUsed::ConsumeTermFrequencyIfPresent(
- Hit *hit, uint32_t *offset) const {
+libtextclassifier3::Status
+PostingListUsedHitSerializer::ConsumeTermFrequencyIfPresent(
+ const PostingListUsed* posting_list_used, Hit* hit,
+ uint32_t* offset) const {
if (!hit->has_term_frequency()) {
// No term frequency to consume. Everything is fine.
return libtextclassifier3::Status::OK;
}
- if (*offset + sizeof(Hit::TermFrequency) > size_in_bytes_) {
+ if (*offset + sizeof(Hit::TermFrequency) >
+ posting_list_used->size_in_bytes()) {
return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
"offset %d must not point past the end of the posting list of size %d.",
- *offset, size_in_bytes_));
+ *offset, posting_list_used->size_in_bytes()));
}
Hit::TermFrequency term_frequency;
- memcpy(&term_frequency, posting_list_buffer_ + *offset,
+ memcpy(&term_frequency, posting_list_used->posting_list_buffer() + *offset,
sizeof(Hit::TermFrequency));
*hit = Hit(hit->value(), term_frequency);
*offset += sizeof(Hit::TermFrequency);
diff --git a/icing/index/main/posting-list-used.h b/icing/index/main/posting-list-used-hit-serializer.h
index 8944034..70e3e6c 100644
--- a/icing/index/main/posting-list-used.h
+++ b/icing/index/main/posting-list-used-hit-serializer.h
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 Google LLC
+// Copyright (C) 2022 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -12,104 +12,93 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#ifndef ICING_INDEX_MAIN_POSTING_LIST_USED_H_
-#define ICING_INDEX_MAIN_POSTING_LIST_USED_H_
+#ifndef ICING_INDEX_MAIN_POSTING_LIST_USED_HIT_SERIALIZER_H_
+#define ICING_INDEX_MAIN_POSTING_LIST_USED_HIT_SERIALIZER_H_
-#include <sys/mman.h>
-
-#include <algorithm>
-#include <cstring>
+#include <cstdint>
#include <vector>
#include "icing/text_classifier/lib3/utils/base/status.h"
#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/file/posting_list/posting-list-common.h"
+#include "icing/file/posting_list/posting-list-used.h"
#include "icing/index/hit/hit.h"
-#include "icing/index/main/posting-list-utils.h"
-#include "icing/util/logging.h"
namespace icing {
namespace lib {
-// A posting list with hits in it. Layout described in comments in
-// posting-list-used.cc.
-class PostingListUsed {
+// A serializer class to serialize hits to PostingListUsed. Layout described in
+// comments in posting-list-used-hit-serializer.cc.
+class PostingListUsedHitSerializer : public PostingListUsedSerializer {
public:
- // Creates a PostingListUsed that points to a buffer of size_in_bytes bytes.
- // 'Preexisting' means that posting_list_buffer was previously modified by
- // another instance of PostingListUsed.
- //
- // Caller owns the hits buffer and must not free it while using a
- // PostingListUsed.
- //
- // RETURNS:
- // - A valid PostingListUsed if successful
- // - INVALID_ARGUMENT if size_in_bytes < min_posting_list_size()
- // || size_in_bytes % sizeof(Hit) != 0.
- // - FAILED_PRECONDITION if posting_list_buffer is null
- static libtextclassifier3::StatusOr<PostingListUsed>
- CreateFromPreexistingPostingListUsedRegion(void *posting_list_buffer,
- uint32_t size_in_bytes);
-
- // Creates a PostingListUsed that points to a buffer of size_in_bytes bytes
- // and initializes the content of the buffer so that the returned
- // PostingListUsed is empty.
- //
- // Caller owns the posting_list_buffer buffer and must not free it while using
- // a PostingListUsed.
- //
- // RETURNS:
- // - A valid PostingListUsed if successful
- // - INVALID_ARGUMENT if size_in_bytes < min_posting_list_size()
- // || size_in_bytes % sizeof(Hit) != 0.
- // - FAILED_PRECONDITION if posting_list_buffer is null
- static libtextclassifier3::StatusOr<PostingListUsed>
- CreateFromUnitializedRegion(void *posting_list_buffer,
- uint32_t size_in_bytes);
-
- // Move contents from another posting list. Clears other.
+ static constexpr uint32_t kSpecialHitsSize = sizeof(Hit) * kNumSpecialData;
+
+ uint32_t GetDataTypeBytes() const override { return sizeof(Hit); }
+
+ uint32_t GetMinPostingListSize() const override {
+ static constexpr uint32_t kMinPostingListSize = kSpecialHitsSize;
+ static_assert(sizeof(PostingListIndex) <= kMinPostingListSize,
+ "PostingListIndex must be small enough to fit in a "
+ "minimum-sized Posting List.");
+
+ return kMinPostingListSize;
+ }
+
+ // Min size of posting list that can fit these used bytes (see MoveFrom).
+ uint32_t GetMinPostingListSizeToFit(
+ const PostingListUsed* posting_list_used) const override;
+
+ // Returns bytes used by actual hits.
+ uint32_t GetBytesUsed(
+ const PostingListUsed* posting_list_used) const override;
+
+ void Clear(PostingListUsed* posting_list_used) const override;
+
+ // Moves contents from posting list 'src' to 'dst'. Clears 'src'.
//
// RETURNS:
- // - OK, if successful
- // - INVALID_ARGUMENT if 'other' is not valid or 'other' is too large to fit
- // in 'this'.
- // - FAILED_PRECONDITION if 'this' posting list is in a corrupted state.
- libtextclassifier3::Status MoveFrom(PostingListUsed *other);
-
- // Min size of posting list that can fit these used bytes. (See
- // MoveFrom.)
- uint32_t MinPostingListSizeToFit() const;
+ // - OK on success
+ // - INVALID_ARGUMENT if 'src' is not valid or 'src' is too large to fit in
+ // 'dst'.
+ // - FAILED_PRECONDITION if 'dst' posting list is in a corrupted state.
+ libtextclassifier3::Status MoveFrom(PostingListUsed* dst,
+ PostingListUsed* src) const override;
// Prepend a hit to the posting list.
+ //
// RETURNS:
// - INVALID_ARGUMENT if !hit.is_valid() or if hit is not less than the
- // previously added hit.
+ // previously added hit.
// - RESOURCE_EXHAUSTED if there is no more room to add hit to the posting
- // list.
- libtextclassifier3::Status PrependHit(const Hit &hit);
+ // list.
+ libtextclassifier3::Status PrependHit(PostingListUsed* posting_list_used,
+ const Hit& hit) const;
- // Prepend hits to the posting list. Hits should be sorted in
- // descending order (as defined by the less than operator for Hit)
+ // Prepend hits to the posting list. Hits should be sorted in descending order
+ // (as defined by the less than operator for Hit)
//
// Returns the number of hits that could be prepended to the posting list. If
// keep_prepended is true, whatever could be prepended is kept, otherwise the
// posting list is left in its original state.
- template <class T, Hit (*GetHit)(const T &)>
- uint32_t PrependHitArray(const T *array, uint32_t num_hits,
- bool keep_prepended);
+ template <class T, Hit (*GetHit)(const T&)>
+ uint32_t PrependHitArray(PostingListUsed* posting_list_used, const T* array,
+ uint32_t num_hits, bool keep_prepended) const;
// Retrieves the hits stored in the posting list.
//
// RETURNS:
// - On success, a vector of hits sorted by the reverse order of prepending.
// - INTERNAL_ERROR if the posting list has been corrupted somehow.
- libtextclassifier3::StatusOr<std::vector<Hit>> GetHits() const;
+ libtextclassifier3::StatusOr<std::vector<Hit>> GetHits(
+ const PostingListUsed* posting_list_used) const;
// Same as GetHits but appends hits to hits_out.
//
// RETURNS:
// - On success, a vector of hits sorted by the reverse order of prepending.
// - INTERNAL_ERROR if the posting list has been corrupted somehow.
- libtextclassifier3::Status GetHits(std::vector<Hit> *hits_out) const;
+ libtextclassifier3::Status GetHits(const PostingListUsed* posting_list_used,
+ std::vector<Hit>* hits_out) const;
// Undo the last num_hits hits prepended. If num_hits > number of
// hits we clear all hits.
@@ -117,10 +106,8 @@ class PostingListUsed {
// RETURNS:
// - OK on success
// - INTERNAL_ERROR if the posting list has been corrupted somehow.
- libtextclassifier3::Status PopFrontHits(uint32_t num_hits);
-
- // Returns bytes used by actual hits.
- uint32_t BytesUsed() const;
+ libtextclassifier3::Status PopFrontHits(PostingListUsed* posting_list_used,
+ uint32_t num_hits) const;
private:
// Posting list layout formats:
@@ -201,71 +188,83 @@ class PostingListUsed {
// -+ | 0x07FFF320 |0x07FFF40E,87| 0x000 | 196 | 434 | 125 | 788
// |
// +-------------+-------------+---------+----------+---------+------+---------+
- PostingListUsed(void *posting_list_buffer, uint32_t size_in_bytes)
- : posting_list_buffer_(static_cast<uint8_t *>(posting_list_buffer)),
- size_in_bytes_(size_in_bytes) {}
// Helpers to determine what state the posting list is in.
- bool full() const {
- return get_special_hit(0).ValueOrDie().is_valid() &&
- get_special_hit(1).ValueOrDie().is_valid();
+ bool IsFull(const PostingListUsed* posting_list_used) const {
+ return GetSpecialHit(posting_list_used, /*index=*/0)
+ .ValueOrDie()
+ .is_valid() &&
+ GetSpecialHit(posting_list_used, /*index=*/1)
+ .ValueOrDie()
+ .is_valid();
}
- bool almost_full() const {
- return !get_special_hit(0).ValueOrDie().is_valid();
+
+ bool IsAlmostFull(const PostingListUsed* posting_list_used) const {
+ return !GetSpecialHit(posting_list_used, /*index=*/0)
+ .ValueOrDie()
+ .is_valid();
}
- bool empty() const {
- return get_special_hit(0).ValueOrDie().value() == size_in_bytes_ &&
- !get_special_hit(1).ValueOrDie().is_valid();
+
+ bool IsEmpty(const PostingListUsed* posting_list_used) const {
+ return GetSpecialHit(posting_list_used, /*index=*/0).ValueOrDie().value() ==
+ posting_list_used->size_in_bytes() &&
+ !GetSpecialHit(posting_list_used, /*index=*/1)
+ .ValueOrDie()
+ .is_valid();
}
// Returns false if both special hits are invalid or if the offset value
// stored in the special hit is less than kSpecialHitsSize or greater than
- // size_in_bytes_. Returns true, otherwise.
- bool IsPostingListValid() const;
+ // posting_list_used->size_in_bytes(). Returns true, otherwise.
+ bool IsPostingListValid(const PostingListUsed* posting_list_used) const;
// Prepend hit to a posting list that is in the ALMOST_FULL state.
// RETURNS:
// - OK, if successful
// - INVALID_ARGUMENT if hit is not less than the previously added hit.
- libtextclassifier3::Status PrependHitToAlmostFull(const Hit &hit);
+ libtextclassifier3::Status PrependHitToAlmostFull(
+ PostingListUsed* posting_list_used, const Hit& hit) const;
// Prepend hit to a posting list that is in the EMPTY state. This will always
// succeed because there are no pre-existing hits and no validly constructed
// posting list could fail to fit one hit.
- void PrependHitToEmpty(const Hit &hit);
+ void PrependHitToEmpty(PostingListUsed* posting_list_used,
+ const Hit& hit) const;
// Prepend hit to a posting list that is in the NOT_FULL state.
// RETURNS:
// - OK, if successful
// - INVALID_ARGUMENT if hit is not less than the previously added hit.
- libtextclassifier3::Status PrependHitToNotFull(const Hit &hit,
- uint32_t offset);
-
- // Reset contents to an empty posting list. This *must* be called if the
- // posting_list_buffer_ region is uninitialized.
- void Clear();
+ libtextclassifier3::Status PrependHitToNotFull(
+ PostingListUsed* posting_list_used, const Hit& hit,
+ uint32_t offset) const;
// Returns either 0 (full state), sizeof(Hit) (almost_full state) or
- // a byte offset between kSpecialHitsSize and size_in_bytes_ (inclusive)
- // (not_full state).
- uint32_t get_start_byte_offset() const;
+ // a byte offset between kSpecialHitsSize and
+ // posting_list_used->size_in_bytes() (inclusive) (not_full state).
+ uint32_t GetStartByteOffset(const PostingListUsed* posting_list_used) const;
// Sets the special hits to properly reflect what offset is (see layout
// comment for further details).
//
- // Returns false if offset > size_in_bytes_ or offset is (kSpecialHitsSize,
- // sizeof(Hit)) or offset is (sizeof(Hit), 0). True, otherwise.
- bool set_start_byte_offset(uint32_t offset);
+ // Returns false if offset > posting_list_used->size_in_bytes() or offset is
+ // (kSpecialHitsSize, sizeof(Hit)) or offset is (sizeof(Hit), 0). True,
+ // otherwise.
+ bool SetStartByteOffset(PostingListUsed* posting_list_used,
+ uint32_t offset) const;
// Manipulate padded areas. We never store the same hit value twice
// so a delta of 0 is a pad byte.
// Returns offset of first non-pad byte.
- uint32_t GetPadEnd(uint32_t offset) const;
+ uint32_t GetPadEnd(const PostingListUsed* posting_list_used,
+ uint32_t offset) const;
// Fill padding between offset start and offset end with 0s.
- // Returns false if end > size_in_bytes_. True, otherwise.
- bool PadToEnd(uint32_t start, uint32_t end);
+ // Returns false if end > posting_list_used->size_in_bytes(). True,
+ // otherwise.
+ bool PadToEnd(PostingListUsed* posting_list_used, uint32_t start,
+ uint32_t end) const;
// Helper for AppendHits/PopFrontHits. Adds limit number of hits to out or all
// hits in the posting list if the posting list contains less than limit
@@ -279,19 +278,22 @@ class PostingListUsed {
// RETURNS:
// - OK on success
// - INTERNAL_ERROR if the posting list has been corrupted somehow.
- libtextclassifier3::Status GetHitsInternal(uint32_t limit, bool pop,
- std::vector<Hit> *out) const;
+ libtextclassifier3::Status GetHitsInternal(
+ const PostingListUsed* posting_list_used, uint32_t limit, bool pop,
+ std::vector<Hit>* out) const;
// Retrieves the value stored in the index-th special hit.
//
// RETURNS:
// - A valid Hit, on success
- // - INVALID_ARGUMENT if index is not less than kNumSpecialHits
- libtextclassifier3::StatusOr<Hit> get_special_hit(uint32_t index) const;
+ // - INVALID_ARGUMENT if index is not less than kNumSpecialData
+ libtextclassifier3::StatusOr<Hit> GetSpecialHit(
+ const PostingListUsed* posting_list_used, uint32_t index) const;
// Sets the value stored in the index-th special hit to val. If index is not
// less than kSpecialHitSize / sizeof(Hit), this has no effect.
- bool set_special_hit(uint32_t index, const Hit &val);
+ bool SetSpecialHit(PostingListUsed* posting_list_used, uint32_t index,
+ const Hit& val) const;
// Prepends hit to the memory region [offset - sizeof(Hit), offset] and
// returns the new beginning of the padded region.
@@ -301,7 +303,8 @@ class PostingListUsed {
// - INVALID_ARGUMENT if hit will not fit (uncompressed) between offset and
// kSpecialHitsSize
libtextclassifier3::StatusOr<uint32_t> PrependHitUncompressed(
- const Hit &hit, uint32_t offset);
+ PostingListUsed* posting_list_used, const Hit& hit,
+ uint32_t offset) const;
// If hit has a term frequency, consumes the term frequency at offset, updates
// hit to include the term frequency and updates offset to reflect that the
@@ -310,29 +313,25 @@ class PostingListUsed {
// RETURNS:
// - OK, if successful
// - INVALID_ARGUMENT if hit has a term frequency and offset +
- // sizeof(Hit::TermFrequency) >=
- // size_in_bytes_
+ // sizeof(Hit::TermFrequency) >= posting_list_used->size_in_bytes()
libtextclassifier3::Status ConsumeTermFrequencyIfPresent(
- Hit *hit, uint32_t *offset) const;
-
- // A byte array of size size_in_bytes_ containing encoded hits for this
- // posting list.
- uint8_t *posting_list_buffer_; // does not own!
- uint32_t size_in_bytes_;
+ const PostingListUsed* posting_list_used, Hit* hit,
+ uint32_t* offset) const;
};
// Inlined functions. Implementation details below. Avert eyes!
-template <class T, Hit (*GetHit)(const T &)>
-uint32_t PostingListUsed::PrependHitArray(const T *array, uint32_t num_hits,
- bool keep_prepended) {
- if (!IsPostingListValid()) {
+template <class T, Hit (*GetHit)(const T&)>
+uint32_t PostingListUsedHitSerializer::PrependHitArray(
+ PostingListUsed* posting_list_used, const T* array, uint32_t num_hits,
+ bool keep_prepended) const {
+ if (!IsPostingListValid(posting_list_used)) {
return 0;
}
// Prepend hits working backwards from array[num_hits - 1].
uint32_t i;
for (i = 0; i < num_hits; ++i) {
- if (!PrependHit(GetHit(array[num_hits - i - 1])).ok()) {
+ if (!PrependHit(posting_list_used, GetHit(array[num_hits - i - 1])).ok()) {
break;
}
}
@@ -341,7 +340,7 @@ uint32_t PostingListUsed::PrependHitArray(const T *array, uint32_t num_hits,
// before. PopFrontHits guarantees that it will remove all 'i' hits so long
// as there are at least 'i' hits in the posting list, which we know there
// are.
- PopFrontHits(i);
+ PopFrontHits(posting_list_used, /*num_hits=*/i);
}
return i;
}
@@ -349,4 +348,4 @@ uint32_t PostingListUsed::PrependHitArray(const T *array, uint32_t num_hits,
} // namespace lib
} // namespace icing
-#endif // ICING_INDEX_MAIN_POSTING_LIST_USED_H_
+#endif // ICING_INDEX_MAIN_POSTING_LIST_USED_HIT_SERIALIZER_H_
diff --git a/icing/index/main/posting-list-used_test.cc b/icing/index/main/posting-list-used-hit-serializer_test.cc
index 044d0c1..b87adc9 100644
--- a/icing/index/main/posting-list-used_test.cc
+++ b/icing/index/main/posting-list-used-hit-serializer_test.cc
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 Google LLC
+// Copyright (C) 2022 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -12,35 +12,20 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "icing/index/main/posting-list-used.h"
+#include "icing/index/main/posting-list-used-hit-serializer.h"
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <sys/time.h>
-#include <sys/types.h>
-#include <unistd.h>
-
-#include <algorithm>
#include <cstdint>
#include <deque>
-#include <iterator>
#include <memory>
-#include <random>
-#include <string>
#include <vector>
#include "icing/text_classifier/lib3/utils/base/status.h"
#include "gmock/gmock.h"
#include "gtest/gtest.h"
-#include "icing/index/main/posting-list-utils.h"
-#include "icing/legacy/index/icing-bit-util.h"
-#include "icing/schema/section.h"
-#include "icing/store/document-id.h"
+#include "icing/file/posting_list/posting-list-used.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/hit-test-utils.h"
-using std::reverse;
-using std::vector;
using testing::ElementsAre;
using testing::ElementsAreArray;
using testing::Eq;
@@ -51,18 +36,20 @@ using testing::Lt;
namespace icing {
namespace lib {
+namespace {
+
struct HitElt {
HitElt() = default;
explicit HitElt(const Hit &hit_in) : hit(hit_in) {}
- static Hit get_hit(const HitElt &hit_elt) {
- return hit_elt.hit;
- }
+ static Hit get_hit(const HitElt &hit_elt) { return hit_elt.hit; }
Hit hit;
};
-TEST(PostingListTest, PostingListUsedPrependHitNotFull) {
+TEST(PostingListUsedHitSerializerTest, PostingListUsedPrependHitNotFull) {
+ PostingListUsedHitSerializer serializer;
+
static const int kNumHits = 2551;
static const size_t kHitsSize = kNumHits * sizeof(Hit);
@@ -70,52 +57,56 @@ TEST(PostingListTest, PostingListUsedPrependHitNotFull) {
ICING_ASSERT_OK_AND_ASSIGN(
PostingListUsed pl_used,
PostingListUsed::CreateFromUnitializedRegion(
- static_cast<void *>(hits_buf.get()), kHitsSize));
+ &serializer, static_cast<void *>(hits_buf.get()), kHitsSize));
// Make used.
Hit hit0(/*section_id=*/0, 0, /*term_frequency=*/56);
- pl_used.PrependHit(hit0);
+ serializer.PrependHit(&pl_used, hit0);
// Size = sizeof(uncompressed hit0)
int expected_size = sizeof(Hit);
- EXPECT_THAT(pl_used.BytesUsed(), Le(expected_size));
- EXPECT_THAT(pl_used.GetHits(), IsOkAndHolds(ElementsAre(hit0)));
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Le(expected_size));
+ EXPECT_THAT(serializer.GetHits(&pl_used), IsOkAndHolds(ElementsAre(hit0)));
Hit hit1(/*section_id=*/0, 1, Hit::kDefaultTermFrequency);
- pl_used.PrependHit(hit1);
+ serializer.PrependHit(&pl_used, hit1);
// Size = sizeof(uncompressed hit1)
// + sizeof(hit0-hit1) + sizeof(hit0::term_frequency)
expected_size += 2 + sizeof(Hit::TermFrequency);
- EXPECT_THAT(pl_used.BytesUsed(), Le(expected_size));
- EXPECT_THAT(pl_used.GetHits(), IsOkAndHolds(ElementsAre(hit1, hit0)));
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Le(expected_size));
+ EXPECT_THAT(serializer.GetHits(&pl_used),
+ IsOkAndHolds(ElementsAre(hit1, hit0)));
Hit hit2(/*section_id=*/0, 2, /*term_frequency=*/56);
- pl_used.PrependHit(hit2);
+ serializer.PrependHit(&pl_used, hit2);
// Size = sizeof(uncompressed hit2)
// + sizeof(hit1-hit2)
// + sizeof(hit0-hit1) + sizeof(hit0::term_frequency)
expected_size += 2;
- EXPECT_THAT(pl_used.BytesUsed(), Le(expected_size));
- EXPECT_THAT(pl_used.GetHits(), IsOkAndHolds(ElementsAre(hit2, hit1, hit0)));
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Le(expected_size));
+ EXPECT_THAT(serializer.GetHits(&pl_used),
+ IsOkAndHolds(ElementsAre(hit2, hit1, hit0)));
Hit hit3(/*section_id=*/0, 3, Hit::kDefaultTermFrequency);
- pl_used.PrependHit(hit3);
+ serializer.PrependHit(&pl_used, hit3);
// Size = sizeof(uncompressed hit3)
// + sizeof(hit2-hit3) + sizeof(hit2::term_frequency)
// + sizeof(hit1-hit2)
// + sizeof(hit0-hit1) + sizeof(hit0::term_frequency)
expected_size += 2 + sizeof(Hit::TermFrequency);
- EXPECT_THAT(pl_used.BytesUsed(), Le(expected_size));
- EXPECT_THAT(pl_used.GetHits(),
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Le(expected_size));
+ EXPECT_THAT(serializer.GetHits(&pl_used),
IsOkAndHolds(ElementsAre(hit3, hit2, hit1, hit0)));
}
-TEST(PostingListTest, PostingListUsedPrependHitAlmostFull) {
- constexpr int kHitsSize = 2 * posting_list_utils::min_posting_list_size();
- std::unique_ptr<char[]> hits_buf = std::make_unique<char[]>(kHitsSize);
+TEST(PostingListUsedHitSerializerTest, PostingListUsedPrependHitAlmostFull) {
+ PostingListUsedHitSerializer serializer;
+
+ int size = 2 * serializer.GetMinPostingListSize();
+ std::unique_ptr<char[]> hits_buf = std::make_unique<char[]>(size);
ICING_ASSERT_OK_AND_ASSIGN(
PostingListUsed pl_used,
PostingListUsed::CreateFromUnitializedRegion(
- static_cast<void *>(hits_buf.get()), kHitsSize));
+ &serializer, static_cast<void *>(hits_buf.get()), size));
// Fill up the compressed region.
// Transitions:
@@ -125,17 +116,18 @@ TEST(PostingListTest, PostingListUsedPrependHitAlmostFull) {
Hit hit0(/*section_id=*/0, 0, Hit::kDefaultTermFrequency);
Hit hit1 = CreateHit(hit0, /*desired_byte_length=*/2);
Hit hit2 = CreateHit(hit1, /*desired_byte_length=*/2);
- ICING_EXPECT_OK(pl_used.PrependHit(hit0));
- ICING_EXPECT_OK(pl_used.PrependHit(hit1));
- ICING_EXPECT_OK(pl_used.PrependHit(hit2));
+ ICING_EXPECT_OK(serializer.PrependHit(&pl_used, hit0));
+ ICING_EXPECT_OK(serializer.PrependHit(&pl_used, hit1));
+ ICING_EXPECT_OK(serializer.PrependHit(&pl_used, hit2));
// Size used will be 2+2+4=8 bytes
int expected_size = sizeof(Hit::Value) + 2 + 2;
- EXPECT_THAT(pl_used.BytesUsed(), Le(expected_size));
- EXPECT_THAT(pl_used.GetHits(), IsOkAndHolds(ElementsAre(hit2, hit1, hit0)));
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Le(expected_size));
+ EXPECT_THAT(serializer.GetHits(&pl_used),
+ IsOkAndHolds(ElementsAre(hit2, hit1, hit0)));
// Add one more hit to transition NOT_FULL -> ALMOST_FULL
Hit hit3 = CreateHit(hit2, /*desired_byte_length=*/3);
- ICING_EXPECT_OK(pl_used.PrependHit(hit3));
+ ICING_EXPECT_OK(serializer.PrependHit(&pl_used, hit3));
// Compressed region would be 2+2+3+4=11 bytes, but the compressed region is
// only 10 bytes. So instead, the posting list will transition to ALMOST_FULL.
// The in-use compressed region will actually shrink from 8 bytes to 7 bytes
@@ -143,91 +135,100 @@ TEST(PostingListTest, PostingListUsedPrependHitAlmostFull) {
// compressed delta of hit2. hit3 will be written to one of the special hits.
// Because we're in ALMOST_FULL, the expected size is the size of the pl minus
// the one hit used to mark the posting list as ALMOST_FULL.
- expected_size = kHitsSize - sizeof(Hit);
- EXPECT_THAT(pl_used.BytesUsed(), Le(expected_size));
- EXPECT_THAT(pl_used.GetHits(),
+ expected_size = size - sizeof(Hit);
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Le(expected_size));
+ EXPECT_THAT(serializer.GetHits(&pl_used),
IsOkAndHolds(ElementsAre(hit3, hit2, hit1, hit0)));
// Add one more hit to transition ALMOST_FULL -> ALMOST_FULL
Hit hit4 = CreateHit(hit3, /*desired_byte_length=*/2);
- ICING_EXPECT_OK(pl_used.PrependHit(hit4));
+ ICING_EXPECT_OK(serializer.PrependHit(&pl_used, hit4));
// There are currently 7 bytes in use in the compressed region. hit3 will have
// a 2-byte delta. That delta will fit in the compressed region (which will
// now have 9 bytes in use), hit4 will be placed in one of the special hits
// and the posting list will remain in ALMOST_FULL.
- EXPECT_THAT(pl_used.BytesUsed(), Le(expected_size));
- EXPECT_THAT(pl_used.GetHits(),
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Le(expected_size));
+ EXPECT_THAT(serializer.GetHits(&pl_used),
IsOkAndHolds(ElementsAre(hit4, hit3, hit2, hit1, hit0)));
// Add one more hit to transition ALMOST_FULL -> FULL
Hit hit5 = CreateHit(hit4, /*desired_byte_length=*/2);
- ICING_EXPECT_OK(pl_used.PrependHit(hit5));
+ ICING_EXPECT_OK(serializer.PrependHit(&pl_used, hit5));
// There are currently 9 bytes in use in the compressed region. hit4 will have
// a 2-byte delta which will not fit in the compressed region. So hit4 will
// remain in one of the special hits and hit5 will occupy the other, making
// the posting list FULL.
- EXPECT_THAT(pl_used.BytesUsed(), Le(kHitsSize));
- EXPECT_THAT(pl_used.GetHits(),
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Le(size));
+ EXPECT_THAT(serializer.GetHits(&pl_used),
IsOkAndHolds(ElementsAre(hit5, hit4, hit3, hit2, hit1, hit0)));
// The posting list is FULL. Adding another hit should fail.
Hit hit6 = CreateHit(hit5, /*desired_byte_length=*/1);
- EXPECT_THAT(pl_used.PrependHit(hit6),
+ EXPECT_THAT(serializer.PrependHit(&pl_used, hit6),
StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
}
-TEST(PostingListTest, PostingListUsedMinSize) {
+TEST(PostingListUsedHitSerializerTest, PostingListUsedMinSize) {
+ PostingListUsedHitSerializer serializer;
+
std::unique_ptr<char[]> hits_buf =
- std::make_unique<char[]>(posting_list_utils::min_posting_list_size());
+ std::make_unique<char[]>(serializer.GetMinPostingListSize());
- ICING_ASSERT_OK_AND_ASSIGN(PostingListUsed pl_used,
- PostingListUsed::CreateFromUnitializedRegion(
- static_cast<void *>(hits_buf.get()),
- posting_list_utils::min_posting_list_size()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used,
+ PostingListUsed::CreateFromUnitializedRegion(
+ &serializer, static_cast<void *>(hits_buf.get()),
+ serializer.GetMinPostingListSize()));
// PL State: EMPTY
- EXPECT_THAT(pl_used.BytesUsed(), Eq(0));
- EXPECT_THAT(pl_used.GetHits(), IsOkAndHolds(IsEmpty()));
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(0));
+ EXPECT_THAT(serializer.GetHits(&pl_used), IsOkAndHolds(IsEmpty()));
// Add a hit, PL should shift to ALMOST_FULL state
Hit hit0(/*section_id=*/0, 0, /*term_frequency=*/0,
/*is_in_prefix_section=*/false,
/*is_prefix_hit=*/true);
- ICING_EXPECT_OK(pl_used.PrependHit(hit0));
+ ICING_EXPECT_OK(serializer.PrependHit(&pl_used, hit0));
// Size = sizeof(uncompressed hit0)
int expected_size = sizeof(Hit);
- EXPECT_THAT(pl_used.BytesUsed(), Le(expected_size));
- EXPECT_THAT(pl_used.GetHits(), IsOkAndHolds(ElementsAre(hit0)));
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Le(expected_size));
+ EXPECT_THAT(serializer.GetHits(&pl_used), IsOkAndHolds(ElementsAre(hit0)));
// Add the smallest hit possible - no term_frequency and a delta of 1. PL
// should shift to FULL state.
Hit hit1(/*section_id=*/0, 0, /*term_frequency=*/0,
/*is_in_prefix_section=*/true,
/*is_prefix_hit=*/false);
- ICING_EXPECT_OK(pl_used.PrependHit(hit1));
+ ICING_EXPECT_OK(serializer.PrependHit(&pl_used, hit1));
// Size = sizeof(uncompressed hit1) + sizeof(uncompressed hit0)
expected_size += sizeof(Hit);
- EXPECT_THAT(pl_used.BytesUsed(), Le(expected_size));
- EXPECT_THAT(pl_used.GetHits(), IsOkAndHolds(ElementsAre(hit1, hit0)));
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Le(expected_size));
+ EXPECT_THAT(serializer.GetHits(&pl_used),
+ IsOkAndHolds(ElementsAre(hit1, hit0)));
// Try to add the smallest hit possible. Should fail
Hit hit2(/*section_id=*/0, 0, /*term_frequency=*/0,
/*is_in_prefix_section=*/false,
/*is_prefix_hit=*/false);
- EXPECT_THAT(pl_used.PrependHit(hit2),
+ EXPECT_THAT(serializer.PrependHit(&pl_used, hit2),
StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
- EXPECT_THAT(pl_used.BytesUsed(), Le(expected_size));
- EXPECT_THAT(pl_used.GetHits(), IsOkAndHolds(ElementsAre(hit1, hit0)));
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Le(expected_size));
+ EXPECT_THAT(serializer.GetHits(&pl_used),
+ IsOkAndHolds(ElementsAre(hit1, hit0)));
}
-TEST(PostingListTest, PostingListPrependHitArrayMinSizePostingList) {
+TEST(PostingListUsedHitSerializerTest,
+ PostingListPrependHitArrayMinSizePostingList) {
+ PostingListUsedHitSerializer serializer;
+
constexpr int kFinalSize = 1025;
std::unique_ptr<char[]> hits_buf = std::make_unique<char[]>(kFinalSize);
// Min Size = 10
- int size = posting_list_utils::min_posting_list_size();
- ICING_ASSERT_OK_AND_ASSIGN(PostingListUsed pl_used,
- PostingListUsed::CreateFromUnitializedRegion(
- static_cast<void *>(hits_buf.get()), size));
+ int size = serializer.GetMinPostingListSize();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used,
+ PostingListUsed::CreateFromUnitializedRegion(
+ &serializer, static_cast<void *>(hits_buf.get()), size));
std::vector<HitElt> hits_in;
hits_in.emplace_back(Hit(1, 0, Hit::kDefaultTermFrequency));
@@ -243,32 +244,37 @@ TEST(PostingListTest, PostingListPrependHitArrayMinSizePostingList) {
// Add five hits. The PL is in the empty state and an empty min size PL can
// only fit two hits. So PrependHitArray should fail.
- uint32_t num_can_prepend = pl_used.PrependHitArray<HitElt, HitElt::get_hit>(
- &hits_in[0], hits_in.size(), false);
+ uint32_t num_can_prepend =
+ serializer.PrependHitArray<HitElt, HitElt::get_hit>(
+ &pl_used, &hits_in[0], hits_in.size(), false);
EXPECT_THAT(num_can_prepend, Eq(2));
int can_fit_hits = num_can_prepend;
// The PL has room for 2 hits. We should be able to add them without any
// problem, transitioning the PL from EMPTY -> ALMOST_FULL -> FULL
const HitElt *hits_in_ptr = hits_in.data() + (hits_in.size() - 2);
- num_can_prepend = pl_used.PrependHitArray<HitElt, HitElt::get_hit>(
- hits_in_ptr, can_fit_hits, false);
+ num_can_prepend = serializer.PrependHitArray<HitElt, HitElt::get_hit>(
+ &pl_used, hits_in_ptr, can_fit_hits, false);
EXPECT_THAT(num_can_prepend, Eq(can_fit_hits));
- EXPECT_THAT(size, Eq(pl_used.BytesUsed()));
+ EXPECT_THAT(size, Eq(serializer.GetBytesUsed(&pl_used)));
std::deque<Hit> hits_pushed;
std::transform(hits_in.rbegin(),
hits_in.rend() - hits_in.size() + can_fit_hits,
std::front_inserter(hits_pushed), HitElt::get_hit);
- EXPECT_THAT(pl_used.GetHits(), IsOkAndHolds(ElementsAreArray(hits_pushed)));
+ EXPECT_THAT(serializer.GetHits(&pl_used),
+ IsOkAndHolds(ElementsAreArray(hits_pushed)));
}
-TEST(PostingListTest, PostingListPrependHitArrayPostingList) {
+TEST(PostingListUsedHitSerializerTest, PostingListPrependHitArrayPostingList) {
+ PostingListUsedHitSerializer serializer;
+
// Size = 30
- int size = 3 * posting_list_utils::min_posting_list_size();
+ int size = 3 * serializer.GetMinPostingListSize();
std::unique_ptr<char[]> hits_buf = std::make_unique<char[]>(size);
- ICING_ASSERT_OK_AND_ASSIGN(PostingListUsed pl_used,
- PostingListUsed::CreateFromUnitializedRegion(
- static_cast<void *>(hits_buf.get()), size));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used,
+ PostingListUsed::CreateFromUnitializedRegion(
+ &serializer, static_cast<void *>(hits_buf.get()), size));
std::vector<HitElt> hits_in;
hits_in.emplace_back(Hit(1, 0, Hit::kDefaultTermFrequency));
@@ -297,14 +303,15 @@ TEST(PostingListTest, PostingListPrependHitArrayPostingList) {
// Add five hits. The PL is in the empty state and should be able to fit all
// five hits without issue, transitioning the PL from EMPTY -> NOT_FULL.
- uint32_t num_could_fit = pl_used.PrependHitArray<HitElt, HitElt::get_hit>(
- &hits_in[0], hits_in.size(), false);
+ uint32_t num_could_fit = serializer.PrependHitArray<HitElt, HitElt::get_hit>(
+ &pl_used, &hits_in[0], hits_in.size(), false);
EXPECT_THAT(num_could_fit, Eq(hits_in.size()));
- EXPECT_THAT(byte_size, Eq(pl_used.BytesUsed()));
+ EXPECT_THAT(byte_size, Eq(serializer.GetBytesUsed(&pl_used)));
std::deque<Hit> hits_pushed;
std::transform(hits_in.rbegin(), hits_in.rend(),
std::front_inserter(hits_pushed), HitElt::get_hit);
- EXPECT_THAT(pl_used.GetHits(), IsOkAndHolds(ElementsAreArray(hits_pushed)));
+ EXPECT_THAT(serializer.GetHits(&pl_used),
+ IsOkAndHolds(ElementsAreArray(hits_pushed)));
Hit first_hit = CreateHit(hits_in.begin()->hit, /*desired_byte_length=*/1);
hits_in.clear();
@@ -341,14 +348,15 @@ TEST(PostingListTest, PostingListPrependHitArrayPostingList) {
// Add these 6 hits. The PL is currently in the NOT_FULL state and should
// remain in the NOT_FULL state.
- num_could_fit = pl_used.PrependHitArray<HitElt, HitElt::get_hit>(
- &hits_in[0], hits_in.size(), false);
+ num_could_fit = serializer.PrependHitArray<HitElt, HitElt::get_hit>(
+ &pl_used, &hits_in[0], hits_in.size(), false);
EXPECT_THAT(num_could_fit, Eq(hits_in.size()));
- EXPECT_THAT(byte_size, Eq(pl_used.BytesUsed()));
+ EXPECT_THAT(byte_size, Eq(serializer.GetBytesUsed(&pl_used)));
// All hits from hits_in were added.
std::transform(hits_in.rbegin(), hits_in.rend(),
std::front_inserter(hits_pushed), HitElt::get_hit);
- EXPECT_THAT(pl_used.GetHits(), IsOkAndHolds(ElementsAreArray(hits_pushed)));
+ EXPECT_THAT(serializer.GetHits(&pl_used),
+ IsOkAndHolds(ElementsAreArray(hits_pushed)));
first_hit = CreateHit(hits_in.begin()->hit, /*desired_byte_length=*/3);
hits_in.clear();
@@ -374,14 +382,15 @@ TEST(PostingListTest, PostingListPrependHitArrayPostingList) {
// Add this 1 hit. The PL is currently in the NOT_FULL state and should
// transition to the ALMOST_FULL state - even though there is still some
// unused space.
- num_could_fit = pl_used.PrependHitArray<HitElt, HitElt::get_hit>(
- &hits_in[0], hits_in.size(), false);
+ num_could_fit = serializer.PrependHitArray<HitElt, HitElt::get_hit>(
+ &pl_used, &hits_in[0], hits_in.size(), false);
EXPECT_THAT(num_could_fit, Eq(hits_in.size()));
- EXPECT_THAT(byte_size, Eq(pl_used.BytesUsed()));
+ EXPECT_THAT(byte_size, Eq(serializer.GetBytesUsed(&pl_used)));
// All hits from hits_in were added.
std::transform(hits_in.rbegin(), hits_in.rend(),
std::front_inserter(hits_pushed), HitElt::get_hit);
- EXPECT_THAT(pl_used.GetHits(), IsOkAndHolds(ElementsAreArray(hits_pushed)));
+ EXPECT_THAT(serializer.GetHits(&pl_used),
+ IsOkAndHolds(ElementsAreArray(hits_pushed)));
first_hit = CreateHit(hits_in.begin()->hit, /*desired_byte_length=*/1);
hits_in.clear();
@@ -413,17 +422,20 @@ TEST(PostingListTest, PostingListPrependHitArrayPostingList) {
// second hit should tranisition to the FULL state because the delta between
// Hit #13 and Hit #14 (2 bytes) is larger than the remaining unused area
// (1 byte).
- num_could_fit = pl_used.PrependHitArray<HitElt, HitElt::get_hit>(
- &hits_in[0], hits_in.size(), false);
+ num_could_fit = serializer.PrependHitArray<HitElt, HitElt::get_hit>(
+ &pl_used, &hits_in[0], hits_in.size(), false);
EXPECT_THAT(num_could_fit, Eq(hits_in.size()));
- EXPECT_THAT(size, Eq(pl_used.BytesUsed()));
+ EXPECT_THAT(size, Eq(serializer.GetBytesUsed(&pl_used)));
// All hits from hits_in were added.
std::transform(hits_in.rbegin(), hits_in.rend(),
std::front_inserter(hits_pushed), HitElt::get_hit);
- EXPECT_THAT(pl_used.GetHits(), IsOkAndHolds(ElementsAreArray(hits_pushed)));
+ EXPECT_THAT(serializer.GetHits(&pl_used),
+ IsOkAndHolds(ElementsAreArray(hits_pushed)));
}
-TEST(PostingListTest, PostingListPrependHitArrayTooManyHits) {
+TEST(PostingListUsedHitSerializerTest, PostingListPrependHitArrayTooManyHits) {
+ PostingListUsedHitSerializer serializer;
+
static constexpr int kNumHits = 128;
static constexpr int kDeltaSize = 1;
static constexpr int kTermFrequencySize = 1;
@@ -433,150 +445,171 @@ TEST(PostingListTest, PostingListPrependHitArrayTooManyHits) {
std::unique_ptr<char[]> hits_buf = std::make_unique<char[]>(kHitsSize);
// Create an array with one too many hits
- vector<Hit> hits_in_too_many =
+ std::vector<Hit> hits_in_too_many =
CreateHits(kNumHits + 1, /*desired_byte_length=*/1);
- vector<HitElt> hit_elts_in_too_many;
+ std::vector<HitElt> hit_elts_in_too_many;
for (const Hit &hit : hits_in_too_many) {
hit_elts_in_too_many.emplace_back(hit);
}
- ICING_ASSERT_OK_AND_ASSIGN(PostingListUsed pl_used,
- PostingListUsed::CreateFromUnitializedRegion(
- static_cast<void *>(hits_buf.get()),
- posting_list_utils::min_posting_list_size()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used,
+ PostingListUsed::CreateFromUnitializedRegion(
+ &serializer, static_cast<void *>(hits_buf.get()),
+ serializer.GetMinPostingListSize()));
// PrependHitArray should fail because hit_elts_in_too_many is far too large
// for the minimum size pl.
- uint32_t num_could_fit = pl_used.PrependHitArray<HitElt, HitElt::get_hit>(
- &hit_elts_in_too_many[0], hit_elts_in_too_many.size(), false);
+ uint32_t num_could_fit = serializer.PrependHitArray<HitElt, HitElt::get_hit>(
+ &pl_used, &hit_elts_in_too_many[0], hit_elts_in_too_many.size(), false);
ASSERT_THAT(num_could_fit, Lt(hit_elts_in_too_many.size()));
- ASSERT_THAT(pl_used.BytesUsed(), Eq(0));
- ASSERT_THAT(pl_used.GetHits(), IsOkAndHolds(IsEmpty()));
+ ASSERT_THAT(serializer.GetBytesUsed(&pl_used), Eq(0));
+ ASSERT_THAT(serializer.GetHits(&pl_used), IsOkAndHolds(IsEmpty()));
ICING_ASSERT_OK_AND_ASSIGN(
- pl_used, PostingListUsed::CreateFromUnitializedRegion(
- static_cast<void *>(hits_buf.get()), kHitsSize));
+ pl_used,
+ PostingListUsed::CreateFromUnitializedRegion(
+ &serializer, static_cast<void *>(hits_buf.get()), kHitsSize));
// PrependHitArray should fail because hit_elts_in_too_many is one hit too
// large for this pl.
- num_could_fit = pl_used.PrependHitArray<HitElt, HitElt::get_hit>(
- &hit_elts_in_too_many[0], hit_elts_in_too_many.size(), false);
+ num_could_fit = serializer.PrependHitArray<HitElt, HitElt::get_hit>(
+ &pl_used, &hit_elts_in_too_many[0], hit_elts_in_too_many.size(), false);
ASSERT_THAT(num_could_fit, Lt(hit_elts_in_too_many.size()));
- ASSERT_THAT(pl_used.BytesUsed(), Eq(0));
- ASSERT_THAT(pl_used.GetHits(), IsOkAndHolds(IsEmpty()));
+ ASSERT_THAT(serializer.GetBytesUsed(&pl_used), Eq(0));
+ ASSERT_THAT(serializer.GetHits(&pl_used), IsOkAndHolds(IsEmpty()));
}
-TEST(PostingListTest, PostingListStatusJumpFromNotFullToFullAndBack) {
+TEST(PostingListUsedHitSerializerTest,
+ PostingListStatusJumpFromNotFullToFullAndBack) {
+ PostingListUsedHitSerializer serializer;
+
const uint32_t pl_size = 3 * sizeof(Hit);
char hits_buf[pl_size];
- ICING_ASSERT_OK_AND_ASSIGN(
- PostingListUsed pl,
- PostingListUsed::CreateFromUnitializedRegion(hits_buf, pl_size));
- ICING_ASSERT_OK(pl.PrependHit(Hit(Hit::kInvalidValue - 1, 0)));
- uint32_t bytes_used = pl.BytesUsed();
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListUsed pl,
+ PostingListUsed::CreateFromUnitializedRegion(
+ &serializer, hits_buf, pl_size));
+ ICING_ASSERT_OK(serializer.PrependHit(&pl, Hit(Hit::kInvalidValue - 1, 0)));
+ uint32_t bytes_used = serializer.GetBytesUsed(&pl);
// Status not full.
- ASSERT_THAT(bytes_used, Le(pl_size - posting_list_utils::kSpecialHitsSize));
- ICING_ASSERT_OK(pl.PrependHit(Hit(Hit::kInvalidValue >> 2, 0)));
+ ASSERT_THAT(bytes_used,
+ Le(pl_size - PostingListUsedHitSerializer::kSpecialHitsSize));
+ ICING_ASSERT_OK(serializer.PrependHit(&pl, Hit(Hit::kInvalidValue >> 2, 0)));
// Status should jump to full directly.
- ASSERT_THAT(pl.BytesUsed(), Eq(pl_size));
- pl.PopFrontHits(1);
+ ASSERT_THAT(serializer.GetBytesUsed(&pl), Eq(pl_size));
+ serializer.PopFrontHits(&pl, 1);
// Status should return to not full as before.
- ASSERT_THAT(pl.BytesUsed(), Eq(bytes_used));
+ ASSERT_THAT(serializer.GetBytesUsed(&pl), Eq(bytes_used));
}
-TEST(PostingListTest, DeltaOverflow) {
+TEST(PostingListUsedHitSerializerTest, DeltaOverflow) {
+ PostingListUsedHitSerializer serializer;
+
char hits_buf[1000];
- ICING_ASSERT_OK_AND_ASSIGN(
- PostingListUsed pl,
- PostingListUsed::CreateFromUnitializedRegion(hits_buf, 4 * sizeof(Hit)));
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListUsed pl,
+ PostingListUsed::CreateFromUnitializedRegion(
+ &serializer, hits_buf, 4 * sizeof(Hit)));
static const Hit::Value kOverflow[4] = {
- Hit::kInvalidValue >> 2,
- (Hit::kInvalidValue >> 2) * 2,
- (Hit::kInvalidValue >> 2) * 3,
- Hit::kInvalidValue - 1,
+ Hit::kInvalidValue >> 2,
+ (Hit::kInvalidValue >> 2) * 2,
+ (Hit::kInvalidValue >> 2) * 3,
+ Hit::kInvalidValue - 1,
};
// Fit at least 4 ordinary values.
for (Hit::Value v = 0; v < 4; v++) {
- ICING_EXPECT_OK(pl.PrependHit(Hit(4 - v)));
+ ICING_EXPECT_OK(serializer.PrependHit(&pl, Hit(4 - v)));
}
// Cannot fit 4 overflow values.
ICING_ASSERT_OK_AND_ASSIGN(pl, PostingListUsed::CreateFromUnitializedRegion(
- hits_buf, 4 * sizeof(Hit)));
- ICING_EXPECT_OK(pl.PrependHit(Hit(kOverflow[3])));
- ICING_EXPECT_OK(pl.PrependHit(Hit(kOverflow[2])));
+ &serializer, hits_buf, 4 * sizeof(Hit)));
+ ICING_EXPECT_OK(serializer.PrependHit(&pl, Hit(kOverflow[3])));
+ ICING_EXPECT_OK(serializer.PrependHit(&pl, Hit(kOverflow[2])));
// Can fit only one more.
- ICING_EXPECT_OK(pl.PrependHit(Hit(kOverflow[1])));
- EXPECT_THAT(pl.PrependHit(Hit(kOverflow[0])),
+ ICING_EXPECT_OK(serializer.PrependHit(&pl, Hit(kOverflow[1])));
+ EXPECT_THAT(serializer.PrependHit(&pl, Hit(kOverflow[0])),
StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
}
-TEST(PostingListTest, MoveFrom) {
- int size = 3 * posting_list_utils::min_posting_list_size();
+TEST(PostingListUsedHitSerializerTest, MoveFrom) {
+ PostingListUsedHitSerializer serializer;
+
+ int size = 3 * serializer.GetMinPostingListSize();
std::unique_ptr<char[]> hits_buf1 = std::make_unique<char[]>(size);
- ICING_ASSERT_OK_AND_ASSIGN(PostingListUsed pl_used1,
- PostingListUsed::CreateFromUnitializedRegion(
- static_cast<void *>(hits_buf1.get()), size));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used1,
+ PostingListUsed::CreateFromUnitializedRegion(
+ &serializer, static_cast<void *>(hits_buf1.get()), size));
std::vector<Hit> hits1 =
CreateHits(/*num_hits=*/5, /*desired_byte_length=*/1);
for (const Hit &hit : hits1) {
- ICING_ASSERT_OK(pl_used1.PrependHit(hit));
+ ICING_ASSERT_OK(serializer.PrependHit(&pl_used1, hit));
}
std::unique_ptr<char[]> hits_buf2 = std::make_unique<char[]>(size);
- ICING_ASSERT_OK_AND_ASSIGN(PostingListUsed pl_used2,
- PostingListUsed::CreateFromUnitializedRegion(
- static_cast<void *>(hits_buf2.get()), size));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used2,
+ PostingListUsed::CreateFromUnitializedRegion(
+ &serializer, static_cast<void *>(hits_buf2.get()), size));
std::vector<Hit> hits2 =
CreateHits(/*num_hits=*/5, /*desired_byte_length=*/2);
for (const Hit &hit : hits2) {
- ICING_ASSERT_OK(pl_used2.PrependHit(hit));
+ ICING_ASSERT_OK(serializer.PrependHit(&pl_used2, hit));
}
- ICING_ASSERT_OK(pl_used2.MoveFrom(&pl_used1));
- EXPECT_THAT(pl_used2.GetHits(),
+ ICING_ASSERT_OK(serializer.MoveFrom(/*dst=*/&pl_used2, /*src=*/&pl_used1));
+ EXPECT_THAT(serializer.GetHits(&pl_used2),
IsOkAndHolds(ElementsAreArray(hits1.rbegin(), hits1.rend())));
- EXPECT_THAT(pl_used1.GetHits(), IsOkAndHolds(IsEmpty()));
+ EXPECT_THAT(serializer.GetHits(&pl_used1), IsOkAndHolds(IsEmpty()));
}
-TEST(PostingListTest, MoveFromNullArgumentReturnsInvalidArgument) {
- int size = 3 * posting_list_utils::min_posting_list_size();
+TEST(PostingListUsedHitSerializerTest,
+ MoveFromNullArgumentReturnsInvalidArgument) {
+ PostingListUsedHitSerializer serializer;
+
+ int size = 3 * serializer.GetMinPostingListSize();
std::unique_ptr<char[]> hits_buf1 = std::make_unique<char[]>(size);
- ICING_ASSERT_OK_AND_ASSIGN(PostingListUsed pl_used1,
- PostingListUsed::CreateFromUnitializedRegion(
- static_cast<void *>(hits_buf1.get()), size));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used1,
+ PostingListUsed::CreateFromUnitializedRegion(
+ &serializer, static_cast<void *>(hits_buf1.get()), size));
std::vector<Hit> hits = CreateHits(/*num_hits=*/5, /*desired_byte_length=*/1);
for (const Hit &hit : hits) {
- ICING_ASSERT_OK(pl_used1.PrependHit(hit));
+ ICING_ASSERT_OK(serializer.PrependHit(&pl_used1, hit));
}
- EXPECT_THAT(pl_used1.MoveFrom(/*other=*/nullptr),
+ EXPECT_THAT(serializer.MoveFrom(&pl_used1, /*other=*/nullptr),
StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
- EXPECT_THAT(pl_used1.GetHits(),
+ EXPECT_THAT(serializer.GetHits(&pl_used1),
IsOkAndHolds(ElementsAreArray(hits.rbegin(), hits.rend())));
}
-TEST(PostingListTest, MoveFromInvalidPostingListReturnsInvalidArgument) {
- int size = 3 * posting_list_utils::min_posting_list_size();
+TEST(PostingListUsedHitSerializerTest,
+ MoveFromInvalidPostingListReturnsInvalidArgument) {
+ PostingListUsedHitSerializer serializer;
+
+ int size = 3 * serializer.GetMinPostingListSize();
std::unique_ptr<char[]> hits_buf1 = std::make_unique<char[]>(size);
- ICING_ASSERT_OK_AND_ASSIGN(PostingListUsed pl_used1,
- PostingListUsed::CreateFromUnitializedRegion(
- static_cast<void *>(hits_buf1.get()), size));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used1,
+ PostingListUsed::CreateFromUnitializedRegion(
+ &serializer, static_cast<void *>(hits_buf1.get()), size));
std::vector<Hit> hits1 =
CreateHits(/*num_hits=*/5, /*desired_byte_length=*/1);
for (const Hit &hit : hits1) {
- ICING_ASSERT_OK(pl_used1.PrependHit(hit));
+ ICING_ASSERT_OK(serializer.PrependHit(&pl_used1, hit));
}
std::unique_ptr<char[]> hits_buf2 = std::make_unique<char[]>(size);
- ICING_ASSERT_OK_AND_ASSIGN(PostingListUsed pl_used2,
- PostingListUsed::CreateFromUnitializedRegion(
- static_cast<void *>(hits_buf2.get()), size));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used2,
+ PostingListUsed::CreateFromUnitializedRegion(
+ &serializer, static_cast<void *>(hits_buf2.get()), size));
std::vector<Hit> hits2 =
CreateHits(/*num_hits=*/5, /*desired_byte_length=*/2);
for (const Hit &hit : hits2) {
- ICING_ASSERT_OK(pl_used2.PrependHit(hit));
+ ICING_ASSERT_OK(serializer.PrependHit(&pl_used2, hit));
}
// Write invalid hits to the beginning of pl_used1 to make it invalid.
@@ -585,32 +618,37 @@ TEST(PostingListTest, MoveFromInvalidPostingListReturnsInvalidArgument) {
*first_hit = invalid_hit;
++first_hit;
*first_hit = invalid_hit;
- EXPECT_THAT(pl_used2.MoveFrom(&pl_used1),
+ EXPECT_THAT(serializer.MoveFrom(/*dst=*/&pl_used2, /*src=*/&pl_used1),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
- EXPECT_THAT(pl_used2.GetHits(),
+ EXPECT_THAT(serializer.GetHits(&pl_used2),
IsOkAndHolds(ElementsAreArray(hits2.rbegin(), hits2.rend())));
}
-TEST(PostingListTest, MoveToInvalidPostingListReturnsInvalidArgument) {
- int size = 3 * posting_list_utils::min_posting_list_size();
+TEST(PostingListUsedHitSerializerTest,
+ MoveToInvalidPostingListReturnsInvalidArgument) {
+ PostingListUsedHitSerializer serializer;
+
+ int size = 3 * serializer.GetMinPostingListSize();
std::unique_ptr<char[]> hits_buf1 = std::make_unique<char[]>(size);
- ICING_ASSERT_OK_AND_ASSIGN(PostingListUsed pl_used1,
- PostingListUsed::CreateFromUnitializedRegion(
- static_cast<void *>(hits_buf1.get()), size));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used1,
+ PostingListUsed::CreateFromUnitializedRegion(
+ &serializer, static_cast<void *>(hits_buf1.get()), size));
std::vector<Hit> hits1 =
CreateHits(/*num_hits=*/5, /*desired_byte_length=*/1);
for (const Hit &hit : hits1) {
- ICING_ASSERT_OK(pl_used1.PrependHit(hit));
+ ICING_ASSERT_OK(serializer.PrependHit(&pl_used1, hit));
}
std::unique_ptr<char[]> hits_buf2 = std::make_unique<char[]>(size);
- ICING_ASSERT_OK_AND_ASSIGN(PostingListUsed pl_used2,
- PostingListUsed::CreateFromUnitializedRegion(
- static_cast<void *>(hits_buf2.get()), size));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used2,
+ PostingListUsed::CreateFromUnitializedRegion(
+ &serializer, static_cast<void *>(hits_buf2.get()), size));
std::vector<Hit> hits2 =
CreateHits(/*num_hits=*/5, /*desired_byte_length=*/2);
for (const Hit &hit : hits2) {
- ICING_ASSERT_OK(pl_used2.PrependHit(hit));
+ ICING_ASSERT_OK(serializer.PrependHit(&pl_used2, hit));
}
// Write invalid hits to the beginning of pl_used2 to make it invalid.
@@ -619,50 +657,57 @@ TEST(PostingListTest, MoveToInvalidPostingListReturnsInvalidArgument) {
*first_hit = invalid_hit;
++first_hit;
*first_hit = invalid_hit;
- EXPECT_THAT(pl_used2.MoveFrom(&pl_used1),
+ EXPECT_THAT(serializer.MoveFrom(&pl_used2, &pl_used1),
StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
- EXPECT_THAT(pl_used1.GetHits(),
+ EXPECT_THAT(serializer.GetHits(&pl_used1),
IsOkAndHolds(ElementsAreArray(hits1.rbegin(), hits1.rend())));
}
-TEST(PostingListTest, MoveToPostingListTooSmall) {
- int size = 3 * posting_list_utils::min_posting_list_size();
+TEST(PostingListUsedHitSerializerTest, MoveToPostingListTooSmall) {
+ PostingListUsedHitSerializer serializer;
+
+ int size = 3 * serializer.GetMinPostingListSize();
std::unique_ptr<char[]> hits_buf1 = std::make_unique<char[]>(size);
- ICING_ASSERT_OK_AND_ASSIGN(PostingListUsed pl_used1,
- PostingListUsed::CreateFromUnitializedRegion(
- static_cast<void *>(hits_buf1.get()), size));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used1,
+ PostingListUsed::CreateFromUnitializedRegion(
+ &serializer, static_cast<void *>(hits_buf1.get()), size));
std::vector<Hit> hits1 =
CreateHits(/*num_hits=*/5, /*desired_byte_length=*/1);
for (const Hit &hit : hits1) {
- ICING_ASSERT_OK(pl_used1.PrependHit(hit));
+ ICING_ASSERT_OK(serializer.PrependHit(&pl_used1, hit));
}
std::unique_ptr<char[]> hits_buf2 =
- std::make_unique<char[]>(posting_list_utils::min_posting_list_size());
- ICING_ASSERT_OK_AND_ASSIGN(PostingListUsed pl_used2,
- PostingListUsed::CreateFromUnitializedRegion(
- static_cast<void *>(hits_buf2.get()),
- posting_list_utils::min_posting_list_size()));
+ std::make_unique<char[]>(serializer.GetMinPostingListSize());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used2,
+ PostingListUsed::CreateFromUnitializedRegion(
+ &serializer, static_cast<void *>(hits_buf2.get()),
+ serializer.GetMinPostingListSize()));
std::vector<Hit> hits2 =
CreateHits(/*num_hits=*/1, /*desired_byte_length=*/2);
for (const Hit &hit : hits2) {
- ICING_ASSERT_OK(pl_used2.PrependHit(hit));
+ ICING_ASSERT_OK(serializer.PrependHit(&pl_used2, hit));
}
- EXPECT_THAT(pl_used2.MoveFrom(&pl_used1),
+ EXPECT_THAT(serializer.MoveFrom(/*dst=*/&pl_used2, /*src=*/&pl_used1),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
- EXPECT_THAT(pl_used1.GetHits(),
+ EXPECT_THAT(serializer.GetHits(&pl_used1),
IsOkAndHolds(ElementsAreArray(hits1.rbegin(), hits1.rend())));
- EXPECT_THAT(pl_used2.GetHits(),
+ EXPECT_THAT(serializer.GetHits(&pl_used2),
IsOkAndHolds(ElementsAreArray(hits2.rbegin(), hits2.rend())));
}
-TEST(PostingListTest, PopHitsWithScores) {
- int size = 2 * posting_list_utils::min_posting_list_size();
+TEST(PostingListUsedHitSerializerTest, PopHitsWithScores) {
+ PostingListUsedHitSerializer serializer;
+
+ int size = 2 * serializer.GetMinPostingListSize();
std::unique_ptr<char[]> hits_buf1 = std::make_unique<char[]>(size);
- ICING_ASSERT_OK_AND_ASSIGN(PostingListUsed pl_used,
- PostingListUsed::CreateFromUnitializedRegion(
- static_cast<void *>(hits_buf1.get()), size));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used,
+ PostingListUsed::CreateFromUnitializedRegion(
+ &serializer, static_cast<void *>(hits_buf1.get()), size));
// This posting list is 20-bytes. Create four hits that will have deltas of
// two bytes each and all of whom will have a non-default score. This posting
@@ -683,12 +728,13 @@ TEST(PostingListTest, PopHitsWithScores) {
Hit hit1 = CreateHit(hit0, /*desired_byte_length=*/2);
Hit hit2 = CreateHit(hit1, /*desired_byte_length=*/2);
Hit hit3 = CreateHit(hit2, /*desired_byte_length=*/2);
- ICING_ASSERT_OK(pl_used.PrependHit(hit0));
- ICING_ASSERT_OK(pl_used.PrependHit(hit1));
- ICING_ASSERT_OK(pl_used.PrependHit(hit2));
- ICING_ASSERT_OK(pl_used.PrependHit(hit3));
+ ICING_ASSERT_OK(serializer.PrependHit(&pl_used, hit0));
+ ICING_ASSERT_OK(serializer.PrependHit(&pl_used, hit1));
+ ICING_ASSERT_OK(serializer.PrependHit(&pl_used, hit2));
+ ICING_ASSERT_OK(serializer.PrependHit(&pl_used, hit3));
- ICING_ASSERT_OK_AND_ASSIGN(std::vector<Hit> hits_out, pl_used.GetHits());
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Hit> hits_out,
+ serializer.GetHits(&pl_used));
EXPECT_THAT(hits_out, ElementsAre(hit3, hit2, hit1, hit0));
// Now, pop the last hit. The posting list should contain the first three
@@ -703,10 +749,12 @@ TEST(PostingListTest, PopHitsWithScores) {
// 9-5 Hit #2
// 4-0 kInvalidHitVal
// ----------------------
- ICING_ASSERT_OK(pl_used.PopFrontHits(1));
- ICING_ASSERT_OK_AND_ASSIGN(hits_out, pl_used.GetHits());
+ ICING_ASSERT_OK(serializer.PopFrontHits(&pl_used, 1));
+ ICING_ASSERT_OK_AND_ASSIGN(hits_out, serializer.GetHits(&pl_used));
EXPECT_THAT(hits_out, ElementsAre(hit2, hit1, hit0));
}
+} // namespace
+
} // namespace lib
} // namespace icing
diff --git a/icing/monkey_test/icing-monkey-test-runner.cc b/icing/monkey_test/icing-monkey-test-runner.cc
new file mode 100644
index 0000000..2dd5a03
--- /dev/null
+++ b/icing/monkey_test/icing-monkey-test-runner.cc
@@ -0,0 +1,442 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/monkey_test/icing-monkey-test-runner.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <functional>
+#include <string>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/monkey_test/in-memory-icing-search-engine.h"
+#include "icing/monkey_test/monkey-test-generators.h"
+#include "icing/portable/equals-proto.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/tmp-directory.h"
+#include "icing/util/logging.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::icing::lib::portable_equals_proto::EqualsProto;
+using ::testing::Eq;
+using ::testing::Le;
+using ::testing::SizeIs;
+using ::testing::UnorderedElementsAreArray;
+
+inline constexpr int kNumTypes = 30;
+const std::vector<int> kPossibleNumProperties = {0,
+ 1,
+ 2,
+ 4,
+ 8,
+ 16,
+ kTotalNumSections / 2,
+ kTotalNumSections,
+ kTotalNumSections + 1,
+ kTotalNumSections * 2};
+inline constexpr int kNumNamespaces = 100;
+inline constexpr int kNumURIs = 1000;
+
+// Merge per 131072 hits
+const int kIndexMergeSize = 1024 * 1024;
+
+// An array of pairs of monkey test APIs with frequencies.
+// If f_sum is the sum of all the frequencies, an operation with frequency f
+// means for every f_sum iterations, the operation is expected to run f times.
+const std::vector<
+ std::pair<std::function<void(IcingMonkeyTestRunner*)>, uint32_t>>
+ kMonkeyAPISchedules = {{&IcingMonkeyTestRunner::DoPut, 500},
+ {&IcingMonkeyTestRunner::DoSearch, 200},
+ {&IcingMonkeyTestRunner::DoGet, 70},
+ {&IcingMonkeyTestRunner::DoGetAllNamespaces, 50},
+ {&IcingMonkeyTestRunner::DoDelete, 50},
+ {&IcingMonkeyTestRunner::DoDeleteByNamespace, 50},
+ {&IcingMonkeyTestRunner::DoDeleteBySchemaType, 50},
+ {&IcingMonkeyTestRunner::DoDeleteByQuery, 20},
+ {&IcingMonkeyTestRunner::DoOptimize, 5},
+ {&IcingMonkeyTestRunner::ReloadFromDisk, 5}};
+
+SchemaProto GenerateRandomSchema(MonkeyTestRandomEngine* random) {
+ MonkeySchemaGenerator schema_generator(random);
+ return schema_generator.GenerateSchema(kNumTypes, kPossibleNumProperties);
+}
+
+SearchSpecProto GenerateRandomSearchSpecProto(
+ MonkeyTestRandomEngine* random,
+ MonkeyDocumentGenerator* document_generator) {
+ // Get a random token from the language set as a single term query.
+ std::string query(document_generator->GetToken());
+ std::uniform_int_distribution<> dist(0, 1);
+ TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+ if (dist(*random) == 1) {
+ term_match_type = TermMatchType::PREFIX;
+ // Randomly drop a suffix of query to test prefix query.
+ std::uniform_int_distribution<> size_dist(1, query.size());
+ query.resize(size_dist(*random));
+ }
+ // 50% chance of getting a section restriction.
+ if (dist(*random) == 1) {
+ const SchemaTypeConfigProto& type_config = document_generator->GetType();
+ if (type_config.properties_size() > 0) {
+ std::uniform_int_distribution<> prop_dist(
+ 0, type_config.properties_size() - 1);
+ query = absl_ports::StrCat(
+ type_config.properties(prop_dist(*random)).property_name(), ":",
+ query);
+ }
+ }
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(term_match_type);
+ search_spec.set_query(query);
+ return search_spec;
+}
+
+ScoringSpecProto GenerateRandomScoringSpec(MonkeyTestRandomEngine* random) {
+ ScoringSpecProto scoring_spec;
+
+ constexpr std::array<ScoringSpecProto::RankingStrategy::Code, 3>
+ ranking_strategies = {
+ ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE,
+ ScoringSpecProto::RankingStrategy::CREATION_TIMESTAMP,
+ ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE};
+
+ std::uniform_int_distribution<> dist(0, ranking_strategies.size() - 1);
+ scoring_spec.set_rank_by(ranking_strategies[dist(*random)]);
+ return scoring_spec;
+}
+
+ResultSpecProto::SnippetSpecProto GenerateRandomSnippetSpecProto(
+ MonkeyTestRandomEngine* random, const ResultSpecProto& result_spec) {
+ ResultSpecProto::SnippetSpecProto snippet_spec;
+
+ std::uniform_int_distribution<> num_to_snippet_dist(
+ 0, result_spec.num_per_page() * 2);
+ snippet_spec.set_num_to_snippet(num_to_snippet_dist(*random));
+
+ std::uniform_int_distribution<> num_matches_per_property_dist(0, 10);
+ snippet_spec.set_num_matches_per_property(
+ num_matches_per_property_dist(*random));
+
+ std::uniform_int_distribution<> dist(0, 4);
+ int random_num = dist(*random);
+ // 1/5 chance of getting one of 0 (disabled), 8, 32, 128, 512
+ int max_window_utf32_length =
+ random_num == 0 ? 0 : (1 << (2 * random_num + 1));
+ snippet_spec.set_max_window_utf32_length(max_window_utf32_length);
+ return snippet_spec;
+}
+
+ResultSpecProto GenerateRandomResultSpecProto(MonkeyTestRandomEngine* random) {
+ std::uniform_int_distribution<> dist(0, 4);
+ ResultSpecProto result_spec;
+ // 1/5 chance of getting one of 1, 4, 16, 64, 256
+ int num_per_page = 1 << (2 * dist(*random));
+ result_spec.set_num_per_page(num_per_page);
+ *result_spec.mutable_snippet_spec() =
+ GenerateRandomSnippetSpecProto(random, result_spec);
+ return result_spec;
+}
+
+void SortDocuments(std::vector<DocumentProto>& documents) {
+ std::sort(documents.begin(), documents.end(),
+ [](const DocumentProto& doc1, const DocumentProto& doc2) {
+ if (doc1.namespace_() != doc2.namespace_()) {
+ return doc1.namespace_() < doc2.namespace_();
+ }
+ return doc1.uri() < doc2.uri();
+ });
+}
+
+} // namespace
+
+IcingMonkeyTestRunner::IcingMonkeyTestRunner(uint32_t seed)
+ : random_(seed), in_memory_icing_() {
+ ICING_LOG(INFO) << "Monkey test runner started with seed: " << seed;
+
+ SchemaProto schema = GenerateRandomSchema(&random_);
+ ICING_LOG(DBG) << "Schema Generated: " << schema.DebugString();
+
+ in_memory_icing_ =
+ std::make_unique<InMemoryIcingSearchEngine>(&random_, std::move(schema));
+
+ document_generator_ = std::make_unique<MonkeyDocumentGenerator>(
+ &random_, in_memory_icing_->GetSchema(), kNumNamespaces, kNumURIs);
+
+ std::string dir = GetTestTempDir() + "/icing/monkey";
+ filesystem_.DeleteDirectoryRecursively(dir.c_str());
+ icing_dir_ = std::make_unique<DestructibleDirectory>(&filesystem_, dir);
+}
+
+void IcingMonkeyTestRunner::Run(uint32_t num) {
+ ASSERT_TRUE(icing_ != nullptr)
+ << "Icing search engine has not yet been created. Please call "
+ "CreateIcingSearchEngineWithSchema() first";
+
+ uint32_t frequency_sum = 0;
+ for (const auto& schedule : kMonkeyAPISchedules) {
+ frequency_sum += schedule.second;
+ }
+ std::uniform_int_distribution<> dist(0, frequency_sum - 1);
+ for (; num; --num) {
+ int p = dist(random_);
+ for (const auto& schedule : kMonkeyAPISchedules) {
+ if (p < schedule.second) {
+ ASSERT_NO_FATAL_FAILURE(schedule.first(this));
+ break;
+ }
+ p -= schedule.second;
+ }
+ ICING_LOG(INFO) << "Documents in the in-memory icing: "
+ << in_memory_icing_->GetNumAliveDocuments();
+ }
+}
+
+void IcingMonkeyTestRunner::CreateIcingSearchEngineWithSchema() {
+ ASSERT_NO_FATAL_FAILURE(CreateIcingSearchEngine());
+ ASSERT_THAT(icing_->SetSchema(*in_memory_icing_->GetSchema()).status(),
+ ProtoIsOk());
+}
+
+void IcingMonkeyTestRunner::DoGet() {
+ InMemoryIcingSearchEngine::PickDocumentResult document =
+ in_memory_icing_->RandomPickDocument(/*p_alive=*/0.70, /*p_all=*/0.28,
+ /*p_other=*/0.02);
+ ICING_LOG(INFO) << "Monkey getting namespace: " << document.name_space
+ << ", uri: " << document.uri;
+ GetResultProto get_result =
+ icing_->Get(document.name_space, document.uri,
+ GetResultSpecProto::default_instance());
+ if (document.document.has_value()) {
+ ASSERT_THAT(get_result.status(), ProtoIsOk())
+ << "Cannot find the document that is supposed to exist.";
+ ASSERT_THAT(get_result.document(), EqualsProto(document.document.value()))
+ << "The document found does not match with the value in the in-memory "
+ "icing.";
+ } else {
+ // Should expect that no document has been found.
+ if (get_result.status().code() != StatusProto::NOT_FOUND) {
+ if (get_result.status().code() == StatusProto::OK) {
+ FAIL() << "Found a document that is not supposed to be found.";
+ }
+ FAIL() << "Icing search engine failure (code "
+ << get_result.status().code()
+ << "): " << get_result.status().message();
+ }
+ }
+}
+
+void IcingMonkeyTestRunner::DoGetAllNamespaces() {
+ ICING_LOG(INFO) << "Monkey getting all namespaces";
+ GetAllNamespacesResultProto get_result = icing_->GetAllNamespaces();
+ ASSERT_THAT(get_result.status(), ProtoIsOk());
+ ASSERT_THAT(get_result.namespaces(),
+ UnorderedElementsAreArray(in_memory_icing_->GetAllNamespaces()));
+}
+
+void IcingMonkeyTestRunner::DoPut() {
+ MonkeyTokenizedDocument doc = document_generator_->GenerateDocument();
+ ICING_LOG(INFO) << "Monkey document generated, namespace: "
+ << doc.document.namespace_()
+ << ", uri: " << doc.document.uri();
+ ICING_LOG(DBG) << doc.document.DebugString();
+ in_memory_icing_->Put(doc);
+ ASSERT_THAT(icing_->Put(doc.document).status(), ProtoIsOk());
+}
+
+void IcingMonkeyTestRunner::DoDelete() {
+ InMemoryIcingSearchEngine::PickDocumentResult document =
+ in_memory_icing_->RandomPickDocument(/*p_alive=*/0.70, /*p_all=*/0.2,
+ /*p_other=*/0.1);
+ ICING_LOG(INFO) << "Monkey deleting namespace: " << document.name_space
+ << ", uri: " << document.uri;
+ in_memory_icing_->Delete(document.name_space, document.uri);
+ DeleteResultProto delete_result =
+ icing_->Delete(document.name_space, document.uri);
+ if (document.document.has_value()) {
+ ASSERT_THAT(delete_result.status(), ProtoIsOk())
+ << "Cannot delete an existing document.";
+ } else {
+ // Should expect that no document has been deleted.
+ if (delete_result.status().code() != StatusProto::NOT_FOUND) {
+ if (delete_result.status().code() == StatusProto::OK) {
+ FAIL() << "Deleted a non-existing document without an error.";
+ }
+ FAIL() << "Icing search engine failure (code "
+ << delete_result.status().code()
+ << "): " << delete_result.status().message();
+ }
+ }
+}
+
+void IcingMonkeyTestRunner::DoDeleteByNamespace() {
+ std::string name_space = document_generator_->GetNamespace();
+ ICING_LOG(INFO) << "Monkey deleting namespace: " << name_space;
+ DeleteByNamespaceResultProto delete_result =
+ icing_->DeleteByNamespace(name_space);
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t num_docs_deleted,
+ in_memory_icing_->DeleteByNamespace(name_space));
+ if (num_docs_deleted != 0) {
+ ASSERT_THAT(delete_result.status(), ProtoIsOk())
+ << "Cannot delete an existing namespace.";
+ ASSERT_THAT(delete_result.delete_stats().num_documents_deleted(),
+ Eq(num_docs_deleted));
+ } else {
+ // Should expect that no document has been deleted.
+ if (delete_result.status().code() != StatusProto::NOT_FOUND) {
+ if (delete_result.status().code() == StatusProto::OK) {
+ FAIL() << "Deleted a non-existing namespace without an error.";
+ }
+ FAIL() << "Icing search engine failure (code "
+ << delete_result.status().code()
+ << "): " << delete_result.status().message();
+ }
+ }
+}
+
+void IcingMonkeyTestRunner::DoDeleteBySchemaType() {
+ std::string schema_type = document_generator_->GetType().schema_type();
+ ICING_LOG(INFO) << "Monkey deleting type: " << schema_type;
+ DeleteBySchemaTypeResultProto delete_result =
+ icing_->DeleteBySchemaType(schema_type);
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t num_docs_deleted,
+ in_memory_icing_->DeleteBySchemaType(schema_type));
+ if (num_docs_deleted != 0) {
+ ASSERT_THAT(delete_result.status(), ProtoIsOk())
+ << "Cannot delete an existing schema type.";
+ ASSERT_THAT(delete_result.delete_stats().num_documents_deleted(),
+ Eq(num_docs_deleted));
+ } else {
+ // Should expect that no document has been deleted.
+ if (delete_result.status().code() != StatusProto::NOT_FOUND) {
+ if (delete_result.status().code() == StatusProto::OK) {
+ FAIL() << "Deleted a non-existing schema type without an error.";
+ }
+ FAIL() << "Icing search engine failure (code "
+ << delete_result.status().code()
+ << "): " << delete_result.status().message();
+ }
+ }
+}
+
+void IcingMonkeyTestRunner::DoDeleteByQuery() {
+ SearchSpecProto search_spec =
+ GenerateRandomSearchSpecProto(&random_, document_generator_.get());
+ ICING_LOG(INFO) << "Monkey deleting by query: " << search_spec.query();
+ DeleteByQueryResultProto delete_result = icing_->DeleteByQuery(search_spec);
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t num_docs_deleted,
+ in_memory_icing_->DeleteByQuery(search_spec));
+ if (num_docs_deleted != 0) {
+ ASSERT_THAT(delete_result.status(), ProtoIsOk())
+ << "Cannot delete documents that matches with the query.";
+ ASSERT_THAT(delete_result.delete_by_query_stats().num_documents_deleted(),
+ Eq(num_docs_deleted));
+ } else {
+ // Should expect that no document has been deleted.
+ if (delete_result.status().code() != StatusProto::NOT_FOUND) {
+ if (delete_result.status().code() == StatusProto::OK) {
+ FAIL() << "Deleted documents that should not match with the query "
+ "without an error.";
+ }
+ FAIL() << "Icing search engine failure (code "
+ << delete_result.status().code()
+ << "): " << delete_result.status().message();
+ }
+ }
+ ICING_LOG(INFO)
+ << delete_result.delete_by_query_stats().num_documents_deleted()
+ << " documents deleted by query.";
+}
+
+void IcingMonkeyTestRunner::DoSearch() {
+ SearchSpecProto search_spec =
+ GenerateRandomSearchSpecProto(&random_, document_generator_.get());
+ ScoringSpecProto scoring_spec = GenerateRandomScoringSpec(&random_);
+ ResultSpecProto result_spec = GenerateRandomResultSpecProto(&random_);
+ const ResultSpecProto::SnippetSpecProto& snippet_spec =
+ result_spec.snippet_spec();
+
+ ICING_LOG(INFO) << "Monkey searching by query: " << search_spec.query()
+ << ", term_match_type: " << search_spec.term_match_type();
+ ICING_VLOG(1) << "search_spec:\n" << search_spec.DebugString();
+ ICING_VLOG(1) << "scoring_spec:\n" << scoring_spec.DebugString();
+ ICING_VLOG(1) << "result_spec:\n" << result_spec.DebugString();
+
+ std::vector<DocumentProto> exp_documents =
+ in_memory_icing_->Search(search_spec);
+
+ SearchResultProto search_result =
+ icing_->Search(search_spec, scoring_spec, result_spec);
+ ASSERT_THAT(search_result.status(), ProtoIsOk());
+
+ std::vector<DocumentProto> actual_documents;
+ int num_snippeted = 0;
+ while (true) {
+ for (const SearchResultProto::ResultProto& doc : search_result.results()) {
+ actual_documents.push_back(doc.document());
+ if (!doc.snippet().entries().empty()) {
+ ++num_snippeted;
+ for (const SnippetProto::EntryProto& entry : doc.snippet().entries()) {
+ ASSERT_THAT(entry.snippet_matches(),
+ SizeIs(Le(snippet_spec.num_matches_per_property())));
+ }
+ }
+ }
+ if (search_result.next_page_token() == kInvalidNextPageToken) {
+ break;
+ }
+ search_result = icing_->GetNextPage(search_result.next_page_token());
+ ASSERT_THAT(search_result.status(), ProtoIsOk());
+ }
+ if (snippet_spec.num_matches_per_property() > 0) {
+ ASSERT_THAT(num_snippeted,
+ Eq(std::min<uint32_t>(exp_documents.size(),
+ snippet_spec.num_to_snippet())));
+ }
+ SortDocuments(exp_documents);
+ SortDocuments(actual_documents);
+ ASSERT_THAT(actual_documents, SizeIs(exp_documents.size()));
+ for (int i = 0; i < exp_documents.size(); ++i) {
+ ASSERT_THAT(actual_documents[i], EqualsProto(exp_documents[i]));
+ }
+ ICING_LOG(INFO) << exp_documents.size() << " documents found by query.";
+}
+
+void IcingMonkeyTestRunner::ReloadFromDisk() {
+ ICING_LOG(INFO) << "Monkey reloading from disk";
+ // Destruct the icing search engine by resetting the unique pointer.
+ icing_.reset();
+ ASSERT_NO_FATAL_FAILURE(CreateIcingSearchEngine());
+}
+
+void IcingMonkeyTestRunner::DoOptimize() {
+ ICING_LOG(INFO) << "Monkey doing optimization";
+ ASSERT_THAT(icing_->Optimize().status(), ProtoIsOk());
+}
+
+void IcingMonkeyTestRunner::CreateIcingSearchEngine() {
+ IcingSearchEngineOptions icing_options;
+ icing_options.set_index_merge_size(kIndexMergeSize);
+ icing_options.set_base_dir(icing_dir_->dir());
+ icing_ = std::make_unique<IcingSearchEngine>(icing_options);
+ ASSERT_THAT(icing_->Initialize().status(), ProtoIsOk());
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/monkey_test/icing-monkey-test-runner.h b/icing/monkey_test/icing-monkey-test-runner.h
new file mode 100644
index 0000000..5f5649c
--- /dev/null
+++ b/icing/monkey_test/icing-monkey-test-runner.h
@@ -0,0 +1,71 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_MONKEY_TEST_ICING_MONKEY_TEST_RUNNER_H_
+#define ICING_MONKEY_TEST_ICING_MONKEY_TEST_RUNNER_H_
+
+#include <cstdint>
+#include <random>
+
+#include "icing/file/destructible-directory.h"
+#include "icing/icing-search-engine.h"
+#include "icing/monkey_test/in-memory-icing-search-engine.h"
+#include "icing/monkey_test/monkey-test-generators.h"
+
+namespace icing {
+namespace lib {
+
+class IcingMonkeyTestRunner {
+ public:
+ IcingMonkeyTestRunner(uint32_t seed = std::random_device()());
+ IcingMonkeyTestRunner(const IcingMonkeyTestRunner&) = delete;
+ IcingMonkeyTestRunner& operator=(const IcingMonkeyTestRunner&) = delete;
+
+ // This function must and should only be called before running the monkey
+ // test.
+ void CreateIcingSearchEngineWithSchema();
+
+ // Run the monkey test with num operations.
+ void Run(uint32_t num);
+
+ // APIs supported in icing search engine.
+ void DoGet();
+ void DoGetAllNamespaces();
+ void DoPut();
+ void DoDelete();
+ void DoDeleteByNamespace();
+ void DoDeleteBySchemaType();
+ void DoDeleteByQuery();
+ void DoSearch();
+
+ // Operations with no observable side-effects.
+ void ReloadFromDisk();
+ void DoOptimize();
+
+ private:
+ MonkeyTestRandomEngine random_;
+ Filesystem filesystem_;
+ std::unique_ptr<DestructibleDirectory> icing_dir_;
+ std::unique_ptr<InMemoryIcingSearchEngine> in_memory_icing_;
+ std::unique_ptr<IcingSearchEngine> icing_;
+
+ std::unique_ptr<MonkeyDocumentGenerator> document_generator_;
+
+ void CreateIcingSearchEngine();
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_MONKEY_TEST_ICING_MONKEY_TEST_RUNNER_H_
diff --git a/icing/monkey_test/icing-search-engine_monkey_test.cc b/icing/monkey_test/icing-search-engine_monkey_test.cc
new file mode 100644
index 0000000..ad887b8
--- /dev/null
+++ b/icing/monkey_test/icing-search-engine_monkey_test.cc
@@ -0,0 +1,30 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "gtest/gtest.h"
+#include "icing/monkey_test/icing-monkey-test-runner.h"
+#include "icing/portable/platform.h"
+
+namespace icing {
+namespace lib {
+
+TEST(IcingSearchEngineMonkeyTest, MonkeyTest) {
+ uint32_t num_iterations = IsAndroidArm() ? 1000 : 5000;
+ IcingMonkeyTestRunner runner;
+ ASSERT_NO_FATAL_FAILURE(runner.CreateIcingSearchEngineWithSchema());
+ ASSERT_NO_FATAL_FAILURE(runner.Run(num_iterations));
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/monkey_test/in-memory-icing-search-engine.cc b/icing/monkey_test/in-memory-icing-search-engine.cc
index df94c46..405a7b0 100644
--- a/icing/monkey_test/in-memory-icing-search-engine.cc
+++ b/icing/monkey_test/in-memory-icing-search-engine.cc
@@ -15,10 +15,12 @@
#include "icing/monkey_test/in-memory-icing-search-engine.h"
#include <cstdint>
+#include <string_view>
#include <unordered_set>
#include <vector>
#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/absl_ports/canonical_errors.h"
#include "icing/absl_ports/str_cat.h"
#include "icing/util/status-macros.h"
@@ -26,6 +28,48 @@
namespace icing {
namespace lib {
+namespace {
+
+// Check if s1 is a prefix of s2.
+bool IsPrefix(std::string_view s1, std::string_view s2) {
+ if (s1.length() > s2.length()) {
+ return false;
+ }
+ return s1 == s2.substr(0, s1.length());
+}
+
+bool DoesDocumentMatchQuery(const MonkeyTokenizedDocument &document,
+ const std::string &query,
+ TermMatchType::Code term_match_type) {
+ std::vector<std::string_view> strs = absl_ports::StrSplit(query, ":");
+ std::string_view query_term;
+ std::string_view section_restrict;
+ if (strs.size() > 1) {
+ section_restrict = strs[0];
+ query_term = strs[1];
+ } else {
+ query_term = query;
+ }
+ for (const MonkeyTokenizedSection &section : document.tokenized_sections) {
+ if (!section_restrict.empty() && section.path != section_restrict) {
+ continue;
+ }
+ for (const std::string &token : section.token_sequence) {
+ if (section.term_match_type == TermMatchType::EXACT_ONLY ||
+ term_match_type == TermMatchType::EXACT_ONLY) {
+ if (token == query_term) {
+ return true;
+ }
+ } else if (IsPrefix(query_term, token)) {
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+} // namespace
+
InMemoryIcingSearchEngine::PickDocumentResult
InMemoryIcingSearchEngine::RandomPickDocument(float p_alive, float p_all,
float p_other) const {
@@ -108,44 +152,67 @@ libtextclassifier3::Status InMemoryIcingSearchEngine::Delete(
return doc_id_or.status();
}
-libtextclassifier3::Status InMemoryIcingSearchEngine::DeleteByNamespace(
- const std::string &name_space) {
+libtextclassifier3::StatusOr<uint32_t>
+InMemoryIcingSearchEngine::DeleteByNamespace(const std::string &name_space) {
std::vector<DocumentId> doc_ids_to_delete;
for (DocumentId doc_id : existing_doc_ids_) {
if (documents_[doc_id].document.namespace_() == name_space) {
doc_ids_to_delete.push_back(doc_id);
}
}
- if (doc_ids_to_delete.empty()) {
- return absl_ports::NotFoundError(absl_ports::StrCat(
- "Namespace: ", name_space,
- " is not found by InMemoryIcingSearchEngine::DeleteByNamespace."));
- }
for (DocumentId doc_id : doc_ids_to_delete) {
const DocumentProto &document = documents_[doc_id].document;
- ICING_RETURN_IF_ERROR(Delete(document.namespace_(), document.uri()));
+ if (!Delete(document.namespace_(), document.uri()).ok()) {
+ return absl_ports::InternalError(
+ "Should never happen. There are inconsistencies in the in-memory "
+ "Icing.");
+ }
}
- return libtextclassifier3::Status::OK;
+ return doc_ids_to_delete.size();
}
-libtextclassifier3::Status InMemoryIcingSearchEngine::DeleteBySchemaType(
- const std::string &schema_type) {
+libtextclassifier3::StatusOr<uint32_t>
+InMemoryIcingSearchEngine::DeleteBySchemaType(const std::string &schema_type) {
std::vector<DocumentId> doc_ids_to_delete;
for (DocumentId doc_id : existing_doc_ids_) {
if (documents_[doc_id].document.schema() == schema_type) {
doc_ids_to_delete.push_back(doc_id);
}
}
- if (doc_ids_to_delete.empty()) {
- return absl_ports::NotFoundError(absl_ports::StrCat(
- "Type: ", schema_type,
- " is not found by InMemoryIcingSearchEngine::DeleteBySchemaType."));
+ for (DocumentId doc_id : doc_ids_to_delete) {
+ const DocumentProto &document = documents_[doc_id].document;
+ if (!Delete(document.namespace_(), document.uri()).ok()) {
+ return absl_ports::InternalError(
+ "Should never happen. There are inconsistencies in the in-memory "
+ "Icing.");
+ }
}
+ return doc_ids_to_delete.size();
+}
+
+libtextclassifier3::StatusOr<uint32_t> InMemoryIcingSearchEngine::DeleteByQuery(
+ const SearchSpecProto &search_spec) {
+ std::vector<DocumentId> doc_ids_to_delete = InternalSearch(search_spec);
for (DocumentId doc_id : doc_ids_to_delete) {
const DocumentProto &document = documents_[doc_id].document;
- ICING_RETURN_IF_ERROR(Delete(document.namespace_(), document.uri()));
+ if (!Delete(document.namespace_(), document.uri()).ok()) {
+ return absl_ports::InternalError(
+ "Should never happen. There are inconsistencies in the in-memory "
+ "Icing.");
+ }
+ }
+ return doc_ids_to_delete.size();
+}
+
+std::vector<DocumentProto> InMemoryIcingSearchEngine::Search(
+ const SearchSpecProto &search_spec) const {
+ std::vector<DocumentId> matched_doc_ids = InternalSearch(search_spec);
+ std::vector<DocumentProto> result;
+ result.reserve(matched_doc_ids.size());
+ for (DocumentId doc_id : matched_doc_ids) {
+ result.push_back(documents_[doc_id].document);
}
- return libtextclassifier3::Status::OK;
+ return result;
}
libtextclassifier3::StatusOr<DocumentId> InMemoryIcingSearchEngine::InternalGet(
@@ -162,5 +229,17 @@ libtextclassifier3::StatusOr<DocumentId> InMemoryIcingSearchEngine::InternalGet(
" is not found by InMemoryIcingSearchEngine::InternalGet."));
}
+std::vector<DocumentId> InMemoryIcingSearchEngine::InternalSearch(
+ const SearchSpecProto &search_spec) const {
+ std::vector<DocumentId> matched_doc_ids;
+ for (DocumentId doc_id : existing_doc_ids_) {
+ if (DoesDocumentMatchQuery(documents_[doc_id], search_spec.query(),
+ search_spec.term_match_type())) {
+ matched_doc_ids.push_back(doc_id);
+ }
+ }
+ return matched_doc_ids;
+}
+
} // namespace lib
} // namespace icing
diff --git a/icing/monkey_test/in-memory-icing-search-engine.h b/icing/monkey_test/in-memory-icing-search-engine.h
index 0c6c03b..a5d8872 100644
--- a/icing/monkey_test/in-memory-icing-search-engine.h
+++ b/icing/monkey_test/in-memory-icing-search-engine.h
@@ -27,6 +27,7 @@
#include "icing/monkey_test/monkey-tokenized-document.h"
#include "icing/proto/document.pb.h"
#include "icing/proto/schema.pb.h"
+#include "icing/proto/search.pb.h"
#include "icing/store/document-id.h"
namespace icing {
@@ -47,6 +48,8 @@ class InMemoryIcingSearchEngine {
: random_(random),
schema_(std::make_unique<SchemaProto>(std::move(schema))) {}
+ uint32_t GetNumAliveDocuments() const { return existing_doc_ids_.size(); }
+
const SchemaProto *GetSchema() const { return schema_.get(); }
// Randomly pick a document from the in-memory Icing for monkey testing.
@@ -81,16 +84,35 @@ class InMemoryIcingSearchEngine {
// Deletes all Documents belonging to the specified namespace.
//
// Returns:
- // OK on success
- // NOT_FOUND if namespace doesn't exist
- libtextclassifier3::Status DeleteByNamespace(const std::string &name_space);
+ // The number of deleted documents on success
+ // INTERNAL_ERROR if there are inconsistencies in the in-memory Icing
+ libtextclassifier3::StatusOr<uint32_t> DeleteByNamespace(
+ const std::string &name_space);
// Deletes all Documents belonging to the specified type
//
// Returns:
- // OK on success
- // NOT_FOUND if schema type doesn't exist
- libtextclassifier3::Status DeleteBySchemaType(const std::string &schema_type);
+ // The number of deleted documents on success
+ // INTERNAL_ERROR if there are inconsistencies in the in-memory Icing
+ libtextclassifier3::StatusOr<uint32_t> DeleteBySchemaType(
+ const std::string &schema_type);
+
+ // Deletes all Documents that match the query specified in search_spec.
+ // Currently, only the "query" and "term_match_type" fields are recognized by
+ // the in-memory Icing, and only single term queries with possible section
+ // restrictions are supported.
+ //
+ // Returns:
+ // The number of deleted documents on success
+ // INTERNAL_ERROR if there are inconsistencies in the in-memory Icing
+ libtextclassifier3::StatusOr<uint32_t> DeleteByQuery(
+ const SearchSpecProto &search_spec);
+
+ // Retrieves documents according to search_spec.
+ // Currently, only the "query" and "term_match_type" fields are recognized by
+ // the in-memory Icing, and only single term queries with possible section
+ // restrictions are supported.
+ std::vector<DocumentProto> Search(const SearchSpecProto &search_spec) const;
private:
// Does not own.
@@ -113,6 +135,11 @@ class InMemoryIcingSearchEngine {
// NOT_FOUND if the key doesn't exist or doc has been deleted
libtextclassifier3::StatusOr<DocumentId> InternalGet(
const std::string &name_space, const std::string &uri) const;
+
+ // A helper method for DeleteByQuery and Search to get matched internal doc
+ // ids.
+ std::vector<DocumentId> InternalSearch(
+ const SearchSpecProto &search_spec) const;
};
} // namespace lib
diff --git a/icing/monkey_test/monkey-test-generators.cc b/icing/monkey_test/monkey-test-generators.cc
index b0fdf10..88fc0b6 100644
--- a/icing/monkey_test/monkey-test-generators.cc
+++ b/icing/monkey_test/monkey-test-generators.cc
@@ -153,8 +153,9 @@ MonkeyTokenizedDocument MonkeyDocumentGenerator::GenerateDocument() {
if (prop.data_type() == PropertyConfigProto::DataType::STRING &&
prop.string_indexing_config().term_match_type() !=
TermMatchType::UNKNOWN) {
- MonkeyTokenizedSection section = {prop.property_name(),
- std::move(prop_content)};
+ MonkeyTokenizedSection section = {
+ prop.property_name(), prop.string_indexing_config().term_match_type(),
+ std::move(prop_content)};
document.tokenized_sections.push_back(std::move(section));
}
}
diff --git a/icing/monkey_test/monkey-test-generators.h b/icing/monkey_test/monkey-test-generators.h
index cc4505f..68c5e92 100644
--- a/icing/monkey_test/monkey-test-generators.h
+++ b/icing/monkey_test/monkey-test-generators.h
@@ -77,7 +77,7 @@ class MonkeyDocumentGenerator {
num_namespaces_(num_namespaces),
num_uris_(num_uris) {}
- SchemaTypeConfigProto GetType() const {
+ const SchemaTypeConfigProto& GetType() const {
std::uniform_int_distribution<> dist(0, schema_->types_size() - 1);
return schema_->types(dist(*random_));
}
diff --git a/icing/monkey_test/monkey-tokenized-document.h b/icing/monkey_test/monkey-tokenized-document.h
index 1d77fc8..a0b38c2 100644
--- a/icing/monkey_test/monkey-tokenized-document.h
+++ b/icing/monkey_test/monkey-tokenized-document.h
@@ -18,12 +18,14 @@
#include <string>
#include "icing/proto/document.pb.h"
+#include "icing/proto/term.pb.h"
namespace icing {
namespace lib {
struct MonkeyTokenizedSection {
std::string path;
+ TermMatchType::Code term_match_type;
std::vector<std::string> token_sequence;
};
diff --git a/icing/query/query-processor.cc b/icing/query/query-processor.cc
index c0a5df1..90587aa 100644
--- a/icing/query/query-processor.cc
+++ b/icing/query/query-processor.cc
@@ -288,16 +288,15 @@ libtextclassifier3::StatusOr<QueryResults> QueryProcessor::ParseRawQuery(
// section restricts. Those are not currently supported. If they became
// supported, this handling for query terms would need to be altered.
if (!frames.top().saw_exclude) {
- ICING_ASSIGN_OR_RETURN(
- std::unique_ptr<DocHitInfoIterator> term_iterator,
- index_.GetIterator(
- normalized_text, kSectionIdMaskAll,
- search_spec.term_match_type(),
- /*need_hit_term_frequency=*/ranking_strategy ==
- ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE));
-
if (ranking_strategy ==
ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE) {
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<DocHitInfoIterator> term_iterator,
+ index_.GetIterator(
+ normalized_text, kSectionIdMaskAll,
+ search_spec.term_match_type(),
+ /*need_hit_term_frequency=*/ranking_strategy ==
+ ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE));
results.query_term_iterators[normalized_text] =
std::make_unique<DocHitInfoIteratorFilter>(
std::move(term_iterator), &document_store_, &schema_store_,
diff --git a/icing/query/query-processor_test.cc b/icing/query/query-processor_test.cc
index 459e10e..da35df8 100644
--- a/icing/query/query-processor_test.cc
+++ b/icing/query/query-processor_test.cc
@@ -62,17 +62,6 @@ using ::testing::SizeIs;
using ::testing::Test;
using ::testing::UnorderedElementsAre;
-constexpr PropertyConfigProto::DataType::Code TYPE_STRING =
- PropertyConfigProto::DataType::STRING;
-
-constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL =
- PropertyConfigProto::Cardinality::OPTIONAL;
-
-constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN =
- StringIndexingConfig::TokenizerType::PLAIN;
-
-constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY;
-
class QueryProcessorTest
: public ::testing::TestWithParam<SearchSpecProto::SearchType::Code> {
protected:
@@ -2417,7 +2406,7 @@ TEST_P(QueryProcessorTest, PropertyFilterForOneDocument) {
.AddType(SchemaTypeConfigBuilder().SetType("email").AddProperty(
PropertyConfigBuilder()
.SetName("subject")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
.SetCardinality(CARDINALITY_OPTIONAL)))
.Build();
// First and only indexed property, so it gets a section_id of 0
@@ -2486,20 +2475,20 @@ TEST_P(QueryProcessorTest, PropertyFilterAcrossSchemaTypes) {
.AddType(SchemaTypeConfigBuilder()
.SetType("email")
// Section "a" would get sectionId 0
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("a")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL))
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("foo")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("a")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("foo")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
.AddType(SchemaTypeConfigBuilder().SetType("message").AddProperty(
PropertyConfigBuilder()
.SetName("foo")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
.SetCardinality(CARDINALITY_OPTIONAL)))
.Build();
@@ -2583,12 +2572,12 @@ TEST_P(QueryProcessorTest, PropertyFilterWithinSchemaType) {
.AddType(SchemaTypeConfigBuilder().SetType("email").AddProperty(
PropertyConfigBuilder()
.SetName("foo")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
.SetCardinality(CARDINALITY_OPTIONAL)))
.AddType(SchemaTypeConfigBuilder().SetType("message").AddProperty(
PropertyConfigBuilder()
.SetName("foo")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
.SetCardinality(CARDINALITY_OPTIONAL)))
.Build();
int email_foo_section_id = 0;
@@ -2691,11 +2680,11 @@ TEST_P(QueryProcessorTest, NestedPropertyFilter) {
.SetType("Bar")
// Add an unindexed property so we generate section
// metadata on it
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("baz")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("baz")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
.Build();
ICING_ASSERT_OK_AND_ASSIGN(
@@ -2764,12 +2753,12 @@ TEST_P(QueryProcessorTest, PropertyFilterRespectsDifferentSectionIds) {
.AddType(SchemaTypeConfigBuilder().SetType("email").AddProperty(
PropertyConfigBuilder()
.SetName("foo")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
.SetCardinality(CARDINALITY_OPTIONAL)))
.AddType(SchemaTypeConfigBuilder().SetType("message").AddProperty(
PropertyConfigBuilder()
.SetName("bar")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
.SetCardinality(CARDINALITY_OPTIONAL)))
.Build();
int email_foo_section_id = 0;
@@ -2983,12 +2972,12 @@ TEST_P(QueryProcessorTest, PropertyFilterTermAndUnrestrictedTerm) {
.AddType(SchemaTypeConfigBuilder().SetType("email").AddProperty(
PropertyConfigBuilder()
.SetName("foo")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
.SetCardinality(CARDINALITY_OPTIONAL)))
.AddType(SchemaTypeConfigBuilder().SetType("message").AddProperty(
PropertyConfigBuilder()
.SetName("foo")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
.SetCardinality(CARDINALITY_OPTIONAL)))
.Build();
int email_foo_section_id = 0;
diff --git a/icing/result/result-retriever-v2_projection_test.cc b/icing/result/result-retriever-v2_projection_test.cc
index cb0de0b..ec67caa 100644
--- a/icing/result/result-retriever-v2_projection_test.cc
+++ b/icing/result/result-retriever-v2_projection_test.cc
@@ -51,15 +51,6 @@ namespace {
using ::icing::lib::portable_equals_proto::EqualsProto;
using ::testing::SizeIs;
-constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL =
- PropertyConfigProto::Cardinality::OPTIONAL;
-
-constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN =
- StringIndexingConfig::TokenizerType::PLAIN;
-
-constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY;
-constexpr TermMatchType::Code MATCH_PREFIX = TermMatchType::PREFIX;
-
class ResultRetrieverV2ProjectionTest : public testing::Test {
protected:
ResultRetrieverV2ProjectionTest() : test_dir_(GetTestTempDir() + "/icing") {
@@ -90,12 +81,12 @@ class ResultRetrieverV2ProjectionTest : public testing::Test {
.SetType("Email")
.AddProperty(PropertyConfigBuilder()
.SetName("name")
- .SetDataTypeString(MATCH_PREFIX,
+ .SetDataTypeString(TERM_MATCH_PREFIX,
TOKENIZER_PLAIN)
.SetCardinality(CARDINALITY_OPTIONAL))
.AddProperty(PropertyConfigBuilder()
.SetName("body")
- .SetDataTypeString(MATCH_EXACT,
+ .SetDataTypeString(TERM_MATCH_EXACT,
TOKENIZER_PLAIN)
.SetCardinality(CARDINALITY_OPTIONAL))
.AddProperty(
@@ -107,16 +98,16 @@ class ResultRetrieverV2ProjectionTest : public testing::Test {
.AddType(
SchemaTypeConfigBuilder()
.SetType("Person")
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("name")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL))
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("emailAddress")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("emailAddress")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
.Build();
ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
diff --git a/icing/result/result-retriever-v2_snippet_test.cc b/icing/result/result-retriever-v2_snippet_test.cc
index 0643e9b..9384d6b 100644
--- a/icing/result/result-retriever-v2_snippet_test.cc
+++ b/icing/result/result-retriever-v2_snippet_test.cc
@@ -56,15 +56,6 @@ using ::testing::Eq;
using ::testing::IsEmpty;
using ::testing::SizeIs;
-constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL =
- PropertyConfigProto::Cardinality::OPTIONAL;
-
-constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN =
- StringIndexingConfig::TokenizerType::PLAIN;
-
-constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY;
-constexpr TermMatchType::Code MATCH_PREFIX = TermMatchType::PREFIX;
-
class ResultRetrieverV2SnippetTest : public testing::Test {
protected:
ResultRetrieverV2SnippetTest() : test_dir_(GetTestTempDir() + "/icing") {
@@ -95,12 +86,12 @@ class ResultRetrieverV2SnippetTest : public testing::Test {
.SetType("Email")
.AddProperty(PropertyConfigBuilder()
.SetName("name")
- .SetDataTypeString(MATCH_PREFIX,
+ .SetDataTypeString(TERM_MATCH_PREFIX,
TOKENIZER_PLAIN)
.SetCardinality(CARDINALITY_OPTIONAL))
.AddProperty(PropertyConfigBuilder()
.SetName("body")
- .SetDataTypeString(MATCH_EXACT,
+ .SetDataTypeString(TERM_MATCH_EXACT,
TOKENIZER_PLAIN)
.SetCardinality(CARDINALITY_OPTIONAL))
.AddProperty(
@@ -112,16 +103,16 @@ class ResultRetrieverV2SnippetTest : public testing::Test {
.AddType(
SchemaTypeConfigBuilder()
.SetType("Person")
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("name")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL))
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("emailAddress")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("emailAddress")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
.Build();
ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
diff --git a/icing/result/result-retriever-v2_test.cc b/icing/result/result-retriever-v2_test.cc
index 5d66be2..0fb2ba0 100644
--- a/icing/result/result-retriever-v2_test.cc
+++ b/icing/result/result-retriever-v2_test.cc
@@ -64,15 +64,6 @@ using ::testing::Return;
using ::testing::SizeIs;
using NamespaceIdMap = std::unordered_map<NamespaceId, int>;
-constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL =
- PropertyConfigProto::Cardinality::OPTIONAL;
-
-constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN =
- StringIndexingConfig::TokenizerType::PLAIN;
-
-constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY;
-constexpr TermMatchType::Code MATCH_PREFIX = TermMatchType::PREFIX;
-
// Mock the behavior of GroupResultLimiter::ShouldBeRemoved.
class MockGroupResultLimiter : public GroupResultLimiterV2 {
public:
@@ -116,12 +107,12 @@ class ResultRetrieverV2Test : public ::testing::Test {
.SetType("Email")
.AddProperty(PropertyConfigBuilder()
.SetName("name")
- .SetDataTypeString(MATCH_PREFIX,
+ .SetDataTypeString(TERM_MATCH_PREFIX,
TOKENIZER_PLAIN)
.SetCardinality(CARDINALITY_OPTIONAL))
.AddProperty(PropertyConfigBuilder()
.SetName("body")
- .SetDataTypeString(MATCH_EXACT,
+ .SetDataTypeString(TERM_MATCH_EXACT,
TOKENIZER_PLAIN)
.SetCardinality(CARDINALITY_OPTIONAL))
.AddProperty(
@@ -133,16 +124,16 @@ class ResultRetrieverV2Test : public ::testing::Test {
.AddType(
SchemaTypeConfigBuilder()
.SetType("Person")
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("name")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL))
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("emailAddress")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("emailAddress")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
.Build();
ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
diff --git a/icing/result/result-retriever_test.cc b/icing/result/result-retriever_test.cc
index 1b2b359..e0b4875 100644
--- a/icing/result/result-retriever_test.cc
+++ b/icing/result/result-retriever_test.cc
@@ -55,15 +55,6 @@ using ::testing::IsEmpty;
using ::testing::Return;
using ::testing::SizeIs;
-constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL =
- PropertyConfigProto::Cardinality::OPTIONAL;
-
-constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN =
- StringIndexingConfig::TokenizerType::PLAIN;
-
-constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY;
-constexpr TermMatchType::Code MATCH_PREFIX = TermMatchType::PREFIX;
-
class ResultRetrieverTest : public testing::Test {
protected:
ResultRetrieverTest() : test_dir_(GetTestTempDir() + "/icing") {
@@ -94,12 +85,12 @@ class ResultRetrieverTest : public testing::Test {
.SetType("Email")
.AddProperty(PropertyConfigBuilder()
.SetName("name")
- .SetDataTypeString(MATCH_PREFIX,
+ .SetDataTypeString(TERM_MATCH_PREFIX,
TOKENIZER_PLAIN)
.SetCardinality(CARDINALITY_OPTIONAL))
.AddProperty(PropertyConfigBuilder()
.SetName("body")
- .SetDataTypeString(MATCH_EXACT,
+ .SetDataTypeString(TERM_MATCH_EXACT,
TOKENIZER_PLAIN)
.SetCardinality(CARDINALITY_OPTIONAL))
.AddProperty(
@@ -111,16 +102,16 @@ class ResultRetrieverTest : public testing::Test {
.AddType(
SchemaTypeConfigBuilder()
.SetType("Person")
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("name")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL))
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("emailAddress")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("emailAddress")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
.Build();
ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
}
diff --git a/icing/result/snippet-retriever.cc b/icing/result/snippet-retriever.cc
index 604ad3d..8044b8d 100644
--- a/icing/result/snippet-retriever.cc
+++ b/icing/result/snippet-retriever.cc
@@ -98,20 +98,6 @@ std::string NormalizeToken(const Normalizer& normalizer, const Token& token) {
[[fallthrough]];
case Token::Type::RFC822_TOKEN:
[[fallthrough]];
- case Token::Type::REGULAR:
- return normalizer.NormalizeTerm(token.text);
- case Token::Type::VERBATIM:
- return std::string(token.text);
- case Token::Type::QUERY_EXCLUSION:
- [[fallthrough]];
- case Token::Type::QUERY_LEFT_PARENTHESES:
- [[fallthrough]];
- case Token::Type::QUERY_RIGHT_PARENTHESES:
- [[fallthrough]];
- case Token::Type::QUERY_OR:
- [[fallthrough]];
- case Token::Type::QUERY_PROPERTY:
- [[fallthrough]];
case Token::Type::URL_SCHEME:
[[fallthrough]];
case Token::Type::URL_USERNAME:
@@ -134,6 +120,20 @@ std::string NormalizeToken(const Normalizer& normalizer, const Token& token) {
[[fallthrough]];
case Token::Type::URL_SUFFIX_INNERMOST:
[[fallthrough]];
+ case Token::Type::REGULAR:
+ return normalizer.NormalizeTerm(token.text);
+ case Token::Type::VERBATIM:
+ return std::string(token.text);
+ case Token::Type::QUERY_EXCLUSION:
+ [[fallthrough]];
+ case Token::Type::QUERY_LEFT_PARENTHESES:
+ [[fallthrough]];
+ case Token::Type::QUERY_RIGHT_PARENTHESES:
+ [[fallthrough]];
+ case Token::Type::QUERY_OR:
+ [[fallthrough]];
+ case Token::Type::QUERY_PROPERTY:
+ [[fallthrough]];
case Token::Type::INVALID:
ICING_LOG(WARNING) << "Unable to normalize token of type: "
<< static_cast<int>(token.type);
@@ -166,6 +166,11 @@ CharacterIterator FindMatchEnd(const Normalizer& normalizer, const Token& token,
[[fallthrough]];
case Token::Type::QUERY_PROPERTY:
[[fallthrough]];
+ case Token::Type::INVALID:
+ ICING_LOG(WARNING)
+ << "Unexpected Token type " << static_cast<int>(token.type)
+ << " found when finding match end of query term and token.";
+ [[fallthrough]];
case Token::Type::RFC822_NAME:
[[fallthrough]];
case Token::Type::RFC822_COMMENT:
@@ -204,11 +209,6 @@ CharacterIterator FindMatchEnd(const Normalizer& normalizer, const Token& token,
[[fallthrough]];
case Token::Type::URL_SUFFIX_INNERMOST:
[[fallthrough]];
- case Token::Type::INVALID:
- ICING_LOG(WARNING)
- << "Unexpected Token type " << static_cast<int>(token.type)
- << " found when finding match end of query term and token.";
- [[fallthrough]];
case Token::Type::REGULAR:
return normalizer.FindNormalizedMatchEndPosition(token.text,
match_query_term);
@@ -336,7 +336,9 @@ libtextclassifier3::StatusOr<CharacterIterator> DetermineWindowStart(
CharacterIterator IncludeTrailingPunctuation(
std::string_view value, CharacterIterator window_end_exclusive,
int window_end_max_exclusive_utf32) {
- while (window_end_exclusive.utf32_index() < window_end_max_exclusive_utf32) {
+ size_t max_search_index = value.length() - 1;
+ while (window_end_exclusive.utf8_index() <= max_search_index &&
+ window_end_exclusive.utf32_index() < window_end_max_exclusive_utf32) {
int char_len = 0;
if (!i18n_utils::IsPunctuationAt(value, window_end_exclusive.utf8_index(),
&char_len)) {
diff --git a/icing/result/snippet-retriever_test.cc b/icing/result/snippet-retriever_test.cc
index 24f8a0a..0940b51 100644
--- a/icing/result/snippet-retriever_test.cc
+++ b/icing/result/snippet-retriever_test.cc
@@ -58,20 +58,12 @@ using ::testing::Eq;
using ::testing::IsEmpty;
using ::testing::SizeIs;
-constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL =
- PropertyConfigProto::Cardinality::OPTIONAL;
-constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REPEATED =
- PropertyConfigProto::Cardinality::REPEATED;
-
-constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN =
- StringIndexingConfig::TokenizerType::PLAIN;
-constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_VERBATIM =
- StringIndexingConfig::TokenizerType::VERBATIM;
-constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_RFC822 =
- StringIndexingConfig::TokenizerType::RFC822;
-
-constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY;
-constexpr TermMatchType::Code MATCH_PREFIX = TermMatchType::PREFIX;
+// TODO (b/246964044): remove ifdef guard when url-tokenizer is ready for export
+// to Android. Also move it to schema-builder.h
+#ifdef ENABLE_URL_TOKENIZER
+constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_URL =
+ StringIndexingConfig::TokenizerType::URL;
+#endif // ENABLE_URL_TOKENIZER
std::vector<std::string_view> GetPropertyPaths(const SnippetProto& snippet) {
std::vector<std::string_view> paths;
@@ -110,16 +102,16 @@ class SnippetRetrieverTest : public testing::Test {
.AddType(
SchemaTypeConfigBuilder()
.SetType("email")
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("subject")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL))
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("body")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
.Build();
ICING_ASSERT_OK(schema_store_->SetSchema(schema));
@@ -184,7 +176,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowSizeSmallerThanMatch) {
// "three". len=4, orig_window= "thre"
snippet_spec_.set_max_window_utf32_length(4);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
- query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
+ query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
EXPECT_THAT(snippet.entries(), SizeIs(1));
EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
@@ -210,7 +202,7 @@ TEST_F(SnippetRetrieverTest,
// "three". len=5, orig_window= "three"
snippet_spec_.set_max_window_utf32_length(5);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
- query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
+ query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
EXPECT_THAT(snippet.entries(), SizeIs(1));
EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
@@ -236,7 +228,7 @@ TEST_F(SnippetRetrieverTest,
// "four". len=4, orig_window= "four"
snippet_spec_.set_max_window_utf32_length(4);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
- query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
+ query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
EXPECT_THAT(snippet.entries(), SizeIs(1));
EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
@@ -268,7 +260,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowStartsInWhitespace) {
// 3. trimmed, shifted window [4,18) "two three four"
snippet_spec_.set_max_window_utf32_length(14);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
- query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
+ query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
EXPECT_THAT(snippet.entries(), SizeIs(1));
EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
@@ -301,7 +293,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowStartsMidToken) {
// 3. trimmed, shifted window [4,20) "two three four.."
snippet_spec_.set_max_window_utf32_length(16);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
- query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
+ query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
EXPECT_THAT(snippet.entries(), SizeIs(1));
EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
@@ -327,7 +319,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowEndsInPunctuation) {
// len=20, orig_window="one two three four.."
snippet_spec_.set_max_window_utf32_length(20);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
- query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
+ query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
EXPECT_THAT(snippet.entries(), SizeIs(1));
EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
@@ -355,7 +347,7 @@ TEST_F(SnippetRetrieverTest,
// len=26, orig_window="pside down in Australia¿"
snippet_spec_.set_max_window_utf32_length(24);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
- query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
+ query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
EXPECT_THAT(snippet.entries(), SizeIs(1));
EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
@@ -383,7 +375,7 @@ TEST_F(SnippetRetrieverTest,
// len=26, orig_window="upside down in Australia¿ "
snippet_spec_.set_max_window_utf32_length(26);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
- query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
+ query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
EXPECT_THAT(snippet.entries(), SizeIs(1));
EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
@@ -416,7 +408,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowStartsBeforeValueStart) {
// 3. trimmed, shifted window [0,22) "one two three four...."
snippet_spec_.set_max_window_utf32_length(22);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
- query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
+ query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
EXPECT_THAT(snippet.entries(), SizeIs(1));
EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
@@ -442,7 +434,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowEndsInWhitespace) {
// len=26, orig_window="one two three four.... "
snippet_spec_.set_max_window_utf32_length(26);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
- query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
+ query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
EXPECT_THAT(snippet.entries(), SizeIs(1));
EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
@@ -475,7 +467,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowEndsMidToken) {
// 3. trimmed, shifted window [0,27) "one two three four.... five"
snippet_spec_.set_max_window_utf32_length(32);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
- query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
+ query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
EXPECT_THAT(snippet.entries(), SizeIs(1));
EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
@@ -501,7 +493,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowSizeEqualToValueSize) {
// len=34, orig_window="one two three four.... five"
snippet_spec_.set_max_window_utf32_length(34);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
- query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
+ query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
EXPECT_THAT(snippet.entries(), SizeIs(1));
EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
@@ -527,7 +519,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowSizeLargerThanValueSize) {
// len=36, orig_window="one two three four.... five"
snippet_spec_.set_max_window_utf32_length(36);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
- query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
+ query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
EXPECT_THAT(snippet.entries(), SizeIs(1));
EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
@@ -561,7 +553,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextStart) {
// 3. trimmed, shifted window [0,27) "one two three four.... five"
snippet_spec_.set_max_window_utf32_length(28);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
- query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
+ query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
EXPECT_THAT(snippet.entries(), SizeIs(1));
EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
@@ -595,7 +587,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextEnd) {
// 3. trimmed, shifted window [4,31) "two three four.... five six"
snippet_spec_.set_max_window_utf32_length(28);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
- query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
+ query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
EXPECT_THAT(snippet.entries(), SizeIs(1));
EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
@@ -629,7 +621,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextStartShortText) {
// 3. trimmed, shifted window [0, 22) "one two three four...."
snippet_spec_.set_max_window_utf32_length(28);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
- query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
+ query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
EXPECT_THAT(snippet.entries(), SizeIs(1));
EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
@@ -663,7 +655,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextEndShortText) {
// 3. trimmed, shifted window [0, 22) "one two three four...."
snippet_spec_.set_max_window_utf32_length(28);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
- query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
+ query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
EXPECT_THAT(snippet.entries(), SizeIs(1));
EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
@@ -684,7 +676,7 @@ TEST_F(SnippetRetrieverTest, PrefixSnippeting) {
SectionIdMask section_mask = 0b00000011;
SectionRestrictQueryTermsMap query_terms{{"", {"f"}}};
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
- query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
+ query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
// Check the snippets. 'f' should match prefix-enabled property 'subject', but
// not exact-only property 'body'
@@ -710,7 +702,7 @@ TEST_F(SnippetRetrieverTest, ExactSnippeting) {
SectionIdMask section_mask = 0b00000011;
SectionRestrictQueryTermsMap query_terms{{"", {"f"}}};
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
- query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
+ query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
// Check the snippets
EXPECT_THAT(snippet.entries(), IsEmpty());
@@ -730,7 +722,7 @@ TEST_F(SnippetRetrieverTest, SimpleSnippetingNoWindowing) {
SectionIdMask section_mask = 0b00000011;
SectionRestrictQueryTermsMap query_terms{{"", {"foo"}}};
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
- query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
+ query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
// Check the snippets
EXPECT_THAT(snippet.entries(), SizeIs(1));
@@ -764,7 +756,7 @@ TEST_F(SnippetRetrieverTest, SnippetingMultipleMatches) {
SectionIdMask section_mask = 0b00000011;
SectionRestrictQueryTermsMap query_terms{{"", {"foo", "bar"}}};
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
- query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
+ query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
// Check the snippets
EXPECT_THAT(snippet.entries(), SizeIs(2));
@@ -822,7 +814,7 @@ TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesSectionRestrict) {
SectionIdMask section_mask = 0b00000001;
SectionRestrictQueryTermsMap query_terms{{"", {"foo", "bar"}}};
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
- query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
+ query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
// Check the snippets
EXPECT_THAT(snippet.entries(), SizeIs(1));
@@ -874,7 +866,7 @@ TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesSectionRestrictedTerm) {
SectionRestrictQueryTermsMap query_terms{{"", {"subject"}},
{"body", {"foo"}}};
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
- query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
+ query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
// Check the snippets
EXPECT_THAT(snippet.entries(), SizeIs(2));
@@ -933,7 +925,7 @@ TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesOneMatchPerProperty) {
SectionIdMask section_mask = 0b00000011;
SectionRestrictQueryTermsMap query_terms{{"", {"foo", "bar"}}};
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
- query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
+ query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
// Check the snippets
EXPECT_THAT(snippet.entries(), SizeIs(2));
@@ -970,7 +962,7 @@ TEST_F(SnippetRetrieverTest, PrefixSnippetingNormalization) {
SectionIdMask section_mask = 0b00000011;
SectionRestrictQueryTermsMap query_terms{{"", {"md"}}};
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
- query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
+ query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
EXPECT_THAT(snippet.entries(), SizeIs(1));
EXPECT_THAT(snippet.entries(0).property_name(), Eq("subject"));
@@ -993,7 +985,7 @@ TEST_F(SnippetRetrieverTest, ExactSnippetingNormalization) {
SectionIdMask section_mask = 0b00000011;
SectionRestrictQueryTermsMap query_terms{{"", {"zurich"}}};
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
- query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
+ query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
EXPECT_THAT(snippet.entries(), SizeIs(1));
EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
@@ -1012,21 +1004,21 @@ TEST_F(SnippetRetrieverTest, SnippetingTestOneLevel) {
SchemaBuilder()
.AddType(SchemaTypeConfigBuilder()
.SetType("SingleLevelType")
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("X")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_REPEATED))
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("Y")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_REPEATED))
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("Z")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_REPEATED)))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("X")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REPEATED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Y")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REPEATED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Z")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REPEATED)))
.Build();
ICING_ASSERT_OK(schema_store_->SetSchema(
schema, /*ignore_errors_and_delete_documents=*/true));
@@ -1057,7 +1049,7 @@ TEST_F(SnippetRetrieverTest, SnippetingTestOneLevel) {
SectionIdMask section_mask = 0b00000111;
SectionRestrictQueryTermsMap query_terms{{"", {"polo"}}};
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
- query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
+ query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
EXPECT_THAT(snippet.entries(), SizeIs(6));
EXPECT_THAT(snippet.entries(0).property_name(), Eq("X[1]"));
@@ -1082,21 +1074,21 @@ TEST_F(SnippetRetrieverTest, SnippetingTestMultiLevel) {
SchemaBuilder()
.AddType(SchemaTypeConfigBuilder()
.SetType("SingleLevelType")
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("X")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_REPEATED))
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("Y")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_REPEATED))
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("Z")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_REPEATED)))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("X")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REPEATED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Y")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REPEATED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Z")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REPEATED)))
.AddType(SchemaTypeConfigBuilder()
.SetType("MultiLevelType")
.AddProperty(PropertyConfigBuilder()
@@ -1160,7 +1152,7 @@ TEST_F(SnippetRetrieverTest, SnippetingTestMultiLevel) {
SectionIdMask section_mask = 0b111111111;
SectionRestrictQueryTermsMap query_terms{{"", {"polo"}}};
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
- query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
+ query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
EXPECT_THAT(snippet.entries(), SizeIs(18));
EXPECT_THAT(snippet.entries(0).property_name(), Eq("A.X[1]"));
@@ -1188,21 +1180,21 @@ TEST_F(SnippetRetrieverTest, SnippetingTestMultiLevelRepeated) {
SchemaBuilder()
.AddType(SchemaTypeConfigBuilder()
.SetType("SingleLevelType")
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("X")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_REPEATED))
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("Y")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_REPEATED))
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("Z")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_REPEATED)))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("X")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REPEATED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Y")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REPEATED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Z")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REPEATED)))
.AddType(SchemaTypeConfigBuilder()
.SetType("MultiLevelType")
.AddProperty(PropertyConfigBuilder()
@@ -1269,7 +1261,7 @@ TEST_F(SnippetRetrieverTest, SnippetingTestMultiLevelRepeated) {
SectionIdMask section_mask = 0b111111111;
SectionRestrictQueryTermsMap query_terms{{"", {"polo"}}};
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
- query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
+ query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
EXPECT_THAT(snippet.entries(), SizeIs(36));
EXPECT_THAT(snippet.entries(0).property_name(), Eq("A[0].X[1]"));
@@ -1302,21 +1294,21 @@ TEST_F(SnippetRetrieverTest, SnippetingTestMultiLevelSingleValue) {
SchemaBuilder()
.AddType(SchemaTypeConfigBuilder()
.SetType("SingleLevelType")
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("X")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL))
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("Y")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL))
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("Z")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("X")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Y")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Z")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
.AddType(SchemaTypeConfigBuilder()
.SetType("MultiLevelType")
.AddProperty(PropertyConfigBuilder()
@@ -1376,7 +1368,7 @@ TEST_F(SnippetRetrieverTest, SnippetingTestMultiLevelSingleValue) {
SectionIdMask section_mask = 0b111111111;
SectionRestrictQueryTermsMap query_terms{{"", {"polo"}}};
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
- query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
+ query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
EXPECT_THAT(snippet.entries(), SizeIs(12));
EXPECT_THAT(snippet.entries(0).property_name(), Eq("A[0].X"));
@@ -1419,7 +1411,7 @@ TEST_F(SnippetRetrieverTest, CJKSnippetMatchTest) {
SectionRestrictQueryTermsMap query_terms{{"", {"走"}}};
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
- query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
+ query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
// Ensure that one and only one property was matched and it was "body"
ASSERT_THAT(snippet.entries(), SizeIs(1));
@@ -1480,7 +1472,7 @@ TEST_F(SnippetRetrieverTest, CJKSnippetWindowTest) {
snippet_spec_.set_max_window_utf32_length(6);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
- query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
+ query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
// Ensure that one and only one property was matched and it was "body"
ASSERT_THAT(snippet.entries(), SizeIs(1));
@@ -1524,7 +1516,7 @@ TEST_F(SnippetRetrieverTest, Utf16MultiCodeUnitSnippetMatchTest) {
SectionRestrictQueryTermsMap query_terms{{"", {"𐀂"}}};
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
- query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
+ query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
// Ensure that one and only one property was matched and it was "body"
ASSERT_THAT(snippet.entries(), SizeIs(1));
@@ -1579,7 +1571,7 @@ TEST_F(SnippetRetrieverTest, Utf16MultiCodeUnitWindowTest) {
snippet_spec_.set_max_window_utf32_length(6);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
- query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
+ query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
// Ensure that one and only one property was matched and it was "body"
ASSERT_THAT(snippet.entries(), SizeIs(1));
@@ -1607,7 +1599,7 @@ TEST_F(SnippetRetrieverTest, SnippettingVerbatimAscii) {
.SetType("verbatimType")
.AddProperty(PropertyConfigBuilder()
.SetName("verbatim")
- .SetDataTypeString(MATCH_EXACT,
+ .SetDataTypeString(TERM_MATCH_EXACT,
TOKENIZER_VERBATIM)
.SetCardinality(CARDINALITY_REPEATED)))
.Build();
@@ -1629,7 +1621,7 @@ TEST_F(SnippetRetrieverTest, SnippettingVerbatimAscii) {
snippet_spec_.set_max_window_utf32_length(13);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
- query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
+ query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
// There should only be one snippet entry and match, the verbatim token in its
// entirety.
@@ -1660,7 +1652,7 @@ TEST_F(SnippetRetrieverTest, SnippettingVerbatimCJK) {
.SetType("verbatimType")
.AddProperty(PropertyConfigBuilder()
.SetName("verbatim")
- .SetDataTypeString(MATCH_PREFIX,
+ .SetDataTypeString(TERM_MATCH_PREFIX,
TOKENIZER_VERBATIM)
.SetCardinality(CARDINALITY_REPEATED)))
.Build();
@@ -1689,7 +1681,7 @@ TEST_F(SnippetRetrieverTest, SnippettingVerbatimCJK) {
snippet_spec_.set_max_window_utf32_length(9);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
- query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
+ query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
// There should only be one snippet entry and match, the verbatim token in its
// entirety.
@@ -1718,7 +1710,7 @@ TEST_F(SnippetRetrieverTest, SnippettingRfc822Ascii) {
.SetType("rfc822Type")
.AddProperty(PropertyConfigBuilder()
.SetName("rfc822")
- .SetDataTypeString(MATCH_PREFIX,
+ .SetDataTypeString(TERM_MATCH_PREFIX,
TOKENIZER_RFC822)
.SetCardinality(CARDINALITY_REPEATED)))
.Build();
@@ -1747,7 +1739,7 @@ TEST_F(SnippetRetrieverTest, SnippettingRfc822Ascii) {
snippet_spec_.set_max_window_utf32_length(35);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
- query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
+ query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
ASSERT_THAT(snippet.entries(), SizeIs(1));
EXPECT_THAT(snippet.entries(0).property_name(), "rfc822");
@@ -1768,7 +1760,7 @@ TEST_F(SnippetRetrieverTest, SnippettingRfc822Ascii) {
snippet_spec_.set_max_window_utf32_length(36);
snippet = snippet_retriever_->RetrieveSnippet(
- query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
+ query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
ASSERT_THAT(snippet.entries(), SizeIs(1));
EXPECT_THAT(snippet.entries(0).property_name(), "rfc822");
@@ -1793,7 +1785,7 @@ TEST_F(SnippetRetrieverTest, SnippettingRfc822CJK) {
.SetType("rfc822Type")
.AddProperty(PropertyConfigBuilder()
.SetName("rfc822")
- .SetDataTypeString(MATCH_PREFIX,
+ .SetDataTypeString(TERM_MATCH_PREFIX,
TOKENIZER_RFC822)
.SetCardinality(CARDINALITY_REPEATED)))
.Build();
@@ -1819,7 +1811,7 @@ TEST_F(SnippetRetrieverTest, SnippettingRfc822CJK) {
snippet_spec_.set_max_window_utf32_length(8);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
- query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
+ query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
// There should only be one snippet entry and match, the local component token
ASSERT_THAT(snippet.entries(), SizeIs(1));
@@ -1839,6 +1831,174 @@ TEST_F(SnippetRetrieverTest, SnippettingRfc822CJK) {
ElementsAre("走", "走"));
}
+#ifdef ENABLE_URL_TOKENIZER
+TEST_F(SnippetRetrieverTest, SnippettingUrlAscii) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("urlType")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("url")
+ .SetDataTypeString(MATCH_PREFIX,
+ TOKENIZER_URL)
+ .SetCardinality(CARDINALITY_REPEATED)))
+ .Build();
+ ICING_ASSERT_OK(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/true));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ snippet_retriever_,
+ SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
+ normalizer_.get()));
+
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "url/1")
+ .SetSchema("urlType")
+ .AddStringProperty("url", "https://mail.google.com/calendar/google/")
+ .Build();
+
+ SectionIdMask section_mask = 0b00000001;
+
+ // Query with single url split-token match
+ SectionRestrictQueryTermsMap query_terms{{"", {"com"}}};
+ // 40 is the length of the url.
+ // Window that is the size of the url should return entire url.
+ snippet_spec_.set_max_window_utf32_length(40);
+
+ SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+ query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
+
+ ASSERT_THAT(snippet.entries(), SizeIs(1));
+ EXPECT_THAT(snippet.entries(0).property_name(), "url");
+
+ std::string_view content =
+ GetString(&document, snippet.entries(0).property_name());
+
+ EXPECT_THAT(GetWindows(content, snippet.entries(0)),
+ ElementsAre("https://mail.google.com/calendar/google/"));
+ EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("com"));
+ EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("com"));
+
+ // Query with single url suffix-token match
+ query_terms = SectionRestrictQueryTermsMap{{"", {"mail.goo"}}};
+ snippet_spec_.set_max_window_utf32_length(40);
+
+ snippet = snippet_retriever_->RetrieveSnippet(
+ query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
+
+ ASSERT_THAT(snippet.entries(), SizeIs(1));
+ EXPECT_THAT(snippet.entries(0).property_name(), "url");
+
+ content = GetString(&document, snippet.entries(0).property_name());
+
+ EXPECT_THAT(GetWindows(content, snippet.entries(0)),
+ ElementsAre("https://mail.google.com/calendar/google/"));
+ EXPECT_THAT(GetMatches(content, snippet.entries(0)),
+ ElementsAre("mail.google.com/calendar/google/"));
+ EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
+ ElementsAre("mail.goo"));
+
+ // Query with multiple url split-token matches
+ query_terms = SectionRestrictQueryTermsMap{{"", {"goog"}}};
+ snippet_spec_.set_max_window_utf32_length(40);
+
+ snippet = snippet_retriever_->RetrieveSnippet(
+ query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
+
+ ASSERT_THAT(snippet.entries(), SizeIs(1));
+ EXPECT_THAT(snippet.entries(0).property_name(), "url");
+
+ content = GetString(&document, snippet.entries(0).property_name());
+
+ EXPECT_THAT(GetWindows(content, snippet.entries(0)),
+ ElementsAre("https://mail.google.com/calendar/google/",
+ "https://mail.google.com/calendar/google/"));
+ EXPECT_THAT(GetMatches(content, snippet.entries(0)),
+ ElementsAre("google", "google"));
+ EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
+ ElementsAre("goog", "goog"));
+
+ // Query with both url split-token and suffix-token matches
+ query_terms = SectionRestrictQueryTermsMap{{"", {"mail"}}};
+ snippet_spec_.set_max_window_utf32_length(40);
+
+ snippet = snippet_retriever_->RetrieveSnippet(
+ query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
+
+ ASSERT_THAT(snippet.entries(), SizeIs(1));
+ EXPECT_THAT(snippet.entries(0).property_name(), "url");
+
+ content = GetString(&document, snippet.entries(0).property_name());
+
+ EXPECT_THAT(GetWindows(content, snippet.entries(0)),
+ ElementsAre("https://mail.google.com/calendar/google/",
+ "https://mail.google.com/calendar/google/"));
+ EXPECT_THAT(GetMatches(content, snippet.entries(0)),
+ ElementsAre("mail", "mail.google.com/calendar/google/"));
+ EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
+ ElementsAre("mail", "mail"));
+
+ // Prefix query with both url split-token and suffix-token matches
+ query_terms = SectionRestrictQueryTermsMap{{"", {"http"}}};
+ snippet_spec_.set_max_window_utf32_length(40);
+
+ snippet = snippet_retriever_->RetrieveSnippet(
+ query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
+
+ ASSERT_THAT(snippet.entries(), SizeIs(1));
+ EXPECT_THAT(snippet.entries(0).property_name(), "url");
+
+ content = GetString(&document, snippet.entries(0).property_name());
+
+ EXPECT_THAT(GetWindows(content, snippet.entries(0)),
+ ElementsAre("https://mail.google.com/calendar/google/",
+ "https://mail.google.com/calendar/google/"));
+ EXPECT_THAT(GetMatches(content, snippet.entries(0)),
+ ElementsAre("https", "https://mail.google.com/calendar/google/"));
+ EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
+ ElementsAre("http", "http"));
+
+ // Window that's smaller than the input size should not return any matches.
+ query_terms = SectionRestrictQueryTermsMap{{"", {"google"}}};
+ snippet_spec_.set_max_window_utf32_length(10);
+
+ snippet = snippet_retriever_->RetrieveSnippet(
+ query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
+
+ ASSERT_THAT(snippet.entries(), SizeIs(0));
+
+ // Test case with more than two matches
+ document =
+ DocumentBuilder()
+ .SetKey("icing", "url/1")
+ .SetSchema("urlType")
+ .AddStringProperty("url", "https://www.google.com/calendar/google/")
+ .Build();
+
+ // Prefix query with both url split-token and suffix-token matches
+ query_terms = SectionRestrictQueryTermsMap{{"", {"google"}}};
+ snippet_spec_.set_max_window_utf32_length(39);
+
+ snippet = snippet_retriever_->RetrieveSnippet(
+ query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
+
+ ASSERT_THAT(snippet.entries(), SizeIs(1));
+ EXPECT_THAT(snippet.entries(0).property_name(), "url");
+
+ content = GetString(&document, snippet.entries(0).property_name());
+
+ EXPECT_THAT(GetWindows(content, snippet.entries(0)),
+ ElementsAre("https://www.google.com/calendar/google/",
+ "https://www.google.com/calendar/google/",
+ "https://www.google.com/calendar/google/"));
+ EXPECT_THAT(GetMatches(content, snippet.entries(0)),
+ ElementsAre("google", "google", "google.com/calendar/google/"));
+ EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
+ ElementsAre("google", "google", "google"));
+}
+#endif // ENABLE_URL_TOKENIZER
+
} // namespace
} // namespace lib
diff --git a/icing/schema-builder.h b/icing/schema-builder.h
index 3bc4527..ea0a774 100644
--- a/icing/schema-builder.h
+++ b/icing/schema-builder.h
@@ -27,6 +27,48 @@
namespace icing {
namespace lib {
+constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_UNKNOWN =
+ PropertyConfigProto::Cardinality::UNKNOWN;
+constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REPEATED =
+ PropertyConfigProto::Cardinality::REPEATED;
+constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL =
+ PropertyConfigProto::Cardinality::OPTIONAL;
+constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REQUIRED =
+ PropertyConfigProto::Cardinality::REQUIRED;
+
+constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_NONE =
+ StringIndexingConfig::TokenizerType::NONE;
+constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN =
+ StringIndexingConfig::TokenizerType::PLAIN;
+constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_VERBATIM =
+ StringIndexingConfig::TokenizerType::VERBATIM;
+constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_RFC822 =
+ StringIndexingConfig::TokenizerType::RFC822;
+
+constexpr TermMatchType::Code TERM_MATCH_UNKNOWN = TermMatchType::UNKNOWN;
+constexpr TermMatchType::Code TERM_MATCH_EXACT = TermMatchType::EXACT_ONLY;
+constexpr TermMatchType::Code TERM_MATCH_PREFIX = TermMatchType::PREFIX;
+
+constexpr IntegerIndexingConfig::NumericMatchType::Code NUMERIC_MATCH_UNKNOWN =
+ IntegerIndexingConfig::NumericMatchType::UNKNOWN;
+constexpr IntegerIndexingConfig::NumericMatchType::Code NUMERIC_MATCH_RANGE =
+ IntegerIndexingConfig::NumericMatchType::RANGE;
+
+constexpr PropertyConfigProto::DataType::Code TYPE_UNKNOWN =
+ PropertyConfigProto::DataType::UNKNOWN;
+constexpr PropertyConfigProto::DataType::Code TYPE_STRING =
+ PropertyConfigProto::DataType::STRING;
+constexpr PropertyConfigProto::DataType::Code TYPE_INT64 =
+ PropertyConfigProto::DataType::INT64;
+constexpr PropertyConfigProto::DataType::Code TYPE_DOUBLE =
+ PropertyConfigProto::DataType::DOUBLE;
+constexpr PropertyConfigProto::DataType::Code TYPE_BOOLEAN =
+ PropertyConfigProto::DataType::BOOLEAN;
+constexpr PropertyConfigProto::DataType::Code TYPE_BYTES =
+ PropertyConfigProto::DataType::BYTES;
+constexpr PropertyConfigProto::DataType::Code TYPE_DOCUMENT =
+ PropertyConfigProto::DataType::DOCUMENT;
+
class PropertyConfigBuilder {
public:
PropertyConfigBuilder() = default;
@@ -53,6 +95,14 @@ class PropertyConfigBuilder {
return *this;
}
+ PropertyConfigBuilder& SetDataTypeInt64(
+ IntegerIndexingConfig::NumericMatchType::Code numeric_match_type) {
+ property_.set_data_type(PropertyConfigProto::DataType::INT64);
+ property_.mutable_integer_indexing_config()->set_numeric_match_type(
+ numeric_match_type);
+ return *this;
+ }
+
PropertyConfigBuilder& SetDataTypeDocument(std::string_view schema_type,
bool index_nested_properties) {
property_.set_data_type(PropertyConfigProto::DataType::DOCUMENT);
diff --git a/icing/schema/schema-store.cc b/icing/schema/schema-store.cc
index b1a2fe5..5f4baa8 100644
--- a/icing/schema/schema-store.cc
+++ b/icing/schema/schema-store.cc
@@ -516,14 +516,16 @@ libtextclassifier3::StatusOr<std::vector<std::string_view>>
SchemaStore::GetStringSectionContent(const DocumentProto& document,
std::string_view section_path) const {
ICING_RETURN_IF_ERROR(CheckSchemaSet());
- return section_manager_->GetStringSectionContent(document, section_path);
+ return section_manager_->GetSectionContent<std::string_view>(document,
+ section_path);
}
libtextclassifier3::StatusOr<std::vector<std::string_view>>
SchemaStore::GetStringSectionContent(const DocumentProto& document,
SectionId section_id) const {
ICING_RETURN_IF_ERROR(CheckSchemaSet());
- return section_manager_->GetStringSectionContent(document, section_id);
+ return section_manager_->GetSectionContent<std::string_view>(document,
+ section_id);
}
libtextclassifier3::StatusOr<const SectionMetadata*>
@@ -533,7 +535,7 @@ SchemaStore::GetSectionMetadata(SchemaTypeId schema_type_id,
return section_manager_->GetSectionMetadata(schema_type_id, section_id);
}
-libtextclassifier3::StatusOr<std::vector<Section>> SchemaStore::ExtractSections(
+libtextclassifier3::StatusOr<SectionGroup> SchemaStore::ExtractSections(
const DocumentProto& document) const {
ICING_RETURN_IF_ERROR(CheckSchemaSet());
return section_manager_->ExtractSections(document);
diff --git a/icing/schema/schema-store.h b/icing/schema/schema-store.h
index 82f4ffa..d5a7c6f 100644
--- a/icing/schema/schema-store.h
+++ b/icing/schema/schema-store.h
@@ -221,15 +221,21 @@ class SchemaStore {
libtextclassifier3::StatusOr<const SectionMetadata*> GetSectionMetadata(
SchemaTypeId schema_type_id, SectionId section_id) const;
- // Extracts all sections from the given document, sections are sorted by
- // section id in increasing order. Section ids start from 0. Sections with
- // empty content won't be returned.
+ // Extracts all sections of different types from the given document and group
+ // them by type.
+ // - Each Section vector is sorted by section Id in ascending order. The
+ // sorted section Ids may not be continuous, since not all section Ids are
+ // present in the document.
+ // - Sections with empty content won't be returned.
+ // - For example, we may extract:
+ // string_sections: [2, 7, 10]
+ // integer_sections: [3, 5, 8]
//
// Returns:
- // A list of sections on success
+ // A SectionGroup instance on success
// FAILED_PRECONDITION if schema hasn't been set yet
// NOT_FOUND if type config name of document not found
- libtextclassifier3::StatusOr<std::vector<Section>> ExtractSections(
+ libtextclassifier3::StatusOr<SectionGroup> ExtractSections(
const DocumentProto& document) const;
// Syncs all the data changes to disk.
diff --git a/icing/schema/schema-store_test.cc b/icing/schema/schema-store_test.cc
index aa05151..da04931 100644
--- a/icing/schema/schema-store_test.cc
+++ b/icing/schema/schema-store_test.cc
@@ -57,21 +57,7 @@ using ::testing::Pointee;
using ::testing::Return;
using ::testing::SizeIs;
-constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL =
- PropertyConfigProto::Cardinality::OPTIONAL;
-constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REPEATED =
- PropertyConfigProto::Cardinality::REPEATED;
-
-constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN =
- StringIndexingConfig::TokenizerType::PLAIN;
-
-constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY;
-constexpr TermMatchType::Code MATCH_PREFIX = TermMatchType::PREFIX;
-
-constexpr PropertyConfigProto::DataType::Code TYPE_STRING =
- PropertyConfigProto::DataType::STRING;
-constexpr PropertyConfigProto::DataType::Code TYPE_DOUBLE =
- PropertyConfigProto::DataType::DOUBLE;
+constexpr int64_t kDefaultTimestamp = 12345678;
class SchemaStoreTest : public ::testing::Test {
protected:
@@ -80,15 +66,23 @@ class SchemaStoreTest : public ::testing::Test {
schema_store_dir_ = test_dir_ + "/schema_store";
filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str());
- schema_ =
- SchemaBuilder()
- .AddType(SchemaTypeConfigBuilder().SetType("email").AddProperty(
- // Add an indexed property so we generate section metadata on it
- PropertyConfigBuilder()
- .SetName("subject")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL)))
- .Build();
+ schema_ = SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("email")
+ .AddProperty(
+ // Add an indexed property so we generate
+ // section metadata on it
+ PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("timestamp")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
}
void TearDown() override {
@@ -123,7 +117,7 @@ TEST_F(SchemaStoreTest, SchemaStoreMoveConstructible) {
.AddType(SchemaTypeConfigBuilder().SetType("TypeA").AddProperty(
PropertyConfigBuilder()
.SetName("prop1")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
.SetCardinality(CARDINALITY_OPTIONAL)))
.Build();
@@ -141,7 +135,8 @@ TEST_F(SchemaStoreTest, SchemaStoreMoveConstructible) {
IsOkAndHolds(Pointee(EqualsProto(schema))));
EXPECT_THAT(move_constructed_schema_store.ComputeChecksum(),
IsOkAndHolds(Eq(expected_checksum)));
- SectionMetadata expected_metadata(/*id_in=*/0, MATCH_EXACT, TOKENIZER_PLAIN,
+ SectionMetadata expected_metadata(/*id_in=*/0, TYPE_STRING, TOKENIZER_PLAIN,
+ TERM_MATCH_EXACT, NUMERIC_MATCH_UNKNOWN,
"prop1");
EXPECT_THAT(move_constructed_schema_store.GetSectionMetadata("TypeA"),
IsOkAndHolds(Pointee(ElementsAre(expected_metadata))));
@@ -154,7 +149,7 @@ TEST_F(SchemaStoreTest, SchemaStoreMoveAssignment) {
.AddType(SchemaTypeConfigBuilder().SetType("TypeA").AddProperty(
PropertyConfigBuilder()
.SetName("prop1")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
.SetCardinality(CARDINALITY_OPTIONAL)))
.Build();
@@ -172,7 +167,7 @@ TEST_F(SchemaStoreTest, SchemaStoreMoveAssignment) {
.AddType(SchemaTypeConfigBuilder().SetType("TypeB").AddProperty(
PropertyConfigBuilder()
.SetName("prop2")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
.SetCardinality(CARDINALITY_OPTIONAL)))
.Build();
@@ -187,7 +182,8 @@ TEST_F(SchemaStoreTest, SchemaStoreMoveAssignment) {
IsOkAndHolds(Pointee(EqualsProto(schema1))));
EXPECT_THAT(move_assigned_schema_store->ComputeChecksum(),
IsOkAndHolds(Eq(expected_checksum)));
- SectionMetadata expected_metadata(/*id_in=*/0, MATCH_EXACT, TOKENIZER_PLAIN,
+ SectionMetadata expected_metadata(/*id_in=*/0, TYPE_STRING, TOKENIZER_PLAIN,
+ TERM_MATCH_EXACT, NUMERIC_MATCH_UNKNOWN,
"prop1");
EXPECT_THAT(move_assigned_schema_store->GetSectionMetadata("TypeA"),
IsOkAndHolds(Pointee(ElementsAre(expected_metadata))));
@@ -363,9 +359,12 @@ TEST_F(SchemaStoreTest, CreateWithPreviousSchemaOk) {
TEST_F(SchemaStoreTest, MultipleCreateOk) {
DocumentProto document;
document.set_schema("email");
- auto properties = document.add_properties();
- properties->set_name("subject");
- properties->add_string_values("subject_content");
+ auto subject_property = document.add_properties();
+ subject_property->set_name("subject");
+ subject_property->add_string_values("subject_content");
+ auto timestamp_property = document.add_properties();
+ timestamp_property->set_name("timestamp");
+ timestamp_property->add_int64_values(kDefaultTimestamp);
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<SchemaStore> schema_store,
@@ -380,9 +379,12 @@ TEST_F(SchemaStoreTest, MultipleCreateOk) {
// Verify that our in-memory structures are ok
EXPECT_THAT(schema_store->GetSchemaTypeConfig("email"),
IsOkAndHolds(Pointee(EqualsProto(schema_.types(0)))));
- ICING_ASSERT_OK_AND_ASSIGN(std::vector<Section> sections,
+ ICING_ASSERT_OK_AND_ASSIGN(SectionGroup section_group,
schema_store->ExtractSections(document));
- EXPECT_THAT(sections[0].content, ElementsAre("subject_content"));
+ EXPECT_THAT(section_group.string_sections[0].content,
+ ElementsAre("subject_content"));
+ EXPECT_THAT(section_group.integer_sections[0].content,
+ ElementsAre(kDefaultTimestamp));
// Verify that our persisted data is ok
EXPECT_THAT(schema_store->GetSchemaTypeId("email"), IsOkAndHolds(0));
@@ -396,8 +398,12 @@ TEST_F(SchemaStoreTest, MultipleCreateOk) {
EXPECT_THAT(schema_store->GetSchemaTypeConfig("email"),
IsOkAndHolds(Pointee(EqualsProto(schema_.types(0)))));
- ICING_ASSERT_OK_AND_ASSIGN(sections, schema_store->ExtractSections(document));
- EXPECT_THAT(sections[0].content, ElementsAre("subject_content"));
+ ICING_ASSERT_OK_AND_ASSIGN(section_group,
+ schema_store->ExtractSections(document));
+ EXPECT_THAT(section_group.string_sections[0].content,
+ ElementsAre("subject_content"));
+ EXPECT_THAT(section_group.integer_sections[0].content,
+ ElementsAre(kDefaultTimestamp));
// Verify that our persisted data is ok
EXPECT_THAT(schema_store->GetSchemaTypeId("email"), IsOkAndHolds(0));
@@ -635,7 +641,7 @@ TEST_F(SchemaStoreTest, IndexedPropertyChangeRequiresReindexingOk) {
.AddType(SchemaTypeConfigBuilder().SetType("email").AddProperty(
PropertyConfigBuilder()
.SetName("subject")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
.SetCardinality(CARDINALITY_OPTIONAL)))
.Build();
@@ -661,7 +667,7 @@ TEST_F(SchemaStoreTest, IndexNestedDocumentsChangeRequiresReindexingOk) {
.SetType("email")
.AddProperty(PropertyConfigBuilder()
.SetName("subject")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
.SetCardinality(CARDINALITY_OPTIONAL))
.Build();
SchemaProto no_nested_index_schema =
@@ -789,10 +795,11 @@ TEST_F(SchemaStoreTest, SetSchemaWithIncompatibleNestedTypesOk) {
SchemaTypeConfigBuilder contact_point_repeated_label =
SchemaTypeConfigBuilder()
.SetType("ContactPoint")
- .AddProperty(PropertyConfigBuilder()
- .SetName("label")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_REPEATED));
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("label")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REPEATED));
SchemaProto old_schema =
SchemaBuilder().AddType(contact_point_repeated_label).Build();
ICING_EXPECT_OK(schema_store->SetSchema(old_schema));
@@ -804,10 +811,11 @@ TEST_F(SchemaStoreTest, SetSchemaWithIncompatibleNestedTypesOk) {
SchemaTypeConfigBuilder contact_point_optional_label =
SchemaTypeConfigBuilder()
.SetType("ContactPoint")
- .AddProperty(PropertyConfigBuilder()
- .SetName("label")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL));
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("label")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL));
SchemaTypeConfigBuilder person =
SchemaTypeConfigBuilder().SetType("Person").AddProperty(
PropertyConfigBuilder()
@@ -857,10 +865,11 @@ TEST_F(SchemaStoreTest, SetSchemaWithIndexIncompatibleNestedTypesOk) {
SchemaTypeConfigBuilder contact_point_prefix_label =
SchemaTypeConfigBuilder()
.SetType("ContactPoint")
- .AddProperty(PropertyConfigBuilder()
- .SetName("label")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_REPEATED));
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("label")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REPEATED));
SchemaProto old_schema =
SchemaBuilder().AddType(contact_point_prefix_label).Build();
ICING_EXPECT_OK(schema_store->SetSchema(old_schema));
@@ -872,7 +881,7 @@ TEST_F(SchemaStoreTest, SetSchemaWithIndexIncompatibleNestedTypesOk) {
.SetType("ContactPoint")
.AddProperty(PropertyConfigBuilder()
.SetName("label")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
.SetCardinality(CARDINALITY_REPEATED));
SchemaTypeConfigBuilder person =
SchemaTypeConfigBuilder().SetType("Person").AddProperty(
@@ -911,10 +920,11 @@ TEST_F(SchemaStoreTest, SetSchemaWithCompatibleNestedTypesOk) {
SchemaTypeConfigBuilder contact_point_optional_label =
SchemaTypeConfigBuilder()
.SetType("ContactPoint")
- .AddProperty(PropertyConfigBuilder()
- .SetName("label")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL));
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("label")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL));
SchemaProto old_schema =
SchemaBuilder().AddType(contact_point_optional_label).Build();
ICING_EXPECT_OK(schema_store->SetSchema(old_schema));
@@ -924,10 +934,11 @@ TEST_F(SchemaStoreTest, SetSchemaWithCompatibleNestedTypesOk) {
SchemaTypeConfigBuilder contact_point_repeated_label =
SchemaTypeConfigBuilder()
.SetType("ContactPoint")
- .AddProperty(PropertyConfigBuilder()
- .SetName("label")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_REPEATED));
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("label")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REPEATED));
SchemaTypeConfigBuilder person =
SchemaTypeConfigBuilder().SetType("Person").AddProperty(
PropertyConfigBuilder()
@@ -1106,7 +1117,7 @@ TEST_F(SchemaStoreTest, SchemaStoreStorageInfoProto) {
PropertyConfigProto prop =
PropertyConfigBuilder()
.SetName("subject")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
.SetCardinality(CARDINALITY_OPTIONAL)
.Build();
SchemaTypeConfigBuilder full_sections_type_builder =
@@ -1201,8 +1212,12 @@ TEST_F(SchemaStoreTest, SetSchemaRegenerateDerivedFilesFailure) {
SchemaTypeConfigBuilder()
.SetType("Type")
.AddProperty(PropertyConfigBuilder()
- .SetName("prop1")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetName("intProp1")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("stringProp1")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
.SetCardinality(CARDINALITY_OPTIONAL))
.Build();
{
@@ -1230,17 +1245,30 @@ TEST_F(SchemaStoreTest, SetSchemaRegenerateDerivedFilesFailure) {
.Build();
EXPECT_THAT(schema_store->SetSchema(std::move(schema)),
StatusIs(libtextclassifier3::StatusCode::INTERNAL));
- DocumentProto document = DocumentBuilder()
- .SetSchema("Type")
- .AddStringProperty("prop1", "foo bar baz")
- .Build();
- SectionMetadata expected_metadata(/*id_in=*/0, MATCH_EXACT, TOKENIZER_PLAIN,
- "prop1");
- ICING_ASSERT_OK_AND_ASSIGN(std::vector<Section> sections,
+ DocumentProto document =
+ DocumentBuilder()
+ .SetSchema("Type")
+ .AddInt64Property("intProp1", 1, 2, 3)
+ .AddStringProperty("stringProp1", "foo bar baz")
+ .Build();
+ SectionMetadata expected_int_prop1_metadata(
+ /*id_in=*/0, TYPE_INT64, TOKENIZER_NONE, TERM_MATCH_UNKNOWN,
+ NUMERIC_MATCH_RANGE, "intProp1");
+ SectionMetadata expected_string_prop1_metadata(
+ /*id_in=*/1, TYPE_STRING, TOKENIZER_PLAIN, TERM_MATCH_EXACT,
+ NUMERIC_MATCH_UNKNOWN, "stringProp1");
+ ICING_ASSERT_OK_AND_ASSIGN(SectionGroup section_group,
schema_store->ExtractSections(document));
- ASSERT_THAT(sections, SizeIs(1));
- EXPECT_THAT(sections.at(0).metadata, Eq(expected_metadata));
- EXPECT_THAT(sections.at(0).content, ElementsAre("foo bar baz"));
+ ASSERT_THAT(section_group.string_sections, SizeIs(1));
+ EXPECT_THAT(section_group.string_sections.at(0).metadata,
+ Eq(expected_string_prop1_metadata));
+ EXPECT_THAT(section_group.string_sections.at(0).content,
+ ElementsAre("foo bar baz"));
+ ASSERT_THAT(section_group.integer_sections, SizeIs(1));
+ EXPECT_THAT(section_group.integer_sections.at(0).metadata,
+ Eq(expected_int_prop1_metadata));
+ EXPECT_THAT(section_group.integer_sections.at(0).content,
+ ElementsAre(1, 2, 3));
}
}
diff --git a/icing/schema/schema-util_test.cc b/icing/schema/schema-util_test.cc
index f28a2f8..ffe1036 100644
--- a/icing/schema/schema-util_test.cc
+++ b/icing/schema/schema-util_test.cc
@@ -38,33 +38,6 @@ constexpr char kEmailType[] = "EmailMessage";
constexpr char kMessageType[] = "Text";
constexpr char kPersonType[] = "Person";
-constexpr PropertyConfigProto::DataType::Code TYPE_DOCUMENT =
- PropertyConfigProto::DataType::DOCUMENT;
-constexpr PropertyConfigProto::DataType::Code TYPE_STRING =
- PropertyConfigProto::DataType::STRING;
-constexpr PropertyConfigProto::DataType::Code TYPE_INT =
- PropertyConfigProto::DataType::INT64;
-constexpr PropertyConfigProto::DataType::Code TYPE_DOUBLE =
- PropertyConfigProto::DataType::DOUBLE;
-
-constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_UNKNOWN =
- PropertyConfigProto::Cardinality::UNKNOWN;
-constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REQUIRED =
- PropertyConfigProto::Cardinality::REQUIRED;
-constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL =
- PropertyConfigProto::Cardinality::OPTIONAL;
-constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REPEATED =
- PropertyConfigProto::Cardinality::REPEATED;
-
-constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_NONE =
- StringIndexingConfig::TokenizerType::NONE;
-constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN =
- StringIndexingConfig::TokenizerType::PLAIN;
-
-constexpr TermMatchType::Code MATCH_UNKNOWN = TermMatchType::UNKNOWN;
-constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY;
-constexpr TermMatchType::Code MATCH_PREFIX = TermMatchType::PREFIX;
-
TEST(SchemaUtilTest, DependencyGraphAlphabeticalOrder) {
// Create a schema with the following dependencies:
// C
@@ -125,10 +98,11 @@ TEST(SchemaUtilTest, DependencyGraphAlphabeticalOrder) {
SchemaTypeConfigProto type_f =
SchemaTypeConfigBuilder()
.SetType("F")
- .AddProperty(PropertyConfigBuilder()
- .SetName("text")
- .SetCardinality(CARDINALITY_OPTIONAL)
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("text")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN))
.Build();
// Provide these in alphabetical (also parent-child) order: A, B, C, D, E, F
@@ -211,10 +185,11 @@ TEST(SchemaUtilTest, DependencyGraphReverseAlphabeticalOrder) {
SchemaTypeConfigProto type_f =
SchemaTypeConfigBuilder()
.SetType("F")
- .AddProperty(PropertyConfigBuilder()
- .SetName("text")
- .SetCardinality(CARDINALITY_OPTIONAL)
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("text")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN))
.Build();
// Provide these in reverse alphabetical (also child-parent) order:
@@ -298,10 +273,11 @@ TEST(SchemaUtilTest, DependencyGraphMixedOrder) {
SchemaTypeConfigProto type_f =
SchemaTypeConfigBuilder()
.SetType("F")
- .AddProperty(PropertyConfigBuilder()
- .SetName("text")
- .SetCardinality(CARDINALITY_OPTIONAL)
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("text")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN))
.Build();
// Provide these in a random order: C, E, F, A, B, D
@@ -760,7 +736,7 @@ TEST(SchemaUtilTest, NewSchemaMissingPropertyIsIncompatible) {
.SetCardinality(CARDINALITY_REQUIRED))
.AddProperty(PropertyConfigBuilder()
.SetName("OldOptional")
- .SetDataType(TYPE_INT)
+ .SetDataType(TYPE_INT64)
.SetCardinality(CARDINALITY_OPTIONAL)))
.Build();
@@ -792,7 +768,7 @@ TEST(SchemaUtilTest, CompatibilityOfDifferentCardinalityOk) {
.SetType(kEmailType)
.AddProperty(PropertyConfigBuilder()
.SetName("Property")
- .SetDataType(TYPE_INT)
+ .SetDataType(TYPE_INT64)
.SetCardinality(CARDINALITY_REPEATED)))
.Build();
@@ -803,7 +779,7 @@ TEST(SchemaUtilTest, CompatibilityOfDifferentCardinalityOk) {
.SetType(kEmailType)
.AddProperty(PropertyConfigBuilder()
.SetName("Property")
- .SetDataType(TYPE_INT)
+ .SetDataType(TYPE_INT64)
.SetCardinality(CARDINALITY_OPTIONAL)))
.Build();
@@ -834,7 +810,7 @@ TEST(SchemaUtilTest, DifferentDataTypeIsIncompatible) {
.SetType(kEmailType)
.AddProperty(PropertyConfigBuilder()
.SetName("Property")
- .SetDataType(TYPE_INT)
+ .SetDataType(TYPE_INT64)
.SetCardinality(CARDINALITY_REPEATED)))
.Build();
@@ -865,13 +841,13 @@ TEST(SchemaUtilTest, DifferentSchemaTypeIsIncompatible) {
.SetType(kPersonType)
.AddProperty(PropertyConfigBuilder()
.SetName("prop")
- .SetDataType(TYPE_INT)
+ .SetDataType(TYPE_INT64)
.SetCardinality(CARDINALITY_REPEATED)))
.AddType(SchemaTypeConfigBuilder()
.SetType(kMessageType)
.AddProperty(PropertyConfigBuilder()
.SetName("prop")
- .SetDataType(TYPE_INT)
+ .SetDataType(TYPE_INT64)
.SetCardinality(CARDINALITY_REPEATED)))
.AddType(SchemaTypeConfigBuilder()
.SetType(kEmailType)
@@ -890,13 +866,13 @@ TEST(SchemaUtilTest, DifferentSchemaTypeIsIncompatible) {
.SetType(kPersonType)
.AddProperty(PropertyConfigBuilder()
.SetName("prop")
- .SetDataType(TYPE_INT)
+ .SetDataType(TYPE_INT64)
.SetCardinality(CARDINALITY_REPEATED)))
.AddType(SchemaTypeConfigBuilder()
.SetType(kMessageType)
.AddProperty(PropertyConfigBuilder()
.SetName("prop")
- .SetDataType(TYPE_INT)
+ .SetDataType(TYPE_INT64)
.SetCardinality(CARDINALITY_REPEATED)))
.AddType(SchemaTypeConfigBuilder()
.SetType(kEmailType)
@@ -926,11 +902,11 @@ TEST(SchemaUtilTest, ChangingIndexedPropertiesMakesIndexIncompatible) {
SchemaBuilder()
.AddType(SchemaTypeConfigBuilder()
.SetType(kPersonType)
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("Property")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
.Build();
// Configure new schema
@@ -938,11 +914,11 @@ TEST(SchemaUtilTest, ChangingIndexedPropertiesMakesIndexIncompatible) {
SchemaBuilder()
.AddType(SchemaTypeConfigBuilder()
.SetType(kPersonType)
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("Property")
- .SetDataTypeString(MATCH_UNKNOWN, TOKENIZER_NONE)
- .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property")
+ .SetDataTypeString(TERM_MATCH_UNKNOWN,
+ TOKENIZER_NONE)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
.Build();
SchemaUtil::SchemaDelta schema_delta;
@@ -968,11 +944,11 @@ TEST(SchemaUtilTest, AddingNewIndexedPropertyMakesIndexIncompatible) {
SchemaBuilder()
.AddType(SchemaTypeConfigBuilder()
.SetType(kPersonType)
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("Property")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
.Build();
// Configure new schema
@@ -980,16 +956,16 @@ TEST(SchemaUtilTest, AddingNewIndexedPropertyMakesIndexIncompatible) {
SchemaBuilder()
.AddType(SchemaTypeConfigBuilder()
.SetType(kPersonType)
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("Property")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL))
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("NewIndexedProperty")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("NewIndexedProperty")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
.Build();
SchemaUtil::SchemaDelta schema_delta;
@@ -1007,29 +983,29 @@ TEST(SchemaUtilTest, AddingTypeIsCompatible) {
SchemaBuilder()
.AddType(SchemaTypeConfigBuilder()
.SetType(kPersonType)
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("Property")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
.Build();
SchemaProto new_schema =
SchemaBuilder()
.AddType(SchemaTypeConfigBuilder()
.SetType(kPersonType)
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("Property")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
.AddType(SchemaTypeConfigBuilder()
.SetType(kEmailType)
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("Property")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
.Build();
SchemaUtil::SchemaDelta schema_delta;
@@ -1048,29 +1024,29 @@ TEST(SchemaUtilTest, DeletingTypeIsNoted) {
SchemaBuilder()
.AddType(SchemaTypeConfigBuilder()
.SetType(kPersonType)
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("Property")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
.AddType(SchemaTypeConfigBuilder()
.SetType(kEmailType)
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("Property")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
.Build();
SchemaProto new_schema =
SchemaBuilder()
.AddType(SchemaTypeConfigBuilder()
.SetType(kEmailType)
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("Property")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
.Build();
SchemaUtil::SchemaDelta schema_delta;
@@ -1090,11 +1066,11 @@ TEST(SchemaUtilTest, DeletingPropertyAndChangingProperty) {
.SetName("Property1")
.SetDataType(TYPE_STRING)
.SetCardinality(CARDINALITY_OPTIONAL))
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("Property2")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_REQUIRED)))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property2")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)))
.Build();
// Remove Property2 and make Property1 indexed now. Removing Property2 should
@@ -1103,11 +1079,11 @@ TEST(SchemaUtilTest, DeletingPropertyAndChangingProperty) {
SchemaBuilder()
.AddType(SchemaTypeConfigBuilder()
.SetType(kEmailType)
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("Property1")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property1")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
.Build();
SchemaUtil::SchemaDelta schema_delta;
@@ -1127,7 +1103,7 @@ TEST(SchemaUtilTest, IndexNestedDocumentsIndexIncompatible) {
.SetType(kEmailType)
.AddProperty(PropertyConfigBuilder()
.SetName("subject")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
.SetCardinality(CARDINALITY_OPTIONAL))
.Build();
SchemaProto no_nested_index_schema =
@@ -1180,7 +1156,7 @@ TEST(SchemaUtilTest, ValidateStringIndexingConfigShouldHaveTermMatchType) {
.AddType(SchemaTypeConfigBuilder().SetType("MyType").AddProperty(
PropertyConfigBuilder()
.SetName("Foo")
- .SetDataTypeString(MATCH_UNKNOWN, TOKENIZER_PLAIN)
+ .SetDataTypeString(TERM_MATCH_UNKNOWN, TOKENIZER_PLAIN)
.SetCardinality(CARDINALITY_REQUIRED)))
.Build();
@@ -1193,7 +1169,7 @@ TEST(SchemaUtilTest, ValidateStringIndexingConfigShouldHaveTermMatchType) {
.AddType(SchemaTypeConfigBuilder().SetType("MyType").AddProperty(
PropertyConfigBuilder()
.SetName("Foo")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
.SetCardinality(CARDINALITY_REQUIRED)))
.Build();
EXPECT_THAT(SchemaUtil::Validate(schema), IsOk());
@@ -1205,7 +1181,7 @@ TEST(SchemaUtilTest, ValidateStringIndexingConfigShouldHaveTokenizer) {
.AddType(SchemaTypeConfigBuilder().SetType("MyType").AddProperty(
PropertyConfigBuilder()
.SetName("Foo")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_NONE)
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_NONE)
.SetCardinality(CARDINALITY_REQUIRED)))
.Build();
@@ -1218,7 +1194,7 @@ TEST(SchemaUtilTest, ValidateStringIndexingConfigShouldHaveTokenizer) {
.AddType(SchemaTypeConfigBuilder().SetType("MyType").AddProperty(
PropertyConfigBuilder()
.SetName("Foo")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
.SetCardinality(CARDINALITY_REQUIRED)))
.Build();
EXPECT_THAT(SchemaUtil::Validate(schema), IsOk());
@@ -1278,11 +1254,11 @@ TEST(SchemaUtilTest, InvalidSelfReferenceEvenWithOtherProperties) {
"OwnSchema",
/*index_nested_properties=*/true)
.SetCardinality(CARDINALITY_OPTIONAL))
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("SomeString")
- .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("SomeString")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
.Build();
EXPECT_THAT(SchemaUtil::Validate(schema),
diff --git a/icing/schema/section-manager.cc b/icing/schema/section-manager.cc
index a0893e6..2ca534e 100644
--- a/icing/schema/section-manager.cc
+++ b/icing/schema/section-manager.cc
@@ -57,6 +57,32 @@ std::string ConcatenatePath(const std::string& path,
return absl_ports::StrCat(path, kPropertySeparator, next_property_name);
}
+// Helper function to append a new section metadata
+libtextclassifier3::Status AppendNewSectionMetadata(
+ std::vector<SectionMetadata>* metadata_list,
+ std::string&& concatenated_path,
+ PropertyConfigProto::DataType::Code data_type,
+ StringIndexingConfig::TokenizerType::Code string_tokenizer_type,
+ TermMatchType::Code term_match_type,
+ IntegerIndexingConfig::NumericMatchType::Code numeric_match_type) {
+ // Validates next section id, makes sure that section id is the same as the
+ // list index so that we could find any section metadata by id in O(1) later.
+ SectionId new_section_id = static_cast<SectionId>(metadata_list->size());
+ if (!IsSectionIdValid(new_section_id)) {
+ // Max number of sections reached
+ return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
+ "Too many properties to be indexed, max number of properties "
+ "allowed: %d",
+ kMaxSectionId - kMinSectionId + 1));
+ }
+
+ // Creates section metadata
+ metadata_list->push_back(SectionMetadata(
+ new_section_id, data_type, string_tokenizer_type, term_match_type,
+ numeric_match_type, std::move(concatenated_path)));
+ return libtextclassifier3::Status::OK;
+}
+
libtextclassifier3::Status AssignSections(
const SchemaTypeConfigProto& current_type_config,
const std::string& current_section_path,
@@ -70,58 +96,63 @@ libtextclassifier3::Status AssignSections(
return p1->property_name() < p2->property_name();
});
for (const auto& property_config : sorted_properties) {
- if (property_config.data_type() ==
- PropertyConfigProto::DataType::DOCUMENT) {
- auto nested_type_config_iter =
- type_config_map.find(property_config.schema_type());
- if (nested_type_config_iter == type_config_map.end()) {
- // This should never happen because our schema should already be
- // validated by this point.
- return absl_ports::NotFoundError(absl_ports::StrCat(
- "Type config not found: ", property_config.schema_type()));
- }
+ // Creates section metadata according to data type
+ switch (property_config.data_type()) {
+ case PropertyConfigProto::DataType::DOCUMENT: {
+ auto nested_type_config_iter =
+ type_config_map.find(property_config.schema_type());
+ if (nested_type_config_iter == type_config_map.end()) {
+ // This should never happen because our schema should already be
+ // validated by this point.
+ return absl_ports::NotFoundError(absl_ports::StrCat(
+ "Type config not found: ", property_config.schema_type()));
+ }
- if (property_config.document_indexing_config()
- .index_nested_properties()) {
- // Assign any indexed sections recursively
- const SchemaTypeConfigProto& nested_type_config =
- nested_type_config_iter->second;
- ICING_RETURN_IF_ERROR(
- AssignSections(nested_type_config,
- ConcatenatePath(current_section_path,
- property_config.property_name()),
- type_config_map, metadata_list));
+ if (property_config.document_indexing_config()
+ .index_nested_properties()) {
+ // Assign any indexed sections recursively
+ const SchemaTypeConfigProto& nested_type_config =
+ nested_type_config_iter->second;
+ ICING_RETURN_IF_ERROR(
+ AssignSections(nested_type_config,
+ ConcatenatePath(current_section_path,
+ property_config.property_name()),
+ type_config_map, metadata_list));
+ }
+ break;
}
- }
-
- // Only index strings currently.
- if (property_config.has_data_type() !=
- PropertyConfigProto::DataType::STRING ||
- property_config.string_indexing_config().term_match_type() ==
+ case PropertyConfigProto::DataType::STRING: {
+ if (property_config.string_indexing_config().term_match_type() !=
TermMatchType::UNKNOWN) {
- // No need to create section for current property
- continue;
- }
-
- // Creates section metadata according to data type
- // Validates next section id, makes sure that section id is the same as
- // the list index so that we could find any section metadata by id in O(1)
- // later.
- auto new_section_id = static_cast<SectionId>(metadata_list->size());
- if (!IsSectionIdValid(new_section_id)) {
- // Max number of sections reached
- return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
- "Too many properties to be indexed, max number of properties "
- "allowed: %d",
- kMaxSectionId - kMinSectionId + 1));
+ ICING_RETURN_IF_ERROR(AppendNewSectionMetadata(
+ metadata_list,
+ ConcatenatePath(current_section_path,
+ property_config.property_name()),
+ PropertyConfigProto::DataType::STRING,
+ property_config.string_indexing_config().tokenizer_type(),
+ property_config.string_indexing_config().term_match_type(),
+ IntegerIndexingConfig::NumericMatchType::UNKNOWN));
+ }
+ break;
+ }
+ case PropertyConfigProto::DataType::INT64: {
+ if (property_config.integer_indexing_config().numeric_match_type() !=
+ IntegerIndexingConfig::NumericMatchType::UNKNOWN) {
+ ICING_RETURN_IF_ERROR(AppendNewSectionMetadata(
+ metadata_list,
+ ConcatenatePath(current_section_path,
+ property_config.property_name()),
+ PropertyConfigProto::DataType::INT64,
+ StringIndexingConfig::TokenizerType::NONE, TermMatchType::UNKNOWN,
+ property_config.integer_indexing_config().numeric_match_type()));
+ }
+ break;
+ }
+ default: {
+ // Skip other data types.
+ break;
+ }
}
-
- // Creates section metadata from property config
- metadata_list->emplace_back(
- new_section_id,
- property_config.string_indexing_config().term_match_type(),
- property_config.string_indexing_config().tokenizer_type(),
- ConcatenatePath(current_section_path, property_config.property_name()));
}
return libtextclassifier3::Status::OK;
}
@@ -153,16 +184,40 @@ BuildSectionMetadataCache(const SchemaUtil::TypeConfigMap& type_config_map,
return section_metadata_cache;
}
-// Helper function to get string content from a property. Repeated values are
-// joined into one string. We only care about the STRING data type.
-std::vector<std::string_view> GetStringPropertyContent(
+// Helper function to get content from a property according to the template type
+// T. We only care about STRING and INT64, which are the only 2 indexable data
+// types.
+template <typename T>
+libtextclassifier3::StatusOr<std::vector<T>> GetPropertyContent(
const PropertyProto& property) {
- std::vector<std::string_view> values;
- if (!property.string_values().empty()) {
- std::copy(property.string_values().begin(), property.string_values().end(),
- std::back_inserter(values));
+ return absl_ports::UnimplementedError(
+ "Unimplemented template type for GetPropertyContent");
+}
+
+template <>
+libtextclassifier3::StatusOr<std::vector<std::string_view>>
+GetPropertyContent<std::string_view>(const PropertyProto& property) {
+ return std::vector<std::string_view>(property.string_values().begin(),
+ property.string_values().end());
+}
+
+template <>
+libtextclassifier3::StatusOr<std::vector<int64_t>> GetPropertyContent<int64_t>(
+ const PropertyProto& property) {
+ return std::vector<int64_t>(property.int64_values().begin(),
+ property.int64_values().end());
+}
+
+template <typename T>
+void AppendSection(
+ SectionMetadata section_metadata,
+ libtextclassifier3::StatusOr<std::vector<T>>&& section_content_or,
+ std::vector<Section<T>>& sections_out) {
+ if (section_content_or.ok()) {
+ // Adds to result vector if section is found in document
+ sections_out.emplace_back(std::move(section_metadata),
+ std::move(section_content_or).ValueOrDie());
}
- return values;
}
} // namespace
@@ -185,9 +240,9 @@ SectionManager::Create(const SchemaUtil::TypeConfigMap& type_config_map,
schema_type_mapper, std::move(section_metadata_cache)));
}
-libtextclassifier3::StatusOr<std::vector<std::string_view>>
-SectionManager::GetStringSectionContent(const DocumentProto& document,
- std::string_view section_path) const {
+template <typename T>
+libtextclassifier3::StatusOr<std::vector<T>> SectionManager::GetSectionContent(
+ const DocumentProto& document, std::string_view section_path) const {
// Finds the first property name in section_path
size_t separator_position = section_path.find(kPropertySeparator);
std::string_view current_property_name =
@@ -212,11 +267,11 @@ SectionManager::GetStringSectionContent(const DocumentProto& document,
if (separator_position == std::string::npos) {
// Current property name is the last one in section path
- std::vector<std::string_view> content =
- GetStringPropertyContent(*property_iterator);
+ ICING_ASSIGN_OR_RETURN(std::vector<T> content,
+ GetPropertyContent<T>(*property_iterator));
if (content.empty()) {
// The content of property is explicitly set to empty, we'll treat it as
- // NOT_FOUND because the index doesn't care about empty strings.
+ // NOT_FOUND because the index doesn't care about empty contents.
return absl_ports::NotFoundError(absl_ports::StrCat(
"Section path '", section_path, "' content was empty"));
}
@@ -226,13 +281,11 @@ SectionManager::GetStringSectionContent(const DocumentProto& document,
// Gets section content recursively
std::string_view sub_section_path =
section_path.substr(separator_position + 1);
- std::vector<std::string_view> nested_document_content;
+ std::vector<T> nested_document_content;
for (const auto& nested_document : property_iterator->document_values()) {
- auto content_or =
- GetStringSectionContent(nested_document, sub_section_path);
+ auto content_or = GetSectionContent<T>(nested_document, sub_section_path);
if (content_or.ok()) {
- std::vector<std::string_view> content =
- std::move(content_or).ValueOrDie();
+ std::vector<T> content = std::move(content_or).ValueOrDie();
std::move(content.begin(), content.end(),
std::back_inserter(nested_document_content));
}
@@ -245,9 +298,17 @@ SectionManager::GetStringSectionContent(const DocumentProto& document,
return nested_document_content;
}
-libtextclassifier3::StatusOr<std::vector<std::string_view>>
-SectionManager::GetStringSectionContent(const DocumentProto& document,
- SectionId section_id) const {
+// Explicit template instantiation
+template libtextclassifier3::StatusOr<std::vector<std::string_view>>
+SectionManager::GetSectionContent<std::string_view>(
+ const DocumentProto& document, std::string_view section_path) const;
+template libtextclassifier3::StatusOr<std::vector<int64_t>>
+SectionManager::GetSectionContent<int64_t>(const DocumentProto& document,
+ std::string_view section_path) const;
+
+template <typename T>
+libtextclassifier3::StatusOr<std::vector<T>> SectionManager::GetSectionContent(
+ const DocumentProto& document, SectionId section_id) const {
if (!IsSectionIdValid(section_id)) {
return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
"Section id %d is greater than the max value %d", section_id,
@@ -262,9 +323,17 @@ SectionManager::GetStringSectionContent(const DocumentProto& document,
}
// The index of metadata list is the same as the section id, so we can use
// section id as the index.
- return GetStringSectionContent(document, metadata_list->at(section_id).path);
+ return GetSectionContent<T>(document, metadata_list->at(section_id).path);
}
+// Explicit template instantiation
+template libtextclassifier3::StatusOr<std::vector<std::string_view>>
+SectionManager::GetSectionContent<std::string_view>(
+ const DocumentProto& document, SectionId section_id) const;
+template libtextclassifier3::StatusOr<std::vector<int64_t>>
+SectionManager::GetSectionContent<int64_t>(const DocumentProto& document,
+ SectionId section_id) const;
+
libtextclassifier3::StatusOr<const SectionMetadata*>
SectionManager::GetSectionMetadata(SchemaTypeId schema_type_id,
SectionId section_id) const {
@@ -286,21 +355,34 @@ SectionManager::GetSectionMetadata(SchemaTypeId schema_type_id,
return &section_metadatas[section_id];
}
-libtextclassifier3::StatusOr<std::vector<Section>>
-SectionManager::ExtractSections(const DocumentProto& document) const {
+libtextclassifier3::StatusOr<SectionGroup> SectionManager::ExtractSections(
+ const DocumentProto& document) const {
ICING_ASSIGN_OR_RETURN(const std::vector<SectionMetadata>* metadata_list,
GetMetadataList(document.schema()));
- std::vector<Section> sections;
- for (const auto& section_metadata : *metadata_list) {
- auto section_content_or =
- GetStringSectionContent(document, section_metadata.path);
- // Adds to result vector if section is found in document
- if (section_content_or.ok()) {
- sections.emplace_back(SectionMetadata(section_metadata),
- std::move(section_content_or).ValueOrDie());
+ SectionGroup section_group;
+ for (const SectionMetadata& section_metadata : *metadata_list) {
+ switch (section_metadata.data_type) {
+ case PropertyConfigProto::DataType::STRING: {
+ AppendSection(section_metadata,
+ GetSectionContent<std::string_view>(
+ document, section_metadata.path),
+ section_group.string_sections);
+ break;
+ }
+ case PropertyConfigProto::DataType::INT64: {
+ AppendSection(
+ section_metadata,
+ GetSectionContent<int64_t>(document, section_metadata.path),
+ section_group.integer_sections);
+ break;
+ }
+ default: {
+ // Skip other data types.
+ break;
+ }
}
}
- return sections;
+ return section_group;
}
libtextclassifier3::StatusOr<const std::vector<SectionMetadata>*>
diff --git a/icing/schema/section-manager.h b/icing/schema/section-manager.h
index 51eb133..78a5acb 100644
--- a/icing/schema/section-manager.h
+++ b/icing/schema/section-manager.h
@@ -55,27 +55,36 @@ class SectionManager {
const SchemaUtil::TypeConfigMap& type_config_map,
const KeyMapper<SchemaTypeId>* schema_type_mapper);
- // Finds content of a section by section path (e.g. property1.property2)
+ // Finds contents of a section by section path (e.g. property1.property2)
+ // according to the template type T.
+ //
+ // Types of supported T:
+ // - std::string, std::string_view: return property.string_values()
+ // - int64_t : return property.int64_values()
//
// Returns:
- // A string of content on success
+ // A vector of contents with the specified type on success
// NOT_FOUND if:
// 1. Property is optional and not found in the document
// 2. section_path is invalid
- // 3. Content is empty
- libtextclassifier3::StatusOr<std::vector<std::string_view>>
- GetStringSectionContent(const DocumentProto& document,
- std::string_view section_path) const;
+ // 3. Content is empty (could be caused by incorrect type T)
+ template <typename T>
+ libtextclassifier3::StatusOr<std::vector<T>> GetSectionContent(
+ const DocumentProto& document, std::string_view section_path) const;
- // Finds content of a section by id
+ // Finds contents of a section by id according to the template type T.
+ //
+ // Types of supported T:
+ // - std::string, std::string_view: return property.string_values()
+ // - int64_t : return property.int64_values()
//
// Returns:
- // A string of content on success
+ // A vector of contents on success
// INVALID_ARGUMENT if section id is invalid
// NOT_FOUND if type config name of document not found
- libtextclassifier3::StatusOr<std::vector<std::string_view>>
- GetStringSectionContent(const DocumentProto& document,
- SectionId section_id) const;
+ template <typename T>
+ libtextclassifier3::StatusOr<std::vector<T>> GetSectionContent(
+ const DocumentProto& document, SectionId section_id) const;
// Returns the SectionMetadata associated with the SectionId that's in the
// SchemaTypeId.
@@ -86,14 +95,16 @@ class SectionManager {
libtextclassifier3::StatusOr<const SectionMetadata*> GetSectionMetadata(
SchemaTypeId schema_type_id, SectionId section_id) const;
- // Extracts all sections from the given document, sections are sorted by
- // section id in increasing order. Section ids start from 0. Sections with
- // empty content won't be returned.
+ // Extracts all sections of different types from the given document and group
+ // them by type.
+ // - Sections are sorted by section id in ascending order.
+ // - Section ids start from 0.
+ // - Sections with empty content won't be returned.
//
// Returns:
- // A list of sections on success
+ // A SectionGroup instance on success
// NOT_FOUND if type config name of document not found
- libtextclassifier3::StatusOr<std::vector<Section>> ExtractSections(
+ libtextclassifier3::StatusOr<SectionGroup> ExtractSections(
const DocumentProto& document) const;
// Returns:
diff --git a/icing/schema/section-manager_test.cc b/icing/schema/section-manager_test.cc
index 39c02d1..4e8fbbd 100644
--- a/icing/schema/section-manager_test.cc
+++ b/icing/schema/section-manager_test.cc
@@ -23,6 +23,7 @@
#include "icing/proto/document.pb.h"
#include "icing/proto/schema.pb.h"
#include "icing/proto/term.pb.h"
+#include "icing/schema-builder.h"
#include "icing/schema/schema-util.h"
#include "icing/store/dynamic-trie-key-mapper.h"
#include "icing/store/key-mapper.h"
@@ -32,6 +33,8 @@
namespace icing {
namespace lib {
+namespace {
+
using ::testing::ElementsAre;
using ::testing::Eq;
using ::testing::HasSubstr;
@@ -44,11 +47,16 @@ constexpr char kPropertySubject[] = "subject";
constexpr char kPropertyText[] = "text";
constexpr char kPropertyAttachment[] = "attachment";
constexpr char kPropertyRecipients[] = "recipients";
+constexpr char kPropertyRecipientIds[] = "recipientIds";
+constexpr char kPropertyTimestamp[] = "timestamp";
+constexpr char kPropertyNonIndexableInteger[] = "non_indexable_integer";
// type and property names of Conversation
constexpr char kTypeConversation[] = "Conversation";
constexpr char kPropertyName[] = "name";
constexpr char kPropertyEmails[] = "emails";
+constexpr int64_t kDefaultTimestamp = 1663274901;
+
class SectionManagerTest : public ::testing::Test {
protected:
SectionManagerTest() : test_dir_(GetTestTempDir() + "/icing") {
@@ -67,6 +75,9 @@ class SectionManagerTest : public ::testing::Test {
.AddBytesProperty(kPropertyAttachment, "attachment bytes")
.AddStringProperty(kPropertyRecipients, "recipient1", "recipient2",
"recipient3")
+ .AddInt64Property(kPropertyRecipientIds, 1, 2, 3)
+ .AddInt64Property(kPropertyTimestamp, kDefaultTimestamp)
+ .AddInt64Property(kPropertyNonIndexableInteger, 100)
.Build();
conversation_document_ =
@@ -91,39 +102,41 @@ class SectionManagerTest : public ::testing::Test {
}
static SchemaTypeConfigProto CreateEmailTypeConfig() {
- SchemaTypeConfigProto type;
- type.set_schema_type(kTypeEmail);
-
- auto subject = type.add_properties();
- subject->set_property_name(kPropertySubject);
- subject->set_data_type(PropertyConfigProto::DataType::STRING);
- subject->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
- subject->mutable_string_indexing_config()->set_term_match_type(
- TermMatchType::EXACT_ONLY);
- subject->mutable_string_indexing_config()->set_tokenizer_type(
- StringIndexingConfig::TokenizerType::PLAIN);
-
- auto text = type.add_properties();
- text->set_property_name(kPropertyText);
- text->set_data_type(PropertyConfigProto::DataType::STRING);
- text->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
- text->mutable_string_indexing_config()->set_term_match_type(
- TermMatchType::UNKNOWN);
-
- auto attachment = type.add_properties();
- attachment->set_property_name(kPropertyAttachment);
- attachment->set_data_type(PropertyConfigProto::DataType::BYTES);
- attachment->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
-
- auto recipients = type.add_properties();
- recipients->set_property_name(kPropertyRecipients);
- recipients->set_data_type(PropertyConfigProto::DataType::STRING);
- recipients->set_cardinality(PropertyConfigProto::Cardinality::REPEATED);
- recipients->mutable_string_indexing_config()->set_term_match_type(
- TermMatchType::EXACT_ONLY);
- recipients->mutable_string_indexing_config()->set_tokenizer_type(
- StringIndexingConfig::TokenizerType::PLAIN);
-
+ SchemaTypeConfigProto type =
+ SchemaTypeConfigBuilder()
+ .SetType(kTypeEmail)
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName(kPropertySubject)
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName(kPropertyText)
+ .SetDataTypeString(TERM_MATCH_UNKNOWN, TOKENIZER_NONE)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kPropertyAttachment)
+ .SetDataType(TYPE_BYTES)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName(kPropertyRecipients)
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REPEATED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kPropertyRecipientIds)
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_REPEATED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kPropertyTimestamp)
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kPropertyNonIndexableInteger)
+ .SetDataType(TYPE_INT64)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .Build();
return type;
}
@@ -133,15 +146,15 @@ class SectionManagerTest : public ::testing::Test {
auto name = type.add_properties();
name->set_property_name(kPropertyName);
- name->set_data_type(PropertyConfigProto::DataType::STRING);
- name->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+ name->set_data_type(TYPE_STRING);
+ name->set_cardinality(CARDINALITY_OPTIONAL);
name->mutable_string_indexing_config()->set_term_match_type(
- TermMatchType::EXACT_ONLY);
+ TERM_MATCH_EXACT);
auto emails = type.add_properties();
emails->set_property_name(kPropertyEmails);
- emails->set_data_type(PropertyConfigProto::DataType::DOCUMENT);
- emails->set_cardinality(PropertyConfigProto::Cardinality::REPEATED);
+ emails->set_data_type(TYPE_DOCUMENT);
+ emails->set_cardinality(CARDINALITY_REPEATED);
emails->set_schema_type(kTypeEmail);
emails->mutable_document_indexing_config()->set_index_nested_properties(
true);
@@ -172,10 +185,10 @@ TEST_F(SectionManagerTest, CreationWithTooManyPropertiesShouldFail) {
for (int i = 0; i < max_num_sections_allowed + 1; i++) {
auto property = type_config.add_properties();
property->set_property_name("property" + std::to_string(i));
- property->set_data_type(PropertyConfigProto::DataType::STRING);
- property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+ property->set_data_type(TYPE_STRING);
+ property->set_cardinality(CARDINALITY_REQUIRED);
property->mutable_string_indexing_config()->set_term_match_type(
- TermMatchType::EXACT_ONLY);
+ TERM_MATCH_EXACT);
}
SchemaUtil::TypeConfigMap type_config_map;
@@ -187,109 +200,186 @@ TEST_F(SectionManagerTest, CreationWithTooManyPropertiesShouldFail) {
HasSubstr("Too many properties")));
}
-TEST_F(SectionManagerTest, GetStringSectionContent) {
+TEST_F(SectionManagerTest, GetSectionContent) {
ICING_ASSERT_OK_AND_ASSIGN(
auto section_manager,
SectionManager::Create(type_config_map_, schema_type_mapper_.get()));
// Test simple section paths
- EXPECT_THAT(
- section_manager->GetStringSectionContent(email_document_,
- /*section_path*/ "subject"),
- IsOkAndHolds(ElementsAre("the subject")));
- EXPECT_THAT(section_manager->GetStringSectionContent(email_document_,
- /*section_path*/ "text"),
+ EXPECT_THAT(section_manager->GetSectionContent<std::string_view>(
+ email_document_,
+ /*section_path=*/"subject"),
+ IsOkAndHolds(ElementsAre("the subject")));
+ EXPECT_THAT(section_manager->GetSectionContent<std::string_view>(
+ email_document_,
+ /*section_path=*/"text"),
IsOkAndHolds(ElementsAre("the text")));
+ EXPECT_THAT(
+ section_manager->GetSectionContent<int64_t>(email_document_,
+ /*section_path=*/"timestamp"),
+ IsOkAndHolds(ElementsAre(kDefaultTimestamp)));
+}
- // Test repeated values, they are joined into one string
+TEST_F(SectionManagerTest, GetSectionContentRepeatedValues) {
ICING_ASSERT_OK_AND_ASSIGN(
- auto content,
- section_manager->GetStringSectionContent(email_document_,
- /*section_path*/ "recipients"));
- EXPECT_THAT(content, ElementsAre("recipient1", "recipient2", "recipient3"));
+ auto section_manager,
+ SectionManager::Create(type_config_map_, schema_type_mapper_.get()));
+
+ // Test repeated values
+ EXPECT_THAT(
+ section_manager->GetSectionContent<std::string_view>(
+ email_document_,
+ /*section_path=*/"recipients"),
+ IsOkAndHolds(ElementsAre("recipient1", "recipient2", "recipient3")));
+ EXPECT_THAT(section_manager->GetSectionContent<int64_t>(
+ email_document_,
+ /*section_path=*/"recipientIds"),
+ IsOkAndHolds(ElementsAre(1, 2, 3)));
+}
+
+TEST_F(SectionManagerTest, GetSectionContentConcatenatedSectionPaths) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto section_manager,
+ SectionManager::Create(type_config_map_, schema_type_mapper_.get()));
// Test concatenated section paths: "property1.property2"
- ICING_ASSERT_OK_AND_ASSIGN(content, section_manager->GetStringSectionContent(
- conversation_document_,
- /*section_path*/ "emails.subject"));
- EXPECT_THAT(content, ElementsAre("the subject", "the subject"));
-
- ICING_ASSERT_OK_AND_ASSIGN(content, section_manager->GetStringSectionContent(
- conversation_document_,
- /*section_path*/ "emails.text"));
- EXPECT_THAT(content, ElementsAre("the text", "the text"));
-
- ICING_ASSERT_OK_AND_ASSIGN(content,
- section_manager->GetStringSectionContent(
- conversation_document_,
- /*section_path*/ "emails.recipients"));
- EXPECT_THAT(content, ElementsAre("recipient1", "recipient2", "recipient3",
- "recipient1", "recipient2", "recipient3"));
+ EXPECT_THAT(section_manager->GetSectionContent<std::string_view>(
+ conversation_document_,
+ /*section_path=*/"emails.subject"),
+ IsOkAndHolds(ElementsAre("the subject", "the subject")));
+ EXPECT_THAT(section_manager->GetSectionContent<std::string_view>(
+ conversation_document_,
+ /*section_path=*/"emails.text"),
+ IsOkAndHolds(ElementsAre("the text", "the text")));
+ EXPECT_THAT(section_manager->GetSectionContent<int64_t>(
+ conversation_document_,
+ /*section_path=*/"emails.timestamp"),
+ IsOkAndHolds(ElementsAre(kDefaultTimestamp, kDefaultTimestamp)));
+ EXPECT_THAT(
+ section_manager->GetSectionContent<std::string_view>(
+ conversation_document_,
+ /*section_path=*/"emails.recipients"),
+ IsOkAndHolds(ElementsAre("recipient1", "recipient2", "recipient3",
+ "recipient1", "recipient2", "recipient3")));
+ EXPECT_THAT(section_manager->GetSectionContent<int64_t>(
+ conversation_document_,
+ /*section_path=*/"emails.recipientIds"),
+ IsOkAndHolds(ElementsAre(1, 2, 3, 1, 2, 3)));
+}
+
+TEST_F(SectionManagerTest, GetSectionContentNonExistingPaths) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto section_manager,
+ SectionManager::Create(type_config_map_, schema_type_mapper_.get()));
// Test non-existing paths
- EXPECT_THAT(section_manager->GetStringSectionContent(email_document_,
- /*section_path*/ "name"),
+ EXPECT_THAT(section_manager->GetSectionContent<std::string_view>(
+ email_document_,
+ /*section_path=*/"name"),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
- EXPECT_THAT(
- section_manager->GetStringSectionContent(email_document_,
- /*section_path*/ "invalid"),
- StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
- EXPECT_THAT(section_manager->GetStringSectionContent(
+ EXPECT_THAT(section_manager->GetSectionContent<std::string_view>(
+ email_document_,
+ /*section_path=*/"invalid"),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(section_manager->GetSectionContent<std::string_view>(
conversation_document_,
- /*section_path*/ "emails.invalid"),
+ /*section_path=*/"emails.invalid"),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
+TEST_F(SectionManagerTest, GetSectionContentNonIndexableTypes) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto section_manager,
+ SectionManager::Create(type_config_map_, schema_type_mapper_.get()));
// Test other data types
// BYTES type can't be indexed, so content won't be returned
- EXPECT_THAT(
- section_manager->GetStringSectionContent(email_document_,
- /*section_path*/ "attachment"),
- StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(section_manager->GetSectionContent<std::string_view>(
+ email_document_,
+ /*section_path=*/"attachment"),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
- // The following tests are similar to the ones above but use section ids
- // instead of section paths
+TEST_F(SectionManagerTest, GetSectionContentMismatchedType) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto section_manager,
+ SectionManager::Create(type_config_map_, schema_type_mapper_.get()));
- // EmailMessage (section id -> section path):
- SectionId recipients_section_id = 0;
- SectionId subject_section_id = 1;
- SectionId invalid_email_section_id = 2;
+ // Use the wrong template type to get the indexable content. GetSectionContent
+ // should get empty content from the corresponding proto (repeated) field and
+ // return NOT_FOUND error.
+ EXPECT_THAT(section_manager->GetSectionContent<std::string_view>(
+ email_document_,
+ /*section_path=*/"recipientIds"),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(section_manager->GetSectionContent<int64_t>(
+ email_document_,
+ /*section_path=*/"recipients"),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
+// The following tests are similar to the ones above but use section ids
+// instead of section paths
+TEST_F(SectionManagerTest, GetSectionContentBySectionId) {
ICING_ASSERT_OK_AND_ASSIGN(
- content, section_manager->GetStringSectionContent(email_document_,
- recipients_section_id));
- EXPECT_THAT(content, ElementsAre("recipient1", "recipient2", "recipient3"));
+ auto section_manager,
+ SectionManager::Create(type_config_map_, schema_type_mapper_.get()));
- EXPECT_THAT(section_manager->GetStringSectionContent(email_document_,
- subject_section_id),
+ // EmailMessage (section id -> section path):
+ SectionId recipient_ids_section_id = 0;
+ SectionId recipients_section_id = 1;
+ SectionId subject_section_id = 2;
+ SectionId timestamp_section_id = 3;
+ SectionId invalid_email_section_id = 4;
+ EXPECT_THAT(section_manager->GetSectionContent<int64_t>(
+ email_document_, recipient_ids_section_id),
+ IsOkAndHolds(ElementsAre(1, 2, 3)));
+ EXPECT_THAT(
+ section_manager->GetSectionContent<std::string_view>(
+ email_document_, recipients_section_id),
+ IsOkAndHolds(ElementsAre("recipient1", "recipient2", "recipient3")));
+ EXPECT_THAT(section_manager->GetSectionContent<std::string_view>(
+ email_document_, subject_section_id),
IsOkAndHolds(ElementsAre("the subject")));
+ EXPECT_THAT(section_manager->GetSectionContent<int64_t>(email_document_,
+ timestamp_section_id),
+ IsOkAndHolds(ElementsAre(kDefaultTimestamp)));
- EXPECT_THAT(section_manager->GetStringSectionContent(
+ EXPECT_THAT(section_manager->GetSectionContent<std::string_view>(
email_document_, invalid_email_section_id),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
// Conversation (section id -> section path):
- // 0 -> emails.recipients
- // 1 -> emails.subject
- // 2 -> name
- SectionId emails_recipients_section_id = 0;
- SectionId emails_subject_section_id = 1;
- SectionId name_section_id = 2;
- SectionId invalid_conversation_section_id = 3;
- ICING_ASSERT_OK_AND_ASSIGN(
- content, section_manager->GetStringSectionContent(
- conversation_document_, emails_recipients_section_id));
- EXPECT_THAT(content, ElementsAre("recipient1", "recipient2", "recipient3",
- "recipient1", "recipient2", "recipient3"));
-
- ICING_ASSERT_OK_AND_ASSIGN(
- content, section_manager->GetStringSectionContent(
- conversation_document_, emails_subject_section_id));
- EXPECT_THAT(content, ElementsAre("the subject", "the subject"));
-
- EXPECT_THAT(section_manager->GetStringSectionContent(conversation_document_,
- name_section_id),
+ // 0 -> emails.recipientIds
+ // 1 -> emails.recipients
+ // 2 -> emails.subject
+ // 3 -> emails.timestamp
+ // 4 -> name
+ SectionId emails_recipient_ids_section_id = 0;
+ SectionId emails_recipients_section_id = 1;
+ SectionId emails_subject_section_id = 2;
+ SectionId emails_timestamp_section_id = 3;
+ SectionId name_section_id = 4;
+ SectionId invalid_conversation_section_id = 5;
+ EXPECT_THAT(section_manager->GetSectionContent<int64_t>(
+ conversation_document_, emails_recipient_ids_section_id),
+ IsOkAndHolds(ElementsAre(1, 2, 3, 1, 2, 3)));
+ EXPECT_THAT(
+ section_manager->GetSectionContent<std::string_view>(
+ conversation_document_, emails_recipients_section_id),
+ IsOkAndHolds(ElementsAre("recipient1", "recipient2", "recipient3",
+ "recipient1", "recipient2", "recipient3")));
+ EXPECT_THAT(section_manager->GetSectionContent<std::string_view>(
+ conversation_document_, emails_subject_section_id),
+ IsOkAndHolds(ElementsAre("the subject", "the subject")));
+ EXPECT_THAT(section_manager->GetSectionContent<int64_t>(
+ conversation_document_, emails_timestamp_section_id),
+ IsOkAndHolds(ElementsAre(kDefaultTimestamp, kDefaultTimestamp)));
+
+ EXPECT_THAT(section_manager->GetSectionContent<std::string_view>(
+ conversation_document_, name_section_id),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
-
- EXPECT_THAT(section_manager->GetStringSectionContent(
+ EXPECT_THAT(section_manager->GetSectionContent<std::string_view>(
conversation_document_, invalid_conversation_section_id),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
}
@@ -300,35 +390,91 @@ TEST_F(SectionManagerTest, ExtractSections) {
SectionManager::Create(type_config_map_, schema_type_mapper_.get()));
// Extracts all sections from 'EmailMessage' document
- ICING_ASSERT_OK_AND_ASSIGN(auto sections,
+ ICING_ASSERT_OK_AND_ASSIGN(SectionGroup section_group,
section_manager->ExtractSections(email_document_));
- EXPECT_THAT(sections.size(), Eq(2));
- EXPECT_THAT(sections[0].metadata.id, Eq(0));
- EXPECT_THAT(sections[0].metadata.path, Eq("recipients"));
- EXPECT_THAT(sections[0].content,
+ // String sections
+ EXPECT_THAT(section_group.string_sections, SizeIs(2));
+
+ EXPECT_THAT(section_group.string_sections[0].metadata,
+ Eq(SectionMetadata(
+ /*id_in=*/1, TYPE_STRING, TOKENIZER_PLAIN, TERM_MATCH_EXACT,
+ NUMERIC_MATCH_UNKNOWN,
+ /*path_in=*/"recipients")));
+ EXPECT_THAT(section_group.string_sections[0].content,
ElementsAre("recipient1", "recipient2", "recipient3"));
- EXPECT_THAT(sections[1].metadata.id, Eq(1));
- EXPECT_THAT(sections[1].metadata.path, Eq("subject"));
- EXPECT_THAT(sections[1].content, ElementsAre("the subject"));
+ EXPECT_THAT(section_group.string_sections[1].metadata,
+ Eq(SectionMetadata(
+ /*id_in=*/2, TYPE_STRING, TOKENIZER_PLAIN, TERM_MATCH_EXACT,
+ NUMERIC_MATCH_UNKNOWN,
+ /*path_in=*/"subject")));
+ EXPECT_THAT(section_group.string_sections[1].content,
+ ElementsAre("the subject"));
+
+ // Integer sections
+ EXPECT_THAT(section_group.integer_sections, SizeIs(2));
+
+ EXPECT_THAT(section_group.integer_sections[0].metadata,
+ Eq(SectionMetadata(/*id_in=*/0, TYPE_INT64, TOKENIZER_NONE,
+ TERM_MATCH_UNKNOWN, NUMERIC_MATCH_RANGE,
+ /*path_in=*/"recipientIds")));
+ EXPECT_THAT(section_group.integer_sections[0].content, ElementsAre(1, 2, 3));
+
+ EXPECT_THAT(section_group.integer_sections[1].metadata,
+ Eq(SectionMetadata(/*id_in=*/3, TYPE_INT64, TOKENIZER_NONE,
+ TERM_MATCH_UNKNOWN, NUMERIC_MATCH_RANGE,
+ /*path_in=*/"timestamp")));
+ EXPECT_THAT(section_group.integer_sections[1].content,
+ ElementsAre(kDefaultTimestamp));
+}
+
+TEST_F(SectionManagerTest, ExtractSectionsNested) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto section_manager,
+ SectionManager::Create(type_config_map_, schema_type_mapper_.get()));
// Extracts all sections from 'Conversation' document
ICING_ASSERT_OK_AND_ASSIGN(
- sections, section_manager->ExtractSections(conversation_document_));
- EXPECT_THAT(sections.size(), Eq(2));
-
- // Section id 3 (name) not found in document, so the first section id found
- // is 1 below.
- EXPECT_THAT(sections[0].metadata.id, Eq(0));
- EXPECT_THAT(sections[0].metadata.path, Eq("emails.recipients"));
- EXPECT_THAT(sections[0].content,
+ SectionGroup section_group,
+ section_manager->ExtractSections(conversation_document_));
+
+ // String sections
+ EXPECT_THAT(section_group.string_sections, SizeIs(2));
+
+ EXPECT_THAT(section_group.string_sections[0].metadata,
+ Eq(SectionMetadata(
+ /*id_in=*/1, TYPE_STRING, TOKENIZER_PLAIN, TERM_MATCH_EXACT,
+ NUMERIC_MATCH_UNKNOWN,
+ /*path_in=*/"emails.recipients")));
+ EXPECT_THAT(section_group.string_sections[0].content,
ElementsAre("recipient1", "recipient2", "recipient3",
"recipient1", "recipient2", "recipient3"));
- EXPECT_THAT(sections[1].metadata.id, Eq(1));
- EXPECT_THAT(sections[1].metadata.path, Eq("emails.subject"));
- EXPECT_THAT(sections[1].content, ElementsAre("the subject", "the subject"));
+ EXPECT_THAT(section_group.string_sections[1].metadata,
+ Eq(SectionMetadata(
+ /*id_in=*/2, TYPE_STRING, TOKENIZER_PLAIN, TERM_MATCH_EXACT,
+ NUMERIC_MATCH_UNKNOWN,
+ /*path_in=*/"emails.subject")));
+ EXPECT_THAT(section_group.string_sections[1].content,
+ ElementsAre("the subject", "the subject"));
+
+ // Integer sections
+ EXPECT_THAT(section_group.integer_sections, SizeIs(2));
+
+ EXPECT_THAT(section_group.integer_sections[0].metadata,
+ Eq(SectionMetadata(/*id_in=*/0, TYPE_INT64, TOKENIZER_NONE,
+ TERM_MATCH_UNKNOWN, NUMERIC_MATCH_RANGE,
+ /*path_in=*/"emails.recipientIds")));
+ EXPECT_THAT(section_group.integer_sections[0].content,
+ ElementsAre(1, 2, 3, 1, 2, 3));
+
+ EXPECT_THAT(section_group.integer_sections[1].metadata,
+ Eq(SectionMetadata(/*id_in=*/3, TYPE_INT64, TOKENIZER_NONE,
+ TERM_MATCH_UNKNOWN, NUMERIC_MATCH_RANGE,
+ /*path_in=*/"emails.timestamp")));
+ EXPECT_THAT(section_group.integer_sections[1].content,
+ ElementsAre(kDefaultTimestamp, kDefaultTimestamp));
}
TEST_F(SectionManagerTest,
@@ -344,54 +490,53 @@ TEST_F(SectionManagerTest,
// Create an int property with a string_indexing_config
auto int_property = type_with_non_string_properties.add_properties();
int_property->set_property_name("int");
- int_property->set_data_type(PropertyConfigProto::DataType::INT64);
- int_property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+ int_property->set_data_type(TYPE_INT64);
+ int_property->set_cardinality(CARDINALITY_REQUIRED);
int_property->mutable_string_indexing_config()->set_term_match_type(
- TermMatchType::EXACT_ONLY);
+ TERM_MATCH_EXACT);
int_property->mutable_string_indexing_config()->set_tokenizer_type(
- StringIndexingConfig::TokenizerType::PLAIN);
+ TOKENIZER_PLAIN);
// Create a double property with a string_indexing_config
auto double_property = type_with_non_string_properties.add_properties();
double_property->set_property_name("double");
- double_property->set_data_type(PropertyConfigProto::DataType::DOUBLE);
- double_property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+ double_property->set_data_type(TYPE_DOUBLE);
+ double_property->set_cardinality(CARDINALITY_REQUIRED);
double_property->mutable_string_indexing_config()->set_term_match_type(
- TermMatchType::EXACT_ONLY);
+ TERM_MATCH_EXACT);
double_property->mutable_string_indexing_config()->set_tokenizer_type(
- StringIndexingConfig::TokenizerType::PLAIN);
+ TOKENIZER_PLAIN);
// Create a boolean property with a string_indexing_config
auto boolean_property = type_with_non_string_properties.add_properties();
boolean_property->set_property_name("boolean");
- boolean_property->set_data_type(PropertyConfigProto::DataType::BOOLEAN);
- boolean_property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+ boolean_property->set_data_type(TYPE_BOOLEAN);
+ boolean_property->set_cardinality(CARDINALITY_REQUIRED);
boolean_property->mutable_string_indexing_config()->set_term_match_type(
- TermMatchType::EXACT_ONLY);
+ TERM_MATCH_EXACT);
boolean_property->mutable_string_indexing_config()->set_tokenizer_type(
- StringIndexingConfig::TokenizerType::PLAIN);
+ TOKENIZER_PLAIN);
// Create a bytes property with a string_indexing_config
auto bytes_property = type_with_non_string_properties.add_properties();
bytes_property->set_property_name("bytes");
- bytes_property->set_data_type(PropertyConfigProto::DataType::BYTES);
- bytes_property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+ bytes_property->set_data_type(TYPE_BYTES);
+ bytes_property->set_cardinality(CARDINALITY_REQUIRED);
bytes_property->mutable_string_indexing_config()->set_term_match_type(
- TermMatchType::EXACT_ONLY);
+ TERM_MATCH_EXACT);
bytes_property->mutable_string_indexing_config()->set_tokenizer_type(
- StringIndexingConfig::TokenizerType::PLAIN);
+ TOKENIZER_PLAIN);
// Create a document property with a string_indexing_config
auto document_property = type_with_non_string_properties.add_properties();
document_property->set_property_name("document");
- document_property->set_data_type(PropertyConfigProto::DataType::DOCUMENT);
+ document_property->set_data_type(TYPE_DOCUMENT);
document_property->set_schema_type(empty_type.schema_type());
- document_property->set_cardinality(
- PropertyConfigProto::Cardinality::REQUIRED);
+ document_property->set_cardinality(CARDINALITY_REQUIRED);
document_property->mutable_string_indexing_config()->set_term_match_type(
- TermMatchType::EXACT_ONLY);
+ TERM_MATCH_EXACT);
document_property->mutable_string_indexing_config()->set_tokenizer_type(
- StringIndexingConfig::TokenizerType::PLAIN);
+ TOKENIZER_PLAIN);
// Setup classes to create the section manager
SchemaUtil::TypeConfigMap type_config_map;
@@ -435,9 +580,109 @@ TEST_F(SectionManagerTest,
.Build();
// Extracts sections from 'Schema' document
- ICING_ASSERT_OK_AND_ASSIGN(auto sections,
+ ICING_ASSERT_OK_AND_ASSIGN(SectionGroup section_group,
+ section_manager->ExtractSections(document));
+ EXPECT_THAT(section_group.string_sections, IsEmpty());
+ EXPECT_THAT(section_group.integer_sections, IsEmpty());
+}
+
+TEST_F(SectionManagerTest,
+ NonIntegerFieldsWithIntegerIndexingConfigDontCreateSections) {
+ // Create a schema for an empty document.
+ SchemaTypeConfigProto empty_type;
+ empty_type.set_schema_type("EmptySchema");
+
+ // Create a schema with all the non-integer fields
+ SchemaTypeConfigProto type_with_non_integer_properties;
+ type_with_non_integer_properties.set_schema_type("Schema");
+
+ // Create an string property with a integer_indexing_config
+ auto string_property = type_with_non_integer_properties.add_properties();
+ string_property->set_property_name("string");
+ string_property->set_data_type(TYPE_STRING);
+ string_property->set_cardinality(CARDINALITY_REQUIRED);
+ string_property->mutable_integer_indexing_config()->set_numeric_match_type(
+ NUMERIC_MATCH_RANGE);
+
+ // Create a double property with a integer_indexing_config
+ auto double_property = type_with_non_integer_properties.add_properties();
+ double_property->set_property_name("double");
+ double_property->set_data_type(TYPE_DOUBLE);
+ double_property->set_cardinality(CARDINALITY_REQUIRED);
+ double_property->mutable_integer_indexing_config()->set_numeric_match_type(
+ NUMERIC_MATCH_RANGE);
+
+ // Create a boolean property with a integer_indexing_config
+ auto boolean_property = type_with_non_integer_properties.add_properties();
+ boolean_property->set_property_name("boolean");
+ boolean_property->set_data_type(TYPE_BOOLEAN);
+ boolean_property->set_cardinality(CARDINALITY_REQUIRED);
+ boolean_property->mutable_integer_indexing_config()->set_numeric_match_type(
+ NUMERIC_MATCH_RANGE);
+
+ // Create a bytes property with a integer_indexing_config
+ auto bytes_property = type_with_non_integer_properties.add_properties();
+ bytes_property->set_property_name("bytes");
+ bytes_property->set_data_type(TYPE_BYTES);
+ bytes_property->set_cardinality(CARDINALITY_REQUIRED);
+ bytes_property->mutable_integer_indexing_config()->set_numeric_match_type(
+ NUMERIC_MATCH_RANGE);
+
+ // Create a document property with a integer_indexing_config
+ auto document_property = type_with_non_integer_properties.add_properties();
+ document_property->set_property_name("document");
+ document_property->set_data_type(TYPE_DOCUMENT);
+ document_property->set_schema_type(empty_type.schema_type());
+ document_property->set_cardinality(CARDINALITY_REQUIRED);
+ document_property->mutable_integer_indexing_config()->set_numeric_match_type(
+ NUMERIC_MATCH_RANGE);
+
+ // Setup classes to create the section manager
+ SchemaUtil::TypeConfigMap type_config_map;
+ type_config_map.emplace(type_with_non_integer_properties.schema_type(),
+ type_with_non_integer_properties);
+ type_config_map.emplace(empty_type.schema_type(), empty_type);
+
+ // DynamicTrieKeyMapper uses 3 internal arrays for bookkeeping. Give each one
+ // 128KiB so the total DynamicTrieKeyMapper should get 384KiB
+ int key_mapper_size = 3 * 128 * 1024;
+ std::string dir = GetTestTempDir() + "/non_integer_fields";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<KeyMapper<SchemaTypeId>> schema_type_mapper,
+ DynamicTrieKeyMapper<SchemaTypeId>::Create(filesystem_, dir,
+ key_mapper_size));
+ ICING_ASSERT_OK(schema_type_mapper->Put(
+ type_with_non_integer_properties.schema_type(), /*schema_type_id=*/0));
+ ICING_ASSERT_OK(schema_type_mapper->Put(empty_type.schema_type(),
+ /*schema_type_id=*/1));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto section_manager,
+ SectionManager::Create(type_config_map, schema_type_mapper.get()));
+
+ // Create an empty document to be nested
+ DocumentProto empty_document = DocumentBuilder()
+ .SetKey("icing", "uri1")
+ .SetSchema(empty_type.schema_type())
+ .Build();
+
+ // Create a document that follows "Schema"
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "uri2")
+ .SetSchema(type_with_non_integer_properties.schema_type())
+ .AddStringProperty("string", "abc")
+ .AddDoubleProperty("double", 0.2)
+ .AddBooleanProperty("boolean", true)
+ .AddBytesProperty("bytes", "attachment bytes")
+ .AddDocumentProperty("document", empty_document)
+ .Build();
+
+ // Extracts sections from 'Schema' document
+ ICING_ASSERT_OK_AND_ASSIGN(SectionGroup section_group,
section_manager->ExtractSections(document));
- EXPECT_THAT(sections.size(), Eq(0));
+ EXPECT_THAT(section_group.string_sections, IsEmpty());
+ EXPECT_THAT(section_group.integer_sections, IsEmpty());
}
TEST_F(SectionManagerTest, AssignSectionsRecursivelyForDocumentFields) {
@@ -447,12 +692,19 @@ TEST_F(SectionManagerTest, AssignSectionsRecursivelyForDocumentFields) {
auto string_property = document_type.add_properties();
string_property->set_property_name("string");
- string_property->set_data_type(PropertyConfigProto::DataType::STRING);
- string_property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+ string_property->set_data_type(TYPE_STRING);
+ string_property->set_cardinality(CARDINALITY_REQUIRED);
string_property->mutable_string_indexing_config()->set_term_match_type(
- TermMatchType::EXACT_ONLY);
+ TERM_MATCH_EXACT);
string_property->mutable_string_indexing_config()->set_tokenizer_type(
- StringIndexingConfig::TokenizerType::PLAIN);
+ TOKENIZER_PLAIN);
+
+ auto integer_property = document_type.add_properties();
+ integer_property->set_property_name("integer");
+ integer_property->set_data_type(TYPE_INT64);
+ integer_property->set_cardinality(CARDINALITY_REQUIRED);
+ integer_property->mutable_integer_indexing_config()->set_numeric_match_type(
+ NUMERIC_MATCH_RANGE);
// Create the outer schema which has the document property.
SchemaTypeConfigProto type;
@@ -460,10 +712,9 @@ TEST_F(SectionManagerTest, AssignSectionsRecursivelyForDocumentFields) {
auto document_property = type.add_properties();
document_property->set_property_name("document");
- document_property->set_data_type(PropertyConfigProto::DataType::DOCUMENT);
+ document_property->set_data_type(TYPE_DOCUMENT);
document_property->set_schema_type(document_type.schema_type());
- document_property->set_cardinality(
- PropertyConfigProto::Cardinality::REQUIRED);
+ document_property->set_cardinality(CARDINALITY_REQUIRED);
// Opt into recursing into the document fields.
document_property->mutable_document_indexing_config()
@@ -474,6 +725,7 @@ TEST_F(SectionManagerTest, AssignSectionsRecursivelyForDocumentFields) {
.SetKey("icing", "uri1")
.SetSchema(document_type.schema_type())
.AddStringProperty("string", "foo")
+ .AddInt64Property("integer", 123)
.Build();
// Create the outer document that holds the inner document
@@ -509,10 +761,11 @@ TEST_F(SectionManagerTest, AssignSectionsRecursivelyForDocumentFields) {
SectionManager::Create(type_config_map, schema_type_mapper.get()));
// Extracts sections from 'Schema' document; there should be the 1 string
- // property inside the document.
- ICING_ASSERT_OK_AND_ASSIGN(std::vector<Section> sections,
+ // property and 1 integer property inside the document.
+ ICING_ASSERT_OK_AND_ASSIGN(SectionGroup section_group,
section_manager->ExtractSections(outer_document));
- EXPECT_THAT(sections, SizeIs(1));
+ EXPECT_THAT(section_group.string_sections, SizeIs(1));
+ EXPECT_THAT(section_group.integer_sections, SizeIs(1));
}
TEST_F(SectionManagerTest, DontAssignSectionsRecursivelyForDocumentFields) {
@@ -522,12 +775,19 @@ TEST_F(SectionManagerTest, DontAssignSectionsRecursivelyForDocumentFields) {
auto string_property = document_type.add_properties();
string_property->set_property_name("string");
- string_property->set_data_type(PropertyConfigProto::DataType::STRING);
- string_property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+ string_property->set_data_type(TYPE_STRING);
+ string_property->set_cardinality(CARDINALITY_REQUIRED);
string_property->mutable_string_indexing_config()->set_term_match_type(
- TermMatchType::EXACT_ONLY);
+ TERM_MATCH_EXACT);
string_property->mutable_string_indexing_config()->set_tokenizer_type(
- StringIndexingConfig::TokenizerType::PLAIN);
+ TOKENIZER_PLAIN);
+
+ auto integer_property = document_type.add_properties();
+ integer_property->set_property_name("integer");
+ integer_property->set_data_type(TYPE_INT64);
+ integer_property->set_cardinality(CARDINALITY_REQUIRED);
+ integer_property->mutable_integer_indexing_config()->set_numeric_match_type(
+ NUMERIC_MATCH_RANGE);
// Create the outer schema which has the document property.
SchemaTypeConfigProto type;
@@ -535,10 +795,9 @@ TEST_F(SectionManagerTest, DontAssignSectionsRecursivelyForDocumentFields) {
auto document_property = type.add_properties();
document_property->set_property_name("document");
- document_property->set_data_type(PropertyConfigProto::DataType::DOCUMENT);
+ document_property->set_data_type(TYPE_DOCUMENT);
document_property->set_schema_type(document_type.schema_type());
- document_property->set_cardinality(
- PropertyConfigProto::Cardinality::REQUIRED);
+ document_property->set_cardinality(CARDINALITY_REQUIRED);
// Opt into recursing into the document fields.
document_property->mutable_document_indexing_config()
@@ -549,6 +808,7 @@ TEST_F(SectionManagerTest, DontAssignSectionsRecursivelyForDocumentFields) {
.SetKey("icing", "uri1")
.SetSchema(document_type.schema_type())
.AddStringProperty("string", "foo")
+ .AddInt64Property("integer", 123)
.Build();
// Create the outer document that holds the inner document
@@ -585,10 +845,13 @@ TEST_F(SectionManagerTest, DontAssignSectionsRecursivelyForDocumentFields) {
// Extracts sections from 'Schema' document; there won't be any since we
// didn't recurse into the document to see the inner string property
- ICING_ASSERT_OK_AND_ASSIGN(std::vector<Section> sections,
+ ICING_ASSERT_OK_AND_ASSIGN(SectionGroup section_group,
section_manager->ExtractSections(outer_document));
- EXPECT_THAT(sections, IsEmpty());
+ EXPECT_THAT(section_group.string_sections, IsEmpty());
+ EXPECT_THAT(section_group.integer_sections, IsEmpty());
}
+} // namespace
+
} // namespace lib
} // namespace icing
diff --git a/icing/schema/section.h b/icing/schema/section.h
index 34c8c58..241095b 100644
--- a/icing/schema/section.h
+++ b/icing/schema/section.h
@@ -54,11 +54,14 @@ struct SectionMetadata {
// A unique id of property within a type config
SectionId id;
- // How strings should be tokenized. It is invalid for a section to have
- // tokenizer == 'NONE'.
+ // Indexable data type of this section. E.g. STRING, INT64.
+ PropertyConfigProto::DataType::Code data_type;
+
+ // How strings should be tokenized. It is invalid for a string section
+ // (data_type == 'STRING') to have tokenizer == 'NONE'.
StringIndexingConfig::TokenizerType::Code tokenizer;
- // How tokens in this section should be matched.
+ // How tokens in a string section should be matched.
//
// TermMatchType::UNKNOWN:
// Terms will not match anything
@@ -70,30 +73,68 @@ struct SectionMetadata {
// Terms will be only stored as an exact match, "fool" only matches "fool"
TermMatchType::Code term_match_type = TermMatchType::UNKNOWN;
- SectionMetadata(SectionId id_in, TermMatchType::Code term_match_type_in,
- StringIndexingConfig::TokenizerType::Code tokenizer,
- std::string&& path_in)
+ // How tokens in a numeric section should be matched.
+ //
+ // NumericMatchType::UNKNOWN:
+ // Contents will not match anything. It is invalid for a numeric section
+ // (data_type == 'INT64') to have numeric_match_type == 'UNKNOWN'.
+ //
+ // NumericMatchType::RANGE:
+ // Contents will be matched by a range query.
+ IntegerIndexingConfig::NumericMatchType::Code numeric_match_type;
+
+ explicit SectionMetadata(
+ SectionId id_in, PropertyConfigProto::DataType::Code data_type_in,
+ StringIndexingConfig::TokenizerType::Code tokenizer,
+ TermMatchType::Code term_match_type_in,
+ IntegerIndexingConfig::NumericMatchType::Code numeric_match_type_in,
+ std::string&& path_in)
: path(std::move(path_in)),
id(id_in),
+ data_type(data_type_in),
tokenizer(tokenizer),
- term_match_type(term_match_type_in) {}
+ term_match_type(term_match_type_in),
+ numeric_match_type(numeric_match_type_in) {}
+
+ SectionMetadata(const SectionMetadata& other) = default;
+ SectionMetadata& operator=(const SectionMetadata& other) = default;
+
+ SectionMetadata(SectionMetadata&& other) = default;
+ SectionMetadata& operator=(SectionMetadata&& other) = default;
bool operator==(const SectionMetadata& rhs) const {
- return path == rhs.path && id == rhs.id && tokenizer == rhs.tokenizer &&
- term_match_type == rhs.term_match_type;
+ return path == rhs.path && id == rhs.id && data_type == rhs.data_type &&
+ tokenizer == rhs.tokenizer &&
+ term_match_type == rhs.term_match_type &&
+ numeric_match_type == rhs.numeric_match_type;
}
};
// Section is an icing internal concept similar to document property but with
// extra metadata. The content can be a value or the combination of repeated
-// values of a property.
+// values of a property, and the type of content is specified by template.
+//
+// Current supported types:
+// - std::string_view (PropertyConfigProto::DataType::STRING)
+// - int64_t (PropertyConfigProto::DataType::INT64)
+template <typename T>
struct Section {
SectionMetadata metadata;
- std::vector<std::string_view> content;
+ std::vector<T> content;
- Section(SectionMetadata&& metadata_in,
- std::vector<std::string_view>&& content_in)
+ explicit Section(SectionMetadata&& metadata_in, std::vector<T>&& content_in)
: metadata(std::move(metadata_in)), content(std::move(content_in)) {}
+
+ PropertyConfigProto::DataType::Code data_type() const {
+ return metadata.data_type;
+ }
+};
+
+// Groups of different type sections. Callers can access sections with types
+// they want and avoid going through non-desired ones.
+struct SectionGroup {
+ std::vector<Section<std::string_view>> string_sections;
+ std::vector<Section<int64_t>> integer_sections;
};
} // namespace lib
diff --git a/icing/scoring/priority-queue-scored-document-hits-ranker.h b/icing/scoring/priority-queue-scored-document-hits-ranker.h
index e0ae4b0..3ef2ae5 100644
--- a/icing/scoring/priority-queue-scored-document-hits-ranker.h
+++ b/icing/scoring/priority-queue-scored-document-hits-ranker.h
@@ -51,7 +51,14 @@ class PriorityQueueScoredDocumentHitsRanker : public ScoredDocumentHitsRanker {
bool operator()(const ScoredDocumentHit& lhs,
const ScoredDocumentHit& rhs) const {
- return is_ascending_ == !(lhs < rhs);
+ // STL comparator requirement: equal MUST return false.
+ // If writing `return is_ascending_ == !(lhs < rhs)`:
+ // - When lhs == rhs, !(lhs < rhs) is true
+ // - If is_ascending_ is true, then we return true for equal case!
+ if (is_ascending_) {
+ return rhs < lhs;
+ }
+ return lhs < rhs;
}
private:
diff --git a/icing/scoring/scored-document-hit.h b/icing/scoring/scored-document-hit.h
index 079ba7e..96ca6aa 100644
--- a/icing/scoring/scored-document-hit.h
+++ b/icing/scoring/scored-document-hit.h
@@ -71,7 +71,14 @@ class ScoredDocumentHitComparator {
bool operator()(const ScoredDocumentHit& lhs,
const ScoredDocumentHit& rhs) const {
- return is_descending_ == !(lhs < rhs);
+ // STL comparator requirement: equal MUST return false.
+ // If writing `return is_descending_ == !(lhs < rhs)`:
+ // - When lhs == rhs, !(lhs < rhs) is true
+ // - If is_descending_ is true, then we return true for equal case!
+ if (is_descending_) {
+ return rhs < lhs;
+ }
+ return lhs < rhs;
}
private:
diff --git a/icing/scoring/scorer.cc b/icing/scoring/scorer.cc
index 5f33e66..14a004e 100644
--- a/icing/scoring/scorer.cc
+++ b/icing/scoring/scorer.cc
@@ -192,6 +192,10 @@ libtextclassifier3::StatusOr<std::unique_ptr<Scorer>> Scorer::Create(
case ScoringSpecProto::RankingStrategy::USAGE_TYPE3_LAST_USED_TIMESTAMP:
return std::make_unique<UsageScorer>(
document_store, scoring_spec.rank_by(), default_score);
+ case ScoringSpecProto::RankingStrategy::JOIN_AGGREGATE_SCORE:
+ ICING_LOG(WARNING)
+ << "JOIN_AGGREGATE_SCORE not implemented, falling back to NoScorer";
+ [[fallthrough]];
case ScoringSpecProto::RankingStrategy::NONE:
return std::make_unique<NoScorer>(default_score);
}
diff --git a/icing/scoring/scorer_test.cc b/icing/scoring/scorer_test.cc
index 1062f50..5432cde 100644
--- a/icing/scoring/scorer_test.cc
+++ b/icing/scoring/scorer_test.cc
@@ -41,12 +41,6 @@ namespace lib {
namespace {
using ::testing::Eq;
-constexpr PropertyConfigProto::DataType::Code TYPE_STRING =
- PropertyConfigProto::DataType::STRING;
-
-constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REQUIRED =
- PropertyConfigProto::Cardinality::REQUIRED;
-
class ScorerTest : public testing::Test {
protected:
ScorerTest()
diff --git a/icing/scoring/scoring-processor_test.cc b/icing/scoring/scoring-processor_test.cc
index ad63a2b..921fc7f 100644
--- a/icing/scoring/scoring-processor_test.cc
+++ b/icing/scoring/scoring-processor_test.cc
@@ -41,12 +41,6 @@ using ::testing::Gt;
using ::testing::IsEmpty;
using ::testing::SizeIs;
-constexpr PropertyConfigProto::DataType::Code TYPE_STRING =
- PropertyConfigProto::DataType::STRING;
-
-constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL =
- PropertyConfigProto::Cardinality::OPTIONAL;
-
class ScoringProcessorTest : public testing::Test {
protected:
ScoringProcessorTest()
diff --git a/icing/store/document-id.h b/icing/store/document-id.h
index 3230819..7ea33b8 100644
--- a/icing/store/document-id.h
+++ b/icing/store/document-id.h
@@ -26,7 +26,8 @@ using DocumentId = int32_t;
// We use 22 bits to encode document_ids and use the largest value (2^22 - 1) to
// represent an invalid document_id.
inline constexpr int kDocumentIdBits = 22;
-inline constexpr DocumentId kInvalidDocumentId = (1u << kDocumentIdBits) - 1;
+inline constexpr DocumentId kInvalidDocumentId =
+ (INT32_C(1) << kDocumentIdBits) - 1;
inline constexpr DocumentId kMinDocumentId = 0;
inline constexpr DocumentId kMaxDocumentId = kInvalidDocumentId - 1;
diff --git a/icing/store/document-store_benchmark.cc b/icing/store/document-store_benchmark.cc
index ab9bff1..a4b3a17 100644
--- a/icing/store/document-store_benchmark.cc
+++ b/icing/store/document-store_benchmark.cc
@@ -65,14 +65,6 @@ namespace lib {
namespace {
-constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL =
- PropertyConfigProto::Cardinality::OPTIONAL;
-
-constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN =
- StringIndexingConfig::TokenizerType::PLAIN;
-
-constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY;
-
class DestructibleDirectory {
public:
explicit DestructibleDirectory(const Filesystem& filesystem,
@@ -101,17 +93,18 @@ DocumentProto CreateDocument(const std::string namespace_,
SchemaProto CreateSchema() {
return SchemaBuilder()
- .AddType(
- SchemaTypeConfigBuilder()
- .SetType("email")
- .AddProperty(PropertyConfigBuilder()
- .SetName("subject")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL))
- .AddProperty(PropertyConfigBuilder()
- .SetName("body")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("email")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
.Build();
}
diff --git a/icing/store/document-store_test.cc b/icing/store/document-store_test.cc
index e158fdc..7cf951a 100644
--- a/icing/store/document-store_test.cc
+++ b/icing/store/document-store_test.cc
@@ -91,17 +91,6 @@ const NamespaceStorageInfoProto& GetNamespaceStorageInfo(
return std::move(NamespaceStorageInfoProto());
}
-constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL =
- PropertyConfigProto::Cardinality::OPTIONAL;
-
-constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN =
- StringIndexingConfig::TokenizerType::PLAIN;
-
-constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY;
-
-constexpr PropertyConfigProto::DataType::Code TYPE_INT =
- PropertyConfigProto::DataType::INT64;
-
UsageReport CreateUsageReport(std::string name_space, std::string uri,
int64 timestamp_ms,
UsageReport::UsageType usage_type) {
@@ -183,16 +172,16 @@ class DocumentStoreTest : public ::testing::Test {
.AddType(
SchemaTypeConfigBuilder()
.SetType("email")
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("subject")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL))
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("body")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
.Build();
ICING_ASSERT_OK_AND_ASSIGN(
schema_store_,
@@ -2363,7 +2352,7 @@ TEST_F(DocumentStoreTest, UpdateSchemaStoreDeletesInvalidDocuments) {
.AddType(SchemaTypeConfigBuilder().SetType("email").AddProperty(
PropertyConfigBuilder()
.SetName("subject")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
.SetCardinality(CARDINALITY_OPTIONAL)))
.Build();
@@ -2597,7 +2586,7 @@ TEST_F(DocumentStoreTest, OptimizedUpdateSchemaStoreDeletesInvalidDocuments) {
.AddType(SchemaTypeConfigBuilder().SetType("email").AddProperty(
PropertyConfigBuilder()
.SetName("subject")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
.SetCardinality(CARDINALITY_OPTIONAL)))
.Build();
@@ -3427,11 +3416,11 @@ TEST_F(DocumentStoreTest, InitializeForceRecoveryUpdatesTypeIds) {
.SetType("email")
.AddProperty(PropertyConfigBuilder()
.SetName("subject")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
.SetCardinality(CARDINALITY_OPTIONAL))
.AddProperty(PropertyConfigBuilder()
.SetName("body")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
.SetCardinality(CARDINALITY_OPTIONAL))
.Build();
SchemaProto schema = SchemaBuilder().AddType(email_type_config).Build();
@@ -3476,14 +3465,14 @@ TEST_F(DocumentStoreTest, InitializeForceRecoveryUpdatesTypeIds) {
SchemaBuilder()
.AddType(SchemaTypeConfigBuilder()
.SetType("alarm")
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("name")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
.AddProperty(PropertyConfigBuilder()
.SetName("time")
- .SetDataType(TYPE_INT)
+ .SetDataType(TYPE_INT64)
.SetCardinality(CARDINALITY_OPTIONAL)))
.AddType(email_type_config)
.Build();
@@ -3527,11 +3516,11 @@ TEST_F(DocumentStoreTest, InitializeDontForceRecoveryDoesntUpdateTypeIds) {
.SetType("email")
.AddProperty(PropertyConfigBuilder()
.SetName("subject")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
.SetCardinality(CARDINALITY_OPTIONAL))
.AddProperty(PropertyConfigBuilder()
.SetName("body")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
.SetCardinality(CARDINALITY_OPTIONAL))
.Build();
SchemaProto schema = SchemaBuilder().AddType(email_type_config).Build();
@@ -3576,14 +3565,14 @@ TEST_F(DocumentStoreTest, InitializeDontForceRecoveryDoesntUpdateTypeIds) {
SchemaBuilder()
.AddType(SchemaTypeConfigBuilder()
.SetType("alarm")
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("name")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
.AddProperty(PropertyConfigBuilder()
.SetName("time")
- .SetDataType(TYPE_INT)
+ .SetDataType(TYPE_INT64)
.SetCardinality(CARDINALITY_OPTIONAL)))
.AddType(email_type_config)
.Build();
@@ -3623,11 +3612,11 @@ TEST_F(DocumentStoreTest, InitializeForceRecoveryDeletesInvalidDocument) {
.SetType("email")
.AddProperty(PropertyConfigBuilder()
.SetName("subject")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
.SetCardinality(CARDINALITY_OPTIONAL))
.AddProperty(PropertyConfigBuilder()
.SetName("body")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
.SetCardinality(CARDINALITY_OPTIONAL))
.Build();
SchemaProto schema = SchemaBuilder().AddType(email_type_config).Build();
@@ -3689,7 +3678,7 @@ TEST_F(DocumentStoreTest, InitializeForceRecoveryDeletesInvalidDocument) {
.SetType("email")
.AddProperty(PropertyConfigBuilder()
.SetName("subject")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
.SetCardinality(CARDINALITY_OPTIONAL))
.Build();
schema = SchemaBuilder().AddType(email_type_config).Build();
@@ -3728,11 +3717,11 @@ TEST_F(DocumentStoreTest, InitializeDontForceRecoveryKeepsInvalidDocument) {
.SetType("email")
.AddProperty(PropertyConfigBuilder()
.SetName("subject")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
.SetCardinality(CARDINALITY_OPTIONAL))
.AddProperty(PropertyConfigBuilder()
.SetName("body")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
.SetCardinality(CARDINALITY_OPTIONAL))
.Build();
SchemaProto schema = SchemaBuilder().AddType(email_type_config).Build();
@@ -3794,7 +3783,7 @@ TEST_F(DocumentStoreTest, InitializeDontForceRecoveryKeepsInvalidDocument) {
.SetType("email")
.AddProperty(PropertyConfigBuilder()
.SetName("subject")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
.SetCardinality(CARDINALITY_OPTIONAL))
.Build();
schema = SchemaBuilder().AddType(email_type_config).Build();
@@ -3828,16 +3817,16 @@ TEST_F(DocumentStoreTest, MigrateToPortableFileBackedProtoLog) {
SchemaBuilder()
.AddType(SchemaTypeConfigBuilder()
.SetType("email")
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("subject")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL))
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("body")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
.Build();
std::string schema_store_dir = schema_store_dir_ + "_migrate";
@@ -3948,20 +3937,20 @@ TEST_F(DocumentStoreTest, GetDebugInfo) {
SchemaBuilder()
.AddType(SchemaTypeConfigBuilder()
.SetType("email")
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("subject")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL))
- .AddProperty(
- PropertyConfigBuilder()
- .SetName("body")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
- .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
.AddType(SchemaTypeConfigBuilder().SetType("person").AddProperty(
PropertyConfigBuilder()
.SetName("name")
- .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
.SetCardinality(CARDINALITY_OPTIONAL)))
.Build();
std::string schema_store_dir = schema_store_dir_ + "_custom";
diff --git a/icing/tokenization/token.h b/icing/tokenization/token.h
index 41fbee0..05d6fe4 100644
--- a/icing/tokenization/token.h
+++ b/icing/tokenization/token.h
@@ -47,7 +47,7 @@ struct Token {
QUERY_RIGHT_PARENTHESES, // Right parentheses
// Types used in URL tokenization
- URL_SCHEME, // "http", "https"
+ URL_SCHEME, // "http", "https", "ftp", "content"
URL_USERNAME,
URL_PASSWORD,
URL_HOST_COMMON_PART, // Hosts are split into two types, common and
diff --git a/icing/tokenization/tokenizer-factory.cc b/icing/tokenization/tokenizer-factory.cc
index dc5cfdf..d120ac8 100644
--- a/icing/tokenization/tokenizer-factory.cc
+++ b/icing/tokenization/tokenizer-factory.cc
@@ -24,6 +24,11 @@
#include "icing/tokenization/raw-query-tokenizer.h"
#include "icing/tokenization/rfc822-tokenizer.h"
#include "icing/tokenization/tokenizer.h"
+
+#ifdef ENABLE_URL_TOKENIZER
+#include "icing/tokenization/url-tokenizer.h"
+#endif // ENABLE_URL_TOKENIZER
+
#include "icing/tokenization/verbatim-tokenizer.h"
#include "icing/util/status-macros.h"
@@ -44,6 +49,12 @@ CreateIndexingTokenizer(StringIndexingConfig::TokenizerType::Code type,
return std::make_unique<VerbatimTokenizer>();
case StringIndexingConfig::TokenizerType::RFC822:
return std::make_unique<Rfc822Tokenizer>();
+// TODO (b/246964044): remove ifdef guard when url-tokenizer is ready for export
+// to Android.
+#ifdef ENABLE_URL_TOKENIZER
+ case StringIndexingConfig::TokenizerType::URL:
+ return std::make_unique<UrlTokenizer>();
+#endif // ENABLE_URL_TOKENIZER
case StringIndexingConfig::TokenizerType::NONE:
[[fallthrough]];
default:
diff --git a/icing/util/document-validator_test.cc b/icing/util/document-validator_test.cc
index 46a2679..310494a 100644
--- a/icing/util/document-validator_test.cc
+++ b/icing/util/document-validator_test.cc
@@ -32,6 +32,7 @@ namespace icing {
namespace lib {
namespace {
+
using ::testing::HasSubstr;
// type and property names of EmailMessage
@@ -47,16 +48,6 @@ constexpr char kPropertyEmails[] = "emails";
constexpr char kDefaultNamespace[] = "icing";
constexpr char kDefaultString[] = "This is a string.";
-constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL =
- PropertyConfigProto::Cardinality::OPTIONAL;
-constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REQUIRED =
- PropertyConfigProto::Cardinality::REQUIRED;
-constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REPEATED =
- PropertyConfigProto::Cardinality::REPEATED;
-
-constexpr PropertyConfigProto::DataType::Code TYPE_STRING =
- PropertyConfigProto::DataType::STRING;
-
class DocumentValidatorTest : public ::testing::Test {
protected:
DocumentValidatorTest() {}
diff --git a/icing/util/tokenized-document.cc b/icing/util/tokenized-document.cc
index 885e489..e741987 100644
--- a/icing/util/tokenized-document.cc
+++ b/icing/util/tokenized-document.cc
@@ -49,9 +49,11 @@ libtextclassifier3::Status TokenizedDocument::Tokenize(
DocumentValidator validator(schema_store);
ICING_RETURN_IF_ERROR(validator.Validate(document_));
- ICING_ASSIGN_OR_RETURN(std::vector<Section> sections,
+ ICING_ASSIGN_OR_RETURN(SectionGroup section_group,
schema_store->ExtractSections(document_));
- for (const Section& section : sections) {
+ // string sections
+ for (const Section<std::string_view>& section :
+ section_group.string_sections) {
ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer> tokenizer,
tokenizer_factory::CreateIndexingTokenizer(
section.metadata.tokenizer, language_segmenter));
diff --git a/proto/icing/proto/schema.proto b/proto/icing/proto/schema.proto
index f960708..d9c43e2 100644
--- a/proto/icing/proto/schema.proto
+++ b/proto/icing/proto/schema.proto
@@ -111,6 +111,21 @@ message StringIndexingConfig {
// original string as an rfc822 token.
// See more here: https://datatracker.ietf.org/doc/html/rfc822
RFC822 = 3;
+
+ // Tokenizes text as an url address. This tokenizes a url string into a
+ // token for each component in the url, as well as any significant
+ // url suffixes. For example,
+ // https://www.google.com/path/subpath?query#ref would be tokenizes into a
+ // scheme token "https“; 3 host tokens "www", "google", "com"; 2 path
+ // tokens "path", "subpath"; a query token "query"; a reference token
+ // "ref"; and 3 suffix tokens
+ // "https://www.google.com/path/subpath?query#ref",
+ // "www.google.com/path/subpath?query#ref",
+ // "google.com/path/subpath?query#ref".
+ // Currently only supports tokenization of one url string at a time
+ // i.e. the input string cannot have spaces in the middle, but can have
+ // leading or trailing spaces.
+ URL = 4;
}
}
optional TokenizerType.Code tokenizer_type = 2;
@@ -128,10 +143,31 @@ message DocumentIndexingConfig {
optional bool index_nested_properties = 1;
}
+// Describes how a int64 property should be indexed.
+// Next tag: 3
+message IntegerIndexingConfig {
+ // OPTIONAL: Indicates how the int64 contents of this property should be
+ // matched.
+ //
+ // The default value is UNKNOWN.
+ message NumericMatchType {
+ enum Code {
+ // Contents in this property will not be indexed. Useful if the int64
+ // property type is not indexable.
+ UNKNOWN = 0;
+
+ // Contents in this property should only be returned for queries matching
+ // the range.
+ RANGE = 1;
+ }
+ }
+ optional NumericMatchType.Code numeric_match_type = 1;
+}
+
// Describes the schema of a single property of Documents that belong to a
// specific SchemaTypeConfigProto. These can be considered as a rich, structured
// type for each property of Documents accepted by IcingSearchEngine.
-// Next tag: 7
+// Next tag: 8
message PropertyConfigProto {
// REQUIRED: Name that uniquely identifies a property within an Document of
// a specific SchemaTypeConfigProto.
@@ -208,6 +244,10 @@ message PropertyConfigProto {
// OPTIONAL: Describes how document properties should be indexed.
optional DocumentIndexingConfig document_indexing_config = 6;
+
+ // OPTIONAL: Describes how int64 properties should be indexed. Int64
+ // properties that do not set the indexing config will not be indexed.
+ optional IntegerIndexingConfig integer_indexing_config = 7;
}
// List of all supported types constitutes the schema used by Icing.
diff --git a/proto/icing/proto/scoring.proto b/proto/icing/proto/scoring.proto
index 375e9bd..13861c9 100644
--- a/proto/icing/proto/scoring.proto
+++ b/proto/icing/proto/scoring.proto
@@ -68,6 +68,9 @@ message ScoringSpecProto {
// Ranked by relevance score, currently computed as BM25F score.
RELEVANCE_SCORE = 9;
+
+ // Ranked by the aggregated score of the joined documents.
+ JOIN_AGGREGATE_SCORE = 10;
}
}
optional RankingStrategy.Code rank_by = 1;
diff --git a/proto/icing/proto/search.proto b/proto/icing/proto/search.proto
index e3324a3..181c63c 100644
--- a/proto/icing/proto/search.proto
+++ b/proto/icing/proto/search.proto
@@ -27,7 +27,7 @@ option java_multiple_files = true;
option objc_class_prefix = "ICNG";
// Client-supplied specifications on what documents to retrieve.
-// Next tag: 7
+// Next tag: 8
message SearchSpecProto {
// REQUIRED: The "raw" query string that users may type. For example, "cat"
// will search for documents with the term cat in it.
@@ -86,6 +86,10 @@ message SearchSpecProto {
// TODO(b/208654892) Remove this field once EXPERIMENTAL_ICING_ADVANCED_QUERY
// is fully supported.
optional SearchType.Code search_type = 6 [default = ICING_RAW_QUERY];
+
+ // OPTIONAL: If this field is present, join documents based on a nested
+ // SearchSpec.
+ optional JoinSpecProto join_spec = 7;
}
// Client-supplied specifications on what to include/how to format the search
@@ -282,7 +286,7 @@ message SearchResultProto {
optional StatusProto status = 1;
// The Results that matched the query. Empty if there was an error.
- // Next tag: 4
+ // Next tag: 5
message ResultProto {
// Document that matches the SearchSpecProto.
optional DocumentProto document = 1;
@@ -294,6 +298,9 @@ message SearchResultProto {
// The score that the document was ranked by. The meaning of this score is
// determined by ScoringSpecProto.rank_by.
optional double score = 3;
+
+ // The documents that were joined to a parent document.
+ repeated ResultProto joined_results = 4;
}
repeated ResultProto results = 2;
@@ -418,3 +425,50 @@ message SuggestionResponse {
repeated Suggestion suggestions = 2;
}
+
+// Specification for a left outer join.
+//
+// Next tag: 7
+message JoinSpecProto {
+ // A nested SearchSpec that will be used to retrieve joined documents. If you
+ // are only looking to join on Action type documents, you could set a schema
+ // filter in this SearchSpec. This includes the nested search query. See
+ // SearchSpecProto.
+ optional SearchSpecProto nested_search_spec = 1;
+
+ // The equivalent of a primary key in SQL. This is an expression that will be
+ // used to match child documents from the nested search to this document. One
+ // such expression is qualifiedId(). When used, it means the
+ // child_property_expression in the joined documents must be equal to the
+ // qualified id.
+ // TODO(b/256022027) allow for parent_property_expression to be any property
+ // of the parent document.
+ optional string parent_property_expression = 2;
+
+ // The equivalent of a foreign key in SQL. This defines an equality constraint
+ // between a property in a child document and a property in the parent
+ // document. For example, if you want to join Action documents which an
+ // entityId property containing a fully qualified document id,
+ // child_property_expression can be set to "entityId".
+ // TODO(b/256022027) figure out how to allow this to refer to documents
+ // outside of same pkg+db+ns.
+ optional string child_property_expression = 3;
+
+ // The max amount of joined documents to join to a parent document.
+ optional int32 max_joined_result_count = 4;
+
+ // The strategy by which to score the aggregation of joined documents. For
+ // example, you might want to know which entity document has the most actions
+ // taken on it. If JOIN_AGGREGATE_SCORE is used in the base SearchSpecProto,
+ // the COUNT value will rank entity documents based on the number of joined
+ // documents.
+ enum AggregationScore {
+ UNDEFINED = 0;
+ COUNT = 1;
+ MIN = 2;
+ AVG = 3;
+ MAX = 4;
+ SUM = 5;
+ }
+ optional AggregationScore aggregation_score_strategy = 5 [default = COUNT];
+}
diff --git a/synced_AOSP_CL_number.txt b/synced_AOSP_CL_number.txt
index cf3c8f0..55403b4 100644
--- a/synced_AOSP_CL_number.txt
+++ b/synced_AOSP_CL_number.txt
@@ -1 +1 @@
-set(synced_AOSP_CL_number=484090353)
+set(synced_AOSP_CL_number=487674301)