diff options
80 files changed, 4321 insertions, 2369 deletions
diff --git a/icing/file/memory-mapped-file-leak_test.cc b/icing/file/memory-mapped-file-leak_test.cc deleted file mode 100644 index ff031df..0000000 --- a/icing/file/memory-mapped-file-leak_test.cc +++ /dev/null @@ -1,78 +0,0 @@ -// Copyright (C) 2019 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "perftools/profiles/collector/heap/alloc_recorder.h" -#include "gmock/gmock.h" -#include "gtest/gtest.h" -#include "icing/file/filesystem.h" -#include "icing/file/memory-mapped-file.h" -#include "icing/testing/common-matchers.h" -#include "icing/testing/recorder-test-utils.h" -#include "icing/testing/tmp-directory.h" - -namespace icing { -namespace lib { -namespace { - -namespace heap_profile = ::perftools::profiles::collector::heap; - -using testing::Le; - -TEST(MemoryMappedFileTest, MMapMemoryLeak) { - std::string test_dir = GetTestTempDir(); - std::string recorder_dir = test_dir + "/recorder"; - Filesystem filesystem; - ASSERT_TRUE(filesystem.CreateDirectoryRecursively(recorder_dir.c_str())); - - ASSERT_TRUE(heap_profile::AllocRecorderStartWithMmapTracking(recorder_dir)); - { - std::string mmfile_dir = test_dir + "/file"; - ASSERT_TRUE(filesystem.CreateDirectoryRecursively(mmfile_dir.c_str())); - - // Don't use ICING_ASSERT_OK_AND_ASSIGN or matcher IsOk to prevent - // unnecessary implicit heap memory allocation in these macros. - libtextclassifier3::StatusOr<MemoryMappedFile> mmfile_or = - MemoryMappedFile::Create(filesystem, mmfile_dir + "/mmfile", - MemoryMappedFile::READ_WRITE_AUTO_SYNC); - ASSERT_TRUE(mmfile_or.ok()); - MemoryMappedFile mmfile = std::move(mmfile_or).ValueOrDie(); - - // How this works: - // We request a 500-byte mapping starting at the 101st byte of the file. - // But(!), mmap only accepts offsets that are multiples of page size. So - // instead mmfile will create a 600-byte mapping starting at the 1st byte of - // file and then return the address of the 101st byte within that mapping. - // For this reason, total bytes and peak bytes will be 600 bytes. - // - // When mmfile goes out of scope it needs to munmap the mapping that it - // created. But, remember that the mapping is larger (600 bytes) than what - // we requested (500 bytes)! So mmfile needs to remember the actual size of - // the mapping, NOT the requested size. Calling munmap with the correct size - // will ensure that total_inuse_bytes is 0 after mmfile goes out of scope. - // Calling munmap with the requested size would still keep 100 bytes of the - // mapping around! - mmfile.Remap(100, 500); - } - heap_profile::AllocRecorderStop(); - - // Mmap only affects bytes measurements. - ProfileInfo profile_info = SummarizeProfileProto(recorder_dir + ".0.pb.gz"); - EXPECT_THAT(profile_info.total_alloc_bytes, Le(600)); - EXPECT_THAT(profile_info.peak_bytes, Le(600)); - EXPECT_THAT(profile_info.inuse_bytes, Le(0)); -} - -} // namespace -} // namespace lib -} // namespace icing diff --git a/icing/index/main/flash-index-storage-header.h b/icing/file/posting_list/flash-index-storage-header.h index 71ec816..6bbf1ba 100644 --- a/icing/index/main/flash-index-storage-header.h +++ b/icing/file/posting_list/flash-index-storage-header.h @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef ICING_INDEX_MAIN_FLASH_INDEX_STORAGE_HEADER_H_ -#define ICING_INDEX_MAIN_FLASH_INDEX_STORAGE_HEADER_H_ +#ifndef ICING_FILE_POSTING_LIST_FLASH_INDEX_STORAGE_HEADER_H_ +#define ICING_FILE_POSTING_LIST_FLASH_INDEX_STORAGE_HEADER_H_ #include <cstdint> #include <memory> @@ -119,4 +119,4 @@ static_assert(16 == sizeof(HeaderBlock::Header), } // namespace lib } // namespace icing -#endif // ICING_INDEX_MAIN_FLASH_INDEX_STORAGE_HEADER_H_ +#endif // ICING_FILE_POSTING_LIST_FLASH_INDEX_STORAGE_HEADER_H_ diff --git a/icing/index/main/flash-index-storage.cc b/icing/file/posting_list/flash-index-storage.cc index 33dacf9..f74bc55 100644 --- a/icing/index/main/flash-index-storage.cc +++ b/icing/file/posting_list/flash-index-storage.cc @@ -12,23 +12,19 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "icing/index/main/flash-index-storage.h" +#include "icing/file/posting_list/flash-index-storage.h" #include <sys/types.h> #include <algorithm> #include <cerrno> -#include <cinttypes> #include <cstdint> #include <memory> -#include <unordered_set> #include "icing/absl_ports/canonical_errors.h" #include "icing/absl_ports/str_cat.h" -#include "icing/file/memory-mapped-file.h" -#include "icing/index/main/index-block.h" -#include "icing/index/main/posting-list-free.h" -#include "icing/index/main/posting-list-utils.h" +#include "icing/file/posting_list/index-block.h" +#include "icing/file/posting_list/posting-list-common.h" #include "icing/legacy/core/icing-string-util.h" #include "icing/util/logging.h" #include "icing/util/math-util.h" @@ -55,9 +51,9 @@ uint32_t SelectBlockSize() { libtextclassifier3::StatusOr<FlashIndexStorage> FlashIndexStorage::Create( const std::string& index_filename, const Filesystem* filesystem, - bool in_memory) { + PostingListUsedSerializer* serializer, bool in_memory) { ICING_RETURN_ERROR_IF_NULL(filesystem); - FlashIndexStorage storage(index_filename, filesystem, in_memory); + FlashIndexStorage storage(index_filename, filesystem, serializer, in_memory); if (!storage.Init()) { return absl_ports::InternalError( "Unable to successfully read header block!"); @@ -67,10 +63,12 @@ libtextclassifier3::StatusOr<FlashIndexStorage> FlashIndexStorage::Create( FlashIndexStorage::FlashIndexStorage(const std::string& index_filename, const Filesystem* filesystem, + PostingListUsedSerializer* serializer, bool has_in_memory_freelists) : index_filename_(index_filename), num_blocks_(0), filesystem_(filesystem), + serializer_(serializer), has_in_memory_freelists_(has_in_memory_freelists) {} FlashIndexStorage::~FlashIndexStorage() { @@ -127,13 +125,16 @@ bool FlashIndexStorage::CreateHeader() { // Work down from the largest posting list that fits in // block_size. We don't care about locality of blocks because this // is a flash index. - for (uint32_t posting_list_bytes = - IndexBlock::CalculateMaxPostingListBytes(block_size); - posting_list_bytes >= posting_list_utils::min_posting_list_size(); + for (uint32_t posting_list_bytes = IndexBlock::CalculateMaxPostingListBytes( + block_size, serializer_->GetDataTypeBytes()); + posting_list_bytes >= serializer_->GetMinPostingListSize(); posting_list_bytes /= 2) { uint32_t aligned_posting_list_bytes = - (posting_list_bytes / sizeof(Hit) * sizeof(Hit)); - ICING_VLOG(1) << "Block size " << header_block_->header()->num_index_block_infos << ": " << aligned_posting_list_bytes; + (posting_list_bytes / serializer_->GetDataTypeBytes()) * + serializer_->GetDataTypeBytes(); + ICING_VLOG(1) << "Block size " + << header_block_->header()->num_index_block_infos << ": " + << aligned_posting_list_bytes; // Initialize free list to empty. HeaderBlock::Header::IndexBlockInfo* block_info = @@ -167,18 +168,22 @@ bool FlashIndexStorage::OpenHeader(int64_t file_size) { return false; } if (file_size % read_header.header()->block_size != 0) { - ICING_LOG(ERROR) << "Index size " << file_size << " not a multiple of block size " << read_header.header()->block_size; + ICING_LOG(ERROR) << "Index size " << file_size + << " not a multiple of block size " + << read_header.header()->block_size; return false; } if (file_size < static_cast<int64_t>(read_header.header()->block_size)) { - ICING_LOG(ERROR) << "Index size " << file_size << " shorter than block size " << read_header.header()->block_size; + ICING_LOG(ERROR) << "Index size " << file_size + << " shorter than block size " + << read_header.header()->block_size; return false; } if (read_header.header()->block_size % getpagesize() != 0) { ICING_LOG(ERROR) << "Block size " << read_header.header()->block_size - << " is not a multiple of page size " << getpagesize(); + << " is not a multiple of page size " << getpagesize(); return false; } num_blocks_ = file_size / read_header.header()->block_size; @@ -207,11 +212,12 @@ bool FlashIndexStorage::OpenHeader(int64_t file_size) { for (int i = 0; i < header_block_->header()->num_index_block_infos; ++i) { int posting_list_bytes = header_block_->header()->index_block_infos[i].posting_list_bytes; - if (posting_list_bytes % sizeof(Hit) != 0) { - ICING_LOG(ERROR) << "Posting list size misaligned, index " << i - << ", size " + if (posting_list_bytes % serializer_->GetDataTypeBytes() != 0) { + ICING_LOG(ERROR) + << "Posting list size misaligned, index " << i << ", size " << header_block_->header()->index_block_infos[i].posting_list_bytes - << ", hit " << sizeof(Hit) << ", file_size " << file_size; + << ", data_type_bytes " << serializer_->GetDataTypeBytes() + << ", file_size " << file_size; return false; } } @@ -270,7 +276,7 @@ libtextclassifier3::StatusOr<IndexBlock> FlashIndexStorage::GetIndexBlock( } off_t offset = static_cast<off_t>(block_index) * block_size(); return IndexBlock::CreateFromPreexistingIndexBlockRegion( - *filesystem_, index_filename_, offset, block_size()); + *filesystem_, index_filename_, serializer_, offset, block_size()); } libtextclassifier3::StatusOr<IndexBlock> FlashIndexStorage::CreateIndexBlock( @@ -283,7 +289,8 @@ libtextclassifier3::StatusOr<IndexBlock> FlashIndexStorage::CreateIndexBlock( } off_t offset = static_cast<off_t>(block_index) * block_size(); return IndexBlock::CreateFromUninitializedRegion( - *filesystem_, index_filename_, offset, block_size(), posting_list_size); + *filesystem_, index_filename_, serializer_, offset, block_size(), + posting_list_size); } int FlashIndexStorage::FindBestIndexBlockInfo( @@ -381,7 +388,8 @@ FlashIndexStorage::AllocateNewPostingList(int block_info_index) { libtextclassifier3::StatusOr<PostingListHolder> FlashIndexStorage::AllocatePostingList(uint32_t min_posting_list_bytes) { - int max_block_size = IndexBlock::CalculateMaxPostingListBytes(block_size()); + int max_block_size = IndexBlock::CalculateMaxPostingListBytes( + block_size(), serializer_->GetDataTypeBytes()); if (min_posting_list_bytes > max_block_size) { return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( "Requested posting list size %d exceeds max posting list size %d", diff --git a/icing/index/main/flash-index-storage.h b/icing/file/posting_list/flash-index-storage.h index fceb26f..032bfd2 100644 --- a/icing/index/main/flash-index-storage.h +++ b/icing/file/posting_list/flash-index-storage.h @@ -12,22 +12,23 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef ICING_INDEX_FLASH_INDEX_STORAGE_H_ -#define ICING_INDEX_FLASH_INDEX_STORAGE_H_ +#ifndef ICING_FILE_POSTING_LIST_FLASH_INDEX_STORAGE_H_ +#define ICING_FILE_POSTING_LIST_FLASH_INDEX_STORAGE_H_ #include <cstdint> #include <memory> #include <string> +#include <vector> #include "icing/text_classifier/lib3/utils/base/statusor.h" #include "icing/absl_ports/canonical_errors.h" #include "icing/file/filesystem.h" -#include "icing/index/main/flash-index-storage-header.h" -#include "icing/index/main/index-block.h" -#include "icing/index/main/posting-list-free.h" -#include "icing/index/main/posting-list-identifier.h" -#include "icing/index/main/posting-list-used.h" +#include "icing/file/posting_list/flash-index-storage-header.h" +#include "icing/file/posting_list/index-block.h" +#include "icing/file/posting_list/posting-list-identifier.h" +#include "icing/file/posting_list/posting-list-used.h" #include "icing/legacy/core/icing-packed-pod.h" +#include "icing/proto/debug.pb.h" #include "icing/store/document-id.h" namespace icing { @@ -84,7 +85,7 @@ class FlashIndexStorage { // one from disk. static libtextclassifier3::StatusOr<FlashIndexStorage> Create( const std::string& index_filename, const Filesystem* filesystem, - bool in_memory = true); + PostingListUsedSerializer* serializer, bool in_memory = true); // Retrieve the PostingList referred to by PostingListIdentifier. This posting // list must have been previously allocated by a prior call to @@ -136,7 +137,7 @@ class FlashIndexStorage { return filesystem_->GetDiskUsage(block_fd_.get()); } - // Returns the size of the index file used to contains hits. + // Returns the size of the index file used to contains data. uint64_t GetElementsSize() const { // Element size is the same as disk size excluding the header block. return GetDiskUsage() - block_size(); @@ -157,14 +158,19 @@ class FlashIndexStorage { return 1.0 - static_cast<double>(num_blocks_) / kMaxBlockIndex; } + const PostingListUsedSerializer* serializer() const { return serializer_; } + PostingListUsedSerializer* serializer() { return serializer_; } + libtextclassifier3::Status Reset(); // TODO(b/222349894) Convert the string output to a protocol buffer instead. void GetDebugInfo(DebugInfoVerbosity::Code verbosity, std::string* out) const; private: - FlashIndexStorage(const std::string& index_filename, - const Filesystem* filesystem, bool has_in_memory_freelists); + explicit FlashIndexStorage(const std::string& index_filename, + const Filesystem* filesystem, + PostingListUsedSerializer* serializer, + bool has_in_memory_freelists); // Init the index from persistence. Create if file does not exist. We do not // erase corrupt files. @@ -281,10 +287,12 @@ class FlashIndexStorage { const Filesystem* filesystem_; // not owned; can't be null + PostingListUsedSerializer* serializer_; // not owned; can't be null + bool has_in_memory_freelists_; }; } // namespace lib } // namespace icing -#endif // ICING_INDEX_FLASH_INDEX_STORAGE_H_ +#endif // ICING_FILE_POSTING_LIST_FLASH_INDEX_STORAGE_H_ diff --git a/icing/index/main/flash-index-storage_test.cc b/icing/file/posting_list/flash-index-storage_test.cc index 25fcaad..50f21f3 100644 --- a/icing/index/main/flash-index-storage_test.cc +++ b/icing/file/posting_list/flash-index-storage_test.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "icing/index/main/flash-index-storage.h" +#include "icing/file/posting_list/flash-index-storage.h" #include <unistd.h> @@ -27,6 +27,7 @@ #include "gtest/gtest.h" #include "icing/file/filesystem.h" #include "icing/index/hit/hit.h" +#include "icing/index/main/posting-list-used-hit-serializer.h" #include "icing/store/document-id.h" #include "icing/testing/common-matchers.h" #include "icing/testing/tmp-directory.h" @@ -49,9 +50,13 @@ class FlashIndexStorageTest : public testing::Test { test_dir_ = GetTestTempDir() + "/test_dir"; file_name_ = test_dir_ + "/test_file.idx.index"; ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(test_dir_.c_str())); + + // TODO(b/249829533): test different serializers + serializer_ = std::make_unique<PostingListUsedHitSerializer>(); } void TearDown() override { + serializer_.reset(); ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(test_dir_.c_str())); } @@ -59,6 +64,7 @@ class FlashIndexStorageTest : public testing::Test { std::string test_dir_; std::string file_name_; Filesystem filesystem_; + std::unique_ptr<PostingListUsedHitSerializer> serializer_; }; TEST_F(FlashIndexStorageTest, CorruptHeader) { @@ -66,13 +72,13 @@ TEST_F(FlashIndexStorageTest, CorruptHeader) { // Create the header file ICING_ASSERT_OK_AND_ASSIGN( FlashIndexStorage flash_index_storage, - FlashIndexStorage::Create(file_name_, &filesystem_)); + FlashIndexStorage::Create(file_name_, &filesystem_, serializer_.get())); } { // Read the valid header - should pass ICING_ASSERT_OK_AND_ASSIGN( FlashIndexStorage flash_index_storage, - FlashIndexStorage::Create(file_name_, &filesystem_)); + FlashIndexStorage::Create(file_name_, &filesystem_, serializer_.get())); } { // Corrupt the header file by changing pl_bytes @@ -84,8 +90,9 @@ TEST_F(FlashIndexStorageTest, CorruptHeader) { { // Read the header file - should fail because pl_bytes is not divisible // by sizeof(Hit), which is 5 as of writing - ASSERT_THAT(FlashIndexStorage::Create(file_name_, &filesystem_), - StatusIs(libtextclassifier3::StatusCode::INTERNAL)); + ASSERT_THAT( + FlashIndexStorage::Create(file_name_, &filesystem_, serializer_.get()), + StatusIs(libtextclassifier3::StatusCode::INTERNAL)); } { // Correct the pl_bytes header alignment @@ -98,7 +105,7 @@ TEST_F(FlashIndexStorageTest, CorruptHeader) { // Read the valid header - should pass ICING_ASSERT_OK_AND_ASSIGN( FlashIndexStorage flash_index_storage, - FlashIndexStorage::Create(file_name_, &filesystem_)); + FlashIndexStorage::Create(file_name_, &filesystem_, serializer_.get())); } // Delete the file @@ -110,7 +117,7 @@ TEST_F(FlashIndexStorageTest, EmptyStorage) { // Create the header file ICING_ASSERT_OK_AND_ASSIGN( FlashIndexStorage flash_index_storage, - FlashIndexStorage::Create(file_name_, &filesystem_)); + FlashIndexStorage::Create(file_name_, &filesystem_, serializer_.get())); // An 'empty' FlashIndexStorage should have: // 1. One block allocated for the header EXPECT_THAT(flash_index_storage.num_blocks(), Eq(1)); @@ -126,7 +133,7 @@ TEST_F(FlashIndexStorageTest, EmptyStorage) { // Read the valid header. All functions should return the same values. ICING_ASSERT_OK_AND_ASSIGN( FlashIndexStorage flash_index_storage, - FlashIndexStorage::Create(file_name_, &filesystem_)); + FlashIndexStorage::Create(file_name_, &filesystem_, serializer_.get())); EXPECT_THAT(flash_index_storage.num_blocks(), Eq(1)); EXPECT_THAT(flash_index_storage.empty(), IsTrue()); EXPECT_THAT(flash_index_storage.get_last_indexed_docid(), @@ -140,7 +147,7 @@ TEST_F(FlashIndexStorageTest, FreeListInMemory) { // Create the header file ICING_ASSERT_OK_AND_ASSIGN( FlashIndexStorage flash_index_storage, - FlashIndexStorage::Create(file_name_, &filesystem_)); + FlashIndexStorage::Create(file_name_, &filesystem_, serializer_.get())); { // 1. Request a PL that is 1/2 block size. Remember that block size also // includes the BlockHeader. The BlockHeader isn't publicly visible, so we @@ -165,9 +172,10 @@ TEST_F(FlashIndexStorageTest, FreeListInMemory) { Hit(/*section_id=*/5, /*document_id=*/2, /*term_frequency=*/100), Hit(/*section_id=*/8, /*document_id=*/5, /*term_frequency=*/197)}; for (const Hit& hit : hits1) { - ICING_ASSERT_OK(posting_list_holder1.posting_list.PrependHit(hit)); + ICING_ASSERT_OK( + serializer_->PrependHit(&posting_list_holder1.posting_list, hit)); } - EXPECT_THAT(posting_list_holder1.posting_list.GetHits(), + EXPECT_THAT(serializer_->GetHits(&posting_list_holder1.posting_list), IsOkAndHolds(ElementsAreArray(hits1.rbegin(), hits1.rend()))); // 2. Get another PL. This should be on the same flash block. There should @@ -188,9 +196,10 @@ TEST_F(FlashIndexStorageTest, FreeListInMemory) { Hit(/*section_id=*/9, /*document_id=*/7, /*term_frequency=*/100), Hit(/*section_id=*/6, /*document_id=*/7, /*term_frequency=*/197)}; for (const Hit& hit : hits2) { - ICING_ASSERT_OK(posting_list_holder2.posting_list.PrependHit(hit)); + ICING_ASSERT_OK( + serializer_->PrependHit(&posting_list_holder2.posting_list, hit)); } - EXPECT_THAT(posting_list_holder2.posting_list.GetHits(), + EXPECT_THAT(serializer_->GetHits(&posting_list_holder2.posting_list), IsOkAndHolds(ElementsAreArray(hits2.rbegin(), hits2.rend()))); // 3. Now, free the first posting list. This should add it to the free list @@ -214,7 +223,7 @@ TEST_F(FlashIndexStorageTest, FreeListInMemory) { EXPECT_THAT(posting_list_holder3.id.block_index(), Eq(id1.block_index())); // Make sure this pl is empty. The hits that used to be there should be // gone. - EXPECT_THAT(posting_list_holder3.posting_list.GetHits(), + EXPECT_THAT(serializer_->GetHits(&posting_list_holder3.posting_list), IsOkAndHolds(IsEmpty())); std::vector<Hit> hits3 = { Hit(/*section_id=*/7, /*document_id=*/1, /*term_frequency=*/62), @@ -222,9 +231,10 @@ TEST_F(FlashIndexStorageTest, FreeListInMemory) { Hit(/*section_id=*/11, /*document_id=*/18, /*term_frequency=*/12), Hit(/*section_id=*/7, /*document_id=*/100, /*term_frequency=*/74)}; for (const Hit& hit : hits3) { - ICING_ASSERT_OK(posting_list_holder3.posting_list.PrependHit(hit)); + ICING_ASSERT_OK( + serializer_->PrependHit(&posting_list_holder3.posting_list, hit)); } - EXPECT_THAT(posting_list_holder3.posting_list.GetHits(), + EXPECT_THAT(serializer_->GetHits(&posting_list_holder3.posting_list), IsOkAndHolds(ElementsAreArray(hits3.rbegin(), hits3.rend()))); } EXPECT_THAT(flash_index_storage.GetDiskUsage(), @@ -235,7 +245,8 @@ TEST_F(FlashIndexStorageTest, FreeListNotInMemory) { // Create the header file ICING_ASSERT_OK_AND_ASSIGN( FlashIndexStorage flash_index_storage, - FlashIndexStorage::Create(file_name_, &filesystem_, /*in_memory=*/false)); + FlashIndexStorage::Create(file_name_, &filesystem_, serializer_.get(), + /*in_memory=*/false)); { // 1. Request a PL that is 1/2 block size. Remember that block size also @@ -261,9 +272,10 @@ TEST_F(FlashIndexStorageTest, FreeListNotInMemory) { Hit(/*section_id=*/5, /*document_id=*/2, /*term_frequency=*/100), Hit(/*section_id=*/8, /*document_id=*/5, /*term_frequency=*/197)}; for (const Hit& hit : hits1) { - ICING_ASSERT_OK(posting_list_holder1.posting_list.PrependHit(hit)); + ICING_ASSERT_OK( + serializer_->PrependHit(&posting_list_holder1.posting_list, hit)); } - EXPECT_THAT(posting_list_holder1.posting_list.GetHits(), + EXPECT_THAT(serializer_->GetHits(&posting_list_holder1.posting_list), IsOkAndHolds(ElementsAreArray(hits1.rbegin(), hits1.rend()))); // 2. Get another PL. This should be on the same flash block. There should @@ -284,9 +296,10 @@ TEST_F(FlashIndexStorageTest, FreeListNotInMemory) { Hit(/*section_id=*/9, /*document_id=*/7, /*term_frequency=*/100), Hit(/*section_id=*/6, /*document_id=*/7, /*term_frequency=*/197)}; for (const Hit& hit : hits2) { - ICING_ASSERT_OK(posting_list_holder2.posting_list.PrependHit(hit)); + ICING_ASSERT_OK( + serializer_->PrependHit(&posting_list_holder2.posting_list, hit)); } - EXPECT_THAT(posting_list_holder2.posting_list.GetHits(), + EXPECT_THAT(serializer_->GetHits(&posting_list_holder2.posting_list), IsOkAndHolds(ElementsAreArray(hits2.rbegin(), hits2.rend()))); // 3. Now, free the first posting list. This should add it to the free list @@ -310,7 +323,7 @@ TEST_F(FlashIndexStorageTest, FreeListNotInMemory) { EXPECT_THAT(posting_list_holder3.id.block_index(), Eq(id1.block_index())); // Make sure this pl is empty. The hits that used to be there should be // gone. - EXPECT_THAT(posting_list_holder3.posting_list.GetHits(), + EXPECT_THAT(serializer_->GetHits(&posting_list_holder3.posting_list), IsOkAndHolds(IsEmpty())); std::vector<Hit> hits3 = { Hit(/*section_id=*/7, /*document_id=*/1, /*term_frequency=*/62), @@ -318,9 +331,10 @@ TEST_F(FlashIndexStorageTest, FreeListNotInMemory) { Hit(/*section_id=*/11, /*document_id=*/18, /*term_frequency=*/12), Hit(/*section_id=*/7, /*document_id=*/100, /*term_frequency=*/74)}; for (const Hit& hit : hits3) { - ICING_ASSERT_OK(posting_list_holder3.posting_list.PrependHit(hit)); + ICING_ASSERT_OK( + serializer_->PrependHit(&posting_list_holder3.posting_list, hit)); } - EXPECT_THAT(posting_list_holder3.posting_list.GetHits(), + EXPECT_THAT(serializer_->GetHits(&posting_list_holder3.posting_list), IsOkAndHolds(ElementsAreArray(hits3.rbegin(), hits3.rend()))); } EXPECT_THAT(flash_index_storage.GetDiskUsage(), @@ -334,17 +348,18 @@ TEST_F(FlashIndexStorageTest, FreeListInMemoryPersistence) { // Create the header file ICING_ASSERT_OK_AND_ASSIGN( FlashIndexStorage flash_index_storage, - FlashIndexStorage::Create(file_name_, &filesystem_)); + FlashIndexStorage::Create(file_name_, &filesystem_, serializer_.get())); { // 1. Request a PL that is 1/2 block size. Remember that block size also // includes the BlockHeader. The BlockHeader isn't publicly visible, so we // subtract 100 bytes to be sure. AllocatePostingList will round up from // kHalfBlockPostingListSize to whatever the correct size is. - half_block_posting_list_size = (flash_index_storage.block_size() - 100) / 2; - ICING_ASSERT_OK_AND_ASSIGN( - PostingListHolder posting_list_holder1, - flash_index_storage.AllocatePostingList(half_block_posting_list_size)); + half_block_posting_list_size = + (flash_index_storage.block_size() - 100) / 2; + ICING_ASSERT_OK_AND_ASSIGN(PostingListHolder posting_list_holder1, + flash_index_storage.AllocatePostingList( + half_block_posting_list_size)); // We expect: // 1. FlashIndexStorage will return a valid id. id1 = posting_list_holder1.id; @@ -359,16 +374,17 @@ TEST_F(FlashIndexStorageTest, FreeListInMemoryPersistence) { Hit(/*section_id=*/5, /*document_id=*/2, /*term_frequency=*/100), Hit(/*section_id=*/8, /*document_id=*/5, /*term_frequency=*/197)}; for (const Hit& hit : hits1) { - ICING_ASSERT_OK(posting_list_holder1.posting_list.PrependHit(hit)); + ICING_ASSERT_OK( + serializer_->PrependHit(&posting_list_holder1.posting_list, hit)); } - EXPECT_THAT(posting_list_holder1.posting_list.GetHits(), + EXPECT_THAT(serializer_->GetHits(&posting_list_holder1.posting_list), IsOkAndHolds(ElementsAreArray(hits1.rbegin(), hits1.rend()))); // 2. Get another PL. This should be on the same flash block. There should // be no allocation. - ICING_ASSERT_OK_AND_ASSIGN( - PostingListHolder posting_list_holder2, - flash_index_storage.AllocatePostingList(half_block_posting_list_size)); + ICING_ASSERT_OK_AND_ASSIGN(PostingListHolder posting_list_holder2, + flash_index_storage.AllocatePostingList( + half_block_posting_list_size)); // We expect: // 1. FlashIndexStorage will return a valid id. EXPECT_THAT(posting_list_holder2.id.is_valid(), IsTrue()); @@ -382,17 +398,19 @@ TEST_F(FlashIndexStorageTest, FreeListInMemoryPersistence) { Hit(/*section_id=*/9, /*document_id=*/7, /*term_frequency=*/100), Hit(/*section_id=*/6, /*document_id=*/7, /*term_frequency=*/197)}; for (const Hit& hit : hits2) { - ICING_ASSERT_OK(posting_list_holder2.posting_list.PrependHit(hit)); + ICING_ASSERT_OK( + serializer_->PrependHit(&posting_list_holder2.posting_list, hit)); } - EXPECT_THAT(posting_list_holder2.posting_list.GetHits(), + EXPECT_THAT(serializer_->GetHits(&posting_list_holder2.posting_list), IsOkAndHolds(ElementsAreArray(hits2.rbegin(), hits2.rend()))); - // 3. Now, free the first posting list. This should add it to the free list + // 3. Now, free the first posting list. This should add it to the free + // list flash_index_storage.FreePostingList(std::move(posting_list_holder1)); } EXPECT_THAT(flash_index_storage.GetDiskUsage(), - Eq(2 * flash_index_storage.block_size())); + Eq(2 * flash_index_storage.block_size())); // 4. The FlashIndexStorage should go out of scope and flush the in-memory // posting list to disk } @@ -401,14 +419,14 @@ TEST_F(FlashIndexStorageTest, FreeListInMemoryPersistence) { // Recreate the flash index. ICING_ASSERT_OK_AND_ASSIGN( FlashIndexStorage flash_index_storage, - FlashIndexStorage::Create(file_name_, &filesystem_)); + FlashIndexStorage::Create(file_name_, &filesystem_, serializer_.get())); { // 5. Request another posting list. This should NOT grow the index because // the first posting list is free. - ICING_ASSERT_OK_AND_ASSIGN( - PostingListHolder posting_list_holder3, - flash_index_storage.AllocatePostingList(half_block_posting_list_size)); + ICING_ASSERT_OK_AND_ASSIGN(PostingListHolder posting_list_holder3, + flash_index_storage.AllocatePostingList( + half_block_posting_list_size)); // We expect: // 1. FlashIndexStorage will return a valid id. EXPECT_THAT(posting_list_holder3.id.is_valid(), IsTrue()); @@ -422,7 +440,7 @@ TEST_F(FlashIndexStorageTest, FreeListInMemoryPersistence) { EXPECT_THAT(posting_list_holder3.id.block_index(), Eq(id1.block_index())); // Make sure this pl is empty. The hits that used to be there should be // gone. - EXPECT_THAT(posting_list_holder3.posting_list.GetHits(), + EXPECT_THAT(serializer_->GetHits(&posting_list_holder3.posting_list), IsOkAndHolds(IsEmpty())); std::vector<Hit> hits3 = { Hit(/*section_id=*/7, /*document_id=*/1, /*term_frequency=*/62), @@ -430,13 +448,14 @@ TEST_F(FlashIndexStorageTest, FreeListInMemoryPersistence) { Hit(/*section_id=*/11, /*document_id=*/18, /*term_frequency=*/12), Hit(/*section_id=*/7, /*document_id=*/100, /*term_frequency=*/74)}; for (const Hit& hit : hits3) { - ICING_ASSERT_OK(posting_list_holder3.posting_list.PrependHit(hit)); + ICING_ASSERT_OK( + serializer_->PrependHit(&posting_list_holder3.posting_list, hit)); } - EXPECT_THAT(posting_list_holder3.posting_list.GetHits(), + EXPECT_THAT(serializer_->GetHits(&posting_list_holder3.posting_list), IsOkAndHolds(ElementsAreArray(hits3.rbegin(), hits3.rend()))); } EXPECT_THAT(flash_index_storage.GetDiskUsage(), - Eq(2 * flash_index_storage.block_size())); + Eq(2 * flash_index_storage.block_size())); } } @@ -444,7 +463,7 @@ TEST_F(FlashIndexStorageTest, DifferentSizedPostingLists) { // Create the header file ICING_ASSERT_OK_AND_ASSIGN( FlashIndexStorage flash_index_storage, - FlashIndexStorage::Create(file_name_, &filesystem_)); + FlashIndexStorage::Create(file_name_, &filesystem_, serializer_.get())); { // 1. Request a PL that is 1/2 block size. Remember that block size also // includes the BlockHeader. The BlockHeader isn't publicly visible, so we @@ -471,9 +490,10 @@ TEST_F(FlashIndexStorageTest, DifferentSizedPostingLists) { Hit(/*section_id=*/5, /*document_id=*/2, /*term_frequency=*/100), Hit(/*section_id=*/8, /*document_id=*/5, /*term_frequency=*/197)}; for (const Hit& hit : hits1) { - ICING_ASSERT_OK(posting_list_holder1.posting_list.PrependHit(hit)); + ICING_ASSERT_OK( + serializer_->PrependHit(&posting_list_holder1.posting_list, hit)); } - EXPECT_THAT(posting_list_holder1.posting_list.GetHits(), + EXPECT_THAT(serializer_->GetHits(&posting_list_holder1.posting_list), IsOkAndHolds(ElementsAreArray(hits1.rbegin(), hits1.rend()))); // 2. Get a PL that is 1/4 block size. Even though a 1/4 block PL could @@ -497,9 +517,10 @@ TEST_F(FlashIndexStorageTest, DifferentSizedPostingLists) { Hit(/*section_id=*/9, /*document_id=*/7, /*term_frequency=*/100), Hit(/*section_id=*/6, /*document_id=*/7, /*term_frequency=*/197)}; for (const Hit& hit : hits2) { - ICING_ASSERT_OK(posting_list_holder2.posting_list.PrependHit(hit)); + ICING_ASSERT_OK( + serializer_->PrependHit(&posting_list_holder2.posting_list, hit)); } - EXPECT_THAT(posting_list_holder2.posting_list.GetHits(), + EXPECT_THAT(serializer_->GetHits(&posting_list_holder2.posting_list), IsOkAndHolds(ElementsAreArray(hits2.rbegin(), hits2.rend()))); // 3. Request another 1/4 block-size posting list. This should NOT grow the @@ -526,7 +547,7 @@ TEST_F(FlashIndexStorageTest, AllocateTooLargePostingList) { // Create the header file ICING_ASSERT_OK_AND_ASSIGN( FlashIndexStorage flash_index_storage, - FlashIndexStorage::Create(file_name_, &filesystem_)); + FlashIndexStorage::Create(file_name_, &filesystem_, serializer_.get())); // Request a PL that is 2x block size. const int kDoubleBlockSize = flash_index_storage.block_size() * 2; diff --git a/icing/index/main/index-block.cc b/icing/file/posting_list/index-block.cc index fe989c7..1b9982e 100644 --- a/icing/index/main/index-block.cc +++ b/icing/file/posting_list/index-block.cc @@ -12,19 +12,22 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "icing/index/main/index-block.h" +#include "icing/file/posting_list/index-block.h" -#include <algorithm> -#include <cinttypes> -#include <limits> +#include <sys/types.h> + +#include <cstdint> +#include <memory> +#include <string_view> #include "icing/text_classifier/lib3/utils/base/statusor.h" #include "icing/absl_ports/canonical_errors.h" #include "icing/file/memory-mapped-file.h" -#include "icing/index/main/posting-list-free.h" -#include "icing/index/main/posting-list-utils.h" +#include "icing/file/posting_list/posting-list-common.h" +#include "icing/file/posting_list/posting-list-free.h" +#include "icing/file/posting_list/posting-list-utils.h" #include "icing/legacy/core/icing-string-util.h" -#include "icing/util/math-util.h" +#include "icing/util/logging.h" #include "icing/util/status-macros.h" namespace icing { @@ -32,35 +35,30 @@ namespace lib { namespace { -libtextclassifier3::Status ValidatePostingListBytes(uint32_t posting_list_bytes, - uint32_t block_size) { - if (posting_list_bytes > - IndexBlock::CalculateMaxPostingListBytes(block_size) || - !posting_list_utils::IsValidPostingListSize(posting_list_bytes)) { +libtextclassifier3::Status ValidatePostingListBytes( + PostingListUsedSerializer* serializer, uint32_t posting_list_bytes, + uint32_t block_size) { + if (posting_list_bytes > IndexBlock::CalculateMaxPostingListBytes( + block_size, serializer->GetDataTypeBytes()) || + !posting_list_utils::IsValidPostingListSize( + posting_list_bytes, serializer->GetDataTypeBytes(), + serializer->GetMinPostingListSize())) { return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( "Requested posting list size %d is illegal for a flash block with max " "posting list size of %d", posting_list_bytes, - IndexBlock::CalculateMaxPostingListBytes(block_size))); + IndexBlock::CalculateMaxPostingListBytes( + block_size, serializer->GetDataTypeBytes()))); } return libtextclassifier3::Status::OK; } } // namespace -uint32_t IndexBlock::ApproximateFullPostingListHitsForBlock( - uint32_t block_size, int posting_list_index_bits) { - // Assume 50% compressed and most don't have term frequencies. - uint32_t bytes_per_hit = sizeof(Hit::Value) / 2; - return (block_size - sizeof(BlockHeader)) / - ((1u << posting_list_index_bits) * bytes_per_hit); -} - libtextclassifier3::StatusOr<IndexBlock> -IndexBlock::CreateFromPreexistingIndexBlockRegion(const Filesystem& filesystem, - std::string_view file_path, - off_t offset, - uint32_t block_size) { +IndexBlock::CreateFromPreexistingIndexBlockRegion( + const Filesystem& filesystem, std::string_view file_path, + PostingListUsedSerializer* serializer, off_t offset, uint32_t block_size) { if (block_size < sizeof(BlockHeader)) { return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( "Provided block_size %d is too small to fit even the BlockHeader!", @@ -71,15 +69,16 @@ IndexBlock::CreateFromPreexistingIndexBlockRegion(const Filesystem& filesystem, filesystem, file_path, MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC)); ICING_RETURN_IF_ERROR(mmapped_file.Remap(offset, block_size)); - IndexBlock block(std::move(mmapped_file)); - ICING_RETURN_IF_ERROR( - ValidatePostingListBytes(block.get_posting_list_bytes(), block_size)); + IndexBlock block(serializer, std::move(mmapped_file)); + ICING_RETURN_IF_ERROR(ValidatePostingListBytes( + serializer, block.get_posting_list_bytes(), block_size)); return block; } libtextclassifier3::StatusOr<IndexBlock> IndexBlock::CreateFromUninitializedRegion(const Filesystem& filesystem, std::string_view file_path, + PostingListUsedSerializer* serializer, off_t offset, uint32_t block_size, uint32_t posting_list_bytes) { if (block_size < sizeof(BlockHeader)) { @@ -88,13 +87,13 @@ IndexBlock::CreateFromUninitializedRegion(const Filesystem& filesystem, block_size)); } ICING_RETURN_IF_ERROR( - ValidatePostingListBytes(posting_list_bytes, block_size)); + ValidatePostingListBytes(serializer, posting_list_bytes, block_size)); ICING_ASSIGN_OR_RETURN(MemoryMappedFile mmapped_file, MemoryMappedFile::Create( filesystem, file_path, MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC)); ICING_RETURN_IF_ERROR(mmapped_file.Remap(offset, block_size)); - IndexBlock block(std::move(mmapped_file)); + IndexBlock block(serializer, std::move(mmapped_file)); // Safe to ignore the return value of Reset. Reset returns an error if // posting_list_bytes is invalid, but this function ensures that // posting_list_bytes is valid thanks to the call to ValidatePostingListBytes @@ -103,17 +102,19 @@ IndexBlock::CreateFromUninitializedRegion(const Filesystem& filesystem, return block; } -IndexBlock::IndexBlock(MemoryMappedFile&& mmapped_block) +IndexBlock::IndexBlock(PostingListUsedSerializer* serializer, + MemoryMappedFile&& mmapped_block) : header_(reinterpret_cast<BlockHeader*>(mmapped_block.mutable_region())), posting_lists_start_ptr_(mmapped_block.mutable_region() + sizeof(BlockHeader)), block_size_in_bytes_(mmapped_block.region_size()), + serializer_(serializer), mmapped_block_( std::make_unique<MemoryMappedFile>(std::move(mmapped_block))) {} libtextclassifier3::Status IndexBlock::Reset(int posting_list_bytes) { ICING_RETURN_IF_ERROR(ValidatePostingListBytes( - posting_list_bytes, mmapped_block_->region_size())); + serializer_, posting_list_bytes, mmapped_block_->region_size())); header_->free_list_posting_list_index = kInvalidPostingListIndex; header_->next_block_index = kInvalidBlockIndex; header_->posting_list_bytes = posting_list_bytes; @@ -140,7 +141,8 @@ IndexBlock::GetAllocatedPostingList(PostingListIndex posting_list_index) { posting_list_index, max_num_posting_lists())); } return PostingListUsed::CreateFromPreexistingPostingListUsedRegion( - get_posting_list_ptr(posting_list_index), get_posting_list_bytes()); + serializer_, get_posting_list_ptr(posting_list_index), + get_posting_list_bytes()); } libtextclassifier3::StatusOr<PostingListIndex> @@ -159,7 +161,9 @@ IndexBlock::AllocatePostingList() { // always return OK and ValueOrDie is safe to call. auto posting_list_or = PostingListFree::CreateFromPreexistingPostingListFreeRegion( - get_posting_list_ptr(posting_list_index), get_posting_list_bytes()); + get_posting_list_ptr(posting_list_index), get_posting_list_bytes(), + serializer_->GetDataTypeBytes(), + serializer_->GetMinPostingListSize()); PostingListFree plfree = std::move(posting_list_or).ValueOrDie(); header_->free_list_posting_list_index = plfree.get_next_posting_list_index(); @@ -172,7 +176,8 @@ IndexBlock::AllocatePostingList() { // Make it a used posting list. PostingListUsed::CreateFromUnitializedRegion( - get_posting_list_ptr(posting_list_index), get_posting_list_bytes()); + serializer_, get_posting_list_ptr(posting_list_index), + get_posting_list_bytes()); return posting_list_index; } @@ -188,7 +193,8 @@ void IndexBlock::FreePostingList(PostingListIndex posting_list_index) { // So CreateFromUninitializedRegion will always return OK and ValueOrDie is // safe to call. auto posting_list_or = PostingListFree::CreateFromUnitializedRegion( - get_posting_list_ptr(posting_list_index), get_posting_list_bytes()); + get_posting_list_ptr(posting_list_index), get_posting_list_bytes(), + serializer_->GetDataTypeBytes(), serializer_->GetMinPostingListSize()); PostingListFree plfree = std::move(posting_list_or).ValueOrDie(); // Put at the head of the list. diff --git a/icing/index/main/index-block.h b/icing/file/posting_list/index-block.h index 8a7aa16..589f155 100644 --- a/icing/index/main/index-block.h +++ b/icing/file/posting_list/index-block.h @@ -12,30 +12,23 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef ICING_INDEX_MAIN_INDEX_BLOCK_H_ -#define ICING_INDEX_MAIN_INDEX_BLOCK_H_ +#ifndef ICING_FILE_POSTING_LIST_INDEX_BLOCK_H_ +#define ICING_FILE_POSTING_LIST_INDEX_BLOCK_H_ -#include <sys/mman.h> +#include <sys/types.h> -#include <algorithm> -#include <cstring> -#include <limits> +#include <cstdint> #include <memory> -#include <string> -#include <unordered_set> -#include <vector> +#include <string_view> #include "icing/file/memory-mapped-file.h" -#include "icing/index/hit/hit.h" -#include "icing/index/main/posting-list-free.h" -#include "icing/index/main/posting-list-used.h" +#include "icing/file/posting_list/posting-list-common.h" +#include "icing/file/posting_list/posting-list-used.h" #include "icing/legacy/index/icing-bit-util.h" namespace icing { namespace lib { -inline constexpr uint32_t kInvalidBlockIndex = 0; - // This class is used to manage I/O to a single flash block and to manage the // division of that flash block into PostingLists. It provides an interface to // allocate, free and read posting lists. @@ -51,17 +44,12 @@ class IndexBlock { public: // What is the maximum posting list size in bytes that can be stored in this // block. - static uint32_t CalculateMaxPostingListBytes(uint32_t block_size_in_bytes) { - return (block_size_in_bytes - sizeof(BlockHeader)) / sizeof(Hit) * - sizeof(Hit); + static uint32_t CalculateMaxPostingListBytes(uint32_t block_size_in_bytes, + uint32_t data_type_bytes) { + return (block_size_in_bytes - sizeof(BlockHeader)) / data_type_bytes * + data_type_bytes; } - // For a given min number of bits needed to store PostingListIndex for a - // block of "block_size", return the approximate number of hits that a full - // posting list in this block could accomodate. - static uint32_t ApproximateFullPostingListHitsForBlock( - uint32_t block_size, int posting_list_index_bits); - // Create an IndexBlock to reference the previously used region of the // mmapped_file starting at offset with size block_size // @@ -74,6 +62,7 @@ class IndexBlock { static libtextclassifier3::StatusOr<IndexBlock> CreateFromPreexistingIndexBlockRegion(const Filesystem& filesystem, std::string_view file_path, + PostingListUsedSerializer* serializer, off_t offset, uint32_t block_size); // Create an IndexBlock to reference an uninitialized region of the @@ -88,8 +77,9 @@ class IndexBlock { // max_posting_list_bytes(size). // - INTERNAL_ERROR if unable to mmap the region [offset, offset+block_size) static libtextclassifier3::StatusOr<IndexBlock> CreateFromUninitializedRegion( - const Filesystem& filesystem, std::string_view file_path, off_t offset, - uint32_t block_size, uint32_t posting_list_bytes); + const Filesystem& filesystem, std::string_view file_path, + PostingListUsedSerializer* serializer, off_t offset, uint32_t block_size, + uint32_t posting_list_bytes); IndexBlock(const IndexBlock&) = delete; IndexBlock& operator=(const IndexBlock&) = delete; @@ -174,7 +164,8 @@ class IndexBlock { private: // Assumes that mmapped_file already has established a valid mapping to the // requested block. - explicit IndexBlock(MemoryMappedFile&& mmapped_block); + explicit IndexBlock(PostingListUsedSerializer* serializer, + MemoryMappedFile&& mmapped_block); // Resets IndexBlock to hold posting lists of posting_list_bytes size and adds // all posting lists to the free list. @@ -212,6 +203,8 @@ class IndexBlock { char* posting_lists_start_ptr_; uint32_t block_size_in_bytes_; + PostingListUsedSerializer* serializer_; // Does not own. + // MemoryMappedFile used to interact with the underlying flash block. std::unique_ptr<MemoryMappedFile> mmapped_block_; }; @@ -219,4 +212,4 @@ class IndexBlock { } // namespace lib } // namespace icing -#endif // ICING_INDEX_MAIN_INDEX_BLOCK_H_ +#endif // ICING_FILE_POSTING_LIST_INDEX_BLOCK_H_ diff --git a/icing/index/main/index-block_test.cc b/icing/file/posting_list/index-block_test.cc index 322918d..775858d 100644 --- a/icing/index/main/index-block_test.cc +++ b/icing/file/posting_list/index-block_test.cc @@ -12,14 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "icing/index/main/index-block.h" +#include "icing/file/posting_list/index-block.h" #include "icing/text_classifier/lib3/utils/base/status.h" #include "gmock/gmock.h" #include "gtest/gtest.h" #include "icing/file/filesystem.h" -#include "icing/file/memory-mapped-file.h" -#include "icing/index/main/posting-list-used.h" +#include "icing/file/posting_list/posting-list-used.h" +#include "icing/index/main/posting-list-used-hit-serializer.h" #include "icing/testing/common-matchers.h" #include "icing/testing/tmp-directory.h" @@ -28,53 +28,57 @@ namespace lib { namespace { +using ::testing::ElementsAreArray; +using ::testing::Eq; + static constexpr int kBlockSize = 4096; -bool CreateFileWithSize(const Filesystem& filesystem, const std::string& file, - int size) { - size_t parent_dir_end = file.find_last_of('/'); - if (parent_dir_end == std::string::npos) { - return false; +class IndexBlockTest : public ::testing::Test { + protected: + void SetUp() override { + test_dir_ = GetTestTempDir() + "/flash"; + flash_file_ = test_dir_ + "/0"; + ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(test_dir_.c_str())); + + // Grow the file by one block for the IndexBlock to use. + ASSERT_TRUE(filesystem_.Grow(flash_file_.c_str(), kBlockSize)); + + // TODO: test different serializers + serializer_ = std::make_unique<PostingListUsedHitSerializer>(); } - std::string file_dir = file.substr(0, parent_dir_end); - return filesystem.CreateDirectoryRecursively(file_dir.c_str()) && - filesystem.Grow(file.c_str(), size); -} -using ::testing::ElementsAreArray; -using ::testing::Eq; + void TearDown() override { + serializer_.reset(); + ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(test_dir_.c_str())); + } -TEST(IndexBlockTest, CreateFromUninitializedRegionProducesEmptyBlock) { - constexpr int kPostingListBytes = 20; + std::string test_dir_; + std::string flash_file_; + Filesystem filesystem_; + std::unique_ptr<PostingListUsedHitSerializer> serializer_; +}; - Filesystem filesystem; - std::string flash_file = GetTestTempDir() + "/flash/0"; - // Grow the file by one block for the IndexBlock to use. - ASSERT_TRUE(CreateFileWithSize(filesystem, flash_file, kBlockSize)); +TEST_F(IndexBlockTest, CreateFromUninitializedRegionProducesEmptyBlock) { + constexpr int kPostingListBytes = 20; { // Create an IndexBlock from this newly allocated file block. ICING_ASSERT_OK_AND_ASSIGN( IndexBlock block, IndexBlock::CreateFromUninitializedRegion( - filesystem, flash_file, /*offset=*/0, kBlockSize, - kPostingListBytes)); + filesystem_, flash_file_, serializer_.get(), + /*offset=*/0, kBlockSize, kPostingListBytes)); EXPECT_TRUE(block.has_free_posting_lists()); } } -TEST(IndexBlockTest, SizeAccessorsWorkCorrectly) { +TEST_F(IndexBlockTest, SizeAccessorsWorkCorrectly) { constexpr int kPostingListBytes1 = 20; - Filesystem filesystem; - std::string flash_file = GetTestTempDir() + "/flash/0"; - // Grow the file by one block for the IndexBlock to use. - ASSERT_TRUE(CreateFileWithSize(filesystem, flash_file, kBlockSize)); - // Create an IndexBlock from this newly allocated file block. - ICING_ASSERT_OK_AND_ASSIGN( - IndexBlock block, IndexBlock::CreateFromUninitializedRegion( - filesystem, flash_file, /*offset=*/0, kBlockSize, - kPostingListBytes1)); + ICING_ASSERT_OK_AND_ASSIGN(IndexBlock block, + IndexBlock::CreateFromUninitializedRegion( + filesystem_, flash_file_, serializer_.get(), + /*offset=*/0, kBlockSize, kPostingListBytes1)); EXPECT_THAT(block.get_posting_list_bytes(), Eq(kPostingListBytes1)); // There should be (4096 - 12) / 20 = 204 posting lists // (sizeof(BlockHeader)==12). We can store a PostingListIndex of 203 in only 8 @@ -85,9 +89,10 @@ TEST(IndexBlockTest, SizeAccessorsWorkCorrectly) { constexpr int kPostingListBytes2 = 200; // Create an IndexBlock from this newly allocated file block. - ICING_ASSERT_OK_AND_ASSIGN(block, IndexBlock::CreateFromUninitializedRegion( - filesystem, flash_file, /*offset=*/0, - kBlockSize, kPostingListBytes2)); + ICING_ASSERT_OK_AND_ASSIGN( + block, IndexBlock::CreateFromUninitializedRegion( + filesystem_, flash_file_, serializer_.get(), /*offset=*/0, + kBlockSize, kPostingListBytes2)); EXPECT_THAT(block.get_posting_list_bytes(), Eq(kPostingListBytes2)); // There should be (4096 - 12) / 200 = 20 posting lists // (sizeof(BlockHeader)==12). We can store a PostingListIndex of 19 in only 5 @@ -96,14 +101,9 @@ TEST(IndexBlockTest, SizeAccessorsWorkCorrectly) { EXPECT_THAT(block.posting_list_index_bits(), Eq(5)); } -TEST(IndexBlockTest, IndexBlockChangesPersistAcrossInstances) { +TEST_F(IndexBlockTest, IndexBlockChangesPersistAcrossInstances) { constexpr int kPostingListBytes = 2000; - Filesystem filesystem; - std::string flash_file = GetTestTempDir() + "/flash/0"; - // Grow the file by one block for the IndexBlock to use. - ASSERT_TRUE(CreateFileWithSize(filesystem, flash_file, kBlockSize)); - std::vector<Hit> test_hits{ Hit(/*section_id=*/2, /*document_id=*/0, Hit::kDefaultTermFrequency), Hit(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency), @@ -116,7 +116,7 @@ TEST(IndexBlockTest, IndexBlockChangesPersistAcrossInstances) { // Create an IndexBlock from this newly allocated file block. ICING_ASSERT_OK_AND_ASSIGN( IndexBlock block, IndexBlock::CreateFromUninitializedRegion( - filesystem, flash_file, + filesystem_, flash_file_, serializer_.get(), /*offset=*/0, /*block_size=*/kBlockSize, kPostingListBytes)); // Add hits to the first posting list. @@ -124,33 +124,30 @@ TEST(IndexBlockTest, IndexBlockChangesPersistAcrossInstances) { ICING_ASSERT_OK_AND_ASSIGN(PostingListUsed pl_used, block.GetAllocatedPostingList(allocated_index)); for (const Hit& hit : test_hits) { - ICING_ASSERT_OK(pl_used.PrependHit(hit)); + ICING_ASSERT_OK(serializer_->PrependHit(&pl_used, hit)); } - EXPECT_THAT(pl_used.GetHits(), IsOkAndHolds(ElementsAreArray( - test_hits.rbegin(), test_hits.rend()))); + EXPECT_THAT( + serializer_->GetHits(&pl_used), + IsOkAndHolds(ElementsAreArray(test_hits.rbegin(), test_hits.rend()))); } { // Create an IndexBlock from the previously allocated file block. ICING_ASSERT_OK_AND_ASSIGN( - IndexBlock block, - IndexBlock::CreateFromPreexistingIndexBlockRegion( - filesystem, flash_file, /*offset=*/0, kBlockSize)); + IndexBlock block, IndexBlock::CreateFromPreexistingIndexBlockRegion( + filesystem_, flash_file_, serializer_.get(), + /*offset=*/0, kBlockSize)); ICING_ASSERT_OK_AND_ASSIGN(PostingListUsed pl_used, block.GetAllocatedPostingList(allocated_index)); - EXPECT_THAT(pl_used.GetHits(), IsOkAndHolds(ElementsAreArray( - test_hits.rbegin(), test_hits.rend()))); + EXPECT_THAT( + serializer_->GetHits(&pl_used), + IsOkAndHolds(ElementsAreArray(test_hits.rbegin(), test_hits.rend()))); EXPECT_TRUE(block.has_free_posting_lists()); } } -TEST(IndexBlockTest, IndexBlockMultiplePostingLists) { +TEST_F(IndexBlockTest, IndexBlockMultiplePostingLists) { constexpr int kPostingListBytes = 2000; - Filesystem filesystem; - std::string flash_file = GetTestTempDir() + "/flash/0"; - // Grow the file by one block for the IndexBlock to use. - ASSERT_TRUE(CreateFileWithSize(filesystem, flash_file, kBlockSize)); - std::vector<Hit> hits_in_posting_list1{ Hit(/*section_id=*/2, /*document_id=*/0, Hit::kDefaultTermFrequency), Hit(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency), @@ -171,8 +168,8 @@ TEST(IndexBlockTest, IndexBlockMultiplePostingLists) { // Create an IndexBlock from this newly allocated file block. ICING_ASSERT_OK_AND_ASSIGN( IndexBlock block, IndexBlock::CreateFromUninitializedRegion( - filesystem, flash_file, /*offset=*/0, kBlockSize, - kPostingListBytes)); + filesystem_, flash_file_, serializer_.get(), + /*offset=*/0, kBlockSize, kPostingListBytes)); // Add hits to the first posting list. ICING_ASSERT_OK_AND_ASSIGN(allocated_index_1, block.AllocatePostingList()); @@ -180,9 +177,9 @@ TEST(IndexBlockTest, IndexBlockMultiplePostingLists) { PostingListUsed pl_used_1, block.GetAllocatedPostingList(allocated_index_1)); for (const Hit& hit : hits_in_posting_list1) { - ICING_ASSERT_OK(pl_used_1.PrependHit(hit)); + ICING_ASSERT_OK(serializer_->PrependHit(&pl_used_1, hit)); } - EXPECT_THAT(pl_used_1.GetHits(), + EXPECT_THAT(serializer_->GetHits(&pl_used_1), IsOkAndHolds(ElementsAreArray(hits_in_posting_list1.rbegin(), hits_in_posting_list1.rend()))); @@ -192,9 +189,9 @@ TEST(IndexBlockTest, IndexBlockMultiplePostingLists) { PostingListUsed pl_used_2, block.GetAllocatedPostingList(allocated_index_2)); for (const Hit& hit : hits_in_posting_list2) { - ICING_ASSERT_OK(pl_used_2.PrependHit(hit)); + ICING_ASSERT_OK(serializer_->PrependHit(&pl_used_2, hit)); } - EXPECT_THAT(pl_used_2.GetHits(), + EXPECT_THAT(serializer_->GetHits(&pl_used_2), IsOkAndHolds(ElementsAreArray(hits_in_posting_list2.rbegin(), hits_in_posting_list2.rend()))); @@ -205,19 +202,19 @@ TEST(IndexBlockTest, IndexBlockMultiplePostingLists) { { // Create an IndexBlock from the previously allocated file block. ICING_ASSERT_OK_AND_ASSIGN( - IndexBlock block, - IndexBlock::CreateFromPreexistingIndexBlockRegion( - filesystem, flash_file, /*offset=*/0, kBlockSize)); + IndexBlock block, IndexBlock::CreateFromPreexistingIndexBlockRegion( + filesystem_, flash_file_, serializer_.get(), + /*offset=*/0, kBlockSize)); ICING_ASSERT_OK_AND_ASSIGN( PostingListUsed pl_used_1, block.GetAllocatedPostingList(allocated_index_1)); - EXPECT_THAT(pl_used_1.GetHits(), + EXPECT_THAT(serializer_->GetHits(&pl_used_1), IsOkAndHolds(ElementsAreArray(hits_in_posting_list1.rbegin(), hits_in_posting_list1.rend()))); ICING_ASSERT_OK_AND_ASSIGN( PostingListUsed pl_used_2, block.GetAllocatedPostingList(allocated_index_2)); - EXPECT_THAT(pl_used_2.GetHits(), + EXPECT_THAT(serializer_->GetHits(&pl_used_2), IsOkAndHolds(ElementsAreArray(hits_in_posting_list2.rbegin(), hits_in_posting_list2.rend()))); EXPECT_THAT(block.AllocatePostingList(), @@ -226,19 +223,14 @@ TEST(IndexBlockTest, IndexBlockMultiplePostingLists) { } } -TEST(IndexBlockTest, IndexBlockReallocatingPostingLists) { +TEST_F(IndexBlockTest, IndexBlockReallocatingPostingLists) { constexpr int kPostingListBytes = 2000; - Filesystem filesystem; - std::string flash_file = GetTestTempDir() + "/flash/0"; - // Grow the file by one block for the IndexBlock to use. - ASSERT_TRUE(CreateFileWithSize(filesystem, flash_file, kBlockSize)); - // Create an IndexBlock from this newly allocated file block. - ICING_ASSERT_OK_AND_ASSIGN( - IndexBlock block, - IndexBlock::CreateFromUninitializedRegion( - filesystem, flash_file, /*offset=*/0, kBlockSize, kPostingListBytes)); + ICING_ASSERT_OK_AND_ASSIGN(IndexBlock block, + IndexBlock::CreateFromUninitializedRegion( + filesystem_, flash_file_, serializer_.get(), + /*offset=*/0, kBlockSize, kPostingListBytes)); // Add hits to the first posting list. std::vector<Hit> hits_in_posting_list1{ @@ -253,9 +245,9 @@ TEST(IndexBlockTest, IndexBlockReallocatingPostingLists) { ICING_ASSERT_OK_AND_ASSIGN(PostingListUsed pl_used_1, block.GetAllocatedPostingList(allocated_index_1)); for (const Hit& hit : hits_in_posting_list1) { - ICING_ASSERT_OK(pl_used_1.PrependHit(hit)); + ICING_ASSERT_OK(serializer_->PrependHit(&pl_used_1, hit)); } - EXPECT_THAT(pl_used_1.GetHits(), + EXPECT_THAT(serializer_->GetHits(&pl_used_1), IsOkAndHolds(ElementsAreArray(hits_in_posting_list1.rbegin(), hits_in_posting_list1.rend()))); @@ -272,9 +264,9 @@ TEST(IndexBlockTest, IndexBlockReallocatingPostingLists) { ICING_ASSERT_OK_AND_ASSIGN(PostingListUsed pl_used_2, block.GetAllocatedPostingList(allocated_index_2)); for (const Hit& hit : hits_in_posting_list2) { - ICING_ASSERT_OK(pl_used_2.PrependHit(hit)); + ICING_ASSERT_OK(serializer_->PrependHit(&pl_used_2, hit)); } - EXPECT_THAT(pl_used_2.GetHits(), + EXPECT_THAT(serializer_->GetHits(&pl_used_2), IsOkAndHolds(ElementsAreArray(hits_in_posting_list2.rbegin(), hits_in_posting_list2.rend()))); @@ -298,9 +290,9 @@ TEST(IndexBlockTest, IndexBlockReallocatingPostingLists) { ICING_ASSERT_OK_AND_ASSIGN(pl_used_1, block.GetAllocatedPostingList(allocated_index_3)); for (const Hit& hit : hits_in_posting_list3) { - ICING_ASSERT_OK(pl_used_1.PrependHit(hit)); + ICING_ASSERT_OK(serializer_->PrependHit(&pl_used_1, hit)); } - EXPECT_THAT(pl_used_1.GetHits(), + EXPECT_THAT(serializer_->GetHits(&pl_used_1), IsOkAndHolds(ElementsAreArray(hits_in_posting_list3.rbegin(), hits_in_posting_list3.rend()))); EXPECT_THAT(block.AllocatePostingList(), @@ -308,22 +300,17 @@ TEST(IndexBlockTest, IndexBlockReallocatingPostingLists) { EXPECT_FALSE(block.has_free_posting_lists()); } -TEST(IndexBlockTest, IndexBlockNextBlockIndex) { +TEST_F(IndexBlockTest, IndexBlockNextBlockIndex) { constexpr int kPostingListBytes = 2000; constexpr int kSomeBlockIndex = 22; - Filesystem filesystem; - std::string flash_file = GetTestTempDir() + "/flash/0"; - // Grow the file by one block for the IndexBlock to use. - ASSERT_TRUE(CreateFileWithSize(filesystem, flash_file, kBlockSize)); - { // Create an IndexBlock from this newly allocated file block and set the // next block index. ICING_ASSERT_OK_AND_ASSIGN( IndexBlock block, IndexBlock::CreateFromUninitializedRegion( - filesystem, flash_file, /*offset=*/0, kBlockSize, - kPostingListBytes)); + filesystem_, flash_file_, serializer_.get(), + /*offset=*/0, kBlockSize, kPostingListBytes)); EXPECT_THAT(block.next_block_index(), Eq(kInvalidBlockIndex)); block.set_next_block_index(kSomeBlockIndex); EXPECT_THAT(block.next_block_index(), Eq(kSomeBlockIndex)); @@ -332,9 +319,9 @@ TEST(IndexBlockTest, IndexBlockNextBlockIndex) { // Create an IndexBlock from this previously allocated file block and make // sure that next_block_index is still set properly. ICING_ASSERT_OK_AND_ASSIGN( - IndexBlock block, - IndexBlock::CreateFromPreexistingIndexBlockRegion( - filesystem, flash_file, /*offset=*/0, kBlockSize)); + IndexBlock block, IndexBlock::CreateFromPreexistingIndexBlockRegion( + filesystem_, flash_file_, serializer_.get(), + /*offset=*/0, kBlockSize)); EXPECT_THAT(block.next_block_index(), Eq(kSomeBlockIndex)); } { @@ -342,8 +329,8 @@ TEST(IndexBlockTest, IndexBlockNextBlockIndex) { // reset the next_block_index to kInvalidBlockIndex. ICING_ASSERT_OK_AND_ASSIGN( IndexBlock block, IndexBlock::CreateFromUninitializedRegion( - filesystem, flash_file, /*offset=*/0, kBlockSize, - kPostingListBytes)); + filesystem_, flash_file_, serializer_.get(), + /*offset=*/0, kBlockSize, kPostingListBytes)); EXPECT_THAT(block.next_block_index(), Eq(kInvalidBlockIndex)); } } diff --git a/icing/file/posting_list/posting-list-common.h b/icing/file/posting_list/posting-list-common.h new file mode 100644 index 0000000..cbe2ddf --- /dev/null +++ b/icing/file/posting_list/posting-list-common.h @@ -0,0 +1,35 @@ +// Copyright (C) 2022 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ICING_FILE_POSTING_LIST_POSTING_LIST_COMMON_H_ +#define ICING_FILE_POSTING_LIST_POSTING_LIST_COMMON_H_ + +#include <cstdint> + +namespace icing { +namespace lib { + +// A FlashIndexBlock can contain multiple posting lists. This specifies which +// PostingList in the FlashIndexBlock we want to refer to. +using PostingListIndex = int32_t; +inline constexpr PostingListIndex kInvalidPostingListIndex = ~0U; + +inline constexpr uint32_t kNumSpecialData = 2; + +inline constexpr uint32_t kInvalidBlockIndex = 0; + +} // namespace lib +} // namespace icing + +#endif // ICING_FILE_POSTING_LIST_POSTING_LIST_COMMON_H_ diff --git a/icing/index/main/posting-list-free.h b/icing/file/posting_list/posting-list-free.h index 75b99d7..073e344 100644 --- a/icing/index/main/posting-list-free.h +++ b/icing/file/posting_list/posting-list-free.h @@ -12,30 +12,22 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef ICING_INDEX_MAIN_POSTING_LIST_FREE_H_ -#define ICING_INDEX_MAIN_POSTING_LIST_FREE_H_ - -#include <sys/mman.h> +#ifndef ICING_FILE_POSTING_LIST_POSTING_LIST_FREE_H_ +#define ICING_FILE_POSTING_LIST_POSTING_LIST_FREE_H_ #include <cstdint> #include <cstring> #include "icing/text_classifier/lib3/utils/base/statusor.h" #include "icing/absl_ports/canonical_errors.h" -#include "icing/index/hit/hit.h" -#include "icing/index/main/posting-list-utils.h" +#include "icing/file/posting_list/posting-list-common.h" +#include "icing/file/posting_list/posting-list-utils.h" #include "icing/legacy/core/icing-string-util.h" -#include "icing/util/logging.h" #include "icing/util/status-macros.h" namespace icing { namespace lib { -// A FlashIndexBlock can contain multiple posting lists. This specifies which -// PostingList in the FlashIndexBlock we want to refer to. -using PostingListIndex = int32_t; -inline constexpr PostingListIndex kInvalidPostingListIndex = ~0U; - // A posting list in the index block's free list. // // We re-use the first sizeof(PostingListIndex) bytes of the posting list @@ -51,14 +43,17 @@ class PostingListFree { // // RETURNS: // - A valid PostingListFree on success - // - INVALID_ARGUMENT if size_in_bytes < min_posting_list_size() - // || size_in_bytes % sizeof(Hit) != 0. + // - INVALID_ARGUMENT if posting_list_utils::IsValidPostingListSize check + // fails // - FAILED_PRECONDITION if posting_list_buffer is null static libtextclassifier3::StatusOr<PostingListFree> - CreateFromPreexistingPostingListFreeRegion(void *posting_list_buffer, - uint32_t size_in_bytes) { + CreateFromPreexistingPostingListFreeRegion(void* posting_list_buffer, + uint32_t size_in_bytes, + uint32_t data_type_bytes, + uint32_t min_posting_list_size) { ICING_RETURN_ERROR_IF_NULL(posting_list_buffer); - if (!posting_list_utils::IsValidPostingListSize(size_in_bytes)) { + if (!posting_list_utils::IsValidPostingListSize( + size_in_bytes, data_type_bytes, min_posting_list_size)) { return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( "Requested posting list size %d is invalid!", size_in_bytes)); } @@ -74,15 +69,17 @@ class PostingListFree { // // RETURNS: // - A valid PostingListFree on success - // - INVALID_ARGUMENT if size_in_bytes < min_size() || size_in_bytes % - // sizeof(Hit) != 0. + // - INVALID_ARGUMENT if posting_list_utils::IsValidPostingListSize check + // fails // - FAILED_PRECONDITION if posting_list_buffer is null static libtextclassifier3::StatusOr<PostingListFree> - CreateFromUnitializedRegion(void *posting_list_buffer, - uint32_t size_in_bytes) { + CreateFromUnitializedRegion(void* posting_list_buffer, uint32_t size_in_bytes, + uint32_t data_type_bytes, + uint32_t min_posting_list_size) { ICING_ASSIGN_OR_RETURN(PostingListFree posting_list_free, CreateFromPreexistingPostingListFreeRegion( - posting_list_buffer, size_in_bytes)); + posting_list_buffer, size_in_bytes, + data_type_bytes, min_posting_list_size)); posting_list_free.Clear(); return posting_list_free; } @@ -101,8 +98,8 @@ class PostingListFree { } private: - PostingListFree(void *posting_list_buffer, uint32_t size_in_bytes) - : posting_list_buffer_(static_cast<uint8_t *>(posting_list_buffer)), + explicit PostingListFree(void* posting_list_buffer, uint32_t size_in_bytes) + : posting_list_buffer_(static_cast<uint8_t*>(posting_list_buffer)), size_in_bytes_(size_in_bytes) {} // Reset the current free posting list as unchained free posting list so that @@ -114,16 +111,11 @@ class PostingListFree { // A byte array of size size_in_bytes_. The first sizeof(PostingListIndex) // bytes which will store the next posting list index, the rest are unused and // can be anything. - uint8_t *posting_list_buffer_; + uint8_t* posting_list_buffer_; [[maybe_unused]] uint32_t size_in_bytes_; - - static_assert(sizeof(PostingListIndex) <= - posting_list_utils::min_posting_list_size(), - "PostingListIndex must be small enough to fit in a " - "minimum-sized Posting List."); }; } // namespace lib } // namespace icing -#endif // ICING_INDEX_MAIN_POSTING_LIST_FREE_H_ +#endif // ICING_FILE_POSTING_LIST_POSTING_LIST_FREE_H_ diff --git a/icing/index/main/posting-list-free_test.cc b/icing/file/posting_list/posting-list-free_test.cc index a152934..99e3cf5 100644 --- a/icing/index/main/posting-list-free_test.cc +++ b/icing/file/posting_list/posting-list-free_test.cc @@ -12,14 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "icing/index/main/posting-list-free.h" +#include "icing/file/posting_list/posting-list-free.h" #include <cstdint> #include <memory> #include "icing/text_classifier/lib3/utils/base/status.h" #include "gtest/gtest.h" -#include "icing/index/main/posting-list-utils.h" +#include "icing/index/main/posting-list-used-hit-serializer.h" #include "icing/testing/common-matchers.h" namespace icing { @@ -27,55 +27,76 @@ namespace lib { namespace { +// TODO(b/249829533): test different serializers + TEST(PostingListTest, PostingListFree) { + PostingListUsedHitSerializer serializer; static const size_t kHitsSize = 2551 * sizeof(Hit); std::unique_ptr<char[]> hits_buf = std::make_unique<char[]>(kHitsSize); ICING_ASSERT_OK_AND_ASSIGN( PostingListFree pl_free, PostingListFree::CreateFromUnitializedRegion( - static_cast<void *>(hits_buf.get()), kHitsSize)); + static_cast<void *>(hits_buf.get()), kHitsSize, + serializer.GetDataTypeBytes(), serializer.GetMinPostingListSize())); EXPECT_EQ(pl_free.get_next_posting_list_index(), kInvalidPostingListIndex); } TEST(PostingListTest, PostingListTooSmallInvalidArgument) { - static const size_t kHitSizeTooSmall = - posting_list_utils::min_posting_list_size() - sizeof(Hit); + PostingListUsedHitSerializer serializer; + const size_t kHitSizeTooSmall = + serializer.GetMinPostingListSize() - sizeof(Hit); std::unique_ptr<char[]> hits_buf = std::make_unique<char[]>(kHitSizeTooSmall); - EXPECT_THAT(PostingListFree::CreateFromUnitializedRegion( - static_cast<void *>(hits_buf.get()), kHitSizeTooSmall), - StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); - EXPECT_THAT(PostingListFree::CreateFromPreexistingPostingListFreeRegion( - static_cast<void *>(hits_buf.get()), kHitSizeTooSmall), - StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + EXPECT_THAT( + PostingListFree::CreateFromUnitializedRegion( + static_cast<void *>(hits_buf.get()), kHitSizeTooSmall, + serializer.GetDataTypeBytes(), serializer.GetMinPostingListSize()), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + EXPECT_THAT( + PostingListFree::CreateFromPreexistingPostingListFreeRegion( + static_cast<void *>(hits_buf.get()), kHitSizeTooSmall, + serializer.GetDataTypeBytes(), serializer.GetMinPostingListSize()), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } TEST(PostingListTest, PostingListNotAlignedInvalidArgument) { - static const size_t kHitSizeNotAligned = - posting_list_utils::min_posting_list_size() + 1; + PostingListUsedHitSerializer serializer; + const size_t kHitSizeNotAligned = serializer.GetMinPostingListSize() + 1; std::unique_ptr<char[]> hits_buf = std::make_unique<char[]>(kHitSizeNotAligned); - EXPECT_THAT(PostingListFree::CreateFromUnitializedRegion( - static_cast<void *>(hits_buf.get()), kHitSizeNotAligned), - StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); - EXPECT_THAT(PostingListFree::CreateFromPreexistingPostingListFreeRegion( - static_cast<void *>(hits_buf.get()), kHitSizeNotAligned), - StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + EXPECT_THAT( + PostingListFree::CreateFromUnitializedRegion( + static_cast<void *>(hits_buf.get()), kHitSizeNotAligned, + serializer.GetDataTypeBytes(), serializer.GetMinPostingListSize()), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + EXPECT_THAT( + PostingListFree::CreateFromPreexistingPostingListFreeRegion( + static_cast<void *>(hits_buf.get()), kHitSizeNotAligned, + serializer.GetDataTypeBytes(), serializer.GetMinPostingListSize()), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } TEST(PostingListTest, PostingListNullBufferFailedPrecondition) { - static const size_t kHitSize = posting_list_utils::min_posting_list_size(); - EXPECT_THAT(PostingListFree::CreateFromUnitializedRegion( - /*posting_list_buffer=*/nullptr, kHitSize), - StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); - EXPECT_THAT(PostingListFree::CreateFromPreexistingPostingListFreeRegion( - /*posting_list_buffer=*/nullptr, kHitSize), - StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); + PostingListUsedHitSerializer serializer; + const size_t kHitSize = serializer.GetMinPostingListSize(); + + // nullptr posting_list_buffer + EXPECT_THAT( + PostingListFree::CreateFromUnitializedRegion( + /*posting_list_buffer=*/nullptr, kHitSize, + serializer.GetDataTypeBytes(), serializer.GetMinPostingListSize()), + StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); + EXPECT_THAT( + PostingListFree::CreateFromPreexistingPostingListFreeRegion( + /*posting_list_buffer=*/nullptr, kHitSize, + serializer.GetDataTypeBytes(), serializer.GetMinPostingListSize()), + StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); } TEST(PostingListTest, PostingListFreePreexistingRegion) { + PostingListUsedHitSerializer serializer; constexpr PostingListIndex kOtherPostingListIndex = 12; static const size_t kHitsSize = 2551 * sizeof(Hit); @@ -85,7 +106,8 @@ TEST(PostingListTest, PostingListFreePreexistingRegion) { ICING_ASSERT_OK_AND_ASSIGN( PostingListFree pl_free, PostingListFree::CreateFromUnitializedRegion( - static_cast<void *>(hits_buf.get()), kHitsSize)); + static_cast<void *>(hits_buf.get()), kHitsSize, + serializer.GetDataTypeBytes(), serializer.GetMinPostingListSize())); pl_free.set_next_posting_list_index(kOtherPostingListIndex); EXPECT_EQ(pl_free.get_next_posting_list_index(), kOtherPostingListIndex); } @@ -95,12 +117,14 @@ TEST(PostingListTest, PostingListFreePreexistingRegion) { ICING_ASSERT_OK_AND_ASSIGN( PostingListFree pl_free, PostingListFree::CreateFromPreexistingPostingListFreeRegion( - static_cast<void *>(hits_buf.get()), kHitsSize)); + static_cast<void *>(hits_buf.get()), kHitsSize, + serializer.GetDataTypeBytes(), serializer.GetMinPostingListSize())); EXPECT_EQ(pl_free.get_next_posting_list_index(), kOtherPostingListIndex); } } TEST(PostingListTest, PostingListFreeUninitializedRegion) { + PostingListUsedHitSerializer serializer; constexpr PostingListIndex kOtherPostingListIndex = 12; static const size_t kHitsSize = 2551 * sizeof(Hit); @@ -110,7 +134,8 @@ TEST(PostingListTest, PostingListFreeUninitializedRegion) { ICING_ASSERT_OK_AND_ASSIGN( PostingListFree pl_free, PostingListFree::CreateFromUnitializedRegion( - static_cast<void *>(hits_buf.get()), kHitsSize)); + static_cast<void *>(hits_buf.get()), kHitsSize, + serializer.GetDataTypeBytes(), serializer.GetMinPostingListSize())); pl_free.set_next_posting_list_index(kOtherPostingListIndex); EXPECT_EQ(pl_free.get_next_posting_list_index(), kOtherPostingListIndex); } @@ -120,7 +145,8 @@ TEST(PostingListTest, PostingListFreeUninitializedRegion) { ICING_ASSERT_OK_AND_ASSIGN( PostingListFree pl_free, PostingListFree::CreateFromUnitializedRegion( - static_cast<void *>(hits_buf.get()), kHitsSize)); + static_cast<void *>(hits_buf.get()), kHitsSize, + serializer.GetDataTypeBytes(), serializer.GetMinPostingListSize())); EXPECT_EQ(pl_free.get_next_posting_list_index(), kInvalidPostingListIndex); } } diff --git a/icing/index/main/posting-list-identifier.cc b/icing/file/posting_list/posting-list-identifier.cc index 1cdac65..4491c38 100644 --- a/icing/index/main/posting-list-identifier.cc +++ b/icing/file/posting_list/posting-list-identifier.cc @@ -12,7 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "icing/index/main/posting-list-identifier.h" +#include "icing/file/posting_list/posting-list-identifier.h" + +#include "icing/file/posting_list/posting-list-common.h" namespace icing { namespace lib { diff --git a/icing/index/main/posting-list-identifier.h b/icing/file/posting_list/posting-list-identifier.h index 4953865..05c7ce5 100644 --- a/icing/index/main/posting-list-identifier.h +++ b/icing/file/posting_list/posting-list-identifier.h @@ -12,11 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef ICING_INDEX_POSTING_LIST_IDENTIFIER_H_ -#define ICING_INDEX_POSTING_LIST_IDENTIFIER_H_ +#ifndef ICING_FILE_POSTING_LIST_POSTING_LIST_IDENTIFIER_H_ +#define ICING_FILE_POSTING_LIST_POSTING_LIST_IDENTIFIER_H_ -#include "icing/index/main/index-block.h" -#include "icing/index/main/posting-list-free.h" +#include <cstdint> + +#include "icing/file/posting_list/posting-list-common.h" #include "icing/legacy/index/icing-bit-util.h" namespace icing { @@ -62,9 +63,9 @@ class PostingListIdentifier { // 2. posting_list_index - the index of this posting list within the block // 3. posting_list_index_bits - the number of bits needed to encode the // largest posting_list_index that this block can have. - PostingListIdentifier(uint32_t block_index, - PostingListIndex posting_list_index, - int posting_list_index_bits) { + explicit PostingListIdentifier(uint32_t block_index, + PostingListIndex posting_list_index, + int posting_list_index_bits) { val_ = 0; BITFIELD_OR(val_, /*offset=*/0, /*len=*/posting_list_index_bits, /*val=*/static_cast<uint64_t>(posting_list_index)); @@ -113,4 +114,4 @@ class PostingListIdentifier { } // namespace lib } // namespace icing -#endif // ICING_INDEX_POSTING_LIST_IDENTIFIER_H_ +#endif // ICING_FILE_POSTING_LIST_POSTING_LIST_IDENTIFIER_H_ diff --git a/icing/file/posting_list/posting-list-used.cc b/icing/file/posting_list/posting-list-used.cc new file mode 100644 index 0000000..370b9c7 --- /dev/null +++ b/icing/file/posting_list/posting-list-used.cc @@ -0,0 +1,56 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/file/posting_list/posting-list-used.h" + +#include <cstdint> + +#include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "icing/absl_ports/canonical_errors.h" +#include "icing/file/posting_list/posting-list-utils.h" +#include "icing/legacy/core/icing-string-util.h" +#include "icing/util/status-macros.h" + +namespace icing { +namespace lib { + +libtextclassifier3::StatusOr<PostingListUsed> +PostingListUsed::CreateFromPreexistingPostingListUsedRegion( + PostingListUsedSerializer* serializer, void* posting_list_buffer, + uint32_t size_in_bytes) { + ICING_RETURN_ERROR_IF_NULL(serializer); + ICING_RETURN_ERROR_IF_NULL(posting_list_buffer); + + if (!posting_list_utils::IsValidPostingListSize( + size_in_bytes, serializer->GetDataTypeBytes(), + serializer->GetMinPostingListSize())) { + return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( + "Requested posting list size %d is invalid!", size_in_bytes)); + } + return PostingListUsed(posting_list_buffer, size_in_bytes); +} + +libtextclassifier3::StatusOr<PostingListUsed> +PostingListUsed::CreateFromUnitializedRegion( + PostingListUsedSerializer* serializer, void* posting_list_buffer, + uint32_t size_in_bytes) { + ICING_ASSIGN_OR_RETURN(PostingListUsed posting_list_used, + CreateFromPreexistingPostingListUsedRegion( + serializer, posting_list_buffer, size_in_bytes)); + serializer->Clear(&posting_list_used); + return posting_list_used; +} + +} // namespace lib +} // namespace icing diff --git a/icing/file/posting_list/posting-list-used.h b/icing/file/posting_list/posting-list-used.h new file mode 100644 index 0000000..ec4b067 --- /dev/null +++ b/icing/file/posting_list/posting-list-used.h @@ -0,0 +1,143 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ICING_FILE_POSTING_LIST_POSTING_LIST_USED_H_ +#define ICING_FILE_POSTING_LIST_POSTING_LIST_USED_H_ + +#include <cstdint> + +#include "icing/text_classifier/lib3/utils/base/status.h" +#include "icing/text_classifier/lib3/utils/base/statusor.h" + +namespace icing { +namespace lib { + +class PostingListUsed; + +// Interface for PostingListUsed data serialization and deserialization. +// - It contains several common methods used by lower level of posting list +// management related classes (e.g. FlashIndexStorage, IndexBlock, +// PostingListUsed, etc). +// - Higher level classes (e.g. MainIndex) create their desired serializers +// according to the data type they're dealing with, and pass the instance down +// to all posting list management related classes. +// - Data specific methods can also be implemented in each serializer. They +// won't be used by posting list management related classes, but higher level +// classes are able to call it and deal with the specific data type. +// +// E.g. main index stores 'Hit' data into posting lists. +// - MainIndex creates PostingListUsedHitSerializer instance and uses hit data +// related methods to serialize/deserialize Hit data to/from posting lists. +// - FlashIndexStorage, IndexBlock, PostingListUsed use the serializer created +// by MainIndex, but hold the reference/pointer in the interface format +// (PostingListUsedSerializer) and only use common interface methods to manage +// posting list. +class PostingListUsedSerializer { + public: + virtual ~PostingListUsedSerializer() = default; + + // Returns byte size of the data type. + virtual uint32_t GetDataTypeBytes() const = 0; + + // Returns minimum posting list size allowed. + // + // Note that min posting list size should also be large enough to store a + // single PostingListIndex (for posting list management usage), so we have to + // add static_assert in each serializer implementation. + // E.g. + // static constexpr uint32_t kMinPostingListSize = kSpecialHitsSize; + // static_assert(sizeof(PostingListIndex) <= kMinPostingListSize, ""); + virtual uint32_t GetMinPostingListSize() const = 0; + + // Returns minimum size of posting list that can fit these used bytes + // (see MoveFrom). + virtual uint32_t GetMinPostingListSizeToFit( + const PostingListUsed* posting_list_used) const = 0; + + // Returns bytes used by actual data. + virtual uint32_t GetBytesUsed( + const PostingListUsed* posting_list_used) const = 0; + + // Clears the posting list. It is usually used for initializing a newly + // allocated (or reclaimed from free posting list chain) posting list. + virtual void Clear(PostingListUsed* posting_list_used) const = 0; + + // Moves contents from posting list 'src' to 'dst'. Clears 'src'. + // + // RETURNS: + // - OK on success + // - INVALID_ARGUMENT if 'src' is not valid or 'src' is too large to fit in + // 'dst'. + // - FAILED_PRECONDITION if 'dst' posting list is in a corrupted state. + virtual libtextclassifier3::Status MoveFrom(PostingListUsed* dst, + PostingListUsed* src) const = 0; +}; + +// A posting list with data in it. Layout depends on the serializer. +class PostingListUsed { + public: + // Creates a PostingListUsed that points to a buffer of size_in_bytes bytes. + // 'Preexisting' means that posting_list_buffer was previously modified by + // another instance of PostingListUsed. + // + // Caller owns the data buffer and must not free it while using a + // PostingListUsed. + // + // RETURNS: + // - A valid PostingListUsed if successful + // - INVALID_ARGUMENT if posting_list_utils::IsValidPostingListSize check + // fails + // - FAILED_PRECONDITION if serializer or posting_list_buffer is null + static libtextclassifier3::StatusOr<PostingListUsed> + CreateFromPreexistingPostingListUsedRegion( + PostingListUsedSerializer* serializer, void* posting_list_buffer, + uint32_t size_in_bytes); + + // Creates a PostingListUsed that points to a buffer of size_in_bytes bytes + // and initializes the content of the buffer so that the returned + // PostingListUsed is empty. + // + // Caller owns the posting_list_buffer buffer and must not free it while using + // a PostingListUsed. + // + // RETURNS: + // - A valid PostingListUsed if successful + // - INVALID_ARGUMENT if posting_list_utils::IsValidPostingListSize check + // fails + // - FAILED_PRECONDITION if serializer or posting_list_buffer is null + static libtextclassifier3::StatusOr<PostingListUsed> + CreateFromUnitializedRegion(PostingListUsedSerializer* serializer, + void* posting_list_buffer, + uint32_t size_in_bytes); + + uint8_t* posting_list_buffer() { return posting_list_buffer_; } + const uint8_t* posting_list_buffer() const { return posting_list_buffer_; } + + uint32_t size_in_bytes() const { return size_in_bytes_; } + + private: + explicit PostingListUsed(void* posting_list_buffer, uint32_t size_in_bytes) + : posting_list_buffer_(static_cast<uint8_t*>(posting_list_buffer)), + size_in_bytes_(size_in_bytes) {} + + // A byte array of size size_in_bytes_ containing encoded data for this + // posting list. + uint8_t* posting_list_buffer_; // does not own! + uint32_t size_in_bytes_; +}; + +} // namespace lib +} // namespace icing + +#endif // ICING_FILE_POSTING_LIST_POSTING_LIST_USED_H_ diff --git a/icing/index/main/posting-list-utils.cc b/icing/file/posting_list/posting-list-utils.cc index b734767..2adbc26 100644 --- a/icing/index/main/posting-list-utils.cc +++ b/icing/file/posting_list/posting-list-utils.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "icing/index/main/posting-list-utils.h" +#include "icing/file/posting_list/posting-list-utils.h" #include "icing/legacy/index/icing-bit-util.h" #include "icing/util/logging.h" @@ -22,27 +22,28 @@ namespace lib { namespace posting_list_utils { -bool IsValidPostingListSize(uint32_t size_in_bytes) { - // size must be sizeof(Hit) aligned. Otherwise, we can have serious +bool IsValidPostingListSize(uint32_t size_in_bytes, uint32_t data_type_bytes, + uint32_t min_posting_list_size) { + // size must be data_type_bytes aligned. Otherwise, we can have serious // wasted space in the worst case. - if (size_in_bytes % sizeof(Hit) != 0) { - ICING_LOG(ERROR) << "Size " << size_in_bytes << " hit " << sizeof(Hit); + if (size_in_bytes % data_type_bytes != 0) { + ICING_LOG(ERROR) << "Size " << size_in_bytes << " data " << data_type_bytes; return false; } // Must be able to store the min information. - if (size_in_bytes < min_posting_list_size()) { + if (size_in_bytes < min_posting_list_size) { ICING_LOG(ERROR) << "Size " << size_in_bytes << " is less than min size " - << min_posting_list_size(); + << min_posting_list_size; return false; } - // We re-use the first two hits as pointers into the posting list - // so the posting list size must fit in sizeof(Hit). - if (BitsToStore(size_in_bytes) > sizeof(Hit::Value) * 8) { + // We re-use the first two data as pointers into the posting list + // so the posting list size must fit in data_type_bytes. + if (BitsToStore(size_in_bytes) > data_type_bytes * 8) { ICING_LOG(ERROR) << "Posting list size must be small enough to store the offset in " - << sizeof(Hit::Value) * 8 << " bytes."; + << data_type_bytes << " bytes."; return false; } diff --git a/icing/index/main/posting-list-utils.h b/icing/file/posting_list/posting-list-utils.h index 77537a7..6a1e28c 100644 --- a/icing/index/main/posting-list-utils.h +++ b/icing/file/posting_list/posting-list-utils.h @@ -12,34 +12,26 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef ICING_INDEX_MAIN_POSTING_LIST_UTILS_H_ -#define ICING_INDEX_MAIN_POSTING_LIST_UTILS_H_ +#ifndef ICING_FILE_POSTING_LIST_POSTING_LIST_UTILS_H_ +#define ICING_FILE_POSTING_LIST_POSTING_LIST_UTILS_H_ #include <cstdint> -#include "icing/index/hit/hit.h" - namespace icing { namespace lib { namespace posting_list_utils { -// Represents the byte length of the two special hits described -// in the private section of posting-list-used.h. -inline constexpr uint32_t kNumSpecialHits = 2; -inline constexpr uint32_t kSpecialHitsSize = sizeof(Hit) * kNumSpecialHits; - -constexpr uint32_t min_posting_list_size() { return kSpecialHitsSize; } - // For a posting list size to be valid, it must: -// 1) be sizeof(Hit) aligned +// 1) be data_type_bytes aligned // 2) be equal to or larger than min_posting_list_size -// 3) be small enough to be encoded within a single Hit (5 bytes) -bool IsValidPostingListSize(uint32_t size_in_bytes); +// 3) be small enough to be encoded within a single data (data_type_bytes) +bool IsValidPostingListSize(uint32_t size_in_bytes, uint32_t data_type_bytes, + uint32_t min_posting_list_size); } // namespace posting_list_utils } // namespace lib } // namespace icing -#endif // ICING_INDEX_MAIN_POSTING_LIST_UTILS_H_ +#endif // ICING_FILE_POSTING_LIST_POSTING_LIST_UTILS_H_ diff --git a/icing/icing-search-engine-with-icu-file_test.cc b/icing/icing-search-engine-with-icu-file_test.cc index 1012b47..39f9df0 100644 --- a/icing/icing-search-engine-with-icu-file_test.cc +++ b/icing/icing-search-engine-with-icu-file_test.cc @@ -34,17 +34,10 @@ namespace icing { namespace lib { namespace { + using ::icing::lib::portable_equals_proto::EqualsProto; using ::testing::Eq; -constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REQUIRED = - PropertyConfigProto::Cardinality::REQUIRED; - -constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN = - StringIndexingConfig::TokenizerType::PLAIN; - -constexpr TermMatchType::Code MATCH_PREFIX = TermMatchType::PREFIX; - std::string GetTestBaseDir() { return GetTestTempDir() + "/icing_with_icu_files"; } @@ -79,7 +72,7 @@ TEST(IcingSearchEngineWithIcuFileTest, ShouldInitialize) { .AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty( PropertyConfigBuilder() .SetName("body") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_REQUIRED))) .Build(); EXPECT_THAT(icing.SetSchema(schema).status().code(), Eq(StatusProto::OK)); @@ -94,7 +87,7 @@ TEST(IcingSearchEngineWithIcuFileTest, ShouldIndexAndSearch) { .AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty( PropertyConfigBuilder() .SetName("body") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_REQUIRED))) .Build(); ASSERT_THAT(icing.SetSchema(schema).status().code(), Eq(StatusProto::OK)); diff --git a/icing/icing-search-engine_backwards_compatibility_test.cc b/icing/icing-search-engine_backwards_compatibility_test.cc index 2574313..b9233cb 100644 --- a/icing/icing-search-engine_backwards_compatibility_test.cc +++ b/icing/icing-search-engine_backwards_compatibility_test.cc @@ -41,12 +41,6 @@ namespace { using ::icing::lib::portable_equals_proto::EqualsProto; using ::testing::Eq; -constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY; -constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL = - PropertyConfigProto::Cardinality::OPTIONAL; -constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN = - StringIndexingConfig::TokenizerType::PLAIN; - // For mocking purpose, we allow tests to provide a custom Filesystem. class TestIcingSearchEngine : public IcingSearchEngine { public: @@ -141,16 +135,16 @@ TEST_F(IcingSearchEngineBackwardsCompatibilityTest, SchemaBuilder() .AddType(SchemaTypeConfigBuilder() .SetType("email") - .AddProperty( - PropertyConfigBuilder() - .SetName("subject") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL)) - .AddProperty( - PropertyConfigBuilder() - .SetName("body") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL))) + .AddProperty(PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(TERM_MATCH_EXACT, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(TERM_MATCH_EXACT, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) .Build(); // Make sure our schema is still the same as we expect. If not, there's @@ -281,16 +275,16 @@ TEST_F(IcingSearchEngineBackwardsCompatibilityTest, MigrateToLargerScale) { SchemaBuilder() .AddType(SchemaTypeConfigBuilder() .SetType("email") - .AddProperty( - PropertyConfigBuilder() - .SetName("subject") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL)) - .AddProperty( - PropertyConfigBuilder() - .SetName("body") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL))) + .AddProperty(PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(TERM_MATCH_EXACT, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(TERM_MATCH_EXACT, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) .Build(); // Make sure our schema is still the same as we expect. If not, there's diff --git a/icing/icing-search-engine_fuzz_test.cc b/icing/icing-search-engine_fuzz_test.cc index 6448ee1..2cf19ad 100644 --- a/icing/icing-search-engine_fuzz_test.cc +++ b/icing/icing-search-engine_fuzz_test.cc @@ -34,14 +34,6 @@ namespace icing { namespace lib { namespace { -constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REQUIRED = - PropertyConfigProto::Cardinality::REQUIRED; - -constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN = - StringIndexingConfig::TokenizerType::PLAIN; - -constexpr TermMatchType::Code MATCH_PREFIX = TermMatchType::PREFIX; - IcingSearchEngineOptions Setup() { IcingSearchEngineOptions icing_options; icing_options.set_base_dir(GetTestTempDir() + "/icing"); @@ -86,7 +78,7 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { .AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty( PropertyConfigBuilder() .SetName("body") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_REQUIRED))) .Build(); icing.SetSchema(schema_proto); diff --git a/icing/icing-search-engine_test.cc b/icing/icing-search-engine_test.cc index bca83dc..7a60101 100644 --- a/icing/icing-search-engine_test.cc +++ b/icing/icing-search-engine_test.cc @@ -97,21 +97,6 @@ constexpr std::string_view kIpsumText = "vehicula posuere vitae, convallis eu lorem. Donec semper augue eu nibh " "placerat semper."; -constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL = - PropertyConfigProto::Cardinality::OPTIONAL; -constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REQUIRED = - PropertyConfigProto::Cardinality::REQUIRED; -constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REPEATED = - PropertyConfigProto::Cardinality::REPEATED; - -constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN = - StringIndexingConfig::TokenizerType::PLAIN; -constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_NONE = - StringIndexingConfig::TokenizerType::NONE; - -constexpr TermMatchType::Code MATCH_PREFIX = TermMatchType::PREFIX; -constexpr TermMatchType::Code MATCH_NONE = TermMatchType::UNKNOWN; - PortableFileBackedProtoLog<DocumentWrapper>::Header ReadDocumentLogHeader( Filesystem filesystem, const std::string& file_path) { PortableFileBackedProtoLog<DocumentWrapper>::Header header; @@ -219,51 +204,55 @@ SchemaProto CreateMessageSchema() { .AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty( PropertyConfigBuilder() .SetName("body") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_REQUIRED))) .Build(); } SchemaProto CreateEmailSchema() { return SchemaBuilder() - .AddType( - SchemaTypeConfigBuilder() - .SetType("Email") - .AddProperty(PropertyConfigBuilder() - .SetName("body") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_REQUIRED)) - .AddProperty(PropertyConfigBuilder() - .SetName("subject") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_REQUIRED))) + .AddType(SchemaTypeConfigBuilder() + .SetType("Email") + .AddProperty(PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty(PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED))) .Build(); } SchemaProto CreatePersonAndEmailSchema() { return SchemaBuilder() - .AddType( - SchemaTypeConfigBuilder() - .SetType("Person") - .AddProperty(PropertyConfigBuilder() - .SetName("name") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL)) - .AddProperty(PropertyConfigBuilder() - .SetName("emailAddress") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType(SchemaTypeConfigBuilder() + .SetType("Person") + .AddProperty(PropertyConfigBuilder() + .SetName("name") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("emailAddress") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) .AddType( SchemaTypeConfigBuilder() .SetType("Email") - .AddProperty(PropertyConfigBuilder() - .SetName("body") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL)) - .AddProperty(PropertyConfigBuilder() - .SetName("subject") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) .AddProperty(PropertyConfigBuilder() .SetName("sender") .SetDataTypeDocument( @@ -1285,10 +1274,11 @@ TEST_F(IcingSearchEngineTest, SchemaTypeConfigProto person_proto = SchemaTypeConfigBuilder() .SetType("Person") - .AddProperty(PropertyConfigBuilder() - .SetName("name") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("name") + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) .Build(); SchemaProto nested_schema = SchemaBuilder() @@ -1301,11 +1291,11 @@ TEST_F(IcingSearchEngineTest, "Person", /*index_nested_properties=*/true) .SetCardinality(CARDINALITY_OPTIONAL)) - .AddProperty( - PropertyConfigBuilder() - .SetName("subject") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL))) + .AddProperty(PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) .Build(); SetSchemaResultProto set_schema_result = icing.SetSchema(nested_schema); @@ -1373,11 +1363,11 @@ TEST_F(IcingSearchEngineTest, "Person", /*index_nested_properties=*/false) .SetCardinality(CARDINALITY_OPTIONAL)) - .AddProperty( - PropertyConfigBuilder() - .SetName("subject") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL))) + .AddProperty(PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) .Build(); set_schema_result = icing.SetSchema(no_nested_schema); @@ -1415,16 +1405,16 @@ TEST_F(IcingSearchEngineTest, SchemaBuilder() .AddType(SchemaTypeConfigBuilder() .SetType("Email") - .AddProperty( - PropertyConfigBuilder() - .SetName("subject") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL)) - .AddProperty( - PropertyConfigBuilder() - .SetName("body") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL))) + .AddProperty(PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) .Build(); SetSchemaResultProto set_schema_result = @@ -1472,7 +1462,7 @@ TEST_F(IcingSearchEngineTest, .AddType(SchemaTypeConfigBuilder().SetType("Email").AddProperty( PropertyConfigBuilder() .SetName("subject") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_OPTIONAL))) .Build(); @@ -1507,16 +1497,16 @@ TEST_F( SchemaBuilder() .AddType(SchemaTypeConfigBuilder() .SetType("Email") - .AddProperty( - PropertyConfigBuilder() - .SetName("subject") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL)) - .AddProperty( - PropertyConfigBuilder() - .SetName("body") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL))) + .AddProperty(PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) .Build(); SetSchemaResultProto set_schema_result = @@ -1564,16 +1554,16 @@ TEST_F( SchemaBuilder() .AddType(SchemaTypeConfigBuilder() .SetType("Email") - .AddProperty( - PropertyConfigBuilder() - .SetName("subject") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL)) - .AddProperty( - PropertyConfigBuilder() - .SetName("to") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL))) + .AddProperty(PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("to") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) .Build(); set_schema_result = icing.SetSchema( @@ -1608,25 +1598,26 @@ TEST_F(IcingSearchEngineTest, ForceSetSchemaIncompatibleNestedDocsAreDeleted) { .SetDataTypeDocument("Person", /*index_nested_properties=*/true) .SetCardinality(CARDINALITY_OPTIONAL)) - .AddProperty(PropertyConfigBuilder() - .SetName("subject") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) .Build(); SchemaProto nested_schema = SchemaBuilder() .AddType(SchemaTypeConfigBuilder() .SetType("Person") - .AddProperty( - PropertyConfigBuilder() - .SetName("name") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL)) - .AddProperty( - PropertyConfigBuilder() - .SetName("company") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL))) + .AddProperty(PropertyConfigBuilder() + .SetName("name") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("company") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) .AddType(email_schema_type) .Build(); @@ -1681,7 +1672,7 @@ TEST_F(IcingSearchEngineTest, ForceSetSchemaIncompatibleNestedDocsAreDeleted) { .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty( PropertyConfigBuilder() .SetName("name") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_OPTIONAL))) .AddType(email_schema_type) .Build(); @@ -6833,7 +6824,7 @@ TEST_F(IcingSearchEngineTest, .AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty( PropertyConfigBuilder() .SetName("unindexedField") - .SetDataTypeString(MATCH_NONE, TOKENIZER_NONE) + .SetDataTypeString(TERM_MATCH_UNKNOWN, TOKENIZER_NONE) .SetCardinality(CARDINALITY_REQUIRED))) .Build(); ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk()); @@ -7294,16 +7285,16 @@ TEST_F(IcingSearchEngineTest, .AddType( SchemaTypeConfigBuilder() .SetType("Message") - .AddProperty( - PropertyConfigBuilder() - .SetName("body") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_REQUIRED)) - .AddProperty( - PropertyConfigBuilder() - .SetName("subject") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL))) + .AddProperty(PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty(PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) .Build(); // Write the marker file std::string marker_filepath = @@ -8148,7 +8139,7 @@ TEST_F(IcingSearchEngineTest, SnippetErrorTest) { .AddType(SchemaTypeConfigBuilder().SetType("Generic").AddProperty( PropertyConfigBuilder() .SetName("subject") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_REPEATED))) .Build(); ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk()); @@ -8257,7 +8248,7 @@ TEST_F(IcingSearchEngineTest, CJKSnippetTest) { // Search and request snippet matching but no windowing. SearchSpecProto search_spec; search_spec.set_query("走"); - search_spec.set_term_match_type(MATCH_PREFIX); + search_spec.set_term_match_type(TERM_MATCH_PREFIX); ResultSpecProto result_spec; result_spec.mutable_snippet_spec()->set_num_to_snippet( @@ -8328,7 +8319,7 @@ TEST_F(IcingSearchEngineTest, InvalidToEmptyQueryTest) { // Search and request snippet matching but no windowing. SearchSpecProto search_spec; search_spec.set_query("?"); - search_spec.set_term_match_type(MATCH_PREFIX); + search_spec.set_term_match_type(TERM_MATCH_PREFIX); ScoringSpecProto scoring_spec; ResultSpecProto result_spec; @@ -8394,7 +8385,7 @@ TEST_F(IcingSearchEngineTest, EmojiSnippetTest) { // Search and request snippet matching but no windowing. SearchSpecProto search_spec; search_spec.set_query("🐟"); - search_spec.set_term_match_type(MATCH_PREFIX); + search_spec.set_term_match_type(TERM_MATCH_PREFIX); ResultSpecProto result_spec; result_spec.mutable_snippet_spec()->set_num_to_snippet(1); @@ -8455,7 +8446,7 @@ TEST_F(IcingSearchEngineTest, PutDocumentIndexFailureDeletion) { // Make sure that the document isn't searchable. SearchSpecProto search_spec; search_spec.set_query("foo"); - search_spec.set_term_match_type(MATCH_PREFIX); + search_spec.set_term_match_type(TERM_MATCH_PREFIX); SearchResultProto search_results = icing.Search(search_spec, ScoringSpecProto::default_instance(), @@ -9103,12 +9094,12 @@ TEST_F(IcingSearchEngineTest, .AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty( PropertyConfigBuilder() .SetName("body") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_REQUIRED))) .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty( PropertyConfigBuilder() .SetName("name") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_OPTIONAL))) .AddType(SchemaTypeConfigBuilder() .SetType("Email") @@ -9118,11 +9109,11 @@ TEST_F(IcingSearchEngineTest, "Person", /*index_nested_properties=*/true) .SetCardinality(CARDINALITY_OPTIONAL)) - .AddProperty( - PropertyConfigBuilder() - .SetName("subject") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL))) + .AddProperty(PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) .Build(); ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk()); @@ -9176,7 +9167,7 @@ TEST_F(IcingSearchEngineTest, SearchSuggestionsTest_SchemaTypeNotFound) { .AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty( PropertyConfigBuilder() .SetName("body") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_REQUIRED))) .Build(); ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk()); @@ -9312,12 +9303,12 @@ TEST_F(IcingSearchEngineTest, .AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty( PropertyConfigBuilder() .SetName("body") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_REQUIRED))) .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty( PropertyConfigBuilder() .SetName("name") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_OPTIONAL))) .AddType(SchemaTypeConfigBuilder() .SetType("Email") @@ -9327,11 +9318,11 @@ TEST_F(IcingSearchEngineTest, "Person", /*index_nested_properties=*/true) .SetCardinality(CARDINALITY_OPTIONAL)) - .AddProperty( - PropertyConfigBuilder() - .SetName("subject") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL))) + .AddProperty(PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) .Build(); ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk()); @@ -9391,12 +9382,12 @@ TEST_F(IcingSearchEngineTest, .AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty( PropertyConfigBuilder() .SetName("body") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_REQUIRED))) .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty( PropertyConfigBuilder() .SetName("name") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_OPTIONAL))) .AddType(SchemaTypeConfigBuilder() .SetType("Email") @@ -9406,11 +9397,11 @@ TEST_F(IcingSearchEngineTest, "Person", /*index_nested_properties=*/true) .SetCardinality(CARDINALITY_OPTIONAL)) - .AddProperty( - PropertyConfigBuilder() - .SetName("subject") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL))) + .AddProperty(PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) .Build(); ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk()); @@ -9447,7 +9438,7 @@ TEST_F(IcingSearchEngineTest, SearchSuggestionsTest_OrderByTermFrequency) { .AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty( PropertyConfigBuilder() .SetName("body") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_REQUIRED))) .Build(); ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk()); @@ -9697,49 +9688,49 @@ TEST_F(IcingSearchEngineTest, IcingShouldWorkFor64Sections) { .AddType(SchemaTypeConfigBuilder() // Person has 4 sections. .SetType("Person") - .AddProperty( - PropertyConfigBuilder() - .SetName("firstName") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL)) - .AddProperty( - PropertyConfigBuilder() - .SetName("lastName") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL)) - .AddProperty( - PropertyConfigBuilder() - .SetName("emailAddress") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL)) - .AddProperty( - PropertyConfigBuilder() - .SetName("phoneNumber") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL))) + .AddProperty(PropertyConfigBuilder() + .SetName("firstName") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("lastName") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("emailAddress") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("phoneNumber") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) .AddType(SchemaTypeConfigBuilder() // Email has 16 sections. .SetType("Email") - .AddProperty( - PropertyConfigBuilder() - .SetName("body") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL)) - .AddProperty( - PropertyConfigBuilder() - .SetName("subject") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL)) - .AddProperty( - PropertyConfigBuilder() - .SetName("date") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL)) - .AddProperty( - PropertyConfigBuilder() - .SetName("time") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("date") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("time") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) .AddProperty( PropertyConfigBuilder() .SetName("sender") diff --git a/icing/index/hit/hit.cc b/icing/index/hit/hit.cc index ce1c366..493e62b 100644 --- a/icing/index/hit/hit.cc +++ b/icing/index/hit/hit.cc @@ -35,9 +35,20 @@ enum FlagOffset { kHasTermFrequency = 2, kNumFlags = 3, }; + +static_assert(kDocumentIdBits + kSectionIdBits + kNumFlags < + sizeof(Hit::Value) * 8, + "Hit::kInvalidValue contains risky value and we should have at " + "least one unused bit to avoid potential bugs. Please follow the " + "process mentioned in hit.h to correct the value of " + "Hit::kInvalidValue and remove this static_assert afterwards."); + static_assert(kDocumentIdBits + kSectionIdBits + kNumFlags <= sizeof(Hit::Value) * 8, "HitOverflow"); +static_assert(kDocumentIdBits == 22, ""); +static_assert(kSectionIdBits == 6, ""); +static_assert(kNumFlags == 3, ""); inline DocumentId InvertDocumentId(DocumentId document_id) { static_assert(kMaxDocumentId <= (std::numeric_limits<DocumentId>::max() - 1), @@ -52,6 +63,31 @@ inline DocumentId InvertDocumentId(DocumentId document_id) { } // namespace +BasicHit::BasicHit(SectionId section_id, DocumentId document_id) { + // Values are stored so that when sorted, they appear in document_id + // descending, section_id ascending, order. So inverted document_id appears in + // the most significant bits, followed by (uninverted) section_id. + Value temp_value = 0; + bit_util::BitfieldSet(/*new_value=*/InvertDocumentId(document_id), + /*lsb_offset=*/kSectionIdBits, /*len=*/kDocumentIdBits, + /*value_out=*/&temp_value); + bit_util::BitfieldSet(/*new_value=*/section_id, /*lsb_offset=*/0, + /*len=*/kSectionIdBits, /*value_out=*/&temp_value); + value_ = temp_value; +} + +DocumentId BasicHit::document_id() const { + DocumentId inverted_document_id = bit_util::BitfieldGet( + value_, /*lsb_offset=*/kSectionIdBits, /*len=*/kDocumentIdBits); + // Undo the document_id inversion. + return InvertDocumentId(inverted_document_id); +} + +SectionId BasicHit::section_id() const { + return bit_util::BitfieldGet(value_, /*lsb_offset=*/0, + /*len=*/kSectionIdBits); +} + Hit::Hit(SectionId section_id, DocumentId document_id, Hit::TermFrequency term_frequency, bool is_in_prefix_section, bool is_prefix_hit) diff --git a/icing/index/hit/hit.h b/icing/index/hit/hit.h index 35c9238..111b320 100644 --- a/icing/index/hit/hit.h +++ b/icing/index/hit/hit.h @@ -26,6 +26,54 @@ namespace icing { namespace lib { +// BasicHit is a specific encoding that refers to content within a document. A +// basic hit consists of: +// - a DocumentId +// - a SectionId +// referring to the document and section that the hit corresponds to. +// +// The hit is the most basic unit of the index and, when grouped together by +// term, can be used to encode what terms appear in what documents. +// +// BasicHit is for indices (e.g. numeric index) that don't require term +// frequency. +class BasicHit { + public: + // The datatype used to encode BasicHit information: the document_id and + // section_id. + using Value = uint32_t; + + // WARNING: Changing this value will invalidate any pre-existing posting lists + // on user devices. + // + // kInvalidValue contains: + // - 0 for unused bits. Note that unused bits are always 0 for both valid and + // invalid BasicHit values. + // - Inverted kInvalidDocumentId + // - SectionId 0 (valid), which is ok because inverted kInvalidDocumentId has + // already invalidated the value. In fact, we currently use all 2^6 section + // ids and there is no "invalid section id", so it doesn't matter what + // SectionId we set for kInvalidValue. + static constexpr Value kInvalidValue = 0; + + explicit BasicHit(SectionId section_id, DocumentId document_id); + + explicit BasicHit() : value_(kInvalidValue) {} + + bool is_valid() const { return value_ != kInvalidValue; } + Value value() const { return value_; } + DocumentId document_id() const; + SectionId section_id() const; + + bool operator<(const BasicHit& h2) const { return value_ < h2.value_; } + bool operator==(const BasicHit& h2) const { return value_ == h2.value_; } + + private: + // Value bits layout: 4 unused + 22 document_id + 6 section id. + Value value_; +} __attribute__((packed)); +static_assert(sizeof(BasicHit) == 4, ""); + // Hit is a specific encoding that refers to content within a document. A hit // consists of: // - a DocumentId @@ -36,7 +84,8 @@ namespace lib { // - whether the Hit does not appear exactly in the document, but instead // represents a term that is a prefix of a term in the document // - whether the Hit came from a section that has prefix expansion enabled -// and a term frequency for the hit. +// and a term frequency for the hit. +// // The hit is the most basic unit of the index and, when grouped together by // term, can be used to encode what terms appear in what documents. class Hit { @@ -47,6 +96,26 @@ class Hit { // WARNING: Changing this value will invalidate any pre-existing posting lists // on user devices. + // + // WARNING: + // - Hit::kInvalidValue should contain inverted kInvalidDocumentId, which is + // b'00...0. However, currently we set it as UINT32_MAX and actually it + // contains b'11...1, which is the inverted document_id 0. + // - It means Hit::kInvalidValue contains valid (document_id, section_id, + // flags), so we potentially cannot distinguish if a Hit is invalid or not. + // The invalidity is an essential feature for posting list since we use it + // to determine the state of the posting list. + // - The reason why it won't break the current posting list is because the + // unused bit(s) are set as 1 for Hit::kInvalidValue and 0 for all valid + // Hits. In other words, the unused bit(s) are actually serving as "invalid + // flag". + // - If we want to exhaust all unused bits in the future, then we have to + // change Hit::kInvalidValue to set the inverted document_id section + // correctly (b'00...0, refer to BasicHit::kInvalidValue as an example). + // - Also this problem is guarded by static_assert in hit.cc. If exhausting + // all unused bits, then the static_assert will detect and fail. We can + // safely remove the static_assert check after following the above process + // to resolve the incorrect Hit::kInvalidValue issue. static constexpr Value kInvalidValue = std::numeric_limits<Value>::max(); // Docs are sorted in reverse, and 0 is never used as the inverted // DocumentId (because it is the inverse of kInvalidValue), so it is always @@ -91,7 +160,7 @@ class Hit { private: // Value and TermFrequency must be in this order. - // Value bits layout: 5 unused + 20 document_id + 4 section id + 3 flags. + // Value bits layout: 1 unused + 22 document_id + 6 section id + 3 flags. Value value_; TermFrequency term_frequency_; } __attribute__((packed)); diff --git a/icing/index/hit/hit_test.cc b/icing/index/hit/hit_test.cc index d47ca37..0086d91 100644 --- a/icing/index/hit/hit_test.cc +++ b/icing/index/hit/hit_test.cc @@ -26,6 +26,7 @@ namespace { using ::testing::ElementsAre; using ::testing::Eq; +using ::testing::Ge; using ::testing::IsFalse; using ::testing::IsTrue; using ::testing::Lt; @@ -35,6 +36,63 @@ static constexpr DocumentId kSomeDocumentId = 24; static constexpr SectionId kSomeSectionid = 5; static constexpr Hit::TermFrequency kSomeTermFrequency = 57; +TEST(BasicHitTest, Accessors) { + BasicHit h1(kSomeSectionid, kSomeDocumentId); + EXPECT_THAT(h1.document_id(), Eq(kSomeDocumentId)); + EXPECT_THAT(h1.section_id(), Eq(kSomeSectionid)); +} + +TEST(BasicHitTest, Invalid) { + BasicHit default_invalid; + EXPECT_THAT(default_invalid.is_valid(), IsFalse()); + + // Also make sure the invalid BasicHit contains an invalid document id. + EXPECT_THAT(default_invalid.document_id(), Eq(kInvalidDocumentId)); + EXPECT_THAT(default_invalid.section_id(), Eq(kMinSectionId)); +} + +TEST(BasicHitTest, Valid) { + BasicHit maximum_document_id_hit(kSomeSectionid, kMaxDocumentId); + EXPECT_THAT(maximum_document_id_hit.is_valid(), IsTrue()); + + BasicHit maximum_section_id_hit(kMaxSectionId, kSomeDocumentId); + EXPECT_THAT(maximum_section_id_hit.is_valid(), IsTrue()); + + BasicHit minimum_document_id_hit(kSomeSectionid, kMinDocumentId); + EXPECT_THAT(minimum_document_id_hit.is_valid(), IsTrue()); + + BasicHit minimum_section_id_hit(kMinSectionId, kSomeDocumentId); + EXPECT_THAT(minimum_section_id_hit.is_valid(), IsTrue()); + + BasicHit all_maximum_hit(kMaxSectionId, kMaxDocumentId); + EXPECT_THAT(all_maximum_hit.is_valid(), IsTrue()); + + BasicHit all_minimum_hit(kMinSectionId, kMinDocumentId); + EXPECT_THAT(all_minimum_hit.is_valid(), IsTrue()); + + // We use invalid BasicHit for std::lower_bound. Verify that value of the + // smallest valid BasicHit (which contains kMinSectionId, kMaxDocumentId) is + // >= BasicHit::kInvalidValue. + BasicHit smallest_hit(kMinSectionId, kMaxDocumentId); + ASSERT_THAT(smallest_hit.is_valid(), IsTrue()); + EXPECT_THAT(smallest_hit.value(), Ge(BasicHit::kInvalidValue)); +} + +TEST(BasicHitTest, Comparison) { + BasicHit hit(/*section_id=*/1, /*document_id=*/243); + // DocumentIds are sorted in ascending order. So a hit with a lower + // document_id should be considered greater than one with a higher + // document_id. + BasicHit higher_document_id_hit(/*section_id=*/1, /*document_id=*/2409); + BasicHit higher_section_id_hit(/*section_id=*/15, /*document_id=*/243); + + std::vector<BasicHit> hits{hit, higher_document_id_hit, + higher_section_id_hit}; + std::sort(hits.begin(), hits.end()); + EXPECT_THAT(hits, + ElementsAre(higher_document_id_hit, hit, higher_section_id_hit)); +} + TEST(HitTest, HasTermFrequencyFlag) { Hit h1(kSomeSectionid, kSomeDocumentId, Hit::kDefaultTermFrequency); EXPECT_THAT(h1.has_term_frequency(), IsFalse()); @@ -101,6 +159,17 @@ TEST(HitTest, Valid) { Hit minimum_section_id_hit(0, kSomeDocumentId, kSomeTermFrequency); EXPECT_THAT(minimum_section_id_hit.is_valid(), IsTrue()); + + // We use Hit with value Hit::kMaxDocumentIdSortValue for std::lower_bound in + // the lite index. Verify that the value of the smallest valid Hit (which + // contains kMinSectionId, kMaxDocumentId and 3 flags = false) is >= + // Hit::kMaxDocumentIdSortValue. + Hit smallest_hit(kMinSectionId, kMaxDocumentId, Hit::kDefaultTermFrequency); + ASSERT_THAT(smallest_hit.is_valid(), IsTrue()); + ASSERT_THAT(smallest_hit.has_term_frequency(), IsFalse()); + ASSERT_THAT(smallest_hit.is_prefix_hit(), IsFalse()); + ASSERT_THAT(smallest_hit.is_in_prefix_section(), IsFalse()); + EXPECT_THAT(smallest_hit.value(), Ge(Hit::kMaxDocumentIdSortValue)); } TEST(HitTest, Comparison) { diff --git a/icing/index/index-processor.cc b/icing/index/index-processor.cc index a1dacde..cfeda31 100644 --- a/icing/index/index-processor.cc +++ b/icing/index/index-processor.cc @@ -89,6 +89,8 @@ libtextclassifier3::Status IndexProcessor::IndexDocument( [[fallthrough]]; case StringIndexingConfig::TokenizerType::RFC822: [[fallthrough]]; + case StringIndexingConfig::TokenizerType::URL: + [[fallthrough]]; case StringIndexingConfig::TokenizerType::PLAIN: std::string normalized_term = normalizer_.NormalizeTerm(token); status = editor.BufferTerm(normalized_term.c_str()); diff --git a/icing/index/index-processor_test.cc b/icing/index/index-processor_test.cc index e961b0e..3c848d3 100644 --- a/icing/index/index-processor_test.cc +++ b/icing/index/index-processor_test.cc @@ -94,6 +94,12 @@ constexpr std::string_view kNestedProperty = "nested"; constexpr std::string_view kExactVerbatimProperty = "verbatimExact"; constexpr std::string_view kPrefixedVerbatimProperty = "verbatimPrefixed"; constexpr std::string_view kRfc822Property = "rfc822"; +// TODO (b/246964044): remove ifdef guard when url-tokenizer is ready for export +// to Android. +#ifdef ENABLE_URL_TOKENIZER +constexpr std::string_view kExactUrlProperty = "urlExact"; +constexpr std::string_view kPrefixedUrlProperty = "urlPrefixed"; +#endif // ENABLE_URL_TOKENIZER constexpr DocumentId kDocumentId0 = 0; constexpr DocumentId kDocumentId1 = 1; @@ -103,8 +109,15 @@ constexpr SectionId kPrefixedSectionId = 1; constexpr SectionId kRepeatedSectionId = 2; constexpr SectionId kRfc822SectionId = 3; constexpr SectionId kNestedSectionId = 4; +#ifdef ENABLE_URL_TOKENIZER +constexpr SectionId kUrlExactSectionId = 5; +constexpr SectionId kUrlPrefixedSectionId = 6; +constexpr SectionId kExactVerbatimSectionId = 7; +constexpr SectionId kPrefixedVerbatimSectionId = 8; +#else // !ENABLE_URL_TOKENIZER constexpr SectionId kExactVerbatimSectionId = 5; constexpr SectionId kPrefixedVerbatimSectionId = 6; +#endif // ENABLE_URL_TOKENIZER using Cardinality = PropertyConfigProto::Cardinality; using DataType = PropertyConfigProto::DataType; @@ -113,25 +126,10 @@ using ::testing::Eq; using ::testing::IsEmpty; using ::testing::Test; -constexpr PropertyConfigProto::DataType::Code TYPE_STRING = - PropertyConfigProto::DataType::STRING; -constexpr PropertyConfigProto::DataType::Code TYPE_BYTES = - PropertyConfigProto::DataType::BYTES; - -constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL = - PropertyConfigProto::Cardinality::OPTIONAL; -constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REPEATED = - PropertyConfigProto::Cardinality::REPEATED; - -constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN = - StringIndexingConfig::TokenizerType::PLAIN; -constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_VERBATIM = - StringIndexingConfig::TokenizerType::VERBATIM; -constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_RFC822 = - StringIndexingConfig::TokenizerType::RFC822; - -constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY; -constexpr TermMatchType::Code MATCH_PREFIX = TermMatchType::PREFIX; +#ifdef ENABLE_URL_TOKENIZER +constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_URL = + StringIndexingConfig::TokenizerType::URL; +#endif // ENABLE_URL_TOKENIZER class IndexProcessorTest : public Test { protected: @@ -169,16 +167,16 @@ class IndexProcessorTest : public Test { .AddType( SchemaTypeConfigBuilder() .SetType(kFakeType) - .AddProperty( - PropertyConfigBuilder() - .SetName(kExactProperty) - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL)) - .AddProperty( - PropertyConfigBuilder() - .SetName(kPrefixedProperty) - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName(kExactProperty) + .SetDataTypeString(TERM_MATCH_EXACT, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName(kPrefixedProperty) + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) .AddProperty(PropertyConfigBuilder() .SetName(kUnindexedProperty1) .SetDataType(TYPE_STRING) @@ -187,26 +185,38 @@ class IndexProcessorTest : public Test { .SetName(kUnindexedProperty2) .SetDataType(TYPE_BYTES) .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName(kRepeatedProperty) + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REPEATED)) + .AddProperty(PropertyConfigBuilder() + .SetName(kExactVerbatimProperty) + .SetDataTypeString(TERM_MATCH_EXACT, + TOKENIZER_VERBATIM) + .SetCardinality(CARDINALITY_REPEATED)) + .AddProperty(PropertyConfigBuilder() + .SetName(kPrefixedVerbatimProperty) + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_VERBATIM) + .SetCardinality(CARDINALITY_REPEATED)) + .AddProperty(PropertyConfigBuilder() + .SetName(kRfc822Property) + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_RFC822) + .SetCardinality(CARDINALITY_REPEATED)) +#ifdef ENABLE_URL_TOKENIZER .AddProperty( PropertyConfigBuilder() - .SetName(kRepeatedProperty) - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_REPEATED)) - .AddProperty( - PropertyConfigBuilder() - .SetName(kExactVerbatimProperty) - .SetDataTypeString(MATCH_EXACT, TOKENIZER_VERBATIM) - .SetCardinality(CARDINALITY_REPEATED)) - .AddProperty( - PropertyConfigBuilder() - .SetName(kPrefixedVerbatimProperty) - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_VERBATIM) + .SetName(kExactUrlProperty) + .SetDataTypeString(MATCH_EXACT, TOKENIZER_URL) .SetCardinality(CARDINALITY_REPEATED)) .AddProperty( PropertyConfigBuilder() - .SetName(kRfc822Property) - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_RFC822) + .SetName(kPrefixedUrlProperty) + .SetDataTypeString(MATCH_PREFIX, TOKENIZER_URL) .SetCardinality(CARDINALITY_REPEATED)) +#endif // ENABLE_URL_TOKENIZER .AddProperty( PropertyConfigBuilder() .SetName(kSubProperty) @@ -216,11 +226,11 @@ class IndexProcessorTest : public Test { .AddType( SchemaTypeConfigBuilder() .SetType(kNestedType) - .AddProperty( - PropertyConfigBuilder() - .SetName(kNestedProperty) - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL))) + .AddProperty(PropertyConfigBuilder() + .SetName(kNestedProperty) + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) .Build(); ICING_ASSERT_OK(schema_store_->SetSchema(schema)); @@ -1075,6 +1085,191 @@ TEST_F(IndexProcessorTest, Rfc822PropertyNoMatch) { EXPECT_THAT(hits, IsEmpty()); } +#ifdef ENABLE_URL_TOKENIZER +TEST_F(IndexProcessorTest, ExactUrlProperty) { + DocumentProto document = + DocumentBuilder() + .SetKey("icing", "fake_type/1") + .SetSchema(std::string(kFakeType)) + .AddStringProperty(std::string(kExactUrlProperty), + "http://www.google.com") + .Build(); + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_document, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + document)); + EXPECT_THAT(tokenized_document.num_tokens(), 7); + + EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), + IsOk()); + EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); + + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DocHitInfoIterator> itr, + index_->GetIterator("google", kSectionIdMaskAll, + TermMatchType::EXACT_ONLY)); + std::vector<DocHitInfoTermFrequencyPair> hits = + GetHitsWithTermFrequency(std::move(itr)); + std::unordered_map<SectionId, Hit::TermFrequency> expected_map{ + {kUrlExactSectionId, 1}}; + EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency( + kDocumentId0, expected_map))); + + ICING_ASSERT_OK_AND_ASSIGN(itr, + index_->GetIterator("http", kSectionIdMaskAll, + TermMatchType::EXACT_ONLY)); + hits = GetHitsWithTermFrequency(std::move(itr)); + expected_map = {{kUrlExactSectionId, 1}}; + EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency( + kDocumentId0, expected_map))); + + ICING_ASSERT_OK_AND_ASSIGN( + itr, index_->GetIterator("www.google.com", kSectionIdMaskAll, + TermMatchType::EXACT_ONLY)); + hits = GetHitsWithTermFrequency(std::move(itr)); + expected_map = {{kUrlExactSectionId, 1}}; + EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency( + kDocumentId0, expected_map))); + + ICING_ASSERT_OK_AND_ASSIGN( + itr, index_->GetIterator("http://www.google.com", kSectionIdMaskAll, + TermMatchType::EXACT_ONLY)); + hits = GetHitsWithTermFrequency(std::move(itr)); + expected_map = {{kUrlExactSectionId, 1}}; + EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency( + kDocumentId0, expected_map))); +} + +TEST_F(IndexProcessorTest, ExactUrlPropertyDoesNotMatchPrefix) { + DocumentProto document = + DocumentBuilder() + .SetKey("icing", "fake_type/1") + .SetSchema(std::string(kFakeType)) + .AddStringProperty(std::string(kExactUrlProperty), + "https://mail.google.com/calendar/render") + .Build(); + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_document, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + document)); + EXPECT_THAT(tokenized_document.num_tokens(), 8); + + EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), + IsOk()); + EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<DocHitInfoIterator> itr, + index_->GetIterator("co", kSectionIdMaskAll, TermMatchType::EXACT_ONLY)); + std::vector<DocHitInfoTermFrequencyPair> hits = + GetHitsWithTermFrequency(std::move(itr)); + EXPECT_THAT(hits, IsEmpty()); + + ICING_ASSERT_OK_AND_ASSIGN(itr, + index_->GetIterator("mail.go", kSectionIdMaskAll, + TermMatchType::EXACT_ONLY)); + hits = GetHitsWithTermFrequency(std::move(itr)); + EXPECT_THAT(hits, IsEmpty()); + + ICING_ASSERT_OK_AND_ASSIGN( + itr, index_->GetIterator("mail.google.com", kSectionIdMaskAll, + TermMatchType::EXACT_ONLY)); + hits = GetHitsWithTermFrequency(std::move(itr)); + EXPECT_THAT(hits, IsEmpty()); +} + +TEST_F(IndexProcessorTest, PrefixUrlProperty) { + DocumentProto document = + DocumentBuilder() + .SetKey("icing", "fake_type/1") + .SetSchema(std::string(kFakeType)) + .AddStringProperty(std::string(kPrefixedUrlProperty), + "http://www.google.com") + .Build(); + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_document, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + document)); + EXPECT_THAT(tokenized_document.num_tokens(), 7); + + EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), + IsOk()); + EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); + + // "goo" is a prefix of "google" and "google.com" + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<DocHitInfoIterator> itr, + index_->GetIterator("goo", kSectionIdMaskAll, TermMatchType::PREFIX)); + std::vector<DocHitInfoTermFrequencyPair> hits = + GetHitsWithTermFrequency(std::move(itr)); + std::unordered_map<SectionId, Hit::TermFrequency> expected_map{ + {kUrlPrefixedSectionId, 1}}; + EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency( + kDocumentId0, expected_map))); + + // "http" is a prefix of "http" and "http://www.google.com" + ICING_ASSERT_OK_AND_ASSIGN(itr, index_->GetIterator("http", kSectionIdMaskAll, + TermMatchType::PREFIX)); + hits = GetHitsWithTermFrequency(std::move(itr)); + expected_map = {{kUrlPrefixedSectionId, 1}}; + EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency( + kDocumentId0, expected_map))); + + // "www.go" is a prefix of "www.google.com" + ICING_ASSERT_OK_AND_ASSIGN( + itr, + index_->GetIterator("www.go", kSectionIdMaskAll, TermMatchType::PREFIX)); + hits = GetHitsWithTermFrequency(std::move(itr)); + expected_map = {{kUrlPrefixedSectionId, 1}}; + EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency( + kDocumentId0, expected_map))); +} + +TEST_F(IndexProcessorTest, PrefixUrlPropertyNoMatch) { + DocumentProto document = + DocumentBuilder() + .SetKey("icing", "fake_type/1") + .SetSchema(std::string(kFakeType)) + .AddStringProperty(std::string(kPrefixedUrlProperty), + "https://mail.google.com/calendar/render") + .Build(); + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_document, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + document)); + EXPECT_THAT(tokenized_document.num_tokens(), 8); + + EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), + IsOk()); + EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); + + // no token starts with "gle", so we should have no hits + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<DocHitInfoIterator> itr, + index_->GetIterator("gle", kSectionIdMaskAll, TermMatchType::PREFIX)); + std::vector<DocHitInfoTermFrequencyPair> hits = + GetHitsWithTermFrequency(std::move(itr)); + EXPECT_THAT(hits, IsEmpty()); + + ICING_ASSERT_OK_AND_ASSIGN( + itr, + index_->GetIterator("w.goo", kSectionIdMaskAll, TermMatchType::PREFIX)); + hits = GetHitsWithTermFrequency(std::move(itr)); + EXPECT_THAT(hits, IsEmpty()); + + // tokens have separators removed, so no hits here + ICING_ASSERT_OK_AND_ASSIGN(itr, index_->GetIterator(".com", kSectionIdMaskAll, + TermMatchType::PREFIX)); + hits = GetHitsWithTermFrequency(std::move(itr)); + EXPECT_THAT(hits, IsEmpty()); + + ICING_ASSERT_OK_AND_ASSIGN( + itr, index_->GetIterator("calendar/render", kSectionIdMaskAll, + TermMatchType::PREFIX)); + hits = GetHitsWithTermFrequency(std::move(itr)); + EXPECT_THAT(hits, IsEmpty()); +} +#endif // ENABLE_URL_TOKENIZER + } // namespace } // namespace lib diff --git a/icing/index/iterator/doc-hit-info-iterator-and.cc b/icing/index/iterator/doc-hit-info-iterator-and.cc index 6bde8e6..3b7ede9 100644 --- a/icing/index/iterator/doc-hit-info-iterator-and.cc +++ b/icing/index/iterator/doc-hit-info-iterator-and.cc @@ -55,11 +55,12 @@ std::unique_ptr<DocHitInfoIterator> CreateAndIterator( if (iterators.size() <= kBinaryAndIteratorPerformanceThreshold && iterators.size() >= kMinBinaryIterators) { // Accumulate the iterators that need to be ANDed together. - iterator = std::move(iterators.at(0)); - for (size_t i = 1; i < iterators.size(); ++i) { + iterator = std::move(iterators.at(iterators.size() - 1)); + for (int i = iterators.size() - 2; i >= 0; --i) { std::unique_ptr<DocHitInfoIterator> temp_iterator = std::move(iterator); iterator = std::make_unique<DocHitInfoIteratorAnd>( - std::move(temp_iterator), std::move(iterators[i])); + /*short_it=*/std::move(iterators[i]), + /*long_it=*/std::move(temp_iterator)); } } else { // If the vector is too small, the AndNary iterator can handle it and return diff --git a/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc b/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc index 485f85b..e80d8f0 100644 --- a/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc +++ b/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc @@ -48,14 +48,6 @@ using ::testing::ElementsAreArray; using ::testing::Eq; using ::testing::IsEmpty; -constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL = - PropertyConfigProto::Cardinality::OPTIONAL; - -constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN = - StringIndexingConfig::TokenizerType::PLAIN; - -constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY; - class DocHitInfoIteratorSectionRestrictTest : public ::testing::Test { protected: DocHitInfoIteratorSectionRestrictTest() @@ -74,7 +66,7 @@ class DocHitInfoIteratorSectionRestrictTest : public ::testing::Test { .AddProperty( PropertyConfigBuilder() .SetName(indexed_property_) - .SetDataTypeString(MATCH_EXACT, + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_OPTIONAL))) .Build(); diff --git a/icing/index/main/doc-hit-info-iterator-term-main.cc b/icing/index/main/doc-hit-info-iterator-term-main.cc index 4bd87aa..098a450 100644 --- a/icing/index/main/doc-hit-info-iterator-term-main.cc +++ b/icing/index/main/doc-hit-info-iterator-term-main.cc @@ -20,12 +20,13 @@ #include "icing/text_classifier/lib3/utils/base/status.h" #include "icing/absl_ports/canonical_errors.h" #include "icing/absl_ports/str_cat.h" +#include "icing/file/posting_list/posting-list-identifier.h" #include "icing/index/hit/doc-hit-info.h" #include "icing/index/main/posting-list-accessor.h" -#include "icing/index/main/posting-list-identifier.h" #include "icing/legacy/core/icing-string-util.h" #include "icing/schema/section.h" #include "icing/store/document-id.h" +#include "icing/util/logging.h" #include "icing/util/status-macros.h" namespace icing { diff --git a/icing/index/main/main-index-merger.cc b/icing/index/main/main-index-merger.cc index f49dc74..c26a6d7 100644 --- a/icing/index/main/main-index-merger.cc +++ b/icing/index/main/main-index-merger.cc @@ -20,10 +20,11 @@ #include <unordered_map> #include "icing/absl_ports/canonical_errors.h" +#include "icing/file/posting_list/index-block.h" #include "icing/index/lite/term-id-hit-pair.h" -#include "icing/index/main/index-block.h" #include "icing/index/term-id-codec.h" #include "icing/legacy/core/icing-string-util.h" +#include "icing/util/logging.h" #include "icing/util/status-macros.h" namespace icing { diff --git a/icing/index/main/main-index.cc b/icing/index/main/main-index.cc index 5de92d0..1c61bfa 100644 --- a/icing/index/main/main-index.cc +++ b/icing/index/main/main-index.cc @@ -22,13 +22,16 @@ #include "icing/absl_ports/canonical_errors.h" #include "icing/absl_ports/str_cat.h" #include "icing/file/destructible-directory.h" -#include "icing/index/main/index-block.h" +#include "icing/file/posting_list/posting-list-common.h" +#include "icing/index/main/posting-list-used-hit-serializer.h" #include "icing/index/term-id-codec.h" #include "icing/index/term-property-id.h" +#include "icing/legacy/core/icing-string-util.h" #include "icing/legacy/index/icing-dynamic-trie.h" #include "icing/proto/debug.pb.h" #include "icing/proto/storage.pb.h" #include "icing/proto/term.pb.h" +#include "icing/util/logging.h" #include "icing/util/status-macros.h" namespace icing { @@ -94,7 +97,9 @@ MainIndex::MainIndex(const std::string& index_directory, const IcingFilesystem* icing_filesystem) : base_dir_(index_directory), filesystem_(filesystem), - icing_filesystem_(icing_filesystem) {} + icing_filesystem_(icing_filesystem), + posting_list_used_hit_serializer_( + std::make_unique<PostingListUsedHitSerializer>()) {} libtextclassifier3::StatusOr<std::unique_ptr<MainIndex>> MainIndex::Create( const std::string& index_directory, const Filesystem* filesystem, @@ -115,7 +120,8 @@ libtextclassifier3::Status MainIndex::Init() { std::string flash_index_file = base_dir_ + "/main_index"; ICING_ASSIGN_OR_RETURN( FlashIndexStorage flash_index, - FlashIndexStorage::Create(flash_index_file, filesystem_)); + FlashIndexStorage::Create(flash_index_file, filesystem_, + posting_list_used_hit_serializer_.get())); flash_index_storage_ = std::make_unique<FlashIndexStorage>(std::move(flash_index)); @@ -161,9 +167,11 @@ MainIndex::GetAccessorForExactTerm(const std::string& term) { return absl_ports::NotFoundError(IcingStringUtil::StringPrintf( "Term %s is not present in main lexicon.", term.c_str())); } - ICING_ASSIGN_OR_RETURN(PostingListAccessor accessor, - PostingListAccessor::CreateFromExisting( - flash_index_storage_.get(), posting_list_id)); + ICING_ASSIGN_OR_RETURN( + PostingListAccessor accessor, + PostingListAccessor::CreateFromExisting( + flash_index_storage_.get(), posting_list_used_hit_serializer_.get(), + posting_list_id)); return std::make_unique<PostingListAccessor>(std::move(accessor)); } @@ -193,9 +201,11 @@ MainIndex::GetAccessorForPrefixTerm(const std::string& prefix) { } PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid; memcpy(&posting_list_id, main_itr.GetValue(), sizeof(posting_list_id)); - ICING_ASSIGN_OR_RETURN(PostingListAccessor pl_accessor, - PostingListAccessor::CreateFromExisting( - flash_index_storage_.get(), posting_list_id)); + ICING_ASSIGN_OR_RETURN( + PostingListAccessor pl_accessor, + PostingListAccessor::CreateFromExisting( + flash_index_storage_.get(), posting_list_used_hit_serializer_.get(), + posting_list_id)); GetPrefixAccessorResult result = { std::make_unique<PostingListAccessor>(std::move(pl_accessor)), exact}; return result; @@ -234,9 +244,11 @@ MainIndex::FindTermsByPrefix( PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid; memcpy(&posting_list_id, term_iterator.GetValue(), sizeof(posting_list_id)); - ICING_ASSIGN_OR_RETURN(PostingListAccessor pl_accessor, - PostingListAccessor::CreateFromExisting( - flash_index_storage_.get(), posting_list_id)); + ICING_ASSIGN_OR_RETURN( + PostingListAccessor pl_accessor, + PostingListAccessor::CreateFromExisting( + flash_index_storage_.get(), posting_list_used_hit_serializer_.get(), + posting_list_id)); ICING_ASSIGN_OR_RETURN(std::vector<Hit> hits, pl_accessor.GetNextHitsBatch()); while (!hits.empty()) { @@ -549,7 +561,8 @@ libtextclassifier3::Status MainIndex::AddHits( sizeof(backfill_posting_list_id)); ICING_ASSIGN_OR_RETURN( PostingListAccessor hit_accum, - PostingListAccessor::Create(flash_index_storage_.get())); + PostingListAccessor::Create(flash_index_storage_.get(), + posting_list_used_hit_serializer_.get())); ICING_RETURN_IF_ERROR( AddPrefixBackfillHits(backfill_posting_list_id, &hit_accum)); PostingListAccessor::FinalizeResult result = @@ -583,15 +596,18 @@ libtextclassifier3::Status MainIndex::AddHitsForTerm( return absl_ports::InternalError( "Valid posting list has an invalid block index!"); } - ICING_ASSIGN_OR_RETURN(PostingListAccessor tmp, - PostingListAccessor::CreateFromExisting( - flash_index_storage_.get(), posting_list_id)); + ICING_ASSIGN_OR_RETURN( + PostingListAccessor tmp, + PostingListAccessor::CreateFromExisting( + flash_index_storage_.get(), posting_list_used_hit_serializer_.get(), + posting_list_id)); pl_accessor = std::make_unique<PostingListAccessor>(std::move(tmp)); } else { // New posting list. ICING_ASSIGN_OR_RETURN( PostingListAccessor tmp, - PostingListAccessor::Create(flash_index_storage_.get())); + PostingListAccessor::Create(flash_index_storage_.get(), + posting_list_used_hit_serializer_.get())); pl_accessor = std::make_unique<PostingListAccessor>(std::move(tmp)); } @@ -621,8 +637,9 @@ libtextclassifier3::Status MainIndex::AddPrefixBackfillHits( PostingListAccessor* hit_accum) { ICING_ASSIGN_OR_RETURN( PostingListAccessor backfill_accessor, - PostingListAccessor::CreateFromExisting(flash_index_storage_.get(), - backfill_posting_list_id)); + PostingListAccessor::CreateFromExisting( + flash_index_storage_.get(), posting_list_used_hit_serializer_.get(), + backfill_posting_list_id)); std::vector<Hit> backfill_hits; ICING_ASSIGN_OR_RETURN(std::vector<Hit> tmp, backfill_accessor.GetNextHitsBatch()); @@ -760,7 +777,9 @@ libtextclassifier3::StatusOr<DocumentId> MainIndex::TransferAndAddHits( ICING_ASSIGN_OR_RETURN( PostingListAccessor hit_accum, - PostingListAccessor::Create(new_index->flash_index_storage_.get())); + PostingListAccessor::Create( + new_index->flash_index_storage_.get(), + new_index->posting_list_used_hit_serializer_.get())); for (auto itr = new_hits.rbegin(); itr != new_hits.rend(); ++itr) { ICING_RETURN_IF_ERROR(hit_accum.PrependHit(*itr)); } @@ -806,9 +825,11 @@ libtextclassifier3::Status MainIndex::TransferIndex( << "Got invalid posting_list_id from previous main index"; continue; } - ICING_ASSIGN_OR_RETURN(PostingListAccessor pl_accessor, - PostingListAccessor::CreateFromExisting( - flash_index_storage_.get(), posting_list_id)); + ICING_ASSIGN_OR_RETURN( + PostingListAccessor pl_accessor, + PostingListAccessor::CreateFromExisting( + flash_index_storage_.get(), posting_list_used_hit_serializer_.get(), + posting_list_id)); ICING_ASSIGN_OR_RETURN( DocumentId curr_largest_document_id, TransferAndAddHits(document_id_old_to_new, term_itr.GetKey(), diff --git a/icing/index/main/main-index.h b/icing/index/main/main-index.h index aa3fc38..e257a77 100644 --- a/icing/index/main/main-index.h +++ b/icing/index/main/main-index.h @@ -20,9 +20,10 @@ #include "icing/text_classifier/lib3/utils/base/status.h" #include "icing/text_classifier/lib3/utils/base/statusor.h" #include "icing/file/filesystem.h" +#include "icing/file/posting_list/flash-index-storage.h" #include "icing/index/lite/term-id-hit-pair.h" -#include "icing/index/main/flash-index-storage.h" #include "icing/index/main/posting-list-accessor.h" +#include "icing/index/main/posting-list-used-hit-serializer.h" #include "icing/index/term-id-codec.h" #include "icing/index/term-metadata.h" #include "icing/legacy/index/icing-dynamic-trie.h" @@ -203,8 +204,9 @@ class MainIndex { const std::vector<DocumentId>& document_id_old_to_new); private: - MainIndex(const std::string& index_directory, const Filesystem* filesystem, - const IcingFilesystem* icing_filesystem); + explicit MainIndex(const std::string& index_directory, + const Filesystem* filesystem, + const IcingFilesystem* icing_filesystem); libtextclassifier3::Status Init(); @@ -323,6 +325,8 @@ class MainIndex { std::string base_dir_; const Filesystem* filesystem_; const IcingFilesystem* icing_filesystem_; + std::unique_ptr<PostingListUsedHitSerializer> + posting_list_used_hit_serializer_; std::unique_ptr<FlashIndexStorage> flash_index_storage_; std::unique_ptr<IcingDynamicTrie> main_lexicon_; }; diff --git a/icing/index/main/main-index_test.cc b/icing/index/main/main-index_test.cc index bfda014..92601e7 100644 --- a/icing/index/main/main-index_test.cc +++ b/icing/index/main/main-index_test.cc @@ -22,7 +22,6 @@ #include "icing/index/lite/term-id-hit-pair.h" #include "icing/index/main/doc-hit-info-iterator-term-main.h" #include "icing/index/main/main-index-merger.h" -#include "icing/index/main/main-index.h" #include "icing/index/term-id-codec.h" #include "icing/index/term-property-id.h" #include "icing/legacy/index/icing-dynamic-trie.h" diff --git a/icing/index/main/posting-list-accessor.cc b/icing/index/main/posting-list-accessor.cc index 93b7b0b..06ab0a1 100644 --- a/icing/index/main/posting-list-accessor.cc +++ b/icing/index/main/posting-list-accessor.cc @@ -14,38 +14,43 @@ #include "icing/index/main/posting-list-accessor.h" +#include <cstdint> #include <memory> +#include <vector> #include "icing/absl_ports/canonical_errors.h" -#include "icing/index/main/flash-index-storage.h" -#include "icing/index/main/index-block.h" -#include "icing/index/main/posting-list-identifier.h" -#include "icing/index/main/posting-list-used.h" +#include "icing/file/posting_list/flash-index-storage.h" +#include "icing/file/posting_list/index-block.h" +#include "icing/file/posting_list/posting-list-identifier.h" +#include "icing/file/posting_list/posting-list-used.h" +#include "icing/index/main/posting-list-used-hit-serializer.h" #include "icing/util/status-macros.h" namespace icing { namespace lib { libtextclassifier3::StatusOr<PostingListAccessor> PostingListAccessor::Create( - FlashIndexStorage *storage) { - uint32_t max_posting_list_bytes = - IndexBlock::CalculateMaxPostingListBytes(storage->block_size()); + FlashIndexStorage *storage, PostingListUsedHitSerializer *serializer) { + uint32_t max_posting_list_bytes = IndexBlock::CalculateMaxPostingListBytes( + storage->block_size(), serializer->GetDataTypeBytes()); std::unique_ptr<uint8_t[]> posting_list_buffer_array = std::make_unique<uint8_t[]>(max_posting_list_bytes); ICING_ASSIGN_OR_RETURN( PostingListUsed posting_list_buffer, PostingListUsed::CreateFromUnitializedRegion( - posting_list_buffer_array.get(), max_posting_list_bytes)); - return PostingListAccessor(storage, std::move(posting_list_buffer_array), + serializer, posting_list_buffer_array.get(), max_posting_list_bytes)); + return PostingListAccessor(storage, serializer, + std::move(posting_list_buffer_array), std::move(posting_list_buffer)); } libtextclassifier3::StatusOr<PostingListAccessor> PostingListAccessor::CreateFromExisting( - FlashIndexStorage *storage, + FlashIndexStorage *storage, PostingListUsedHitSerializer *serializer, PostingListIdentifier existing_posting_list_id) { // Our posting_list_buffer_ will start as empty. - ICING_ASSIGN_OR_RETURN(PostingListAccessor pl_accessor, Create(storage)); + ICING_ASSIGN_OR_RETURN(PostingListAccessor pl_accessor, + Create(storage, serializer)); ICING_ASSIGN_OR_RETURN(PostingListHolder holder, storage->GetPostingList(existing_posting_list_id)); pl_accessor.preexisting_posting_list_ = @@ -64,8 +69,9 @@ PostingListAccessor::GetNextHitsBatch() { "Cannot retrieve hits from a PostingListAccessor that was not created " "from a preexisting posting list."); } - ICING_ASSIGN_OR_RETURN(std::vector<Hit> batch, - preexisting_posting_list_->posting_list.GetHits()); + ICING_ASSIGN_OR_RETURN( + std::vector<Hit> batch, + serializer_->GetHits(&preexisting_posting_list_->posting_list)); uint32_t next_block_index; // Posting lists will only be chained when they are max-sized, in which case // block.next_block_index() will point to the next block for the next posting @@ -95,7 +101,7 @@ libtextclassifier3::Status PostingListAccessor::PrependHit(const Hit &hit) { PostingListUsed &active_pl = (preexisting_posting_list_ != nullptr) ? preexisting_posting_list_->posting_list : posting_list_buffer_; - libtextclassifier3::Status status = active_pl.PrependHit(hit); + libtextclassifier3::Status status = serializer_->PrependHit(&active_pl, hit); if (!absl_ports::IsResourceExhausted(status)) { return status; } @@ -112,7 +118,7 @@ libtextclassifier3::Status PostingListAccessor::PrependHit(const Hit &hit) { // It's fine to explicitly reference posting_list_buffer_ here because there's // no way of reaching this line while preexisting_posting_list_ is still in // use. - return posting_list_buffer_.PrependHit(hit); + return serializer_->PrependHit(&posting_list_buffer_, hit); } void PostingListAccessor::FlushPreexistingPostingList() { @@ -127,7 +133,8 @@ void PostingListAccessor::FlushPreexistingPostingList() { // and free this posting list. // // Move will always succeed since posting_list_buffer_ is max_pl_bytes. - posting_list_buffer_.MoveFrom(&preexisting_posting_list_->posting_list); + serializer_->MoveFrom(/*dst=*/&posting_list_buffer_, + /*src=*/&preexisting_posting_list_->posting_list); // Now that all the contents of this posting list have been copied, there's // no more use for it. Make it available to be used for another posting @@ -140,13 +147,14 @@ void PostingListAccessor::FlushPreexistingPostingList() { libtextclassifier3::Status PostingListAccessor::FlushInMemoryPostingList() { // We exceeded max_pl_bytes(). Need to flush posting_list_buffer_ and update // the chain. - uint32_t max_posting_list_bytes = - IndexBlock::CalculateMaxPostingListBytes(storage_->block_size()); + uint32_t max_posting_list_bytes = IndexBlock::CalculateMaxPostingListBytes( + storage_->block_size(), serializer_->GetDataTypeBytes()); ICING_ASSIGN_OR_RETURN(PostingListHolder holder, storage_->AllocatePostingList(max_posting_list_bytes)); holder.block.set_next_block_index(prev_block_identifier_.block_index()); prev_block_identifier_ = holder.id; - return holder.posting_list.MoveFrom(&posting_list_buffer_); + return serializer_->MoveFrom(/*dst=*/&holder.posting_list, + /*src=*/&posting_list_buffer_); } PostingListAccessor::FinalizeResult PostingListAccessor::Finalize( @@ -158,7 +166,7 @@ PostingListAccessor::FinalizeResult PostingListAccessor::Finalize( accessor.preexisting_posting_list_->id}; return result; } - if (accessor.posting_list_buffer_.BytesUsed() <= 0) { + if (accessor.serializer_->GetBytesUsed(&accessor.posting_list_buffer_) <= 0) { FinalizeResult result = {absl_ports::InvalidArgumentError( "Can't finalize an empty PostingListAccessor. " "There's nothing to Finalize!"), @@ -166,10 +174,12 @@ PostingListAccessor::FinalizeResult PostingListAccessor::Finalize( return result; } uint32_t posting_list_bytes = - accessor.posting_list_buffer_.MinPostingListSizeToFit(); + accessor.serializer_->GetMinPostingListSizeToFit( + &accessor.posting_list_buffer_); if (accessor.prev_block_identifier_.is_valid()) { posting_list_bytes = IndexBlock::CalculateMaxPostingListBytes( - accessor.storage_->block_size()); + accessor.storage_->block_size(), + accessor.serializer_->GetDataTypeBytes()); } auto holder_or = accessor.storage_->AllocatePostingList(posting_list_bytes); if (!holder_or.ok()) { @@ -189,7 +199,9 @@ PostingListAccessor::FinalizeResult PostingListAccessor::Finalize( // is valid because we created it in-memory. And finally, we know that the // hits from posting_list_buffer_ will fit in editor.posting_list() because we // requested it be at at least posting_list_bytes large. - auto status = holder.posting_list.MoveFrom(&accessor.posting_list_buffer_); + auto status = + accessor.serializer_->MoveFrom(/*dst=*/&holder.posting_list, + /*src=*/&accessor.posting_list_buffer_); if (!status.ok()) { FinalizeResult result = {std::move(status), accessor.prev_block_identifier_}; diff --git a/icing/index/main/posting-list-accessor.h b/icing/index/main/posting-list-accessor.h index e1bb3c0..3f93c3a 100644 --- a/icing/index/main/posting-list-accessor.h +++ b/icing/index/main/posting-list-accessor.h @@ -15,14 +15,17 @@ #ifndef ICING_INDEX_POSTING_LIST_ACCESSOR_H_ #define ICING_INDEX_POSTING_LIST_ACCESSOR_H_ +#include <cstdint> #include <memory> +#include <vector> #include "icing/text_classifier/lib3/utils/base/status.h" #include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "icing/file/posting_list/flash-index-storage.h" +#include "icing/file/posting_list/posting-list-identifier.h" +#include "icing/file/posting_list/posting-list-used.h" #include "icing/index/hit/hit.h" -#include "icing/index/main/flash-index-storage.h" -#include "icing/index/main/posting-list-identifier.h" -#include "icing/index/main/posting-list-used.h" +#include "icing/index/main/posting-list-used-hit-serializer.h" namespace icing { namespace lib { @@ -48,7 +51,7 @@ class PostingListAccessor { // - On success, a valid instance of PostingListAccessor // - INVALID_ARGUMENT error if storage has an invalid block_size. static libtextclassifier3::StatusOr<PostingListAccessor> Create( - FlashIndexStorage* storage); + FlashIndexStorage* storage, PostingListUsedHitSerializer* serializer); // Create a PostingListAccessor with an existing posting list identified by // existing_posting_list_id. @@ -61,7 +64,7 @@ class PostingListAccessor { // - On success, a valid instance of PostingListAccessor // - INVALID_ARGUMENT if storage has an invalid block_size. static libtextclassifier3::StatusOr<PostingListAccessor> CreateFromExisting( - FlashIndexStorage* storage, + FlashIndexStorage* storage, PostingListUsedHitSerializer* serializer, PostingListIdentifier existing_posting_list_id); // Retrieve the next batch of hits for the posting list chain @@ -109,10 +112,11 @@ class PostingListAccessor { private: explicit PostingListAccessor( - FlashIndexStorage* storage, + FlashIndexStorage* storage, PostingListUsedHitSerializer* serializer, std::unique_ptr<uint8_t[]> posting_list_buffer_array, PostingListUsed posting_list_buffer) : storage_(storage), + serializer_(serializer), prev_block_identifier_(PostingListIdentifier::kInvalid), posting_list_buffer_array_(std::move(posting_list_buffer_array)), posting_list_buffer_(std::move(posting_list_buffer)), @@ -137,6 +141,8 @@ class PostingListAccessor { FlashIndexStorage* storage_; // Does not own. + PostingListUsedHitSerializer* serializer_; // Does not own. + // The PostingListIdentifier of the first max-sized posting list in the // posting list chain or PostingListIdentifier::kInvalid if there is no // posting list chain. diff --git a/icing/index/main/posting-list-accessor_test.cc b/icing/index/main/posting-list-accessor_test.cc index a539fe4..3145420 100644 --- a/icing/index/main/posting-list-accessor_test.cc +++ b/icing/index/main/posting-list-accessor_test.cc @@ -19,11 +19,12 @@ #include "gmock/gmock.h" #include "gtest/gtest.h" #include "icing/file/filesystem.h" +#include "icing/file/posting_list/flash-index-storage.h" +#include "icing/file/posting_list/index-block.h" +#include "icing/file/posting_list/posting-list-identifier.h" +#include "icing/file/posting_list/posting-list-used.h" #include "icing/index/hit/hit.h" -#include "icing/index/main/flash-index-storage.h" -#include "icing/index/main/index-block.h" -#include "icing/index/main/posting-list-identifier.h" -#include "icing/index/main/posting-list-used.h" +#include "icing/index/main/posting-list-used-hit-serializer.h" #include "icing/testing/common-matchers.h" #include "icing/testing/hit-test-utils.h" #include "icing/testing/tmp-directory.h" @@ -39,20 +40,45 @@ using ::testing::Eq; using ::testing::Lt; using ::testing::SizeIs; -TEST(PostingListAccessorStorageTest, HitsAddAndRetrieveProperly) { - std::string test_dir = GetTestTempDir() + "/test_dir"; - std::string file_name = test_dir + "/test_file.idx.index"; - Filesystem filesystem; - ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(test_dir.c_str())); - ASSERT_TRUE(filesystem.CreateDirectoryRecursively(test_dir.c_str())); +class PostingListAccessorTest : public ::testing::Test { + protected: + void SetUp() override { + test_dir_ = GetTestTempDir() + "/test_dir"; + file_name_ = test_dir_ + "/test_file.idx.index"; - ICING_ASSERT_OK_AND_ASSIGN(FlashIndexStorage flash_index_storage, - FlashIndexStorage::Create(file_name, &filesystem)); + ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(test_dir_.c_str())); + ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(test_dir_.c_str())); + + serializer_ = std::make_unique<PostingListUsedHitSerializer>(); + + ICING_ASSERT_OK_AND_ASSIGN( + FlashIndexStorage flash_index_storage, + FlashIndexStorage::Create(file_name_, &filesystem_, serializer_.get())); + flash_index_storage_ = + std::make_unique<FlashIndexStorage>(std::move(flash_index_storage)); + } + + void TearDown() override { + flash_index_storage_.reset(); + serializer_.reset(); + ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(test_dir_.c_str())); + } + + Filesystem filesystem_; + std::string test_dir_; + std::string file_name_; + std::unique_ptr<PostingListUsedHitSerializer> serializer_; + std::unique_ptr<FlashIndexStorage> flash_index_storage_; +}; + +TEST_F(PostingListAccessorTest, HitsAddAndRetrieveProperly) { + ICING_ASSERT_OK_AND_ASSIGN( + PostingListAccessor pl_accessor, + PostingListAccessor::Create(flash_index_storage_.get(), + serializer_.get())); // Add some hits! Any hits! std::vector<Hit> hits1 = CreateHits(/*num_hits=*/5, /*desired_byte_length=*/1); - ICING_ASSERT_OK_AND_ASSIGN(PostingListAccessor pl_accessor, - PostingListAccessor::Create(&flash_index_storage)); for (const Hit& hit : hits1) { ICING_ASSERT_OK(pl_accessor.PrependHit(hit)); } @@ -64,23 +90,17 @@ TEST(PostingListAccessorStorageTest, HitsAddAndRetrieveProperly) { // Retrieve some hits. ICING_ASSERT_OK_AND_ASSIGN(PostingListHolder pl_holder, - flash_index_storage.GetPostingList(result.id)); - EXPECT_THAT(pl_holder.posting_list.GetHits(), + flash_index_storage_->GetPostingList(result.id)); + EXPECT_THAT(serializer_->GetHits(&pl_holder.posting_list), IsOkAndHolds(ElementsAreArray(hits1.rbegin(), hits1.rend()))); EXPECT_THAT(pl_holder.block.next_block_index(), Eq(kInvalidBlockIndex)); } -TEST(PostingListAccessorStorageTest, PreexistingPLKeepOnSameBlock) { - std::string test_dir = GetTestTempDir() + "/test_dir"; - std::string file_name = test_dir + "/test_file.idx.index"; - Filesystem filesystem; - ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(test_dir.c_str())); - ASSERT_TRUE(filesystem.CreateDirectoryRecursively(test_dir.c_str())); - - ICING_ASSERT_OK_AND_ASSIGN(FlashIndexStorage flash_index_storage, - FlashIndexStorage::Create(file_name, &filesystem)); - ICING_ASSERT_OK_AND_ASSIGN(PostingListAccessor pl_accessor, - PostingListAccessor::Create(&flash_index_storage)); +TEST_F(PostingListAccessorTest, PreexistingPLKeepOnSameBlock) { + ICING_ASSERT_OK_AND_ASSIGN( + PostingListAccessor pl_accessor, + PostingListAccessor::Create(flash_index_storage_.get(), + serializer_.get())); // Add a single hit. This will fit in a min-sized posting list. Hit hit1(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency); ICING_ASSERT_OK(pl_accessor.PrependHit(hit1)); @@ -95,8 +115,9 @@ TEST(PostingListAccessorStorageTest, PreexistingPLKeepOnSameBlock) { // at least two hits, so this should NOT cause the previous pl to be // reallocated. ICING_ASSERT_OK_AND_ASSIGN( - pl_accessor, PostingListAccessor::CreateFromExisting(&flash_index_storage, - result1.id)); + pl_accessor, + PostingListAccessor::CreateFromExisting(flash_index_storage_.get(), + serializer_.get(), result1.id)); Hit hit2 = CreateHit(hit1, /*desired_byte_length=*/1); ICING_ASSERT_OK(pl_accessor.PrependHit(hit2)); PostingListAccessor::FinalizeResult result2 = @@ -108,22 +129,16 @@ TEST(PostingListAccessorStorageTest, PreexistingPLKeepOnSameBlock) { // The posting list at result2.id should hold all of the hits that have been // added. ICING_ASSERT_OK_AND_ASSIGN(PostingListHolder pl_holder, - flash_index_storage.GetPostingList(result2.id)); - EXPECT_THAT(pl_holder.posting_list.GetHits(), + flash_index_storage_->GetPostingList(result2.id)); + EXPECT_THAT(serializer_->GetHits(&pl_holder.posting_list), IsOkAndHolds(ElementsAre(hit2, hit1))); } -TEST(PostingListAccessorStorageTest, PreexistingPLReallocateToLargerPL) { - std::string test_dir = GetTestTempDir() + "/test_dir"; - std::string file_name = test_dir + "/test_file.idx.index"; - Filesystem filesystem; - ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(test_dir.c_str())); - ASSERT_TRUE(filesystem.CreateDirectoryRecursively(test_dir.c_str())); - - ICING_ASSERT_OK_AND_ASSIGN(FlashIndexStorage flash_index_storage, - FlashIndexStorage::Create(file_name, &filesystem)); - ICING_ASSERT_OK_AND_ASSIGN(PostingListAccessor pl_accessor, - PostingListAccessor::Create(&flash_index_storage)); +TEST_F(PostingListAccessorTest, PreexistingPLReallocateToLargerPL) { + ICING_ASSERT_OK_AND_ASSIGN( + PostingListAccessor pl_accessor, + PostingListAccessor::Create(flash_index_storage_.get(), + serializer_.get())); // The smallest posting list size is 15 bytes. The first four hits will be // compressed to one byte each and will be able to fit in the 5 byte padded // region. The last hit will fit in one of the special hits. The posting list @@ -142,8 +157,9 @@ TEST(PostingListAccessorStorageTest, PreexistingPLReallocateToLargerPL) { // Now let's add some more hits! ICING_ASSERT_OK_AND_ASSIGN( - pl_accessor, PostingListAccessor::CreateFromExisting(&flash_index_storage, - result1.id)); + pl_accessor, + PostingListAccessor::CreateFromExisting(flash_index_storage_.get(), + serializer_.get(), result1.id)); // The current posting list can fit at most 2 more hits. Adding 12 more hits // should result in these hits being moved to a larger posting list. std::vector<Hit> hits2 = CreateHits( @@ -167,22 +183,16 @@ TEST(PostingListAccessorStorageTest, PreexistingPLReallocateToLargerPL) { hits1.push_back(hit); } ICING_ASSERT_OK_AND_ASSIGN(PostingListHolder pl_holder, - flash_index_storage.GetPostingList(result2.id)); - EXPECT_THAT(pl_holder.posting_list.GetHits(), + flash_index_storage_->GetPostingList(result2.id)); + EXPECT_THAT(serializer_->GetHits(&pl_holder.posting_list), IsOkAndHolds(ElementsAreArray(hits1.rbegin(), hits1.rend()))); } -TEST(PostingListAccessorStorageTest, MultiBlockChainsBlocksProperly) { - std::string test_dir = GetTestTempDir() + "/test_dir"; - std::string file_name = test_dir + "/test_file.idx.index"; - Filesystem filesystem; - ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(test_dir.c_str())); - ASSERT_TRUE(filesystem.CreateDirectoryRecursively(test_dir.c_str())); - - ICING_ASSERT_OK_AND_ASSIGN(FlashIndexStorage flash_index_storage, - FlashIndexStorage::Create(file_name, &filesystem)); - ICING_ASSERT_OK_AND_ASSIGN(PostingListAccessor pl_accessor, - PostingListAccessor::Create(&flash_index_storage)); +TEST_F(PostingListAccessorTest, MultiBlockChainsBlocksProperly) { + ICING_ASSERT_OK_AND_ASSIGN( + PostingListAccessor pl_accessor, + PostingListAccessor::Create(flash_index_storage_.get(), + serializer_.get())); // Add some hits! Any hits! std::vector<Hit> hits1 = CreateHits(/*num_hits=*/5000, /*desired_byte_length=*/1); @@ -202,11 +212,11 @@ TEST(PostingListAccessorStorageTest, MultiBlockChainsBlocksProperly) { // Now let's retrieve them! ICING_ASSERT_OK_AND_ASSIGN( PostingListHolder pl_holder, - flash_index_storage.GetPostingList(second_block_id)); + flash_index_storage_->GetPostingList(second_block_id)); // This pl_holder will only hold a posting list with the hits that didn't fit // on the first block. ICING_ASSERT_OK_AND_ASSIGN(std::vector<Hit> second_block_hits, - pl_holder.posting_list.GetHits()); + serializer_->GetHits(&pl_holder.posting_list)); ASSERT_THAT(second_block_hits, SizeIs(Lt(hits1.size()))); auto first_block_hits_start = hits1.rbegin() + second_block_hits.size(); EXPECT_THAT(second_block_hits, @@ -219,24 +229,17 @@ TEST(PostingListAccessorStorageTest, MultiBlockChainsBlocksProperly) { PostingListIdentifier pl_id(first_block_id, /*posting_list_index=*/0, /*posting_list_index_bits=*/0); ICING_ASSERT_OK_AND_ASSIGN(pl_holder, - flash_index_storage.GetPostingList(pl_id)); + flash_index_storage_->GetPostingList(pl_id)); EXPECT_THAT( - pl_holder.posting_list.GetHits(), + serializer_->GetHits(&pl_holder.posting_list), IsOkAndHolds(ElementsAreArray(first_block_hits_start, hits1.rend()))); } -TEST(PostingListAccessorStorageTest, - PreexistingMultiBlockReusesBlocksProperly) { - std::string test_dir = GetTestTempDir() + "/test_dir"; - std::string file_name = test_dir + "/test_file.idx.index"; - Filesystem filesystem; - ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(test_dir.c_str())); - ASSERT_TRUE(filesystem.CreateDirectoryRecursively(test_dir.c_str())); - - ICING_ASSERT_OK_AND_ASSIGN(FlashIndexStorage flash_index_storage, - FlashIndexStorage::Create(file_name, &filesystem)); - ICING_ASSERT_OK_AND_ASSIGN(PostingListAccessor pl_accessor, - PostingListAccessor::Create(&flash_index_storage)); +TEST_F(PostingListAccessorTest, PreexistingMultiBlockReusesBlocksProperly) { + ICING_ASSERT_OK_AND_ASSIGN( + PostingListAccessor pl_accessor, + PostingListAccessor::Create(flash_index_storage_.get(), + serializer_.get())); // Add some hits! Any hits! std::vector<Hit> hits1 = CreateHits(/*num_hits=*/5000, /*desired_byte_length=*/1); @@ -254,8 +257,9 @@ TEST(PostingListAccessorStorageTest, // Now add a couple more hits. These should fit on the existing, not full // second block. ICING_ASSERT_OK_AND_ASSIGN( - pl_accessor, PostingListAccessor::CreateFromExisting(&flash_index_storage, - first_add_id)); + pl_accessor, + PostingListAccessor::CreateFromExisting(flash_index_storage_.get(), + serializer_.get(), first_add_id)); std::vector<Hit> hits2 = CreateHits( /*start_docid=*/hits1.back().document_id() + 1, /*num_hits=*/50, /*desired_byte_length=*/1); @@ -273,12 +277,13 @@ TEST(PostingListAccessorStorageTest, for (const Hit& hit : hits2) { hits1.push_back(hit); } - ICING_ASSERT_OK_AND_ASSIGN(PostingListHolder pl_holder, - flash_index_storage.GetPostingList(second_add_id)); + ICING_ASSERT_OK_AND_ASSIGN( + PostingListHolder pl_holder, + flash_index_storage_->GetPostingList(second_add_id)); // This pl_holder will only hold a posting list with the hits that didn't fit // on the first block. ICING_ASSERT_OK_AND_ASSIGN(std::vector<Hit> second_block_hits, - pl_holder.posting_list.GetHits()); + serializer_->GetHits(&pl_holder.posting_list)); ASSERT_THAT(second_block_hits, SizeIs(Lt(hits1.size()))); auto first_block_hits_start = hits1.rbegin() + second_block_hits.size(); EXPECT_THAT(second_block_hits, @@ -291,39 +296,27 @@ TEST(PostingListAccessorStorageTest, PostingListIdentifier pl_id(first_block_id, /*posting_list_index=*/0, /*posting_list_index_bits=*/0); ICING_ASSERT_OK_AND_ASSIGN(pl_holder, - flash_index_storage.GetPostingList(pl_id)); + flash_index_storage_->GetPostingList(pl_id)); EXPECT_THAT( - pl_holder.posting_list.GetHits(), + serializer_->GetHits(&pl_holder.posting_list), IsOkAndHolds(ElementsAreArray(first_block_hits_start, hits1.rend()))); } -TEST(PostingListAccessorStorageTest, InvalidHitReturnsInvalidArgument) { - std::string test_dir = GetTestTempDir() + "/test_dir"; - std::string file_name = test_dir + "/test_file.idx.index"; - Filesystem filesystem; - ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(test_dir.c_str())); - ASSERT_TRUE(filesystem.CreateDirectoryRecursively(test_dir.c_str())); - - ICING_ASSERT_OK_AND_ASSIGN(FlashIndexStorage flash_index_storage, - FlashIndexStorage::Create(file_name, &filesystem)); - ICING_ASSERT_OK_AND_ASSIGN(PostingListAccessor pl_accessor, - PostingListAccessor::Create(&flash_index_storage)); +TEST_F(PostingListAccessorTest, InvalidHitReturnsInvalidArgument) { + ICING_ASSERT_OK_AND_ASSIGN( + PostingListAccessor pl_accessor, + PostingListAccessor::Create(flash_index_storage_.get(), + serializer_.get())); Hit invalid_hit; EXPECT_THAT(pl_accessor.PrependHit(invalid_hit), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } -TEST(PostingListAccessorStorageTest, HitsNotDecreasingReturnsInvalidArgument) { - std::string test_dir = GetTestTempDir() + "/test_dir"; - std::string file_name = test_dir + "/test_file.idx.index"; - Filesystem filesystem; - ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(test_dir.c_str())); - ASSERT_TRUE(filesystem.CreateDirectoryRecursively(test_dir.c_str())); - - ICING_ASSERT_OK_AND_ASSIGN(FlashIndexStorage flash_index_storage, - FlashIndexStorage::Create(file_name, &filesystem)); - ICING_ASSERT_OK_AND_ASSIGN(PostingListAccessor pl_accessor, - PostingListAccessor::Create(&flash_index_storage)); +TEST_F(PostingListAccessorTest, HitsNotDecreasingReturnsInvalidArgument) { + ICING_ASSERT_OK_AND_ASSIGN( + PostingListAccessor pl_accessor, + PostingListAccessor::Create(flash_index_storage_.get(), + serializer_.get())); Hit hit1(/*section_id=*/3, /*document_id=*/1, Hit::kDefaultTermFrequency); ICING_ASSERT_OK(pl_accessor.PrependHit(hit1)); @@ -336,43 +329,32 @@ TEST(PostingListAccessorStorageTest, HitsNotDecreasingReturnsInvalidArgument) { StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } -TEST(PostingListAccessorStorageTest, NewPostingListNoHitsAdded) { - std::string test_dir = GetTestTempDir() + "/test_dir"; - std::string file_name = test_dir + "/test_file.idx.index"; - Filesystem filesystem; - ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(test_dir.c_str())); - ASSERT_TRUE(filesystem.CreateDirectoryRecursively(test_dir.c_str())); - - ICING_ASSERT_OK_AND_ASSIGN(FlashIndexStorage flash_index_storage, - FlashIndexStorage::Create(file_name, &filesystem)); - ICING_ASSERT_OK_AND_ASSIGN(PostingListAccessor pl_accessor, - PostingListAccessor::Create(&flash_index_storage)); +TEST_F(PostingListAccessorTest, NewPostingListNoHitsAdded) { + ICING_ASSERT_OK_AND_ASSIGN( + PostingListAccessor pl_accessor, + PostingListAccessor::Create(flash_index_storage_.get(), + serializer_.get())); PostingListAccessor::FinalizeResult result1 = PostingListAccessor::Finalize(std::move(pl_accessor)); EXPECT_THAT(result1.status, StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } -TEST(PostingListAccessorStorageTest, PreexistingPostingListNoHitsAdded) { - std::string test_dir = GetTestTempDir() + "/test_dir"; - std::string file_name = test_dir + "/test_file.idx.index"; - Filesystem filesystem; - ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(test_dir.c_str())); - ASSERT_TRUE(filesystem.CreateDirectoryRecursively(test_dir.c_str())); - - ICING_ASSERT_OK_AND_ASSIGN(FlashIndexStorage flash_index_storage, - FlashIndexStorage::Create(file_name, &filesystem)); - ICING_ASSERT_OK_AND_ASSIGN(PostingListAccessor pl_accessor, - PostingListAccessor::Create(&flash_index_storage)); +TEST_F(PostingListAccessorTest, PreexistingPostingListNoHitsAdded) { + ICING_ASSERT_OK_AND_ASSIGN( + PostingListAccessor pl_accessor, + PostingListAccessor::Create(flash_index_storage_.get(), + serializer_.get())); Hit hit1(/*section_id=*/3, /*document_id=*/1, Hit::kDefaultTermFrequency); ICING_ASSERT_OK(pl_accessor.PrependHit(hit1)); PostingListAccessor::FinalizeResult result1 = PostingListAccessor::Finalize(std::move(pl_accessor)); ICING_ASSERT_OK(result1.status); - ICING_ASSERT_OK_AND_ASSIGN(PostingListAccessor pl_accessor2, - PostingListAccessor::CreateFromExisting( - &flash_index_storage, result1.id)); + ICING_ASSERT_OK_AND_ASSIGN( + PostingListAccessor pl_accessor2, + PostingListAccessor::CreateFromExisting(flash_index_storage_.get(), + serializer_.get(), result1.id)); PostingListAccessor::FinalizeResult result2 = PostingListAccessor::Finalize(std::move(pl_accessor2)); ICING_ASSERT_OK(result2.status); diff --git a/icing/index/main/posting-list-used.cc b/icing/index/main/posting-list-used-hit-serializer.cc index 62e73e5..d45a428 100644 --- a/icing/index/main/posting-list-used.cc +++ b/icing/index/main/posting-list-used-hit-serializer.cc @@ -1,4 +1,4 @@ -// Copyright (C) 2019 Google LLC +// Copyright (C) 2022 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,17 +12,19 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "icing/index/main/posting-list-used.h" +#include "icing/index/main/posting-list-used-hit-serializer.h" -#include <algorithm> -#include <cinttypes> #include <cstdint> +#include <cstring> #include <limits> +#include <vector> #include "icing/absl_ports/canonical_errors.h" -#include "icing/index/main/posting-list-utils.h" +#include "icing/file/posting_list/posting-list-common.h" +#include "icing/file/posting_list/posting-list-used.h" #include "icing/legacy/core/icing-string-util.h" #include "icing/legacy/index/icing-bit-util.h" +#include "icing/util/logging.h" #include "icing/util/status-macros.h" namespace icing { @@ -30,97 +32,110 @@ namespace lib { namespace { -uint32_t GetTermFrequencyByteSize(const Hit &hit) { +uint32_t GetTermFrequencyByteSize(const Hit& hit) { return hit.has_term_frequency() ? sizeof(Hit::TermFrequency) : 0; } } // namespace -libtextclassifier3::StatusOr<PostingListUsed> -PostingListUsed::CreateFromPreexistingPostingListUsedRegion( - void *posting_list_buffer, uint32_t size_in_bytes) { - ICING_RETURN_ERROR_IF_NULL(posting_list_buffer); - if (!posting_list_utils::IsValidPostingListSize(size_in_bytes)) { - return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( - "Requested posting list size %d is invalid!", size_in_bytes)); - } - return PostingListUsed(posting_list_buffer, size_in_bytes); +uint32_t PostingListUsedHitSerializer::GetBytesUsed( + const PostingListUsed* posting_list_used) const { + // The special hits will be included if they represent actual hits. If they + // represent the hit offset or the invalid hit sentinel, they are not + // included. + return posting_list_used->size_in_bytes() - + GetStartByteOffset(posting_list_used); } -libtextclassifier3::StatusOr<PostingListUsed> -PostingListUsed::CreateFromUnitializedRegion(void *posting_list_buffer, - uint32_t size_in_bytes) { - ICING_ASSIGN_OR_RETURN(PostingListUsed posting_list_used, - CreateFromPreexistingPostingListUsedRegion( - posting_list_buffer, size_in_bytes)); - posting_list_used.Clear(); - return posting_list_used; +uint32_t PostingListUsedHitSerializer::GetMinPostingListSizeToFit( + const PostingListUsed* posting_list_used) const { + if (IsFull(posting_list_used) || IsAlmostFull(posting_list_used)) { + // If in either the FULL state or ALMOST_FULL state, this posting list *is* + // the minimum size posting list that can fit these hits. So just return the + // size of the posting list. + return posting_list_used->size_in_bytes(); + } + + // In NOT_FULL status BytesUsed contains no special hits. The minimum sized + // posting list that would be guaranteed to fit these hits would be + // ALMOST_FULL, with kInvalidHit in special_hit(0), the uncompressed Hit in + // special_hit(1) and the n compressed hits in the compressed region. + // BytesUsed contains one uncompressed Hit and n compressed hits. Therefore, + // fitting these hits into a posting list would require BytesUsed plus one + // extra hit. + return GetBytesUsed(posting_list_used) + sizeof(Hit); } -void PostingListUsed::Clear() { - // Safe to ignore return value because size_in_bytes_ a valid argument. - set_start_byte_offset(size_in_bytes_); +void PostingListUsedHitSerializer::Clear( + PostingListUsed* posting_list_used) const { + // Safe to ignore return value because posting_list_used->size_in_bytes() is + // a valid argument. + SetStartByteOffset(posting_list_used, + /*offset=*/posting_list_used->size_in_bytes()); } -libtextclassifier3::Status PostingListUsed::MoveFrom(PostingListUsed *other) { - ICING_RETURN_ERROR_IF_NULL(other); - if (other->MinPostingListSizeToFit() > size_in_bytes_) { +libtextclassifier3::Status PostingListUsedHitSerializer::MoveFrom( + PostingListUsed* dst, PostingListUsed* src) const { + ICING_RETURN_ERROR_IF_NULL(dst); + ICING_RETURN_ERROR_IF_NULL(src); + if (GetMinPostingListSizeToFit(src) > dst->size_in_bytes()) { return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( - "other->MinPostingListSizeToFit %d must be larger than size %d.", - other->MinPostingListSizeToFit(), size_in_bytes_)); + "src MinPostingListSizeToFit %d must be larger than size %d.", + GetMinPostingListSizeToFit(src), dst->size_in_bytes())); } - if (!IsPostingListValid()) { + if (!IsPostingListValid(dst)) { return absl_ports::FailedPreconditionError( - "This posting list is in an invalid state and can't be used!"); + "Dst posting list is in an invalid state and can't be used!"); } - if (!other->IsPostingListValid()) { + if (!IsPostingListValid(src)) { return absl_ports::InvalidArgumentError( - "Cannot MoveFrom an invalid posting list!"); + "Cannot MoveFrom an invalid src posting list!"); } - // Pop just enough hits that all of other's compressed hits fit in - // this posting_list's compressed area. Then we can memcpy that area. + // Pop just enough hits that all of src's compressed hits fit in + // dst posting_list's compressed area. Then we can memcpy that area. std::vector<Hit> hits; - while (other->full() || other->almost_full() || - (size_in_bytes_ - posting_list_utils::kSpecialHitsSize < - other->BytesUsed())) { - if (!other->GetHitsInternal(/*limit=*/1, /*pop=*/true, &hits).ok()) { + while (IsFull(src) || IsAlmostFull(src) || + (dst->size_in_bytes() - kSpecialHitsSize < GetBytesUsed(src))) { + if (!GetHitsInternal(src, /*limit=*/1, /*pop=*/true, &hits).ok()) { return absl_ports::AbortedError( - "Unable to retrieve hits from other posting list."); + "Unable to retrieve hits from src posting list."); } } // memcpy the area and set up start byte offset. - Clear(); - memcpy(posting_list_buffer_ + size_in_bytes_ - other->BytesUsed(), - other->posting_list_buffer_ + other->get_start_byte_offset(), - other->BytesUsed()); - // Because we popped all hits from other outside of the compressed area and we - // guaranteed that other->BytesUsed is less than size_in_bytes_ - + Clear(dst); + memcpy(dst->posting_list_buffer() + dst->size_in_bytes() - GetBytesUsed(src), + src->posting_list_buffer() + GetStartByteOffset(src), + GetBytesUsed(src)); + // Because we popped all hits from src outside of the compressed area and we + // guaranteed that GetBytesUsed(src) is less than dst->size_in_bytes() - // kSpecialHitSize. This is guaranteed to be a valid byte offset for the // NOT_FULL state, so ignoring the value is safe. - set_start_byte_offset(size_in_bytes_ - other->BytesUsed()); + SetStartByteOffset(dst, dst->size_in_bytes() - GetBytesUsed(src)); // Put back remaining hits. for (size_t i = 0; i < hits.size(); i++) { - const Hit &hit = hits[hits.size() - i - 1]; + const Hit& hit = hits[hits.size() - i - 1]; // PrependHit can return either INVALID_ARGUMENT - if hit is invalid or not // less than the previous hit - or RESOURCE_EXHAUSTED. RESOURCE_EXHAUSTED // should be impossible because we've already assured that there is enough // room above. - ICING_RETURN_IF_ERROR(PrependHit(hit)); + ICING_RETURN_IF_ERROR(PrependHit(dst, hit)); } - other->Clear(); + Clear(src); return libtextclassifier3::Status::OK; } -uint32_t PostingListUsed::GetPadEnd(uint32_t offset) const { +uint32_t PostingListUsedHitSerializer::GetPadEnd( + const PostingListUsed* posting_list_used, uint32_t offset) const { Hit::Value pad; uint32_t pad_end = offset; - while (pad_end < size_in_bytes_) { - size_t pad_len = VarInt::Decode(posting_list_buffer_ + pad_end, &pad); + while (pad_end < posting_list_used->size_in_bytes()) { + size_t pad_len = VarInt::Decode( + posting_list_used->posting_list_buffer() + pad_end, &pad); if (pad != 0) { // No longer a pad. break; @@ -130,22 +145,24 @@ uint32_t PostingListUsed::GetPadEnd(uint32_t offset) const { return pad_end; } -bool PostingListUsed::PadToEnd(uint32_t start, uint32_t end) { - if (end > size_in_bytes_) { +bool PostingListUsedHitSerializer::PadToEnd(PostingListUsed* posting_list_used, + uint32_t start, + uint32_t end) const { + if (end > posting_list_used->size_in_bytes()) { ICING_LOG(ERROR) << "Cannot pad a region that ends after size!"; return false; } // In VarInt a value of 0 encodes to 0. - memset(posting_list_buffer_ + start, 0, end - start); + memset(posting_list_used->posting_list_buffer() + start, 0, end - start); return true; } -libtextclassifier3::Status PostingListUsed::PrependHitToAlmostFull( - const Hit &hit) { +libtextclassifier3::Status PostingListUsedHitSerializer::PrependHitToAlmostFull( + PostingListUsed* posting_list_used, const Hit& hit) const { // Get delta between first hit and the new hit. Try to fit delta // in the padded area and put new hit at the special position 1. - // Calling ValueOrDie is safe here because 1 < kNumSpecialHits. - Hit cur = get_special_hit(1).ValueOrDie(); + // Calling ValueOrDie is safe here because 1 < kNumSpecialData. + Hit cur = GetSpecialHit(posting_list_used, /*index=*/1).ValueOrDie(); if (cur.value() <= hit.value()) { return absl_ports::InvalidArgumentError( "Hit being prepended must be strictly less than the most recent Hit"); @@ -155,58 +172,62 @@ libtextclassifier3::Status PostingListUsed::PrependHitToAlmostFull( size_t delta_len = VarInt::Encode(delta, delta_buf); uint32_t cur_term_frequency_bytes = GetTermFrequencyByteSize(cur); - uint32_t pad_end = GetPadEnd(posting_list_utils::kSpecialHitsSize); + uint32_t pad_end = GetPadEnd(posting_list_used, + /*offset=*/kSpecialHitsSize); - if (pad_end >= posting_list_utils::kSpecialHitsSize + delta_len + - cur_term_frequency_bytes) { + if (pad_end >= kSpecialHitsSize + delta_len + cur_term_frequency_bytes) { // Pad area has enough space for delta and term_frequency of existing hit // (cur). Write delta at pad_end - delta_len - cur_term_frequency_bytes. - uint8_t *delta_offset = - posting_list_buffer_ + pad_end - delta_len - cur_term_frequency_bytes; + uint8_t* delta_offset = posting_list_used->posting_list_buffer() + pad_end - + delta_len - cur_term_frequency_bytes; memcpy(delta_offset, delta_buf, delta_len); // Now copy term_frequency. Hit::TermFrequency term_frequency = cur.term_frequency(); - uint8_t *term_frequency_offset = delta_offset + delta_len; + uint8_t* term_frequency_offset = delta_offset + delta_len; memcpy(term_frequency_offset, &term_frequency, cur_term_frequency_bytes); // Now first hit is the new hit, at special position 1. Safe to ignore the - // return value because 1 < kNumSpecialHits. - set_special_hit(1, hit); + // return value because 1 < kNumSpecialData. + SetSpecialHit(posting_list_used, /*index=*/1, hit); // Safe to ignore the return value because sizeof(Hit) is a valid argument. - set_start_byte_offset(sizeof(Hit)); + SetStartByteOffset(posting_list_used, /*offset=*/sizeof(Hit)); } else { // No space for delta. We put the new hit at special position 0 // and go to the full state. Safe to ignore the return value because 1 < - // kNumSpecialHits. - set_special_hit(0, hit); + // kNumSpecialData. + SetSpecialHit(posting_list_used, /*index=*/0, hit); } return libtextclassifier3::Status::OK; } -void PostingListUsed::PrependHitToEmpty(const Hit &hit) { +void PostingListUsedHitSerializer::PrependHitToEmpty( + PostingListUsed* posting_list_used, const Hit& hit) const { // First hit to be added. Just add verbatim, no compression. - if (size_in_bytes_ == posting_list_utils::kSpecialHitsSize) { - // Safe to ignore the return value because 1 < kNumSpecialHits - set_special_hit(1, hit); + if (posting_list_used->size_in_bytes() == kSpecialHitsSize) { + // Safe to ignore the return value because 1 < kNumSpecialData + SetSpecialHit(posting_list_used, /*index=*/1, hit); // Safe to ignore the return value because sizeof(Hit) is a valid argument. - set_start_byte_offset(sizeof(Hit)); + SetStartByteOffset(posting_list_used, /*offset=*/sizeof(Hit)); } else { // Since this is the first hit, size != kSpecialHitsSize and // size % sizeof(Hit) == 0, we know that there is room to fit 'hit' into // the compressed region, so ValueOrDie is safe. - uint32_t offset = PrependHitUncompressed(hit, size_in_bytes_).ValueOrDie(); + uint32_t offset = + PrependHitUncompressed(posting_list_used, hit, + /*offset=*/posting_list_used->size_in_bytes()) + .ValueOrDie(); // Safe to ignore the return value because PrependHitUncompressed is // guaranteed to return a valid offset. - set_start_byte_offset(offset); + SetStartByteOffset(posting_list_used, offset); } } -libtextclassifier3::Status PostingListUsed::PrependHitToNotFull( - const Hit &hit, uint32_t offset) { +libtextclassifier3::Status PostingListUsedHitSerializer::PrependHitToNotFull( + PostingListUsed* posting_list_used, const Hit& hit, uint32_t offset) const { // First hit in compressed area. It is uncompressed. See if delta // between the first hit and new hit will still fit in the // compressed area. - if (offset + sizeof(Hit::Value) > size_in_bytes_) { + if (offset + sizeof(Hit::Value) > posting_list_used->size_in_bytes()) { // The first hit in the compressed region *should* be uncompressed, but // somehow there isn't enough room between offset and the end of the // compressed area to fit an uncompressed hit. This should NEVER happen. @@ -214,7 +235,8 @@ libtextclassifier3::Status PostingListUsed::PrependHitToNotFull( "Posting list is in an invalid state."); } Hit::Value cur_value; - memcpy(&cur_value, posting_list_buffer_ + offset, sizeof(Hit::Value)); + memcpy(&cur_value, posting_list_used->posting_list_buffer() + offset, + sizeof(Hit::Value)); if (cur_value <= hit.value()) { return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( "Hit %d being prepended must be strictly less than the most recent " @@ -228,45 +250,49 @@ libtextclassifier3::Status PostingListUsed::PrependHitToNotFull( // offset now points to one past the end of the first hit. offset += sizeof(Hit::Value); - if (posting_list_utils::kSpecialHitsSize + sizeof(Hit::Value) + delta_len + + if (kSpecialHitsSize + sizeof(Hit::Value) + delta_len + hit_term_frequency_bytes <= offset) { // Enough space for delta in compressed area. // Prepend delta. offset -= delta_len; - memcpy(posting_list_buffer_ + offset, delta_buf, delta_len); + memcpy(posting_list_used->posting_list_buffer() + offset, delta_buf, + delta_len); // Prepend new hit with (possibly) its term_frequency. We know that there is // room for 'hit' because of the if statement above, so calling ValueOrDie // is safe. - offset = PrependHitUncompressed(hit, offset).ValueOrDie(); + offset = + PrependHitUncompressed(posting_list_used, hit, offset).ValueOrDie(); // offset is guaranteed to be valid here. So it's safe to ignore the return // value. The if above will guarantee that offset >= kSpecialHitSize and < - // size_in_bytes_ because the if ensures that there is enough room between - // offset and kSpecialHitSize to fit the delta of the previous hit, any - // term_frequency and the uncompressed hit. - set_start_byte_offset(offset); - } else if (posting_list_utils::kSpecialHitsSize + delta_len <= offset) { + // posting_list_used->size_in_bytes() because the if ensures that there is + // enough room between offset and kSpecialHitSize to fit the delta of the + // previous hit, any term_frequency and the uncompressed hit. + SetStartByteOffset(posting_list_used, offset); + } else if (kSpecialHitsSize + delta_len <= offset) { // Only have space for delta. The new hit must be put in special // position 1. // Prepend delta. offset -= delta_len; - memcpy(posting_list_buffer_ + offset, delta_buf, delta_len); + memcpy(posting_list_used->posting_list_buffer() + offset, delta_buf, + delta_len); // Prepend pad. Safe to ignore the return value of PadToEnd because offset - // must be less than size_in_bytes_. Otherwise, this function already would - // have returned FAILED_PRECONDITION. - PadToEnd(posting_list_utils::kSpecialHitsSize, offset); + // must be less than posting_list_used->size_in_bytes(). Otherwise, this + // function already would have returned FAILED_PRECONDITION. + PadToEnd(posting_list_used, /*start=*/kSpecialHitsSize, + /*end=*/offset); // Put new hit in special position 1. Safe to ignore return value because 1 - // < kNumSpecialHits. - set_special_hit(1, hit); + // < kNumSpecialData. + SetSpecialHit(posting_list_used, /*index=*/1, hit); // State almost_full. Safe to ignore the return value because sizeof(Hit) is // a valid argument. - set_start_byte_offset(sizeof(Hit)); + SetStartByteOffset(posting_list_used, /*offset=*/sizeof(Hit)); } else { // Very rare case where delta is larger than sizeof(Hit::Value) // (i.e. varint delta encoding expanded required storage). We @@ -277,59 +303,65 @@ libtextclassifier3::Status PostingListUsed::PrependHitToNotFull( // Therefore, offset must be less than kSpecialHitSize + 5. Since posting // list size must be divisible by sizeof(Hit) (5), it is guaranteed that // offset < size_in_bytes, so it is safe to ignore the return value here. - ConsumeTermFrequencyIfPresent(&cur, &offset); + ConsumeTermFrequencyIfPresent(posting_list_used, &cur, &offset); // Safe to ignore the return value of PadToEnd because offset must be less - // than size_in_bytes_. Otherwise, this function already would have returned - // FAILED_PRECONDITION. - PadToEnd(posting_list_utils::kSpecialHitsSize, offset); - // Safe to ignore the return value here because 0 and 1 < kNumSpecialHits. - set_special_hit(1, cur); - set_special_hit(0, hit); + // than posting_list_used->size_in_bytes(). Otherwise, this function + // already would have returned FAILED_PRECONDITION. + PadToEnd(posting_list_used, /*start=*/kSpecialHitsSize, + /*end=*/offset); + // Safe to ignore the return value here because 0 and 1 < kNumSpecialData. + SetSpecialHit(posting_list_used, /*index=*/1, cur); + SetSpecialHit(posting_list_used, /*index=*/0, hit); } return libtextclassifier3::Status::OK; } -libtextclassifier3::Status PostingListUsed::PrependHit(const Hit &hit) { +libtextclassifier3::Status PostingListUsedHitSerializer::PrependHit( + PostingListUsed* posting_list_used, const Hit& hit) const { static_assert(sizeof(Hit::Value) <= sizeof(uint64_t), "Hit::Value cannot be larger than 8 bytes because the delta " "must be able to fit in 8 bytes."); if (!hit.is_valid()) { return absl_ports::InvalidArgumentError("Cannot prepend an invalid hit!"); } - if (!IsPostingListValid()) { + if (!IsPostingListValid(posting_list_used)) { return absl_ports::FailedPreconditionError( "This PostingListUsed is in an invalid state and can't add any hits!"); } - if (full()) { + if (IsFull(posting_list_used)) { // State full: no space left. return absl_ports::ResourceExhaustedError("No more room for hits"); - } else if (almost_full()) { - return PrependHitToAlmostFull(hit); - } else if (empty()) { - PrependHitToEmpty(hit); + } else if (IsAlmostFull(posting_list_used)) { + return PrependHitToAlmostFull(posting_list_used, hit); + } else if (IsEmpty(posting_list_used)) { + PrependHitToEmpty(posting_list_used, hit); return libtextclassifier3::Status::OK; } else { - uint32_t offset = get_start_byte_offset(); - return PrependHitToNotFull(hit, offset); + uint32_t offset = GetStartByteOffset(posting_list_used); + return PrependHitToNotFull(posting_list_used, hit, offset); } } -libtextclassifier3::StatusOr<std::vector<Hit>> PostingListUsed::GetHits() - const { +libtextclassifier3::StatusOr<std::vector<Hit>> +PostingListUsedHitSerializer::GetHits( + const PostingListUsed* posting_list_used) const { std::vector<Hit> hits_out; - ICING_RETURN_IF_ERROR(GetHits(&hits_out)); + ICING_RETURN_IF_ERROR(GetHits(posting_list_used, &hits_out)); return hits_out; } -libtextclassifier3::Status PostingListUsed::GetHits( - std::vector<Hit> *hits_out) const { - return GetHitsInternal(/*limit=*/std::numeric_limits<uint32_t>::max(), +libtextclassifier3::Status PostingListUsedHitSerializer::GetHits( + const PostingListUsed* posting_list_used, + std::vector<Hit>* hits_out) const { + return GetHitsInternal(posting_list_used, + /*limit=*/std::numeric_limits<uint32_t>::max(), /*pop=*/false, hits_out); } -libtextclassifier3::Status PostingListUsed::PopFrontHits(uint32_t num_hits) { - if (num_hits == 1 && full()) { +libtextclassifier3::Status PostingListUsedHitSerializer::PopFrontHits( + PostingListUsed* posting_list_used, uint32_t num_hits) const { + if (num_hits == 1 && IsFull(posting_list_used)) { // The PL is in full status which means that we save 2 uncompressed hits in // the 2 special postions. But full status may be reached by 2 different // statuses. @@ -383,31 +415,35 @@ libtextclassifier3::Status PostingListUsed::PopFrontHits(uint32_t num_hits) { // Popping 2 hits should never fail because we've just ensured that the // posting list is in the FULL state. - ICING_RETURN_IF_ERROR(GetHitsInternal(/*limit=*/2, /*pop=*/true, &out)); + ICING_RETURN_IF_ERROR( + GetHitsInternal(posting_list_used, /*limit=*/2, /*pop=*/true, &out)); // PrependHit should never fail because out[1] is a valid hit less than // previous hits in the posting list and because there's no way that the // posting list could run out of room because it previously stored this hit // AND another hit. - PrependHit(out[1]); + PrependHit(posting_list_used, out[1]); } else if (num_hits > 0) { - return GetHitsInternal(/*limit=*/num_hits, /*pop=*/true, nullptr); + return GetHitsInternal(posting_list_used, /*limit=*/num_hits, /*pop=*/true, + nullptr); } return libtextclassifier3::Status::OK; } -libtextclassifier3::Status PostingListUsed::GetHitsInternal( - uint32_t limit, bool pop, std::vector<Hit> *out) const { +libtextclassifier3::Status PostingListUsedHitSerializer::GetHitsInternal( + const PostingListUsed* posting_list_used, uint32_t limit, bool pop, + std::vector<Hit>* out) const { // Put current uncompressed val here. Hit::Value val = Hit::kInvalidValue; - uint32_t offset = get_start_byte_offset(); + uint32_t offset = GetStartByteOffset(posting_list_used); uint32_t count = 0; // First traverse the first two special positions. - while (count < limit && offset < posting_list_utils::kSpecialHitsSize) { + while (count < limit && offset < kSpecialHitsSize) { // Calling ValueOrDie is safe here because offset / sizeof(Hit) < - // kNumSpecialHits because of the check above. - Hit hit = get_special_hit(offset / sizeof(Hit)).ValueOrDie(); + // kNumSpecialData because of the check above. + Hit hit = GetSpecialHit(posting_list_used, /*index=*/offset / sizeof(Hit)) + .ValueOrDie(); val = hit.value(); if (out != nullptr) { out->push_back(hit); @@ -417,25 +453,26 @@ libtextclassifier3::Status PostingListUsed::GetHitsInternal( } // If special position 1 was set then we need to skip padding. - if (val != Hit::kInvalidValue && - offset == posting_list_utils::kSpecialHitsSize) { - offset = GetPadEnd(offset); + if (val != Hit::kInvalidValue && offset == kSpecialHitsSize) { + offset = GetPadEnd(posting_list_used, offset); } - while (count < limit && offset < size_in_bytes_) { + while (count < limit && offset < posting_list_used->size_in_bytes()) { if (val == Hit::kInvalidValue) { // First hit is in compressed area. Put that in val. - memcpy(&val, posting_list_buffer_ + offset, sizeof(Hit::Value)); + memcpy(&val, posting_list_used->posting_list_buffer() + offset, + sizeof(Hit::Value)); offset += sizeof(Hit::Value); } else { // Now we have delta encoded subsequent hits. Decode and push. uint64_t delta; - offset += VarInt::Decode(posting_list_buffer_ + offset, &delta); + offset += VarInt::Decode( + posting_list_used->posting_list_buffer() + offset, &delta); val += delta; } Hit hit(val); libtextclassifier3::Status status = - ConsumeTermFrequencyIfPresent(&hit, &offset); + ConsumeTermFrequencyIfPresent(posting_list_used, &hit, &offset); if (!status.ok()) { // This posting list has been corrupted somehow. The first hit of the // posting list claims to have a term frequency, but there's no more room @@ -453,29 +490,32 @@ libtextclassifier3::Status PostingListUsed::GetHitsInternal( } if (pop) { - PostingListUsed *mutable_this = const_cast<PostingListUsed *>(this); + PostingListUsed* mutable_posting_list_used = + const_cast<PostingListUsed*>(posting_list_used); // Modify the posting list so that we pop all hits actually // traversed. - if (offset >= posting_list_utils::kSpecialHitsSize && - offset < size_in_bytes_) { + if (offset >= kSpecialHitsSize && + offset < posting_list_used->size_in_bytes()) { // In the compressed area. Pop and reconstruct. offset/val is // the last traversed hit, which we must discard. So move one // more forward. uint64_t delta; - offset += VarInt::Decode(posting_list_buffer_ + offset, &delta); + offset += VarInt::Decode( + posting_list_used->posting_list_buffer() + offset, &delta); val += delta; // Now val is the first hit of the new posting list. - if (posting_list_utils::kSpecialHitsSize + sizeof(Hit::Value) <= offset) { + if (kSpecialHitsSize + sizeof(Hit::Value) <= offset) { // val fits in compressed area. Simply copy. offset -= sizeof(Hit::Value); - memcpy(posting_list_buffer_ + offset, &val, sizeof(Hit::Value)); + memcpy(mutable_posting_list_used->posting_list_buffer() + offset, &val, + sizeof(Hit::Value)); } else { // val won't fit in compressed area. Also see if there is a // term_frequency. Hit hit(val); libtextclassifier3::Status status = - ConsumeTermFrequencyIfPresent(&hit, &offset); + ConsumeTermFrequencyIfPresent(posting_list_used, &hit, &offset); if (!status.ok()) { // This posting list has been corrupted somehow. The first hit of // the posting list claims to have a term frequency, but there's no @@ -487,20 +527,24 @@ libtextclassifier3::Status PostingListUsed::GetHitsInternal( } return absl_ports::InternalError("Posting list has been corrupted!"); } - // Okay to ignore the return value here because 1 < kNumSpecialHits. - mutable_this->set_special_hit(1, hit); + // Okay to ignore the return value here because 1 < kNumSpecialData. + SetSpecialHit(mutable_posting_list_used, /*index=*/1, hit); // Prepend pad. Safe to ignore the return value of PadToEnd because - // offset must be less than size_in_bytes_ thanks to the if above. - mutable_this->PadToEnd(posting_list_utils::kSpecialHitsSize, offset); + // offset must be less than posting_list_used->size_in_bytes() thanks to + // the if above. + PadToEnd(mutable_posting_list_used, + /*start=*/kSpecialHitsSize, + /*end=*/offset); offset = sizeof(Hit); } } // offset is guaranteed to be valid so ignoring the return value of // set_start_byte_offset is safe. It falls into one of four scenarios: - // Scenario 1: the above if was false because offset is not < size_in_bytes_ - // In this case, offset must be == size_in_bytes_ because we reached - // offset by unwinding hits on the posting list. + // Scenario 1: the above if was false because offset is not < + // posting_list_used->size_in_bytes() + // In this case, offset must be == posting_list_used->size_in_bytes() + // because we reached offset by unwinding hits on the posting list. // Scenario 2: offset is < kSpecialHitSize // In this case, offset is guaranteed to be either 0 or sizeof(Hit) // because offset is incremented by sizeof(Hit) within the first while @@ -514,104 +558,91 @@ libtextclassifier3::Status PostingListUsed::GetHitsInternal( // in the posting list is too large to fit as an uncompressed hit in the // in the compressed region. Therefore, it must be stored in a special hit // and offset will be sizeof(Hit). - mutable_this->set_start_byte_offset(offset); + SetStartByteOffset(mutable_posting_list_used, offset); } return libtextclassifier3::Status::OK; } -libtextclassifier3::StatusOr<Hit> PostingListUsed::get_special_hit( - uint32_t index) const { +libtextclassifier3::StatusOr<Hit> PostingListUsedHitSerializer::GetSpecialHit( + const PostingListUsed* posting_list_used, uint32_t index) const { static_assert(sizeof(Hit::Value) >= sizeof(uint32_t), "HitTooSmall"); - if (index >= posting_list_utils::kNumSpecialHits || index < 0) { + if (index >= kNumSpecialData || index < 0) { return absl_ports::InvalidArgumentError( "Special hits only exist at indices 0 and 1"); } Hit val; - memcpy(&val, posting_list_buffer_ + index * sizeof(val), sizeof(val)); + memcpy(&val, posting_list_used->posting_list_buffer() + index * sizeof(val), + sizeof(val)); return val; } -bool PostingListUsed::set_special_hit(uint32_t index, const Hit &val) { - if (index >= posting_list_utils::kNumSpecialHits || index < 0) { +bool PostingListUsedHitSerializer::SetSpecialHit( + PostingListUsed* posting_list_used, uint32_t index, const Hit& val) const { + if (index >= kNumSpecialData || index < 0) { ICING_LOG(ERROR) << "Special hits only exist at indices 0 and 1"; return false; } - memcpy(posting_list_buffer_ + index * sizeof(val), &val, sizeof(val)); + memcpy(posting_list_used->posting_list_buffer() + index * sizeof(val), &val, + sizeof(val)); return true; } -uint32_t PostingListUsed::BytesUsed() const { - // The special hits will be included if they represent actual hits. If they - // represent the hit offset or the invalid hit sentinel, they are not - // included. - return size_in_bytes_ - get_start_byte_offset(); -} - -uint32_t PostingListUsed::MinPostingListSizeToFit() const { - if (full() || almost_full()) { - // If in either the FULL state or ALMOST_FULL state, this posting list *is* - // the minimum size posting list that can fit these hits. So just return the - // size of the posting list. - return size_in_bytes_; - } - - // In NOT_FULL status BytesUsed contains no special hits. The minimum sized - // posting list that would be guaranteed to fit these hits would be - // ALMOST_FULL, with kInvalidHit in special_hit(0), the uncompressed Hit in - // special_hit(1) and the n compressed hits in the compressed region. - // BytesUsed contains one uncompressed Hit and n compressed hits. Therefore, - // fitting these hits into a posting list would require BytesUsed plus one - // extra hit. - return BytesUsed() + sizeof(Hit); -} - -bool PostingListUsed::IsPostingListValid() const { - if (almost_full()) { +bool PostingListUsedHitSerializer::IsPostingListValid( + const PostingListUsed* posting_list_used) const { + if (IsAlmostFull(posting_list_used)) { // Special Hit 1 should hold a Hit. Calling ValueOrDie is safe because we - // know that 1 < kNumSpecialHits. - if (!get_special_hit(1).ValueOrDie().is_valid()) { + // know that 1 < kNumSpecialData. + if (!GetSpecialHit(posting_list_used, /*index=*/1) + .ValueOrDie() + .is_valid()) { ICING_LOG(ERROR) << "Both special hits cannot be invalid at the same time."; return false; } - } else if (!full()) { + } else if (!IsFull(posting_list_used)) { // NOT_FULL. Special Hit 0 should hold a valid offset. Calling ValueOrDie is - // safe because we know that 0 < kNumSpecialHits. - if (get_special_hit(0).ValueOrDie().value() > size_in_bytes_ || - get_special_hit(0).ValueOrDie().value() < - posting_list_utils::kSpecialHitsSize) { - ICING_LOG(ERROR) << "Hit: " << get_special_hit(0).ValueOrDie().value() - << " size: " << size_in_bytes_ - << " sp size: " << posting_list_utils::kSpecialHitsSize; + // safe because we know that 0 < kNumSpecialData. + if (GetSpecialHit(posting_list_used, /*index=*/0).ValueOrDie().value() > + posting_list_used->size_in_bytes() || + GetSpecialHit(posting_list_used, /*index=*/0).ValueOrDie().value() < + kSpecialHitsSize) { + ICING_LOG(ERROR) + << "Hit: " + << GetSpecialHit(posting_list_used, /*index=*/0).ValueOrDie().value() + << " size: " << posting_list_used->size_in_bytes() + << " sp size: " << kSpecialHitsSize; return false; } } return true; } -uint32_t PostingListUsed::get_start_byte_offset() const { - if (full()) { +uint32_t PostingListUsedHitSerializer::GetStartByteOffset( + const PostingListUsed* posting_list_used) const { + if (IsFull(posting_list_used)) { return 0; - } else if (almost_full()) { + } else if (IsAlmostFull(posting_list_used)) { return sizeof(Hit); } else { // NOT_FULL, calling ValueOrDie is safe because we know that 0 < - // kNumSpecialHits. - return get_special_hit(0).ValueOrDie().value(); + // kNumSpecialData. + return GetSpecialHit(posting_list_used, /*index=*/0).ValueOrDie().value(); } } -bool PostingListUsed::set_start_byte_offset(uint32_t offset) { - if (offset > size_in_bytes_) { +bool PostingListUsedHitSerializer::SetStartByteOffset( + PostingListUsed* posting_list_used, uint32_t offset) const { + if (offset > posting_list_used->size_in_bytes()) { ICING_LOG(ERROR) << "offset cannot be a value greater than size " - << size_in_bytes_ << ". offset is " << offset << "."; + << posting_list_used->size_in_bytes() << ". offset is " + << offset << "."; return false; } - if (offset < posting_list_utils::kSpecialHitsSize && offset > sizeof(Hit)) { + if (offset < kSpecialHitsSize && offset > sizeof(Hit)) { ICING_LOG(ERROR) << "offset cannot be a value between (" << sizeof(Hit) - << ", " << posting_list_utils::kSpecialHitsSize - << "). offset is " << offset << "."; + << ", " << kSpecialHitsSize << "). offset is " << offset + << "."; return false; } if (offset < sizeof(Hit) && offset != 0) { @@ -619,55 +650,61 @@ bool PostingListUsed::set_start_byte_offset(uint32_t offset) { << "). offset is " << offset << "."; return false; } - if (offset >= posting_list_utils::kSpecialHitsSize) { + if (offset >= kSpecialHitsSize) { // not_full state. Safe to ignore the return value because 0 and 1 are both - // < kNumSpecialHits. - set_special_hit(0, Hit(offset)); - set_special_hit(1, Hit()); + // < kNumSpecialData. + SetSpecialHit(posting_list_used, /*index=*/0, Hit(offset)); + SetSpecialHit(posting_list_used, /*index=*/1, Hit()); } else if (offset == sizeof(Hit)) { // almost_full state. Safe to ignore the return value because 1 is both < - // kNumSpecialHits. - set_special_hit(0, Hit()); + // kNumSpecialData. + SetSpecialHit(posting_list_used, /*index=*/0, Hit()); } // Nothing to do for the FULL state - the offset isn't actually stored // anywhere and both special hits hold valid hits. return true; } -libtextclassifier3::StatusOr<uint32_t> PostingListUsed::PrependHitUncompressed( - const Hit &hit, uint32_t offset) { +libtextclassifier3::StatusOr<uint32_t> +PostingListUsedHitSerializer::PrependHitUncompressed( + PostingListUsed* posting_list_used, const Hit& hit, uint32_t offset) const { if (hit.has_term_frequency()) { - if (offset < posting_list_utils::kSpecialHitsSize + sizeof(Hit)) { + if (offset < kSpecialHitsSize + sizeof(Hit)) { return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( "Not enough room to prepend Hit at offset %d.", offset)); } offset -= sizeof(Hit); - memcpy(posting_list_buffer_ + offset, &hit, sizeof(Hit)); + memcpy(posting_list_used->posting_list_buffer() + offset, &hit, + sizeof(Hit)); } else { - if (offset < posting_list_utils::kSpecialHitsSize + sizeof(Hit::Value)) { + if (offset < kSpecialHitsSize + sizeof(Hit::Value)) { return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( "Not enough room to prepend Hit::Value at offset %d.", offset)); } offset -= sizeof(Hit::Value); Hit::Value val = hit.value(); - memcpy(posting_list_buffer_ + offset, &val, sizeof(Hit::Value)); + memcpy(posting_list_used->posting_list_buffer() + offset, &val, + sizeof(Hit::Value)); } return offset; } -libtextclassifier3::Status PostingListUsed::ConsumeTermFrequencyIfPresent( - Hit *hit, uint32_t *offset) const { +libtextclassifier3::Status +PostingListUsedHitSerializer::ConsumeTermFrequencyIfPresent( + const PostingListUsed* posting_list_used, Hit* hit, + uint32_t* offset) const { if (!hit->has_term_frequency()) { // No term frequency to consume. Everything is fine. return libtextclassifier3::Status::OK; } - if (*offset + sizeof(Hit::TermFrequency) > size_in_bytes_) { + if (*offset + sizeof(Hit::TermFrequency) > + posting_list_used->size_in_bytes()) { return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( "offset %d must not point past the end of the posting list of size %d.", - *offset, size_in_bytes_)); + *offset, posting_list_used->size_in_bytes())); } Hit::TermFrequency term_frequency; - memcpy(&term_frequency, posting_list_buffer_ + *offset, + memcpy(&term_frequency, posting_list_used->posting_list_buffer() + *offset, sizeof(Hit::TermFrequency)); *hit = Hit(hit->value(), term_frequency); *offset += sizeof(Hit::TermFrequency); diff --git a/icing/index/main/posting-list-used.h b/icing/index/main/posting-list-used-hit-serializer.h index 8944034..70e3e6c 100644 --- a/icing/index/main/posting-list-used.h +++ b/icing/index/main/posting-list-used-hit-serializer.h @@ -1,4 +1,4 @@ -// Copyright (C) 2019 Google LLC +// Copyright (C) 2022 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,104 +12,93 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef ICING_INDEX_MAIN_POSTING_LIST_USED_H_ -#define ICING_INDEX_MAIN_POSTING_LIST_USED_H_ +#ifndef ICING_INDEX_MAIN_POSTING_LIST_USED_HIT_SERIALIZER_H_ +#define ICING_INDEX_MAIN_POSTING_LIST_USED_HIT_SERIALIZER_H_ -#include <sys/mman.h> - -#include <algorithm> -#include <cstring> +#include <cstdint> #include <vector> #include "icing/text_classifier/lib3/utils/base/status.h" #include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "icing/file/posting_list/posting-list-common.h" +#include "icing/file/posting_list/posting-list-used.h" #include "icing/index/hit/hit.h" -#include "icing/index/main/posting-list-utils.h" -#include "icing/util/logging.h" namespace icing { namespace lib { -// A posting list with hits in it. Layout described in comments in -// posting-list-used.cc. -class PostingListUsed { +// A serializer class to serialize hits to PostingListUsed. Layout described in +// comments in posting-list-used-hit-serializer.cc. +class PostingListUsedHitSerializer : public PostingListUsedSerializer { public: - // Creates a PostingListUsed that points to a buffer of size_in_bytes bytes. - // 'Preexisting' means that posting_list_buffer was previously modified by - // another instance of PostingListUsed. - // - // Caller owns the hits buffer and must not free it while using a - // PostingListUsed. - // - // RETURNS: - // - A valid PostingListUsed if successful - // - INVALID_ARGUMENT if size_in_bytes < min_posting_list_size() - // || size_in_bytes % sizeof(Hit) != 0. - // - FAILED_PRECONDITION if posting_list_buffer is null - static libtextclassifier3::StatusOr<PostingListUsed> - CreateFromPreexistingPostingListUsedRegion(void *posting_list_buffer, - uint32_t size_in_bytes); - - // Creates a PostingListUsed that points to a buffer of size_in_bytes bytes - // and initializes the content of the buffer so that the returned - // PostingListUsed is empty. - // - // Caller owns the posting_list_buffer buffer and must not free it while using - // a PostingListUsed. - // - // RETURNS: - // - A valid PostingListUsed if successful - // - INVALID_ARGUMENT if size_in_bytes < min_posting_list_size() - // || size_in_bytes % sizeof(Hit) != 0. - // - FAILED_PRECONDITION if posting_list_buffer is null - static libtextclassifier3::StatusOr<PostingListUsed> - CreateFromUnitializedRegion(void *posting_list_buffer, - uint32_t size_in_bytes); - - // Move contents from another posting list. Clears other. + static constexpr uint32_t kSpecialHitsSize = sizeof(Hit) * kNumSpecialData; + + uint32_t GetDataTypeBytes() const override { return sizeof(Hit); } + + uint32_t GetMinPostingListSize() const override { + static constexpr uint32_t kMinPostingListSize = kSpecialHitsSize; + static_assert(sizeof(PostingListIndex) <= kMinPostingListSize, + "PostingListIndex must be small enough to fit in a " + "minimum-sized Posting List."); + + return kMinPostingListSize; + } + + // Min size of posting list that can fit these used bytes (see MoveFrom). + uint32_t GetMinPostingListSizeToFit( + const PostingListUsed* posting_list_used) const override; + + // Returns bytes used by actual hits. + uint32_t GetBytesUsed( + const PostingListUsed* posting_list_used) const override; + + void Clear(PostingListUsed* posting_list_used) const override; + + // Moves contents from posting list 'src' to 'dst'. Clears 'src'. // // RETURNS: - // - OK, if successful - // - INVALID_ARGUMENT if 'other' is not valid or 'other' is too large to fit - // in 'this'. - // - FAILED_PRECONDITION if 'this' posting list is in a corrupted state. - libtextclassifier3::Status MoveFrom(PostingListUsed *other); - - // Min size of posting list that can fit these used bytes. (See - // MoveFrom.) - uint32_t MinPostingListSizeToFit() const; + // - OK on success + // - INVALID_ARGUMENT if 'src' is not valid or 'src' is too large to fit in + // 'dst'. + // - FAILED_PRECONDITION if 'dst' posting list is in a corrupted state. + libtextclassifier3::Status MoveFrom(PostingListUsed* dst, + PostingListUsed* src) const override; // Prepend a hit to the posting list. + // // RETURNS: // - INVALID_ARGUMENT if !hit.is_valid() or if hit is not less than the - // previously added hit. + // previously added hit. // - RESOURCE_EXHAUSTED if there is no more room to add hit to the posting - // list. - libtextclassifier3::Status PrependHit(const Hit &hit); + // list. + libtextclassifier3::Status PrependHit(PostingListUsed* posting_list_used, + const Hit& hit) const; - // Prepend hits to the posting list. Hits should be sorted in - // descending order (as defined by the less than operator for Hit) + // Prepend hits to the posting list. Hits should be sorted in descending order + // (as defined by the less than operator for Hit) // // Returns the number of hits that could be prepended to the posting list. If // keep_prepended is true, whatever could be prepended is kept, otherwise the // posting list is left in its original state. - template <class T, Hit (*GetHit)(const T &)> - uint32_t PrependHitArray(const T *array, uint32_t num_hits, - bool keep_prepended); + template <class T, Hit (*GetHit)(const T&)> + uint32_t PrependHitArray(PostingListUsed* posting_list_used, const T* array, + uint32_t num_hits, bool keep_prepended) const; // Retrieves the hits stored in the posting list. // // RETURNS: // - On success, a vector of hits sorted by the reverse order of prepending. // - INTERNAL_ERROR if the posting list has been corrupted somehow. - libtextclassifier3::StatusOr<std::vector<Hit>> GetHits() const; + libtextclassifier3::StatusOr<std::vector<Hit>> GetHits( + const PostingListUsed* posting_list_used) const; // Same as GetHits but appends hits to hits_out. // // RETURNS: // - On success, a vector of hits sorted by the reverse order of prepending. // - INTERNAL_ERROR if the posting list has been corrupted somehow. - libtextclassifier3::Status GetHits(std::vector<Hit> *hits_out) const; + libtextclassifier3::Status GetHits(const PostingListUsed* posting_list_used, + std::vector<Hit>* hits_out) const; // Undo the last num_hits hits prepended. If num_hits > number of // hits we clear all hits. @@ -117,10 +106,8 @@ class PostingListUsed { // RETURNS: // - OK on success // - INTERNAL_ERROR if the posting list has been corrupted somehow. - libtextclassifier3::Status PopFrontHits(uint32_t num_hits); - - // Returns bytes used by actual hits. - uint32_t BytesUsed() const; + libtextclassifier3::Status PopFrontHits(PostingListUsed* posting_list_used, + uint32_t num_hits) const; private: // Posting list layout formats: @@ -201,71 +188,83 @@ class PostingListUsed { // -+ | 0x07FFF320 |0x07FFF40E,87| 0x000 | 196 | 434 | 125 | 788 // | // +-------------+-------------+---------+----------+---------+------+---------+ - PostingListUsed(void *posting_list_buffer, uint32_t size_in_bytes) - : posting_list_buffer_(static_cast<uint8_t *>(posting_list_buffer)), - size_in_bytes_(size_in_bytes) {} // Helpers to determine what state the posting list is in. - bool full() const { - return get_special_hit(0).ValueOrDie().is_valid() && - get_special_hit(1).ValueOrDie().is_valid(); + bool IsFull(const PostingListUsed* posting_list_used) const { + return GetSpecialHit(posting_list_used, /*index=*/0) + .ValueOrDie() + .is_valid() && + GetSpecialHit(posting_list_used, /*index=*/1) + .ValueOrDie() + .is_valid(); } - bool almost_full() const { - return !get_special_hit(0).ValueOrDie().is_valid(); + + bool IsAlmostFull(const PostingListUsed* posting_list_used) const { + return !GetSpecialHit(posting_list_used, /*index=*/0) + .ValueOrDie() + .is_valid(); } - bool empty() const { - return get_special_hit(0).ValueOrDie().value() == size_in_bytes_ && - !get_special_hit(1).ValueOrDie().is_valid(); + + bool IsEmpty(const PostingListUsed* posting_list_used) const { + return GetSpecialHit(posting_list_used, /*index=*/0).ValueOrDie().value() == + posting_list_used->size_in_bytes() && + !GetSpecialHit(posting_list_used, /*index=*/1) + .ValueOrDie() + .is_valid(); } // Returns false if both special hits are invalid or if the offset value // stored in the special hit is less than kSpecialHitsSize or greater than - // size_in_bytes_. Returns true, otherwise. - bool IsPostingListValid() const; + // posting_list_used->size_in_bytes(). Returns true, otherwise. + bool IsPostingListValid(const PostingListUsed* posting_list_used) const; // Prepend hit to a posting list that is in the ALMOST_FULL state. // RETURNS: // - OK, if successful // - INVALID_ARGUMENT if hit is not less than the previously added hit. - libtextclassifier3::Status PrependHitToAlmostFull(const Hit &hit); + libtextclassifier3::Status PrependHitToAlmostFull( + PostingListUsed* posting_list_used, const Hit& hit) const; // Prepend hit to a posting list that is in the EMPTY state. This will always // succeed because there are no pre-existing hits and no validly constructed // posting list could fail to fit one hit. - void PrependHitToEmpty(const Hit &hit); + void PrependHitToEmpty(PostingListUsed* posting_list_used, + const Hit& hit) const; // Prepend hit to a posting list that is in the NOT_FULL state. // RETURNS: // - OK, if successful // - INVALID_ARGUMENT if hit is not less than the previously added hit. - libtextclassifier3::Status PrependHitToNotFull(const Hit &hit, - uint32_t offset); - - // Reset contents to an empty posting list. This *must* be called if the - // posting_list_buffer_ region is uninitialized. - void Clear(); + libtextclassifier3::Status PrependHitToNotFull( + PostingListUsed* posting_list_used, const Hit& hit, + uint32_t offset) const; // Returns either 0 (full state), sizeof(Hit) (almost_full state) or - // a byte offset between kSpecialHitsSize and size_in_bytes_ (inclusive) - // (not_full state). - uint32_t get_start_byte_offset() const; + // a byte offset between kSpecialHitsSize and + // posting_list_used->size_in_bytes() (inclusive) (not_full state). + uint32_t GetStartByteOffset(const PostingListUsed* posting_list_used) const; // Sets the special hits to properly reflect what offset is (see layout // comment for further details). // - // Returns false if offset > size_in_bytes_ or offset is (kSpecialHitsSize, - // sizeof(Hit)) or offset is (sizeof(Hit), 0). True, otherwise. - bool set_start_byte_offset(uint32_t offset); + // Returns false if offset > posting_list_used->size_in_bytes() or offset is + // (kSpecialHitsSize, sizeof(Hit)) or offset is (sizeof(Hit), 0). True, + // otherwise. + bool SetStartByteOffset(PostingListUsed* posting_list_used, + uint32_t offset) const; // Manipulate padded areas. We never store the same hit value twice // so a delta of 0 is a pad byte. // Returns offset of first non-pad byte. - uint32_t GetPadEnd(uint32_t offset) const; + uint32_t GetPadEnd(const PostingListUsed* posting_list_used, + uint32_t offset) const; // Fill padding between offset start and offset end with 0s. - // Returns false if end > size_in_bytes_. True, otherwise. - bool PadToEnd(uint32_t start, uint32_t end); + // Returns false if end > posting_list_used->size_in_bytes(). True, + // otherwise. + bool PadToEnd(PostingListUsed* posting_list_used, uint32_t start, + uint32_t end) const; // Helper for AppendHits/PopFrontHits. Adds limit number of hits to out or all // hits in the posting list if the posting list contains less than limit @@ -279,19 +278,22 @@ class PostingListUsed { // RETURNS: // - OK on success // - INTERNAL_ERROR if the posting list has been corrupted somehow. - libtextclassifier3::Status GetHitsInternal(uint32_t limit, bool pop, - std::vector<Hit> *out) const; + libtextclassifier3::Status GetHitsInternal( + const PostingListUsed* posting_list_used, uint32_t limit, bool pop, + std::vector<Hit>* out) const; // Retrieves the value stored in the index-th special hit. // // RETURNS: // - A valid Hit, on success - // - INVALID_ARGUMENT if index is not less than kNumSpecialHits - libtextclassifier3::StatusOr<Hit> get_special_hit(uint32_t index) const; + // - INVALID_ARGUMENT if index is not less than kNumSpecialData + libtextclassifier3::StatusOr<Hit> GetSpecialHit( + const PostingListUsed* posting_list_used, uint32_t index) const; // Sets the value stored in the index-th special hit to val. If index is not // less than kSpecialHitSize / sizeof(Hit), this has no effect. - bool set_special_hit(uint32_t index, const Hit &val); + bool SetSpecialHit(PostingListUsed* posting_list_used, uint32_t index, + const Hit& val) const; // Prepends hit to the memory region [offset - sizeof(Hit), offset] and // returns the new beginning of the padded region. @@ -301,7 +303,8 @@ class PostingListUsed { // - INVALID_ARGUMENT if hit will not fit (uncompressed) between offset and // kSpecialHitsSize libtextclassifier3::StatusOr<uint32_t> PrependHitUncompressed( - const Hit &hit, uint32_t offset); + PostingListUsed* posting_list_used, const Hit& hit, + uint32_t offset) const; // If hit has a term frequency, consumes the term frequency at offset, updates // hit to include the term frequency and updates offset to reflect that the @@ -310,29 +313,25 @@ class PostingListUsed { // RETURNS: // - OK, if successful // - INVALID_ARGUMENT if hit has a term frequency and offset + - // sizeof(Hit::TermFrequency) >= - // size_in_bytes_ + // sizeof(Hit::TermFrequency) >= posting_list_used->size_in_bytes() libtextclassifier3::Status ConsumeTermFrequencyIfPresent( - Hit *hit, uint32_t *offset) const; - - // A byte array of size size_in_bytes_ containing encoded hits for this - // posting list. - uint8_t *posting_list_buffer_; // does not own! - uint32_t size_in_bytes_; + const PostingListUsed* posting_list_used, Hit* hit, + uint32_t* offset) const; }; // Inlined functions. Implementation details below. Avert eyes! -template <class T, Hit (*GetHit)(const T &)> -uint32_t PostingListUsed::PrependHitArray(const T *array, uint32_t num_hits, - bool keep_prepended) { - if (!IsPostingListValid()) { +template <class T, Hit (*GetHit)(const T&)> +uint32_t PostingListUsedHitSerializer::PrependHitArray( + PostingListUsed* posting_list_used, const T* array, uint32_t num_hits, + bool keep_prepended) const { + if (!IsPostingListValid(posting_list_used)) { return 0; } // Prepend hits working backwards from array[num_hits - 1]. uint32_t i; for (i = 0; i < num_hits; ++i) { - if (!PrependHit(GetHit(array[num_hits - i - 1])).ok()) { + if (!PrependHit(posting_list_used, GetHit(array[num_hits - i - 1])).ok()) { break; } } @@ -341,7 +340,7 @@ uint32_t PostingListUsed::PrependHitArray(const T *array, uint32_t num_hits, // before. PopFrontHits guarantees that it will remove all 'i' hits so long // as there are at least 'i' hits in the posting list, which we know there // are. - PopFrontHits(i); + PopFrontHits(posting_list_used, /*num_hits=*/i); } return i; } @@ -349,4 +348,4 @@ uint32_t PostingListUsed::PrependHitArray(const T *array, uint32_t num_hits, } // namespace lib } // namespace icing -#endif // ICING_INDEX_MAIN_POSTING_LIST_USED_H_ +#endif // ICING_INDEX_MAIN_POSTING_LIST_USED_HIT_SERIALIZER_H_ diff --git a/icing/index/main/posting-list-used_test.cc b/icing/index/main/posting-list-used-hit-serializer_test.cc index 044d0c1..b87adc9 100644 --- a/icing/index/main/posting-list-used_test.cc +++ b/icing/index/main/posting-list-used-hit-serializer_test.cc @@ -1,4 +1,4 @@ -// Copyright (C) 2019 Google LLC +// Copyright (C) 2022 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,35 +12,20 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "icing/index/main/posting-list-used.h" +#include "icing/index/main/posting-list-used-hit-serializer.h" -#include <fcntl.h> -#include <sys/stat.h> -#include <sys/time.h> -#include <sys/types.h> -#include <unistd.h> - -#include <algorithm> #include <cstdint> #include <deque> -#include <iterator> #include <memory> -#include <random> -#include <string> #include <vector> #include "icing/text_classifier/lib3/utils/base/status.h" #include "gmock/gmock.h" #include "gtest/gtest.h" -#include "icing/index/main/posting-list-utils.h" -#include "icing/legacy/index/icing-bit-util.h" -#include "icing/schema/section.h" -#include "icing/store/document-id.h" +#include "icing/file/posting_list/posting-list-used.h" #include "icing/testing/common-matchers.h" #include "icing/testing/hit-test-utils.h" -using std::reverse; -using std::vector; using testing::ElementsAre; using testing::ElementsAreArray; using testing::Eq; @@ -51,18 +36,20 @@ using testing::Lt; namespace icing { namespace lib { +namespace { + struct HitElt { HitElt() = default; explicit HitElt(const Hit &hit_in) : hit(hit_in) {} - static Hit get_hit(const HitElt &hit_elt) { - return hit_elt.hit; - } + static Hit get_hit(const HitElt &hit_elt) { return hit_elt.hit; } Hit hit; }; -TEST(PostingListTest, PostingListUsedPrependHitNotFull) { +TEST(PostingListUsedHitSerializerTest, PostingListUsedPrependHitNotFull) { + PostingListUsedHitSerializer serializer; + static const int kNumHits = 2551; static const size_t kHitsSize = kNumHits * sizeof(Hit); @@ -70,52 +57,56 @@ TEST(PostingListTest, PostingListUsedPrependHitNotFull) { ICING_ASSERT_OK_AND_ASSIGN( PostingListUsed pl_used, PostingListUsed::CreateFromUnitializedRegion( - static_cast<void *>(hits_buf.get()), kHitsSize)); + &serializer, static_cast<void *>(hits_buf.get()), kHitsSize)); // Make used. Hit hit0(/*section_id=*/0, 0, /*term_frequency=*/56); - pl_used.PrependHit(hit0); + serializer.PrependHit(&pl_used, hit0); // Size = sizeof(uncompressed hit0) int expected_size = sizeof(Hit); - EXPECT_THAT(pl_used.BytesUsed(), Le(expected_size)); - EXPECT_THAT(pl_used.GetHits(), IsOkAndHolds(ElementsAre(hit0))); + EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Le(expected_size)); + EXPECT_THAT(serializer.GetHits(&pl_used), IsOkAndHolds(ElementsAre(hit0))); Hit hit1(/*section_id=*/0, 1, Hit::kDefaultTermFrequency); - pl_used.PrependHit(hit1); + serializer.PrependHit(&pl_used, hit1); // Size = sizeof(uncompressed hit1) // + sizeof(hit0-hit1) + sizeof(hit0::term_frequency) expected_size += 2 + sizeof(Hit::TermFrequency); - EXPECT_THAT(pl_used.BytesUsed(), Le(expected_size)); - EXPECT_THAT(pl_used.GetHits(), IsOkAndHolds(ElementsAre(hit1, hit0))); + EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Le(expected_size)); + EXPECT_THAT(serializer.GetHits(&pl_used), + IsOkAndHolds(ElementsAre(hit1, hit0))); Hit hit2(/*section_id=*/0, 2, /*term_frequency=*/56); - pl_used.PrependHit(hit2); + serializer.PrependHit(&pl_used, hit2); // Size = sizeof(uncompressed hit2) // + sizeof(hit1-hit2) // + sizeof(hit0-hit1) + sizeof(hit0::term_frequency) expected_size += 2; - EXPECT_THAT(pl_used.BytesUsed(), Le(expected_size)); - EXPECT_THAT(pl_used.GetHits(), IsOkAndHolds(ElementsAre(hit2, hit1, hit0))); + EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Le(expected_size)); + EXPECT_THAT(serializer.GetHits(&pl_used), + IsOkAndHolds(ElementsAre(hit2, hit1, hit0))); Hit hit3(/*section_id=*/0, 3, Hit::kDefaultTermFrequency); - pl_used.PrependHit(hit3); + serializer.PrependHit(&pl_used, hit3); // Size = sizeof(uncompressed hit3) // + sizeof(hit2-hit3) + sizeof(hit2::term_frequency) // + sizeof(hit1-hit2) // + sizeof(hit0-hit1) + sizeof(hit0::term_frequency) expected_size += 2 + sizeof(Hit::TermFrequency); - EXPECT_THAT(pl_used.BytesUsed(), Le(expected_size)); - EXPECT_THAT(pl_used.GetHits(), + EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Le(expected_size)); + EXPECT_THAT(serializer.GetHits(&pl_used), IsOkAndHolds(ElementsAre(hit3, hit2, hit1, hit0))); } -TEST(PostingListTest, PostingListUsedPrependHitAlmostFull) { - constexpr int kHitsSize = 2 * posting_list_utils::min_posting_list_size(); - std::unique_ptr<char[]> hits_buf = std::make_unique<char[]>(kHitsSize); +TEST(PostingListUsedHitSerializerTest, PostingListUsedPrependHitAlmostFull) { + PostingListUsedHitSerializer serializer; + + int size = 2 * serializer.GetMinPostingListSize(); + std::unique_ptr<char[]> hits_buf = std::make_unique<char[]>(size); ICING_ASSERT_OK_AND_ASSIGN( PostingListUsed pl_used, PostingListUsed::CreateFromUnitializedRegion( - static_cast<void *>(hits_buf.get()), kHitsSize)); + &serializer, static_cast<void *>(hits_buf.get()), size)); // Fill up the compressed region. // Transitions: @@ -125,17 +116,18 @@ TEST(PostingListTest, PostingListUsedPrependHitAlmostFull) { Hit hit0(/*section_id=*/0, 0, Hit::kDefaultTermFrequency); Hit hit1 = CreateHit(hit0, /*desired_byte_length=*/2); Hit hit2 = CreateHit(hit1, /*desired_byte_length=*/2); - ICING_EXPECT_OK(pl_used.PrependHit(hit0)); - ICING_EXPECT_OK(pl_used.PrependHit(hit1)); - ICING_EXPECT_OK(pl_used.PrependHit(hit2)); + ICING_EXPECT_OK(serializer.PrependHit(&pl_used, hit0)); + ICING_EXPECT_OK(serializer.PrependHit(&pl_used, hit1)); + ICING_EXPECT_OK(serializer.PrependHit(&pl_used, hit2)); // Size used will be 2+2+4=8 bytes int expected_size = sizeof(Hit::Value) + 2 + 2; - EXPECT_THAT(pl_used.BytesUsed(), Le(expected_size)); - EXPECT_THAT(pl_used.GetHits(), IsOkAndHolds(ElementsAre(hit2, hit1, hit0))); + EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Le(expected_size)); + EXPECT_THAT(serializer.GetHits(&pl_used), + IsOkAndHolds(ElementsAre(hit2, hit1, hit0))); // Add one more hit to transition NOT_FULL -> ALMOST_FULL Hit hit3 = CreateHit(hit2, /*desired_byte_length=*/3); - ICING_EXPECT_OK(pl_used.PrependHit(hit3)); + ICING_EXPECT_OK(serializer.PrependHit(&pl_used, hit3)); // Compressed region would be 2+2+3+4=11 bytes, but the compressed region is // only 10 bytes. So instead, the posting list will transition to ALMOST_FULL. // The in-use compressed region will actually shrink from 8 bytes to 7 bytes @@ -143,91 +135,100 @@ TEST(PostingListTest, PostingListUsedPrependHitAlmostFull) { // compressed delta of hit2. hit3 will be written to one of the special hits. // Because we're in ALMOST_FULL, the expected size is the size of the pl minus // the one hit used to mark the posting list as ALMOST_FULL. - expected_size = kHitsSize - sizeof(Hit); - EXPECT_THAT(pl_used.BytesUsed(), Le(expected_size)); - EXPECT_THAT(pl_used.GetHits(), + expected_size = size - sizeof(Hit); + EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Le(expected_size)); + EXPECT_THAT(serializer.GetHits(&pl_used), IsOkAndHolds(ElementsAre(hit3, hit2, hit1, hit0))); // Add one more hit to transition ALMOST_FULL -> ALMOST_FULL Hit hit4 = CreateHit(hit3, /*desired_byte_length=*/2); - ICING_EXPECT_OK(pl_used.PrependHit(hit4)); + ICING_EXPECT_OK(serializer.PrependHit(&pl_used, hit4)); // There are currently 7 bytes in use in the compressed region. hit3 will have // a 2-byte delta. That delta will fit in the compressed region (which will // now have 9 bytes in use), hit4 will be placed in one of the special hits // and the posting list will remain in ALMOST_FULL. - EXPECT_THAT(pl_used.BytesUsed(), Le(expected_size)); - EXPECT_THAT(pl_used.GetHits(), + EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Le(expected_size)); + EXPECT_THAT(serializer.GetHits(&pl_used), IsOkAndHolds(ElementsAre(hit4, hit3, hit2, hit1, hit0))); // Add one more hit to transition ALMOST_FULL -> FULL Hit hit5 = CreateHit(hit4, /*desired_byte_length=*/2); - ICING_EXPECT_OK(pl_used.PrependHit(hit5)); + ICING_EXPECT_OK(serializer.PrependHit(&pl_used, hit5)); // There are currently 9 bytes in use in the compressed region. hit4 will have // a 2-byte delta which will not fit in the compressed region. So hit4 will // remain in one of the special hits and hit5 will occupy the other, making // the posting list FULL. - EXPECT_THAT(pl_used.BytesUsed(), Le(kHitsSize)); - EXPECT_THAT(pl_used.GetHits(), + EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Le(size)); + EXPECT_THAT(serializer.GetHits(&pl_used), IsOkAndHolds(ElementsAre(hit5, hit4, hit3, hit2, hit1, hit0))); // The posting list is FULL. Adding another hit should fail. Hit hit6 = CreateHit(hit5, /*desired_byte_length=*/1); - EXPECT_THAT(pl_used.PrependHit(hit6), + EXPECT_THAT(serializer.PrependHit(&pl_used, hit6), StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED)); } -TEST(PostingListTest, PostingListUsedMinSize) { +TEST(PostingListUsedHitSerializerTest, PostingListUsedMinSize) { + PostingListUsedHitSerializer serializer; + std::unique_ptr<char[]> hits_buf = - std::make_unique<char[]>(posting_list_utils::min_posting_list_size()); + std::make_unique<char[]>(serializer.GetMinPostingListSize()); - ICING_ASSERT_OK_AND_ASSIGN(PostingListUsed pl_used, - PostingListUsed::CreateFromUnitializedRegion( - static_cast<void *>(hits_buf.get()), - posting_list_utils::min_posting_list_size())); + ICING_ASSERT_OK_AND_ASSIGN( + PostingListUsed pl_used, + PostingListUsed::CreateFromUnitializedRegion( + &serializer, static_cast<void *>(hits_buf.get()), + serializer.GetMinPostingListSize())); // PL State: EMPTY - EXPECT_THAT(pl_used.BytesUsed(), Eq(0)); - EXPECT_THAT(pl_used.GetHits(), IsOkAndHolds(IsEmpty())); + EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(0)); + EXPECT_THAT(serializer.GetHits(&pl_used), IsOkAndHolds(IsEmpty())); // Add a hit, PL should shift to ALMOST_FULL state Hit hit0(/*section_id=*/0, 0, /*term_frequency=*/0, /*is_in_prefix_section=*/false, /*is_prefix_hit=*/true); - ICING_EXPECT_OK(pl_used.PrependHit(hit0)); + ICING_EXPECT_OK(serializer.PrependHit(&pl_used, hit0)); // Size = sizeof(uncompressed hit0) int expected_size = sizeof(Hit); - EXPECT_THAT(pl_used.BytesUsed(), Le(expected_size)); - EXPECT_THAT(pl_used.GetHits(), IsOkAndHolds(ElementsAre(hit0))); + EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Le(expected_size)); + EXPECT_THAT(serializer.GetHits(&pl_used), IsOkAndHolds(ElementsAre(hit0))); // Add the smallest hit possible - no term_frequency and a delta of 1. PL // should shift to FULL state. Hit hit1(/*section_id=*/0, 0, /*term_frequency=*/0, /*is_in_prefix_section=*/true, /*is_prefix_hit=*/false); - ICING_EXPECT_OK(pl_used.PrependHit(hit1)); + ICING_EXPECT_OK(serializer.PrependHit(&pl_used, hit1)); // Size = sizeof(uncompressed hit1) + sizeof(uncompressed hit0) expected_size += sizeof(Hit); - EXPECT_THAT(pl_used.BytesUsed(), Le(expected_size)); - EXPECT_THAT(pl_used.GetHits(), IsOkAndHolds(ElementsAre(hit1, hit0))); + EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Le(expected_size)); + EXPECT_THAT(serializer.GetHits(&pl_used), + IsOkAndHolds(ElementsAre(hit1, hit0))); // Try to add the smallest hit possible. Should fail Hit hit2(/*section_id=*/0, 0, /*term_frequency=*/0, /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); - EXPECT_THAT(pl_used.PrependHit(hit2), + EXPECT_THAT(serializer.PrependHit(&pl_used, hit2), StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED)); - EXPECT_THAT(pl_used.BytesUsed(), Le(expected_size)); - EXPECT_THAT(pl_used.GetHits(), IsOkAndHolds(ElementsAre(hit1, hit0))); + EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Le(expected_size)); + EXPECT_THAT(serializer.GetHits(&pl_used), + IsOkAndHolds(ElementsAre(hit1, hit0))); } -TEST(PostingListTest, PostingListPrependHitArrayMinSizePostingList) { +TEST(PostingListUsedHitSerializerTest, + PostingListPrependHitArrayMinSizePostingList) { + PostingListUsedHitSerializer serializer; + constexpr int kFinalSize = 1025; std::unique_ptr<char[]> hits_buf = std::make_unique<char[]>(kFinalSize); // Min Size = 10 - int size = posting_list_utils::min_posting_list_size(); - ICING_ASSERT_OK_AND_ASSIGN(PostingListUsed pl_used, - PostingListUsed::CreateFromUnitializedRegion( - static_cast<void *>(hits_buf.get()), size)); + int size = serializer.GetMinPostingListSize(); + ICING_ASSERT_OK_AND_ASSIGN( + PostingListUsed pl_used, + PostingListUsed::CreateFromUnitializedRegion( + &serializer, static_cast<void *>(hits_buf.get()), size)); std::vector<HitElt> hits_in; hits_in.emplace_back(Hit(1, 0, Hit::kDefaultTermFrequency)); @@ -243,32 +244,37 @@ TEST(PostingListTest, PostingListPrependHitArrayMinSizePostingList) { // Add five hits. The PL is in the empty state and an empty min size PL can // only fit two hits. So PrependHitArray should fail. - uint32_t num_can_prepend = pl_used.PrependHitArray<HitElt, HitElt::get_hit>( - &hits_in[0], hits_in.size(), false); + uint32_t num_can_prepend = + serializer.PrependHitArray<HitElt, HitElt::get_hit>( + &pl_used, &hits_in[0], hits_in.size(), false); EXPECT_THAT(num_can_prepend, Eq(2)); int can_fit_hits = num_can_prepend; // The PL has room for 2 hits. We should be able to add them without any // problem, transitioning the PL from EMPTY -> ALMOST_FULL -> FULL const HitElt *hits_in_ptr = hits_in.data() + (hits_in.size() - 2); - num_can_prepend = pl_used.PrependHitArray<HitElt, HitElt::get_hit>( - hits_in_ptr, can_fit_hits, false); + num_can_prepend = serializer.PrependHitArray<HitElt, HitElt::get_hit>( + &pl_used, hits_in_ptr, can_fit_hits, false); EXPECT_THAT(num_can_prepend, Eq(can_fit_hits)); - EXPECT_THAT(size, Eq(pl_used.BytesUsed())); + EXPECT_THAT(size, Eq(serializer.GetBytesUsed(&pl_used))); std::deque<Hit> hits_pushed; std::transform(hits_in.rbegin(), hits_in.rend() - hits_in.size() + can_fit_hits, std::front_inserter(hits_pushed), HitElt::get_hit); - EXPECT_THAT(pl_used.GetHits(), IsOkAndHolds(ElementsAreArray(hits_pushed))); + EXPECT_THAT(serializer.GetHits(&pl_used), + IsOkAndHolds(ElementsAreArray(hits_pushed))); } -TEST(PostingListTest, PostingListPrependHitArrayPostingList) { +TEST(PostingListUsedHitSerializerTest, PostingListPrependHitArrayPostingList) { + PostingListUsedHitSerializer serializer; + // Size = 30 - int size = 3 * posting_list_utils::min_posting_list_size(); + int size = 3 * serializer.GetMinPostingListSize(); std::unique_ptr<char[]> hits_buf = std::make_unique<char[]>(size); - ICING_ASSERT_OK_AND_ASSIGN(PostingListUsed pl_used, - PostingListUsed::CreateFromUnitializedRegion( - static_cast<void *>(hits_buf.get()), size)); + ICING_ASSERT_OK_AND_ASSIGN( + PostingListUsed pl_used, + PostingListUsed::CreateFromUnitializedRegion( + &serializer, static_cast<void *>(hits_buf.get()), size)); std::vector<HitElt> hits_in; hits_in.emplace_back(Hit(1, 0, Hit::kDefaultTermFrequency)); @@ -297,14 +303,15 @@ TEST(PostingListTest, PostingListPrependHitArrayPostingList) { // Add five hits. The PL is in the empty state and should be able to fit all // five hits without issue, transitioning the PL from EMPTY -> NOT_FULL. - uint32_t num_could_fit = pl_used.PrependHitArray<HitElt, HitElt::get_hit>( - &hits_in[0], hits_in.size(), false); + uint32_t num_could_fit = serializer.PrependHitArray<HitElt, HitElt::get_hit>( + &pl_used, &hits_in[0], hits_in.size(), false); EXPECT_THAT(num_could_fit, Eq(hits_in.size())); - EXPECT_THAT(byte_size, Eq(pl_used.BytesUsed())); + EXPECT_THAT(byte_size, Eq(serializer.GetBytesUsed(&pl_used))); std::deque<Hit> hits_pushed; std::transform(hits_in.rbegin(), hits_in.rend(), std::front_inserter(hits_pushed), HitElt::get_hit); - EXPECT_THAT(pl_used.GetHits(), IsOkAndHolds(ElementsAreArray(hits_pushed))); + EXPECT_THAT(serializer.GetHits(&pl_used), + IsOkAndHolds(ElementsAreArray(hits_pushed))); Hit first_hit = CreateHit(hits_in.begin()->hit, /*desired_byte_length=*/1); hits_in.clear(); @@ -341,14 +348,15 @@ TEST(PostingListTest, PostingListPrependHitArrayPostingList) { // Add these 6 hits. The PL is currently in the NOT_FULL state and should // remain in the NOT_FULL state. - num_could_fit = pl_used.PrependHitArray<HitElt, HitElt::get_hit>( - &hits_in[0], hits_in.size(), false); + num_could_fit = serializer.PrependHitArray<HitElt, HitElt::get_hit>( + &pl_used, &hits_in[0], hits_in.size(), false); EXPECT_THAT(num_could_fit, Eq(hits_in.size())); - EXPECT_THAT(byte_size, Eq(pl_used.BytesUsed())); + EXPECT_THAT(byte_size, Eq(serializer.GetBytesUsed(&pl_used))); // All hits from hits_in were added. std::transform(hits_in.rbegin(), hits_in.rend(), std::front_inserter(hits_pushed), HitElt::get_hit); - EXPECT_THAT(pl_used.GetHits(), IsOkAndHolds(ElementsAreArray(hits_pushed))); + EXPECT_THAT(serializer.GetHits(&pl_used), + IsOkAndHolds(ElementsAreArray(hits_pushed))); first_hit = CreateHit(hits_in.begin()->hit, /*desired_byte_length=*/3); hits_in.clear(); @@ -374,14 +382,15 @@ TEST(PostingListTest, PostingListPrependHitArrayPostingList) { // Add this 1 hit. The PL is currently in the NOT_FULL state and should // transition to the ALMOST_FULL state - even though there is still some // unused space. - num_could_fit = pl_used.PrependHitArray<HitElt, HitElt::get_hit>( - &hits_in[0], hits_in.size(), false); + num_could_fit = serializer.PrependHitArray<HitElt, HitElt::get_hit>( + &pl_used, &hits_in[0], hits_in.size(), false); EXPECT_THAT(num_could_fit, Eq(hits_in.size())); - EXPECT_THAT(byte_size, Eq(pl_used.BytesUsed())); + EXPECT_THAT(byte_size, Eq(serializer.GetBytesUsed(&pl_used))); // All hits from hits_in were added. std::transform(hits_in.rbegin(), hits_in.rend(), std::front_inserter(hits_pushed), HitElt::get_hit); - EXPECT_THAT(pl_used.GetHits(), IsOkAndHolds(ElementsAreArray(hits_pushed))); + EXPECT_THAT(serializer.GetHits(&pl_used), + IsOkAndHolds(ElementsAreArray(hits_pushed))); first_hit = CreateHit(hits_in.begin()->hit, /*desired_byte_length=*/1); hits_in.clear(); @@ -413,17 +422,20 @@ TEST(PostingListTest, PostingListPrependHitArrayPostingList) { // second hit should tranisition to the FULL state because the delta between // Hit #13 and Hit #14 (2 bytes) is larger than the remaining unused area // (1 byte). - num_could_fit = pl_used.PrependHitArray<HitElt, HitElt::get_hit>( - &hits_in[0], hits_in.size(), false); + num_could_fit = serializer.PrependHitArray<HitElt, HitElt::get_hit>( + &pl_used, &hits_in[0], hits_in.size(), false); EXPECT_THAT(num_could_fit, Eq(hits_in.size())); - EXPECT_THAT(size, Eq(pl_used.BytesUsed())); + EXPECT_THAT(size, Eq(serializer.GetBytesUsed(&pl_used))); // All hits from hits_in were added. std::transform(hits_in.rbegin(), hits_in.rend(), std::front_inserter(hits_pushed), HitElt::get_hit); - EXPECT_THAT(pl_used.GetHits(), IsOkAndHolds(ElementsAreArray(hits_pushed))); + EXPECT_THAT(serializer.GetHits(&pl_used), + IsOkAndHolds(ElementsAreArray(hits_pushed))); } -TEST(PostingListTest, PostingListPrependHitArrayTooManyHits) { +TEST(PostingListUsedHitSerializerTest, PostingListPrependHitArrayTooManyHits) { + PostingListUsedHitSerializer serializer; + static constexpr int kNumHits = 128; static constexpr int kDeltaSize = 1; static constexpr int kTermFrequencySize = 1; @@ -433,150 +445,171 @@ TEST(PostingListTest, PostingListPrependHitArrayTooManyHits) { std::unique_ptr<char[]> hits_buf = std::make_unique<char[]>(kHitsSize); // Create an array with one too many hits - vector<Hit> hits_in_too_many = + std::vector<Hit> hits_in_too_many = CreateHits(kNumHits + 1, /*desired_byte_length=*/1); - vector<HitElt> hit_elts_in_too_many; + std::vector<HitElt> hit_elts_in_too_many; for (const Hit &hit : hits_in_too_many) { hit_elts_in_too_many.emplace_back(hit); } - ICING_ASSERT_OK_AND_ASSIGN(PostingListUsed pl_used, - PostingListUsed::CreateFromUnitializedRegion( - static_cast<void *>(hits_buf.get()), - posting_list_utils::min_posting_list_size())); + ICING_ASSERT_OK_AND_ASSIGN( + PostingListUsed pl_used, + PostingListUsed::CreateFromUnitializedRegion( + &serializer, static_cast<void *>(hits_buf.get()), + serializer.GetMinPostingListSize())); // PrependHitArray should fail because hit_elts_in_too_many is far too large // for the minimum size pl. - uint32_t num_could_fit = pl_used.PrependHitArray<HitElt, HitElt::get_hit>( - &hit_elts_in_too_many[0], hit_elts_in_too_many.size(), false); + uint32_t num_could_fit = serializer.PrependHitArray<HitElt, HitElt::get_hit>( + &pl_used, &hit_elts_in_too_many[0], hit_elts_in_too_many.size(), false); ASSERT_THAT(num_could_fit, Lt(hit_elts_in_too_many.size())); - ASSERT_THAT(pl_used.BytesUsed(), Eq(0)); - ASSERT_THAT(pl_used.GetHits(), IsOkAndHolds(IsEmpty())); + ASSERT_THAT(serializer.GetBytesUsed(&pl_used), Eq(0)); + ASSERT_THAT(serializer.GetHits(&pl_used), IsOkAndHolds(IsEmpty())); ICING_ASSERT_OK_AND_ASSIGN( - pl_used, PostingListUsed::CreateFromUnitializedRegion( - static_cast<void *>(hits_buf.get()), kHitsSize)); + pl_used, + PostingListUsed::CreateFromUnitializedRegion( + &serializer, static_cast<void *>(hits_buf.get()), kHitsSize)); // PrependHitArray should fail because hit_elts_in_too_many is one hit too // large for this pl. - num_could_fit = pl_used.PrependHitArray<HitElt, HitElt::get_hit>( - &hit_elts_in_too_many[0], hit_elts_in_too_many.size(), false); + num_could_fit = serializer.PrependHitArray<HitElt, HitElt::get_hit>( + &pl_used, &hit_elts_in_too_many[0], hit_elts_in_too_many.size(), false); ASSERT_THAT(num_could_fit, Lt(hit_elts_in_too_many.size())); - ASSERT_THAT(pl_used.BytesUsed(), Eq(0)); - ASSERT_THAT(pl_used.GetHits(), IsOkAndHolds(IsEmpty())); + ASSERT_THAT(serializer.GetBytesUsed(&pl_used), Eq(0)); + ASSERT_THAT(serializer.GetHits(&pl_used), IsOkAndHolds(IsEmpty())); } -TEST(PostingListTest, PostingListStatusJumpFromNotFullToFullAndBack) { +TEST(PostingListUsedHitSerializerTest, + PostingListStatusJumpFromNotFullToFullAndBack) { + PostingListUsedHitSerializer serializer; + const uint32_t pl_size = 3 * sizeof(Hit); char hits_buf[pl_size]; - ICING_ASSERT_OK_AND_ASSIGN( - PostingListUsed pl, - PostingListUsed::CreateFromUnitializedRegion(hits_buf, pl_size)); - ICING_ASSERT_OK(pl.PrependHit(Hit(Hit::kInvalidValue - 1, 0))); - uint32_t bytes_used = pl.BytesUsed(); + ICING_ASSERT_OK_AND_ASSIGN(PostingListUsed pl, + PostingListUsed::CreateFromUnitializedRegion( + &serializer, hits_buf, pl_size)); + ICING_ASSERT_OK(serializer.PrependHit(&pl, Hit(Hit::kInvalidValue - 1, 0))); + uint32_t bytes_used = serializer.GetBytesUsed(&pl); // Status not full. - ASSERT_THAT(bytes_used, Le(pl_size - posting_list_utils::kSpecialHitsSize)); - ICING_ASSERT_OK(pl.PrependHit(Hit(Hit::kInvalidValue >> 2, 0))); + ASSERT_THAT(bytes_used, + Le(pl_size - PostingListUsedHitSerializer::kSpecialHitsSize)); + ICING_ASSERT_OK(serializer.PrependHit(&pl, Hit(Hit::kInvalidValue >> 2, 0))); // Status should jump to full directly. - ASSERT_THAT(pl.BytesUsed(), Eq(pl_size)); - pl.PopFrontHits(1); + ASSERT_THAT(serializer.GetBytesUsed(&pl), Eq(pl_size)); + serializer.PopFrontHits(&pl, 1); // Status should return to not full as before. - ASSERT_THAT(pl.BytesUsed(), Eq(bytes_used)); + ASSERT_THAT(serializer.GetBytesUsed(&pl), Eq(bytes_used)); } -TEST(PostingListTest, DeltaOverflow) { +TEST(PostingListUsedHitSerializerTest, DeltaOverflow) { + PostingListUsedHitSerializer serializer; + char hits_buf[1000]; - ICING_ASSERT_OK_AND_ASSIGN( - PostingListUsed pl, - PostingListUsed::CreateFromUnitializedRegion(hits_buf, 4 * sizeof(Hit))); + ICING_ASSERT_OK_AND_ASSIGN(PostingListUsed pl, + PostingListUsed::CreateFromUnitializedRegion( + &serializer, hits_buf, 4 * sizeof(Hit))); static const Hit::Value kOverflow[4] = { - Hit::kInvalidValue >> 2, - (Hit::kInvalidValue >> 2) * 2, - (Hit::kInvalidValue >> 2) * 3, - Hit::kInvalidValue - 1, + Hit::kInvalidValue >> 2, + (Hit::kInvalidValue >> 2) * 2, + (Hit::kInvalidValue >> 2) * 3, + Hit::kInvalidValue - 1, }; // Fit at least 4 ordinary values. for (Hit::Value v = 0; v < 4; v++) { - ICING_EXPECT_OK(pl.PrependHit(Hit(4 - v))); + ICING_EXPECT_OK(serializer.PrependHit(&pl, Hit(4 - v))); } // Cannot fit 4 overflow values. ICING_ASSERT_OK_AND_ASSIGN(pl, PostingListUsed::CreateFromUnitializedRegion( - hits_buf, 4 * sizeof(Hit))); - ICING_EXPECT_OK(pl.PrependHit(Hit(kOverflow[3]))); - ICING_EXPECT_OK(pl.PrependHit(Hit(kOverflow[2]))); + &serializer, hits_buf, 4 * sizeof(Hit))); + ICING_EXPECT_OK(serializer.PrependHit(&pl, Hit(kOverflow[3]))); + ICING_EXPECT_OK(serializer.PrependHit(&pl, Hit(kOverflow[2]))); // Can fit only one more. - ICING_EXPECT_OK(pl.PrependHit(Hit(kOverflow[1]))); - EXPECT_THAT(pl.PrependHit(Hit(kOverflow[0])), + ICING_EXPECT_OK(serializer.PrependHit(&pl, Hit(kOverflow[1]))); + EXPECT_THAT(serializer.PrependHit(&pl, Hit(kOverflow[0])), StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED)); } -TEST(PostingListTest, MoveFrom) { - int size = 3 * posting_list_utils::min_posting_list_size(); +TEST(PostingListUsedHitSerializerTest, MoveFrom) { + PostingListUsedHitSerializer serializer; + + int size = 3 * serializer.GetMinPostingListSize(); std::unique_ptr<char[]> hits_buf1 = std::make_unique<char[]>(size); - ICING_ASSERT_OK_AND_ASSIGN(PostingListUsed pl_used1, - PostingListUsed::CreateFromUnitializedRegion( - static_cast<void *>(hits_buf1.get()), size)); + ICING_ASSERT_OK_AND_ASSIGN( + PostingListUsed pl_used1, + PostingListUsed::CreateFromUnitializedRegion( + &serializer, static_cast<void *>(hits_buf1.get()), size)); std::vector<Hit> hits1 = CreateHits(/*num_hits=*/5, /*desired_byte_length=*/1); for (const Hit &hit : hits1) { - ICING_ASSERT_OK(pl_used1.PrependHit(hit)); + ICING_ASSERT_OK(serializer.PrependHit(&pl_used1, hit)); } std::unique_ptr<char[]> hits_buf2 = std::make_unique<char[]>(size); - ICING_ASSERT_OK_AND_ASSIGN(PostingListUsed pl_used2, - PostingListUsed::CreateFromUnitializedRegion( - static_cast<void *>(hits_buf2.get()), size)); + ICING_ASSERT_OK_AND_ASSIGN( + PostingListUsed pl_used2, + PostingListUsed::CreateFromUnitializedRegion( + &serializer, static_cast<void *>(hits_buf2.get()), size)); std::vector<Hit> hits2 = CreateHits(/*num_hits=*/5, /*desired_byte_length=*/2); for (const Hit &hit : hits2) { - ICING_ASSERT_OK(pl_used2.PrependHit(hit)); + ICING_ASSERT_OK(serializer.PrependHit(&pl_used2, hit)); } - ICING_ASSERT_OK(pl_used2.MoveFrom(&pl_used1)); - EXPECT_THAT(pl_used2.GetHits(), + ICING_ASSERT_OK(serializer.MoveFrom(/*dst=*/&pl_used2, /*src=*/&pl_used1)); + EXPECT_THAT(serializer.GetHits(&pl_used2), IsOkAndHolds(ElementsAreArray(hits1.rbegin(), hits1.rend()))); - EXPECT_THAT(pl_used1.GetHits(), IsOkAndHolds(IsEmpty())); + EXPECT_THAT(serializer.GetHits(&pl_used1), IsOkAndHolds(IsEmpty())); } -TEST(PostingListTest, MoveFromNullArgumentReturnsInvalidArgument) { - int size = 3 * posting_list_utils::min_posting_list_size(); +TEST(PostingListUsedHitSerializerTest, + MoveFromNullArgumentReturnsInvalidArgument) { + PostingListUsedHitSerializer serializer; + + int size = 3 * serializer.GetMinPostingListSize(); std::unique_ptr<char[]> hits_buf1 = std::make_unique<char[]>(size); - ICING_ASSERT_OK_AND_ASSIGN(PostingListUsed pl_used1, - PostingListUsed::CreateFromUnitializedRegion( - static_cast<void *>(hits_buf1.get()), size)); + ICING_ASSERT_OK_AND_ASSIGN( + PostingListUsed pl_used1, + PostingListUsed::CreateFromUnitializedRegion( + &serializer, static_cast<void *>(hits_buf1.get()), size)); std::vector<Hit> hits = CreateHits(/*num_hits=*/5, /*desired_byte_length=*/1); for (const Hit &hit : hits) { - ICING_ASSERT_OK(pl_used1.PrependHit(hit)); + ICING_ASSERT_OK(serializer.PrependHit(&pl_used1, hit)); } - EXPECT_THAT(pl_used1.MoveFrom(/*other=*/nullptr), + EXPECT_THAT(serializer.MoveFrom(&pl_used1, /*other=*/nullptr), StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); - EXPECT_THAT(pl_used1.GetHits(), + EXPECT_THAT(serializer.GetHits(&pl_used1), IsOkAndHolds(ElementsAreArray(hits.rbegin(), hits.rend()))); } -TEST(PostingListTest, MoveFromInvalidPostingListReturnsInvalidArgument) { - int size = 3 * posting_list_utils::min_posting_list_size(); +TEST(PostingListUsedHitSerializerTest, + MoveFromInvalidPostingListReturnsInvalidArgument) { + PostingListUsedHitSerializer serializer; + + int size = 3 * serializer.GetMinPostingListSize(); std::unique_ptr<char[]> hits_buf1 = std::make_unique<char[]>(size); - ICING_ASSERT_OK_AND_ASSIGN(PostingListUsed pl_used1, - PostingListUsed::CreateFromUnitializedRegion( - static_cast<void *>(hits_buf1.get()), size)); + ICING_ASSERT_OK_AND_ASSIGN( + PostingListUsed pl_used1, + PostingListUsed::CreateFromUnitializedRegion( + &serializer, static_cast<void *>(hits_buf1.get()), size)); std::vector<Hit> hits1 = CreateHits(/*num_hits=*/5, /*desired_byte_length=*/1); for (const Hit &hit : hits1) { - ICING_ASSERT_OK(pl_used1.PrependHit(hit)); + ICING_ASSERT_OK(serializer.PrependHit(&pl_used1, hit)); } std::unique_ptr<char[]> hits_buf2 = std::make_unique<char[]>(size); - ICING_ASSERT_OK_AND_ASSIGN(PostingListUsed pl_used2, - PostingListUsed::CreateFromUnitializedRegion( - static_cast<void *>(hits_buf2.get()), size)); + ICING_ASSERT_OK_AND_ASSIGN( + PostingListUsed pl_used2, + PostingListUsed::CreateFromUnitializedRegion( + &serializer, static_cast<void *>(hits_buf2.get()), size)); std::vector<Hit> hits2 = CreateHits(/*num_hits=*/5, /*desired_byte_length=*/2); for (const Hit &hit : hits2) { - ICING_ASSERT_OK(pl_used2.PrependHit(hit)); + ICING_ASSERT_OK(serializer.PrependHit(&pl_used2, hit)); } // Write invalid hits to the beginning of pl_used1 to make it invalid. @@ -585,32 +618,37 @@ TEST(PostingListTest, MoveFromInvalidPostingListReturnsInvalidArgument) { *first_hit = invalid_hit; ++first_hit; *first_hit = invalid_hit; - EXPECT_THAT(pl_used2.MoveFrom(&pl_used1), + EXPECT_THAT(serializer.MoveFrom(/*dst=*/&pl_used2, /*src=*/&pl_used1), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); - EXPECT_THAT(pl_used2.GetHits(), + EXPECT_THAT(serializer.GetHits(&pl_used2), IsOkAndHolds(ElementsAreArray(hits2.rbegin(), hits2.rend()))); } -TEST(PostingListTest, MoveToInvalidPostingListReturnsInvalidArgument) { - int size = 3 * posting_list_utils::min_posting_list_size(); +TEST(PostingListUsedHitSerializerTest, + MoveToInvalidPostingListReturnsInvalidArgument) { + PostingListUsedHitSerializer serializer; + + int size = 3 * serializer.GetMinPostingListSize(); std::unique_ptr<char[]> hits_buf1 = std::make_unique<char[]>(size); - ICING_ASSERT_OK_AND_ASSIGN(PostingListUsed pl_used1, - PostingListUsed::CreateFromUnitializedRegion( - static_cast<void *>(hits_buf1.get()), size)); + ICING_ASSERT_OK_AND_ASSIGN( + PostingListUsed pl_used1, + PostingListUsed::CreateFromUnitializedRegion( + &serializer, static_cast<void *>(hits_buf1.get()), size)); std::vector<Hit> hits1 = CreateHits(/*num_hits=*/5, /*desired_byte_length=*/1); for (const Hit &hit : hits1) { - ICING_ASSERT_OK(pl_used1.PrependHit(hit)); + ICING_ASSERT_OK(serializer.PrependHit(&pl_used1, hit)); } std::unique_ptr<char[]> hits_buf2 = std::make_unique<char[]>(size); - ICING_ASSERT_OK_AND_ASSIGN(PostingListUsed pl_used2, - PostingListUsed::CreateFromUnitializedRegion( - static_cast<void *>(hits_buf2.get()), size)); + ICING_ASSERT_OK_AND_ASSIGN( + PostingListUsed pl_used2, + PostingListUsed::CreateFromUnitializedRegion( + &serializer, static_cast<void *>(hits_buf2.get()), size)); std::vector<Hit> hits2 = CreateHits(/*num_hits=*/5, /*desired_byte_length=*/2); for (const Hit &hit : hits2) { - ICING_ASSERT_OK(pl_used2.PrependHit(hit)); + ICING_ASSERT_OK(serializer.PrependHit(&pl_used2, hit)); } // Write invalid hits to the beginning of pl_used2 to make it invalid. @@ -619,50 +657,57 @@ TEST(PostingListTest, MoveToInvalidPostingListReturnsInvalidArgument) { *first_hit = invalid_hit; ++first_hit; *first_hit = invalid_hit; - EXPECT_THAT(pl_used2.MoveFrom(&pl_used1), + EXPECT_THAT(serializer.MoveFrom(&pl_used2, &pl_used1), StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); - EXPECT_THAT(pl_used1.GetHits(), + EXPECT_THAT(serializer.GetHits(&pl_used1), IsOkAndHolds(ElementsAreArray(hits1.rbegin(), hits1.rend()))); } -TEST(PostingListTest, MoveToPostingListTooSmall) { - int size = 3 * posting_list_utils::min_posting_list_size(); +TEST(PostingListUsedHitSerializerTest, MoveToPostingListTooSmall) { + PostingListUsedHitSerializer serializer; + + int size = 3 * serializer.GetMinPostingListSize(); std::unique_ptr<char[]> hits_buf1 = std::make_unique<char[]>(size); - ICING_ASSERT_OK_AND_ASSIGN(PostingListUsed pl_used1, - PostingListUsed::CreateFromUnitializedRegion( - static_cast<void *>(hits_buf1.get()), size)); + ICING_ASSERT_OK_AND_ASSIGN( + PostingListUsed pl_used1, + PostingListUsed::CreateFromUnitializedRegion( + &serializer, static_cast<void *>(hits_buf1.get()), size)); std::vector<Hit> hits1 = CreateHits(/*num_hits=*/5, /*desired_byte_length=*/1); for (const Hit &hit : hits1) { - ICING_ASSERT_OK(pl_used1.PrependHit(hit)); + ICING_ASSERT_OK(serializer.PrependHit(&pl_used1, hit)); } std::unique_ptr<char[]> hits_buf2 = - std::make_unique<char[]>(posting_list_utils::min_posting_list_size()); - ICING_ASSERT_OK_AND_ASSIGN(PostingListUsed pl_used2, - PostingListUsed::CreateFromUnitializedRegion( - static_cast<void *>(hits_buf2.get()), - posting_list_utils::min_posting_list_size())); + std::make_unique<char[]>(serializer.GetMinPostingListSize()); + ICING_ASSERT_OK_AND_ASSIGN( + PostingListUsed pl_used2, + PostingListUsed::CreateFromUnitializedRegion( + &serializer, static_cast<void *>(hits_buf2.get()), + serializer.GetMinPostingListSize())); std::vector<Hit> hits2 = CreateHits(/*num_hits=*/1, /*desired_byte_length=*/2); for (const Hit &hit : hits2) { - ICING_ASSERT_OK(pl_used2.PrependHit(hit)); + ICING_ASSERT_OK(serializer.PrependHit(&pl_used2, hit)); } - EXPECT_THAT(pl_used2.MoveFrom(&pl_used1), + EXPECT_THAT(serializer.MoveFrom(/*dst=*/&pl_used2, /*src=*/&pl_used1), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); - EXPECT_THAT(pl_used1.GetHits(), + EXPECT_THAT(serializer.GetHits(&pl_used1), IsOkAndHolds(ElementsAreArray(hits1.rbegin(), hits1.rend()))); - EXPECT_THAT(pl_used2.GetHits(), + EXPECT_THAT(serializer.GetHits(&pl_used2), IsOkAndHolds(ElementsAreArray(hits2.rbegin(), hits2.rend()))); } -TEST(PostingListTest, PopHitsWithScores) { - int size = 2 * posting_list_utils::min_posting_list_size(); +TEST(PostingListUsedHitSerializerTest, PopHitsWithScores) { + PostingListUsedHitSerializer serializer; + + int size = 2 * serializer.GetMinPostingListSize(); std::unique_ptr<char[]> hits_buf1 = std::make_unique<char[]>(size); - ICING_ASSERT_OK_AND_ASSIGN(PostingListUsed pl_used, - PostingListUsed::CreateFromUnitializedRegion( - static_cast<void *>(hits_buf1.get()), size)); + ICING_ASSERT_OK_AND_ASSIGN( + PostingListUsed pl_used, + PostingListUsed::CreateFromUnitializedRegion( + &serializer, static_cast<void *>(hits_buf1.get()), size)); // This posting list is 20-bytes. Create four hits that will have deltas of // two bytes each and all of whom will have a non-default score. This posting @@ -683,12 +728,13 @@ TEST(PostingListTest, PopHitsWithScores) { Hit hit1 = CreateHit(hit0, /*desired_byte_length=*/2); Hit hit2 = CreateHit(hit1, /*desired_byte_length=*/2); Hit hit3 = CreateHit(hit2, /*desired_byte_length=*/2); - ICING_ASSERT_OK(pl_used.PrependHit(hit0)); - ICING_ASSERT_OK(pl_used.PrependHit(hit1)); - ICING_ASSERT_OK(pl_used.PrependHit(hit2)); - ICING_ASSERT_OK(pl_used.PrependHit(hit3)); + ICING_ASSERT_OK(serializer.PrependHit(&pl_used, hit0)); + ICING_ASSERT_OK(serializer.PrependHit(&pl_used, hit1)); + ICING_ASSERT_OK(serializer.PrependHit(&pl_used, hit2)); + ICING_ASSERT_OK(serializer.PrependHit(&pl_used, hit3)); - ICING_ASSERT_OK_AND_ASSIGN(std::vector<Hit> hits_out, pl_used.GetHits()); + ICING_ASSERT_OK_AND_ASSIGN(std::vector<Hit> hits_out, + serializer.GetHits(&pl_used)); EXPECT_THAT(hits_out, ElementsAre(hit3, hit2, hit1, hit0)); // Now, pop the last hit. The posting list should contain the first three @@ -703,10 +749,12 @@ TEST(PostingListTest, PopHitsWithScores) { // 9-5 Hit #2 // 4-0 kInvalidHitVal // ---------------------- - ICING_ASSERT_OK(pl_used.PopFrontHits(1)); - ICING_ASSERT_OK_AND_ASSIGN(hits_out, pl_used.GetHits()); + ICING_ASSERT_OK(serializer.PopFrontHits(&pl_used, 1)); + ICING_ASSERT_OK_AND_ASSIGN(hits_out, serializer.GetHits(&pl_used)); EXPECT_THAT(hits_out, ElementsAre(hit2, hit1, hit0)); } +} // namespace + } // namespace lib } // namespace icing diff --git a/icing/monkey_test/icing-monkey-test-runner.cc b/icing/monkey_test/icing-monkey-test-runner.cc new file mode 100644 index 0000000..2dd5a03 --- /dev/null +++ b/icing/monkey_test/icing-monkey-test-runner.cc @@ -0,0 +1,442 @@ +// Copyright (C) 2022 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/monkey_test/icing-monkey-test-runner.h" + +#include <algorithm> +#include <cstdint> +#include <functional> +#include <string> +#include <vector> + +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "icing/monkey_test/in-memory-icing-search-engine.h" +#include "icing/monkey_test/monkey-test-generators.h" +#include "icing/portable/equals-proto.h" +#include "icing/testing/common-matchers.h" +#include "icing/testing/tmp-directory.h" +#include "icing/util/logging.h" + +namespace icing { +namespace lib { + +namespace { + +using ::icing::lib::portable_equals_proto::EqualsProto; +using ::testing::Eq; +using ::testing::Le; +using ::testing::SizeIs; +using ::testing::UnorderedElementsAreArray; + +inline constexpr int kNumTypes = 30; +const std::vector<int> kPossibleNumProperties = {0, + 1, + 2, + 4, + 8, + 16, + kTotalNumSections / 2, + kTotalNumSections, + kTotalNumSections + 1, + kTotalNumSections * 2}; +inline constexpr int kNumNamespaces = 100; +inline constexpr int kNumURIs = 1000; + +// Merge per 131072 hits +const int kIndexMergeSize = 1024 * 1024; + +// An array of pairs of monkey test APIs with frequencies. +// If f_sum is the sum of all the frequencies, an operation with frequency f +// means for every f_sum iterations, the operation is expected to run f times. +const std::vector< + std::pair<std::function<void(IcingMonkeyTestRunner*)>, uint32_t>> + kMonkeyAPISchedules = {{&IcingMonkeyTestRunner::DoPut, 500}, + {&IcingMonkeyTestRunner::DoSearch, 200}, + {&IcingMonkeyTestRunner::DoGet, 70}, + {&IcingMonkeyTestRunner::DoGetAllNamespaces, 50}, + {&IcingMonkeyTestRunner::DoDelete, 50}, + {&IcingMonkeyTestRunner::DoDeleteByNamespace, 50}, + {&IcingMonkeyTestRunner::DoDeleteBySchemaType, 50}, + {&IcingMonkeyTestRunner::DoDeleteByQuery, 20}, + {&IcingMonkeyTestRunner::DoOptimize, 5}, + {&IcingMonkeyTestRunner::ReloadFromDisk, 5}}; + +SchemaProto GenerateRandomSchema(MonkeyTestRandomEngine* random) { + MonkeySchemaGenerator schema_generator(random); + return schema_generator.GenerateSchema(kNumTypes, kPossibleNumProperties); +} + +SearchSpecProto GenerateRandomSearchSpecProto( + MonkeyTestRandomEngine* random, + MonkeyDocumentGenerator* document_generator) { + // Get a random token from the language set as a single term query. + std::string query(document_generator->GetToken()); + std::uniform_int_distribution<> dist(0, 1); + TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY; + if (dist(*random) == 1) { + term_match_type = TermMatchType::PREFIX; + // Randomly drop a suffix of query to test prefix query. + std::uniform_int_distribution<> size_dist(1, query.size()); + query.resize(size_dist(*random)); + } + // 50% chance of getting a section restriction. + if (dist(*random) == 1) { + const SchemaTypeConfigProto& type_config = document_generator->GetType(); + if (type_config.properties_size() > 0) { + std::uniform_int_distribution<> prop_dist( + 0, type_config.properties_size() - 1); + query = absl_ports::StrCat( + type_config.properties(prop_dist(*random)).property_name(), ":", + query); + } + } + SearchSpecProto search_spec; + search_spec.set_term_match_type(term_match_type); + search_spec.set_query(query); + return search_spec; +} + +ScoringSpecProto GenerateRandomScoringSpec(MonkeyTestRandomEngine* random) { + ScoringSpecProto scoring_spec; + + constexpr std::array<ScoringSpecProto::RankingStrategy::Code, 3> + ranking_strategies = { + ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE, + ScoringSpecProto::RankingStrategy::CREATION_TIMESTAMP, + ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE}; + + std::uniform_int_distribution<> dist(0, ranking_strategies.size() - 1); + scoring_spec.set_rank_by(ranking_strategies[dist(*random)]); + return scoring_spec; +} + +ResultSpecProto::SnippetSpecProto GenerateRandomSnippetSpecProto( + MonkeyTestRandomEngine* random, const ResultSpecProto& result_spec) { + ResultSpecProto::SnippetSpecProto snippet_spec; + + std::uniform_int_distribution<> num_to_snippet_dist( + 0, result_spec.num_per_page() * 2); + snippet_spec.set_num_to_snippet(num_to_snippet_dist(*random)); + + std::uniform_int_distribution<> num_matches_per_property_dist(0, 10); + snippet_spec.set_num_matches_per_property( + num_matches_per_property_dist(*random)); + + std::uniform_int_distribution<> dist(0, 4); + int random_num = dist(*random); + // 1/5 chance of getting one of 0 (disabled), 8, 32, 128, 512 + int max_window_utf32_length = + random_num == 0 ? 0 : (1 << (2 * random_num + 1)); + snippet_spec.set_max_window_utf32_length(max_window_utf32_length); + return snippet_spec; +} + +ResultSpecProto GenerateRandomResultSpecProto(MonkeyTestRandomEngine* random) { + std::uniform_int_distribution<> dist(0, 4); + ResultSpecProto result_spec; + // 1/5 chance of getting one of 1, 4, 16, 64, 256 + int num_per_page = 1 << (2 * dist(*random)); + result_spec.set_num_per_page(num_per_page); + *result_spec.mutable_snippet_spec() = + GenerateRandomSnippetSpecProto(random, result_spec); + return result_spec; +} + +void SortDocuments(std::vector<DocumentProto>& documents) { + std::sort(documents.begin(), documents.end(), + [](const DocumentProto& doc1, const DocumentProto& doc2) { + if (doc1.namespace_() != doc2.namespace_()) { + return doc1.namespace_() < doc2.namespace_(); + } + return doc1.uri() < doc2.uri(); + }); +} + +} // namespace + +IcingMonkeyTestRunner::IcingMonkeyTestRunner(uint32_t seed) + : random_(seed), in_memory_icing_() { + ICING_LOG(INFO) << "Monkey test runner started with seed: " << seed; + + SchemaProto schema = GenerateRandomSchema(&random_); + ICING_LOG(DBG) << "Schema Generated: " << schema.DebugString(); + + in_memory_icing_ = + std::make_unique<InMemoryIcingSearchEngine>(&random_, std::move(schema)); + + document_generator_ = std::make_unique<MonkeyDocumentGenerator>( + &random_, in_memory_icing_->GetSchema(), kNumNamespaces, kNumURIs); + + std::string dir = GetTestTempDir() + "/icing/monkey"; + filesystem_.DeleteDirectoryRecursively(dir.c_str()); + icing_dir_ = std::make_unique<DestructibleDirectory>(&filesystem_, dir); +} + +void IcingMonkeyTestRunner::Run(uint32_t num) { + ASSERT_TRUE(icing_ != nullptr) + << "Icing search engine has not yet been created. Please call " + "CreateIcingSearchEngineWithSchema() first"; + + uint32_t frequency_sum = 0; + for (const auto& schedule : kMonkeyAPISchedules) { + frequency_sum += schedule.second; + } + std::uniform_int_distribution<> dist(0, frequency_sum - 1); + for (; num; --num) { + int p = dist(random_); + for (const auto& schedule : kMonkeyAPISchedules) { + if (p < schedule.second) { + ASSERT_NO_FATAL_FAILURE(schedule.first(this)); + break; + } + p -= schedule.second; + } + ICING_LOG(INFO) << "Documents in the in-memory icing: " + << in_memory_icing_->GetNumAliveDocuments(); + } +} + +void IcingMonkeyTestRunner::CreateIcingSearchEngineWithSchema() { + ASSERT_NO_FATAL_FAILURE(CreateIcingSearchEngine()); + ASSERT_THAT(icing_->SetSchema(*in_memory_icing_->GetSchema()).status(), + ProtoIsOk()); +} + +void IcingMonkeyTestRunner::DoGet() { + InMemoryIcingSearchEngine::PickDocumentResult document = + in_memory_icing_->RandomPickDocument(/*p_alive=*/0.70, /*p_all=*/0.28, + /*p_other=*/0.02); + ICING_LOG(INFO) << "Monkey getting namespace: " << document.name_space + << ", uri: " << document.uri; + GetResultProto get_result = + icing_->Get(document.name_space, document.uri, + GetResultSpecProto::default_instance()); + if (document.document.has_value()) { + ASSERT_THAT(get_result.status(), ProtoIsOk()) + << "Cannot find the document that is supposed to exist."; + ASSERT_THAT(get_result.document(), EqualsProto(document.document.value())) + << "The document found does not match with the value in the in-memory " + "icing."; + } else { + // Should expect that no document has been found. + if (get_result.status().code() != StatusProto::NOT_FOUND) { + if (get_result.status().code() == StatusProto::OK) { + FAIL() << "Found a document that is not supposed to be found."; + } + FAIL() << "Icing search engine failure (code " + << get_result.status().code() + << "): " << get_result.status().message(); + } + } +} + +void IcingMonkeyTestRunner::DoGetAllNamespaces() { + ICING_LOG(INFO) << "Monkey getting all namespaces"; + GetAllNamespacesResultProto get_result = icing_->GetAllNamespaces(); + ASSERT_THAT(get_result.status(), ProtoIsOk()); + ASSERT_THAT(get_result.namespaces(), + UnorderedElementsAreArray(in_memory_icing_->GetAllNamespaces())); +} + +void IcingMonkeyTestRunner::DoPut() { + MonkeyTokenizedDocument doc = document_generator_->GenerateDocument(); + ICING_LOG(INFO) << "Monkey document generated, namespace: " + << doc.document.namespace_() + << ", uri: " << doc.document.uri(); + ICING_LOG(DBG) << doc.document.DebugString(); + in_memory_icing_->Put(doc); + ASSERT_THAT(icing_->Put(doc.document).status(), ProtoIsOk()); +} + +void IcingMonkeyTestRunner::DoDelete() { + InMemoryIcingSearchEngine::PickDocumentResult document = + in_memory_icing_->RandomPickDocument(/*p_alive=*/0.70, /*p_all=*/0.2, + /*p_other=*/0.1); + ICING_LOG(INFO) << "Monkey deleting namespace: " << document.name_space + << ", uri: " << document.uri; + in_memory_icing_->Delete(document.name_space, document.uri); + DeleteResultProto delete_result = + icing_->Delete(document.name_space, document.uri); + if (document.document.has_value()) { + ASSERT_THAT(delete_result.status(), ProtoIsOk()) + << "Cannot delete an existing document."; + } else { + // Should expect that no document has been deleted. + if (delete_result.status().code() != StatusProto::NOT_FOUND) { + if (delete_result.status().code() == StatusProto::OK) { + FAIL() << "Deleted a non-existing document without an error."; + } + FAIL() << "Icing search engine failure (code " + << delete_result.status().code() + << "): " << delete_result.status().message(); + } + } +} + +void IcingMonkeyTestRunner::DoDeleteByNamespace() { + std::string name_space = document_generator_->GetNamespace(); + ICING_LOG(INFO) << "Monkey deleting namespace: " << name_space; + DeleteByNamespaceResultProto delete_result = + icing_->DeleteByNamespace(name_space); + ICING_ASSERT_OK_AND_ASSIGN(uint32_t num_docs_deleted, + in_memory_icing_->DeleteByNamespace(name_space)); + if (num_docs_deleted != 0) { + ASSERT_THAT(delete_result.status(), ProtoIsOk()) + << "Cannot delete an existing namespace."; + ASSERT_THAT(delete_result.delete_stats().num_documents_deleted(), + Eq(num_docs_deleted)); + } else { + // Should expect that no document has been deleted. + if (delete_result.status().code() != StatusProto::NOT_FOUND) { + if (delete_result.status().code() == StatusProto::OK) { + FAIL() << "Deleted a non-existing namespace without an error."; + } + FAIL() << "Icing search engine failure (code " + << delete_result.status().code() + << "): " << delete_result.status().message(); + } + } +} + +void IcingMonkeyTestRunner::DoDeleteBySchemaType() { + std::string schema_type = document_generator_->GetType().schema_type(); + ICING_LOG(INFO) << "Monkey deleting type: " << schema_type; + DeleteBySchemaTypeResultProto delete_result = + icing_->DeleteBySchemaType(schema_type); + ICING_ASSERT_OK_AND_ASSIGN(uint32_t num_docs_deleted, + in_memory_icing_->DeleteBySchemaType(schema_type)); + if (num_docs_deleted != 0) { + ASSERT_THAT(delete_result.status(), ProtoIsOk()) + << "Cannot delete an existing schema type."; + ASSERT_THAT(delete_result.delete_stats().num_documents_deleted(), + Eq(num_docs_deleted)); + } else { + // Should expect that no document has been deleted. + if (delete_result.status().code() != StatusProto::NOT_FOUND) { + if (delete_result.status().code() == StatusProto::OK) { + FAIL() << "Deleted a non-existing schema type without an error."; + } + FAIL() << "Icing search engine failure (code " + << delete_result.status().code() + << "): " << delete_result.status().message(); + } + } +} + +void IcingMonkeyTestRunner::DoDeleteByQuery() { + SearchSpecProto search_spec = + GenerateRandomSearchSpecProto(&random_, document_generator_.get()); + ICING_LOG(INFO) << "Monkey deleting by query: " << search_spec.query(); + DeleteByQueryResultProto delete_result = icing_->DeleteByQuery(search_spec); + ICING_ASSERT_OK_AND_ASSIGN(uint32_t num_docs_deleted, + in_memory_icing_->DeleteByQuery(search_spec)); + if (num_docs_deleted != 0) { + ASSERT_THAT(delete_result.status(), ProtoIsOk()) + << "Cannot delete documents that matches with the query."; + ASSERT_THAT(delete_result.delete_by_query_stats().num_documents_deleted(), + Eq(num_docs_deleted)); + } else { + // Should expect that no document has been deleted. + if (delete_result.status().code() != StatusProto::NOT_FOUND) { + if (delete_result.status().code() == StatusProto::OK) { + FAIL() << "Deleted documents that should not match with the query " + "without an error."; + } + FAIL() << "Icing search engine failure (code " + << delete_result.status().code() + << "): " << delete_result.status().message(); + } + } + ICING_LOG(INFO) + << delete_result.delete_by_query_stats().num_documents_deleted() + << " documents deleted by query."; +} + +void IcingMonkeyTestRunner::DoSearch() { + SearchSpecProto search_spec = + GenerateRandomSearchSpecProto(&random_, document_generator_.get()); + ScoringSpecProto scoring_spec = GenerateRandomScoringSpec(&random_); + ResultSpecProto result_spec = GenerateRandomResultSpecProto(&random_); + const ResultSpecProto::SnippetSpecProto& snippet_spec = + result_spec.snippet_spec(); + + ICING_LOG(INFO) << "Monkey searching by query: " << search_spec.query() + << ", term_match_type: " << search_spec.term_match_type(); + ICING_VLOG(1) << "search_spec:\n" << search_spec.DebugString(); + ICING_VLOG(1) << "scoring_spec:\n" << scoring_spec.DebugString(); + ICING_VLOG(1) << "result_spec:\n" << result_spec.DebugString(); + + std::vector<DocumentProto> exp_documents = + in_memory_icing_->Search(search_spec); + + SearchResultProto search_result = + icing_->Search(search_spec, scoring_spec, result_spec); + ASSERT_THAT(search_result.status(), ProtoIsOk()); + + std::vector<DocumentProto> actual_documents; + int num_snippeted = 0; + while (true) { + for (const SearchResultProto::ResultProto& doc : search_result.results()) { + actual_documents.push_back(doc.document()); + if (!doc.snippet().entries().empty()) { + ++num_snippeted; + for (const SnippetProto::EntryProto& entry : doc.snippet().entries()) { + ASSERT_THAT(entry.snippet_matches(), + SizeIs(Le(snippet_spec.num_matches_per_property()))); + } + } + } + if (search_result.next_page_token() == kInvalidNextPageToken) { + break; + } + search_result = icing_->GetNextPage(search_result.next_page_token()); + ASSERT_THAT(search_result.status(), ProtoIsOk()); + } + if (snippet_spec.num_matches_per_property() > 0) { + ASSERT_THAT(num_snippeted, + Eq(std::min<uint32_t>(exp_documents.size(), + snippet_spec.num_to_snippet()))); + } + SortDocuments(exp_documents); + SortDocuments(actual_documents); + ASSERT_THAT(actual_documents, SizeIs(exp_documents.size())); + for (int i = 0; i < exp_documents.size(); ++i) { + ASSERT_THAT(actual_documents[i], EqualsProto(exp_documents[i])); + } + ICING_LOG(INFO) << exp_documents.size() << " documents found by query."; +} + +void IcingMonkeyTestRunner::ReloadFromDisk() { + ICING_LOG(INFO) << "Monkey reloading from disk"; + // Destruct the icing search engine by resetting the unique pointer. + icing_.reset(); + ASSERT_NO_FATAL_FAILURE(CreateIcingSearchEngine()); +} + +void IcingMonkeyTestRunner::DoOptimize() { + ICING_LOG(INFO) << "Monkey doing optimization"; + ASSERT_THAT(icing_->Optimize().status(), ProtoIsOk()); +} + +void IcingMonkeyTestRunner::CreateIcingSearchEngine() { + IcingSearchEngineOptions icing_options; + icing_options.set_index_merge_size(kIndexMergeSize); + icing_options.set_base_dir(icing_dir_->dir()); + icing_ = std::make_unique<IcingSearchEngine>(icing_options); + ASSERT_THAT(icing_->Initialize().status(), ProtoIsOk()); +} + +} // namespace lib +} // namespace icing diff --git a/icing/monkey_test/icing-monkey-test-runner.h b/icing/monkey_test/icing-monkey-test-runner.h new file mode 100644 index 0000000..5f5649c --- /dev/null +++ b/icing/monkey_test/icing-monkey-test-runner.h @@ -0,0 +1,71 @@ +// Copyright (C) 2022 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ICING_MONKEY_TEST_ICING_MONKEY_TEST_RUNNER_H_ +#define ICING_MONKEY_TEST_ICING_MONKEY_TEST_RUNNER_H_ + +#include <cstdint> +#include <random> + +#include "icing/file/destructible-directory.h" +#include "icing/icing-search-engine.h" +#include "icing/monkey_test/in-memory-icing-search-engine.h" +#include "icing/monkey_test/monkey-test-generators.h" + +namespace icing { +namespace lib { + +class IcingMonkeyTestRunner { + public: + IcingMonkeyTestRunner(uint32_t seed = std::random_device()()); + IcingMonkeyTestRunner(const IcingMonkeyTestRunner&) = delete; + IcingMonkeyTestRunner& operator=(const IcingMonkeyTestRunner&) = delete; + + // This function must and should only be called before running the monkey + // test. + void CreateIcingSearchEngineWithSchema(); + + // Run the monkey test with num operations. + void Run(uint32_t num); + + // APIs supported in icing search engine. + void DoGet(); + void DoGetAllNamespaces(); + void DoPut(); + void DoDelete(); + void DoDeleteByNamespace(); + void DoDeleteBySchemaType(); + void DoDeleteByQuery(); + void DoSearch(); + + // Operations with no observable side-effects. + void ReloadFromDisk(); + void DoOptimize(); + + private: + MonkeyTestRandomEngine random_; + Filesystem filesystem_; + std::unique_ptr<DestructibleDirectory> icing_dir_; + std::unique_ptr<InMemoryIcingSearchEngine> in_memory_icing_; + std::unique_ptr<IcingSearchEngine> icing_; + + std::unique_ptr<MonkeyDocumentGenerator> document_generator_; + + void CreateIcingSearchEngine(); +}; + +} // namespace lib +} // namespace icing + +#endif // ICING_MONKEY_TEST_ICING_MONKEY_TEST_RUNNER_H_ diff --git a/icing/monkey_test/icing-search-engine_monkey_test.cc b/icing/monkey_test/icing-search-engine_monkey_test.cc new file mode 100644 index 0000000..ad887b8 --- /dev/null +++ b/icing/monkey_test/icing-search-engine_monkey_test.cc @@ -0,0 +1,30 @@ +// Copyright (C) 2022 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "gtest/gtest.h" +#include "icing/monkey_test/icing-monkey-test-runner.h" +#include "icing/portable/platform.h" + +namespace icing { +namespace lib { + +TEST(IcingSearchEngineMonkeyTest, MonkeyTest) { + uint32_t num_iterations = IsAndroidArm() ? 1000 : 5000; + IcingMonkeyTestRunner runner; + ASSERT_NO_FATAL_FAILURE(runner.CreateIcingSearchEngineWithSchema()); + ASSERT_NO_FATAL_FAILURE(runner.Run(num_iterations)); +} + +} // namespace lib +} // namespace icing diff --git a/icing/monkey_test/in-memory-icing-search-engine.cc b/icing/monkey_test/in-memory-icing-search-engine.cc index df94c46..405a7b0 100644 --- a/icing/monkey_test/in-memory-icing-search-engine.cc +++ b/icing/monkey_test/in-memory-icing-search-engine.cc @@ -15,10 +15,12 @@ #include "icing/monkey_test/in-memory-icing-search-engine.h" #include <cstdint> +#include <string_view> #include <unordered_set> #include <vector> #include "icing/text_classifier/lib3/utils/base/status.h" +#include "icing/text_classifier/lib3/utils/base/statusor.h" #include "icing/absl_ports/canonical_errors.h" #include "icing/absl_ports/str_cat.h" #include "icing/util/status-macros.h" @@ -26,6 +28,48 @@ namespace icing { namespace lib { +namespace { + +// Check if s1 is a prefix of s2. +bool IsPrefix(std::string_view s1, std::string_view s2) { + if (s1.length() > s2.length()) { + return false; + } + return s1 == s2.substr(0, s1.length()); +} + +bool DoesDocumentMatchQuery(const MonkeyTokenizedDocument &document, + const std::string &query, + TermMatchType::Code term_match_type) { + std::vector<std::string_view> strs = absl_ports::StrSplit(query, ":"); + std::string_view query_term; + std::string_view section_restrict; + if (strs.size() > 1) { + section_restrict = strs[0]; + query_term = strs[1]; + } else { + query_term = query; + } + for (const MonkeyTokenizedSection §ion : document.tokenized_sections) { + if (!section_restrict.empty() && section.path != section_restrict) { + continue; + } + for (const std::string &token : section.token_sequence) { + if (section.term_match_type == TermMatchType::EXACT_ONLY || + term_match_type == TermMatchType::EXACT_ONLY) { + if (token == query_term) { + return true; + } + } else if (IsPrefix(query_term, token)) { + return true; + } + } + } + return false; +} + +} // namespace + InMemoryIcingSearchEngine::PickDocumentResult InMemoryIcingSearchEngine::RandomPickDocument(float p_alive, float p_all, float p_other) const { @@ -108,44 +152,67 @@ libtextclassifier3::Status InMemoryIcingSearchEngine::Delete( return doc_id_or.status(); } -libtextclassifier3::Status InMemoryIcingSearchEngine::DeleteByNamespace( - const std::string &name_space) { +libtextclassifier3::StatusOr<uint32_t> +InMemoryIcingSearchEngine::DeleteByNamespace(const std::string &name_space) { std::vector<DocumentId> doc_ids_to_delete; for (DocumentId doc_id : existing_doc_ids_) { if (documents_[doc_id].document.namespace_() == name_space) { doc_ids_to_delete.push_back(doc_id); } } - if (doc_ids_to_delete.empty()) { - return absl_ports::NotFoundError(absl_ports::StrCat( - "Namespace: ", name_space, - " is not found by InMemoryIcingSearchEngine::DeleteByNamespace.")); - } for (DocumentId doc_id : doc_ids_to_delete) { const DocumentProto &document = documents_[doc_id].document; - ICING_RETURN_IF_ERROR(Delete(document.namespace_(), document.uri())); + if (!Delete(document.namespace_(), document.uri()).ok()) { + return absl_ports::InternalError( + "Should never happen. There are inconsistencies in the in-memory " + "Icing."); + } } - return libtextclassifier3::Status::OK; + return doc_ids_to_delete.size(); } -libtextclassifier3::Status InMemoryIcingSearchEngine::DeleteBySchemaType( - const std::string &schema_type) { +libtextclassifier3::StatusOr<uint32_t> +InMemoryIcingSearchEngine::DeleteBySchemaType(const std::string &schema_type) { std::vector<DocumentId> doc_ids_to_delete; for (DocumentId doc_id : existing_doc_ids_) { if (documents_[doc_id].document.schema() == schema_type) { doc_ids_to_delete.push_back(doc_id); } } - if (doc_ids_to_delete.empty()) { - return absl_ports::NotFoundError(absl_ports::StrCat( - "Type: ", schema_type, - " is not found by InMemoryIcingSearchEngine::DeleteBySchemaType.")); + for (DocumentId doc_id : doc_ids_to_delete) { + const DocumentProto &document = documents_[doc_id].document; + if (!Delete(document.namespace_(), document.uri()).ok()) { + return absl_ports::InternalError( + "Should never happen. There are inconsistencies in the in-memory " + "Icing."); + } } + return doc_ids_to_delete.size(); +} + +libtextclassifier3::StatusOr<uint32_t> InMemoryIcingSearchEngine::DeleteByQuery( + const SearchSpecProto &search_spec) { + std::vector<DocumentId> doc_ids_to_delete = InternalSearch(search_spec); for (DocumentId doc_id : doc_ids_to_delete) { const DocumentProto &document = documents_[doc_id].document; - ICING_RETURN_IF_ERROR(Delete(document.namespace_(), document.uri())); + if (!Delete(document.namespace_(), document.uri()).ok()) { + return absl_ports::InternalError( + "Should never happen. There are inconsistencies in the in-memory " + "Icing."); + } + } + return doc_ids_to_delete.size(); +} + +std::vector<DocumentProto> InMemoryIcingSearchEngine::Search( + const SearchSpecProto &search_spec) const { + std::vector<DocumentId> matched_doc_ids = InternalSearch(search_spec); + std::vector<DocumentProto> result; + result.reserve(matched_doc_ids.size()); + for (DocumentId doc_id : matched_doc_ids) { + result.push_back(documents_[doc_id].document); } - return libtextclassifier3::Status::OK; + return result; } libtextclassifier3::StatusOr<DocumentId> InMemoryIcingSearchEngine::InternalGet( @@ -162,5 +229,17 @@ libtextclassifier3::StatusOr<DocumentId> InMemoryIcingSearchEngine::InternalGet( " is not found by InMemoryIcingSearchEngine::InternalGet.")); } +std::vector<DocumentId> InMemoryIcingSearchEngine::InternalSearch( + const SearchSpecProto &search_spec) const { + std::vector<DocumentId> matched_doc_ids; + for (DocumentId doc_id : existing_doc_ids_) { + if (DoesDocumentMatchQuery(documents_[doc_id], search_spec.query(), + search_spec.term_match_type())) { + matched_doc_ids.push_back(doc_id); + } + } + return matched_doc_ids; +} + } // namespace lib } // namespace icing diff --git a/icing/monkey_test/in-memory-icing-search-engine.h b/icing/monkey_test/in-memory-icing-search-engine.h index 0c6c03b..a5d8872 100644 --- a/icing/monkey_test/in-memory-icing-search-engine.h +++ b/icing/monkey_test/in-memory-icing-search-engine.h @@ -27,6 +27,7 @@ #include "icing/monkey_test/monkey-tokenized-document.h" #include "icing/proto/document.pb.h" #include "icing/proto/schema.pb.h" +#include "icing/proto/search.pb.h" #include "icing/store/document-id.h" namespace icing { @@ -47,6 +48,8 @@ class InMemoryIcingSearchEngine { : random_(random), schema_(std::make_unique<SchemaProto>(std::move(schema))) {} + uint32_t GetNumAliveDocuments() const { return existing_doc_ids_.size(); } + const SchemaProto *GetSchema() const { return schema_.get(); } // Randomly pick a document from the in-memory Icing for monkey testing. @@ -81,16 +84,35 @@ class InMemoryIcingSearchEngine { // Deletes all Documents belonging to the specified namespace. // // Returns: - // OK on success - // NOT_FOUND if namespace doesn't exist - libtextclassifier3::Status DeleteByNamespace(const std::string &name_space); + // The number of deleted documents on success + // INTERNAL_ERROR if there are inconsistencies in the in-memory Icing + libtextclassifier3::StatusOr<uint32_t> DeleteByNamespace( + const std::string &name_space); // Deletes all Documents belonging to the specified type // // Returns: - // OK on success - // NOT_FOUND if schema type doesn't exist - libtextclassifier3::Status DeleteBySchemaType(const std::string &schema_type); + // The number of deleted documents on success + // INTERNAL_ERROR if there are inconsistencies in the in-memory Icing + libtextclassifier3::StatusOr<uint32_t> DeleteBySchemaType( + const std::string &schema_type); + + // Deletes all Documents that match the query specified in search_spec. + // Currently, only the "query" and "term_match_type" fields are recognized by + // the in-memory Icing, and only single term queries with possible section + // restrictions are supported. + // + // Returns: + // The number of deleted documents on success + // INTERNAL_ERROR if there are inconsistencies in the in-memory Icing + libtextclassifier3::StatusOr<uint32_t> DeleteByQuery( + const SearchSpecProto &search_spec); + + // Retrieves documents according to search_spec. + // Currently, only the "query" and "term_match_type" fields are recognized by + // the in-memory Icing, and only single term queries with possible section + // restrictions are supported. + std::vector<DocumentProto> Search(const SearchSpecProto &search_spec) const; private: // Does not own. @@ -113,6 +135,11 @@ class InMemoryIcingSearchEngine { // NOT_FOUND if the key doesn't exist or doc has been deleted libtextclassifier3::StatusOr<DocumentId> InternalGet( const std::string &name_space, const std::string &uri) const; + + // A helper method for DeleteByQuery and Search to get matched internal doc + // ids. + std::vector<DocumentId> InternalSearch( + const SearchSpecProto &search_spec) const; }; } // namespace lib diff --git a/icing/monkey_test/monkey-test-generators.cc b/icing/monkey_test/monkey-test-generators.cc index b0fdf10..88fc0b6 100644 --- a/icing/monkey_test/monkey-test-generators.cc +++ b/icing/monkey_test/monkey-test-generators.cc @@ -153,8 +153,9 @@ MonkeyTokenizedDocument MonkeyDocumentGenerator::GenerateDocument() { if (prop.data_type() == PropertyConfigProto::DataType::STRING && prop.string_indexing_config().term_match_type() != TermMatchType::UNKNOWN) { - MonkeyTokenizedSection section = {prop.property_name(), - std::move(prop_content)}; + MonkeyTokenizedSection section = { + prop.property_name(), prop.string_indexing_config().term_match_type(), + std::move(prop_content)}; document.tokenized_sections.push_back(std::move(section)); } } diff --git a/icing/monkey_test/monkey-test-generators.h b/icing/monkey_test/monkey-test-generators.h index cc4505f..68c5e92 100644 --- a/icing/monkey_test/monkey-test-generators.h +++ b/icing/monkey_test/monkey-test-generators.h @@ -77,7 +77,7 @@ class MonkeyDocumentGenerator { num_namespaces_(num_namespaces), num_uris_(num_uris) {} - SchemaTypeConfigProto GetType() const { + const SchemaTypeConfigProto& GetType() const { std::uniform_int_distribution<> dist(0, schema_->types_size() - 1); return schema_->types(dist(*random_)); } diff --git a/icing/monkey_test/monkey-tokenized-document.h b/icing/monkey_test/monkey-tokenized-document.h index 1d77fc8..a0b38c2 100644 --- a/icing/monkey_test/monkey-tokenized-document.h +++ b/icing/monkey_test/monkey-tokenized-document.h @@ -18,12 +18,14 @@ #include <string> #include "icing/proto/document.pb.h" +#include "icing/proto/term.pb.h" namespace icing { namespace lib { struct MonkeyTokenizedSection { std::string path; + TermMatchType::Code term_match_type; std::vector<std::string> token_sequence; }; diff --git a/icing/query/query-processor.cc b/icing/query/query-processor.cc index c0a5df1..90587aa 100644 --- a/icing/query/query-processor.cc +++ b/icing/query/query-processor.cc @@ -288,16 +288,15 @@ libtextclassifier3::StatusOr<QueryResults> QueryProcessor::ParseRawQuery( // section restricts. Those are not currently supported. If they became // supported, this handling for query terms would need to be altered. if (!frames.top().saw_exclude) { - ICING_ASSIGN_OR_RETURN( - std::unique_ptr<DocHitInfoIterator> term_iterator, - index_.GetIterator( - normalized_text, kSectionIdMaskAll, - search_spec.term_match_type(), - /*need_hit_term_frequency=*/ranking_strategy == - ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE)); - if (ranking_strategy == ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE) { + ICING_ASSIGN_OR_RETURN( + std::unique_ptr<DocHitInfoIterator> term_iterator, + index_.GetIterator( + normalized_text, kSectionIdMaskAll, + search_spec.term_match_type(), + /*need_hit_term_frequency=*/ranking_strategy == + ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE)); results.query_term_iterators[normalized_text] = std::make_unique<DocHitInfoIteratorFilter>( std::move(term_iterator), &document_store_, &schema_store_, diff --git a/icing/query/query-processor_test.cc b/icing/query/query-processor_test.cc index 459e10e..da35df8 100644 --- a/icing/query/query-processor_test.cc +++ b/icing/query/query-processor_test.cc @@ -62,17 +62,6 @@ using ::testing::SizeIs; using ::testing::Test; using ::testing::UnorderedElementsAre; -constexpr PropertyConfigProto::DataType::Code TYPE_STRING = - PropertyConfigProto::DataType::STRING; - -constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL = - PropertyConfigProto::Cardinality::OPTIONAL; - -constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN = - StringIndexingConfig::TokenizerType::PLAIN; - -constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY; - class QueryProcessorTest : public ::testing::TestWithParam<SearchSpecProto::SearchType::Code> { protected: @@ -2417,7 +2406,7 @@ TEST_P(QueryProcessorTest, PropertyFilterForOneDocument) { .AddType(SchemaTypeConfigBuilder().SetType("email").AddProperty( PropertyConfigBuilder() .SetName("subject") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_OPTIONAL))) .Build(); // First and only indexed property, so it gets a section_id of 0 @@ -2486,20 +2475,20 @@ TEST_P(QueryProcessorTest, PropertyFilterAcrossSchemaTypes) { .AddType(SchemaTypeConfigBuilder() .SetType("email") // Section "a" would get sectionId 0 - .AddProperty( - PropertyConfigBuilder() - .SetName("a") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL)) - .AddProperty( - PropertyConfigBuilder() - .SetName("foo") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL))) + .AddProperty(PropertyConfigBuilder() + .SetName("a") + .SetDataTypeString(TERM_MATCH_EXACT, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("foo") + .SetDataTypeString(TERM_MATCH_EXACT, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) .AddType(SchemaTypeConfigBuilder().SetType("message").AddProperty( PropertyConfigBuilder() .SetName("foo") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_OPTIONAL))) .Build(); @@ -2583,12 +2572,12 @@ TEST_P(QueryProcessorTest, PropertyFilterWithinSchemaType) { .AddType(SchemaTypeConfigBuilder().SetType("email").AddProperty( PropertyConfigBuilder() .SetName("foo") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_OPTIONAL))) .AddType(SchemaTypeConfigBuilder().SetType("message").AddProperty( PropertyConfigBuilder() .SetName("foo") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_OPTIONAL))) .Build(); int email_foo_section_id = 0; @@ -2691,11 +2680,11 @@ TEST_P(QueryProcessorTest, NestedPropertyFilter) { .SetType("Bar") // Add an unindexed property so we generate section // metadata on it - .AddProperty( - PropertyConfigBuilder() - .SetName("baz") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL))) + .AddProperty(PropertyConfigBuilder() + .SetName("baz") + .SetDataTypeString(TERM_MATCH_EXACT, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) .Build(); ICING_ASSERT_OK_AND_ASSIGN( @@ -2764,12 +2753,12 @@ TEST_P(QueryProcessorTest, PropertyFilterRespectsDifferentSectionIds) { .AddType(SchemaTypeConfigBuilder().SetType("email").AddProperty( PropertyConfigBuilder() .SetName("foo") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_OPTIONAL))) .AddType(SchemaTypeConfigBuilder().SetType("message").AddProperty( PropertyConfigBuilder() .SetName("bar") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_OPTIONAL))) .Build(); int email_foo_section_id = 0; @@ -2983,12 +2972,12 @@ TEST_P(QueryProcessorTest, PropertyFilterTermAndUnrestrictedTerm) { .AddType(SchemaTypeConfigBuilder().SetType("email").AddProperty( PropertyConfigBuilder() .SetName("foo") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_OPTIONAL))) .AddType(SchemaTypeConfigBuilder().SetType("message").AddProperty( PropertyConfigBuilder() .SetName("foo") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_OPTIONAL))) .Build(); int email_foo_section_id = 0; diff --git a/icing/result/result-retriever-v2_projection_test.cc b/icing/result/result-retriever-v2_projection_test.cc index cb0de0b..ec67caa 100644 --- a/icing/result/result-retriever-v2_projection_test.cc +++ b/icing/result/result-retriever-v2_projection_test.cc @@ -51,15 +51,6 @@ namespace { using ::icing::lib::portable_equals_proto::EqualsProto; using ::testing::SizeIs; -constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL = - PropertyConfigProto::Cardinality::OPTIONAL; - -constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN = - StringIndexingConfig::TokenizerType::PLAIN; - -constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY; -constexpr TermMatchType::Code MATCH_PREFIX = TermMatchType::PREFIX; - class ResultRetrieverV2ProjectionTest : public testing::Test { protected: ResultRetrieverV2ProjectionTest() : test_dir_(GetTestTempDir() + "/icing") { @@ -90,12 +81,12 @@ class ResultRetrieverV2ProjectionTest : public testing::Test { .SetType("Email") .AddProperty(PropertyConfigBuilder() .SetName("name") - .SetDataTypeString(MATCH_PREFIX, + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_OPTIONAL)) .AddProperty(PropertyConfigBuilder() .SetName("body") - .SetDataTypeString(MATCH_EXACT, + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_OPTIONAL)) .AddProperty( @@ -107,16 +98,16 @@ class ResultRetrieverV2ProjectionTest : public testing::Test { .AddType( SchemaTypeConfigBuilder() .SetType("Person") - .AddProperty( - PropertyConfigBuilder() - .SetName("name") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL)) - .AddProperty( - PropertyConfigBuilder() - .SetName("emailAddress") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL))) + .AddProperty(PropertyConfigBuilder() + .SetName("name") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("emailAddress") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) .Build(); ASSERT_THAT(schema_store_->SetSchema(schema), IsOk()); diff --git a/icing/result/result-retriever-v2_snippet_test.cc b/icing/result/result-retriever-v2_snippet_test.cc index 0643e9b..9384d6b 100644 --- a/icing/result/result-retriever-v2_snippet_test.cc +++ b/icing/result/result-retriever-v2_snippet_test.cc @@ -56,15 +56,6 @@ using ::testing::Eq; using ::testing::IsEmpty; using ::testing::SizeIs; -constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL = - PropertyConfigProto::Cardinality::OPTIONAL; - -constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN = - StringIndexingConfig::TokenizerType::PLAIN; - -constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY; -constexpr TermMatchType::Code MATCH_PREFIX = TermMatchType::PREFIX; - class ResultRetrieverV2SnippetTest : public testing::Test { protected: ResultRetrieverV2SnippetTest() : test_dir_(GetTestTempDir() + "/icing") { @@ -95,12 +86,12 @@ class ResultRetrieverV2SnippetTest : public testing::Test { .SetType("Email") .AddProperty(PropertyConfigBuilder() .SetName("name") - .SetDataTypeString(MATCH_PREFIX, + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_OPTIONAL)) .AddProperty(PropertyConfigBuilder() .SetName("body") - .SetDataTypeString(MATCH_EXACT, + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_OPTIONAL)) .AddProperty( @@ -112,16 +103,16 @@ class ResultRetrieverV2SnippetTest : public testing::Test { .AddType( SchemaTypeConfigBuilder() .SetType("Person") - .AddProperty( - PropertyConfigBuilder() - .SetName("name") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL)) - .AddProperty( - PropertyConfigBuilder() - .SetName("emailAddress") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL))) + .AddProperty(PropertyConfigBuilder() + .SetName("name") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("emailAddress") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) .Build(); ASSERT_THAT(schema_store_->SetSchema(schema), IsOk()); diff --git a/icing/result/result-retriever-v2_test.cc b/icing/result/result-retriever-v2_test.cc index 5d66be2..0fb2ba0 100644 --- a/icing/result/result-retriever-v2_test.cc +++ b/icing/result/result-retriever-v2_test.cc @@ -64,15 +64,6 @@ using ::testing::Return; using ::testing::SizeIs; using NamespaceIdMap = std::unordered_map<NamespaceId, int>; -constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL = - PropertyConfigProto::Cardinality::OPTIONAL; - -constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN = - StringIndexingConfig::TokenizerType::PLAIN; - -constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY; -constexpr TermMatchType::Code MATCH_PREFIX = TermMatchType::PREFIX; - // Mock the behavior of GroupResultLimiter::ShouldBeRemoved. class MockGroupResultLimiter : public GroupResultLimiterV2 { public: @@ -116,12 +107,12 @@ class ResultRetrieverV2Test : public ::testing::Test { .SetType("Email") .AddProperty(PropertyConfigBuilder() .SetName("name") - .SetDataTypeString(MATCH_PREFIX, + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_OPTIONAL)) .AddProperty(PropertyConfigBuilder() .SetName("body") - .SetDataTypeString(MATCH_EXACT, + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_OPTIONAL)) .AddProperty( @@ -133,16 +124,16 @@ class ResultRetrieverV2Test : public ::testing::Test { .AddType( SchemaTypeConfigBuilder() .SetType("Person") - .AddProperty( - PropertyConfigBuilder() - .SetName("name") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL)) - .AddProperty( - PropertyConfigBuilder() - .SetName("emailAddress") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL))) + .AddProperty(PropertyConfigBuilder() + .SetName("name") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("emailAddress") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) .Build(); ASSERT_THAT(schema_store_->SetSchema(schema), IsOk()); diff --git a/icing/result/result-retriever_test.cc b/icing/result/result-retriever_test.cc index 1b2b359..e0b4875 100644 --- a/icing/result/result-retriever_test.cc +++ b/icing/result/result-retriever_test.cc @@ -55,15 +55,6 @@ using ::testing::IsEmpty; using ::testing::Return; using ::testing::SizeIs; -constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL = - PropertyConfigProto::Cardinality::OPTIONAL; - -constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN = - StringIndexingConfig::TokenizerType::PLAIN; - -constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY; -constexpr TermMatchType::Code MATCH_PREFIX = TermMatchType::PREFIX; - class ResultRetrieverTest : public testing::Test { protected: ResultRetrieverTest() : test_dir_(GetTestTempDir() + "/icing") { @@ -94,12 +85,12 @@ class ResultRetrieverTest : public testing::Test { .SetType("Email") .AddProperty(PropertyConfigBuilder() .SetName("name") - .SetDataTypeString(MATCH_PREFIX, + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_OPTIONAL)) .AddProperty(PropertyConfigBuilder() .SetName("body") - .SetDataTypeString(MATCH_EXACT, + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_OPTIONAL)) .AddProperty( @@ -111,16 +102,16 @@ class ResultRetrieverTest : public testing::Test { .AddType( SchemaTypeConfigBuilder() .SetType("Person") - .AddProperty( - PropertyConfigBuilder() - .SetName("name") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL)) - .AddProperty( - PropertyConfigBuilder() - .SetName("emailAddress") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL))) + .AddProperty(PropertyConfigBuilder() + .SetName("name") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("emailAddress") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) .Build(); ASSERT_THAT(schema_store_->SetSchema(schema), IsOk()); } diff --git a/icing/result/snippet-retriever.cc b/icing/result/snippet-retriever.cc index 604ad3d..8044b8d 100644 --- a/icing/result/snippet-retriever.cc +++ b/icing/result/snippet-retriever.cc @@ -98,20 +98,6 @@ std::string NormalizeToken(const Normalizer& normalizer, const Token& token) { [[fallthrough]]; case Token::Type::RFC822_TOKEN: [[fallthrough]]; - case Token::Type::REGULAR: - return normalizer.NormalizeTerm(token.text); - case Token::Type::VERBATIM: - return std::string(token.text); - case Token::Type::QUERY_EXCLUSION: - [[fallthrough]]; - case Token::Type::QUERY_LEFT_PARENTHESES: - [[fallthrough]]; - case Token::Type::QUERY_RIGHT_PARENTHESES: - [[fallthrough]]; - case Token::Type::QUERY_OR: - [[fallthrough]]; - case Token::Type::QUERY_PROPERTY: - [[fallthrough]]; case Token::Type::URL_SCHEME: [[fallthrough]]; case Token::Type::URL_USERNAME: @@ -134,6 +120,20 @@ std::string NormalizeToken(const Normalizer& normalizer, const Token& token) { [[fallthrough]]; case Token::Type::URL_SUFFIX_INNERMOST: [[fallthrough]]; + case Token::Type::REGULAR: + return normalizer.NormalizeTerm(token.text); + case Token::Type::VERBATIM: + return std::string(token.text); + case Token::Type::QUERY_EXCLUSION: + [[fallthrough]]; + case Token::Type::QUERY_LEFT_PARENTHESES: + [[fallthrough]]; + case Token::Type::QUERY_RIGHT_PARENTHESES: + [[fallthrough]]; + case Token::Type::QUERY_OR: + [[fallthrough]]; + case Token::Type::QUERY_PROPERTY: + [[fallthrough]]; case Token::Type::INVALID: ICING_LOG(WARNING) << "Unable to normalize token of type: " << static_cast<int>(token.type); @@ -166,6 +166,11 @@ CharacterIterator FindMatchEnd(const Normalizer& normalizer, const Token& token, [[fallthrough]]; case Token::Type::QUERY_PROPERTY: [[fallthrough]]; + case Token::Type::INVALID: + ICING_LOG(WARNING) + << "Unexpected Token type " << static_cast<int>(token.type) + << " found when finding match end of query term and token."; + [[fallthrough]]; case Token::Type::RFC822_NAME: [[fallthrough]]; case Token::Type::RFC822_COMMENT: @@ -204,11 +209,6 @@ CharacterIterator FindMatchEnd(const Normalizer& normalizer, const Token& token, [[fallthrough]]; case Token::Type::URL_SUFFIX_INNERMOST: [[fallthrough]]; - case Token::Type::INVALID: - ICING_LOG(WARNING) - << "Unexpected Token type " << static_cast<int>(token.type) - << " found when finding match end of query term and token."; - [[fallthrough]]; case Token::Type::REGULAR: return normalizer.FindNormalizedMatchEndPosition(token.text, match_query_term); @@ -336,7 +336,9 @@ libtextclassifier3::StatusOr<CharacterIterator> DetermineWindowStart( CharacterIterator IncludeTrailingPunctuation( std::string_view value, CharacterIterator window_end_exclusive, int window_end_max_exclusive_utf32) { - while (window_end_exclusive.utf32_index() < window_end_max_exclusive_utf32) { + size_t max_search_index = value.length() - 1; + while (window_end_exclusive.utf8_index() <= max_search_index && + window_end_exclusive.utf32_index() < window_end_max_exclusive_utf32) { int char_len = 0; if (!i18n_utils::IsPunctuationAt(value, window_end_exclusive.utf8_index(), &char_len)) { diff --git a/icing/result/snippet-retriever_test.cc b/icing/result/snippet-retriever_test.cc index 24f8a0a..0940b51 100644 --- a/icing/result/snippet-retriever_test.cc +++ b/icing/result/snippet-retriever_test.cc @@ -58,20 +58,12 @@ using ::testing::Eq; using ::testing::IsEmpty; using ::testing::SizeIs; -constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL = - PropertyConfigProto::Cardinality::OPTIONAL; -constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REPEATED = - PropertyConfigProto::Cardinality::REPEATED; - -constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN = - StringIndexingConfig::TokenizerType::PLAIN; -constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_VERBATIM = - StringIndexingConfig::TokenizerType::VERBATIM; -constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_RFC822 = - StringIndexingConfig::TokenizerType::RFC822; - -constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY; -constexpr TermMatchType::Code MATCH_PREFIX = TermMatchType::PREFIX; +// TODO (b/246964044): remove ifdef guard when url-tokenizer is ready for export +// to Android. Also move it to schema-builder.h +#ifdef ENABLE_URL_TOKENIZER +constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_URL = + StringIndexingConfig::TokenizerType::URL; +#endif // ENABLE_URL_TOKENIZER std::vector<std::string_view> GetPropertyPaths(const SnippetProto& snippet) { std::vector<std::string_view> paths; @@ -110,16 +102,16 @@ class SnippetRetrieverTest : public testing::Test { .AddType( SchemaTypeConfigBuilder() .SetType("email") - .AddProperty( - PropertyConfigBuilder() - .SetName("subject") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL)) - .AddProperty( - PropertyConfigBuilder() - .SetName("body") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL))) + .AddProperty(PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(TERM_MATCH_EXACT, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) .Build(); ICING_ASSERT_OK(schema_store_->SetSchema(schema)); @@ -184,7 +176,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowSizeSmallerThanMatch) { // "three". len=4, orig_window= "thre" snippet_spec_.set_max_window_utf32_length(4); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( - query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); + query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask); EXPECT_THAT(snippet.entries(), SizeIs(1)); EXPECT_THAT(snippet.entries(0).property_name(), Eq("body")); @@ -210,7 +202,7 @@ TEST_F(SnippetRetrieverTest, // "three". len=5, orig_window= "three" snippet_spec_.set_max_window_utf32_length(5); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( - query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); + query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask); EXPECT_THAT(snippet.entries(), SizeIs(1)); EXPECT_THAT(snippet.entries(0).property_name(), Eq("body")); @@ -236,7 +228,7 @@ TEST_F(SnippetRetrieverTest, // "four". len=4, orig_window= "four" snippet_spec_.set_max_window_utf32_length(4); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( - query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); + query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask); EXPECT_THAT(snippet.entries(), SizeIs(1)); EXPECT_THAT(snippet.entries(0).property_name(), Eq("body")); @@ -268,7 +260,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowStartsInWhitespace) { // 3. trimmed, shifted window [4,18) "two three four" snippet_spec_.set_max_window_utf32_length(14); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( - query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); + query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask); EXPECT_THAT(snippet.entries(), SizeIs(1)); EXPECT_THAT(snippet.entries(0).property_name(), Eq("body")); @@ -301,7 +293,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowStartsMidToken) { // 3. trimmed, shifted window [4,20) "two three four.." snippet_spec_.set_max_window_utf32_length(16); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( - query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); + query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask); EXPECT_THAT(snippet.entries(), SizeIs(1)); EXPECT_THAT(snippet.entries(0).property_name(), Eq("body")); @@ -327,7 +319,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowEndsInPunctuation) { // len=20, orig_window="one two three four.." snippet_spec_.set_max_window_utf32_length(20); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( - query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); + query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask); EXPECT_THAT(snippet.entries(), SizeIs(1)); EXPECT_THAT(snippet.entries(0).property_name(), Eq("body")); @@ -355,7 +347,7 @@ TEST_F(SnippetRetrieverTest, // len=26, orig_window="pside down in Australia¿" snippet_spec_.set_max_window_utf32_length(24); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( - query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); + query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask); EXPECT_THAT(snippet.entries(), SizeIs(1)); EXPECT_THAT(snippet.entries(0).property_name(), Eq("body")); @@ -383,7 +375,7 @@ TEST_F(SnippetRetrieverTest, // len=26, orig_window="upside down in Australia¿ " snippet_spec_.set_max_window_utf32_length(26); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( - query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); + query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask); EXPECT_THAT(snippet.entries(), SizeIs(1)); EXPECT_THAT(snippet.entries(0).property_name(), Eq("body")); @@ -416,7 +408,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowStartsBeforeValueStart) { // 3. trimmed, shifted window [0,22) "one two three four...." snippet_spec_.set_max_window_utf32_length(22); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( - query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); + query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask); EXPECT_THAT(snippet.entries(), SizeIs(1)); EXPECT_THAT(snippet.entries(0).property_name(), Eq("body")); @@ -442,7 +434,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowEndsInWhitespace) { // len=26, orig_window="one two three four.... " snippet_spec_.set_max_window_utf32_length(26); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( - query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); + query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask); EXPECT_THAT(snippet.entries(), SizeIs(1)); EXPECT_THAT(snippet.entries(0).property_name(), Eq("body")); @@ -475,7 +467,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowEndsMidToken) { // 3. trimmed, shifted window [0,27) "one two three four.... five" snippet_spec_.set_max_window_utf32_length(32); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( - query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); + query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask); EXPECT_THAT(snippet.entries(), SizeIs(1)); EXPECT_THAT(snippet.entries(0).property_name(), Eq("body")); @@ -501,7 +493,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowSizeEqualToValueSize) { // len=34, orig_window="one two three four.... five" snippet_spec_.set_max_window_utf32_length(34); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( - query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); + query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask); EXPECT_THAT(snippet.entries(), SizeIs(1)); EXPECT_THAT(snippet.entries(0).property_name(), Eq("body")); @@ -527,7 +519,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowSizeLargerThanValueSize) { // len=36, orig_window="one two three four.... five" snippet_spec_.set_max_window_utf32_length(36); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( - query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); + query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask); EXPECT_THAT(snippet.entries(), SizeIs(1)); EXPECT_THAT(snippet.entries(0).property_name(), Eq("body")); @@ -561,7 +553,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextStart) { // 3. trimmed, shifted window [0,27) "one two three four.... five" snippet_spec_.set_max_window_utf32_length(28); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( - query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); + query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask); EXPECT_THAT(snippet.entries(), SizeIs(1)); EXPECT_THAT(snippet.entries(0).property_name(), Eq("body")); @@ -595,7 +587,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextEnd) { // 3. trimmed, shifted window [4,31) "two three four.... five six" snippet_spec_.set_max_window_utf32_length(28); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( - query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); + query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask); EXPECT_THAT(snippet.entries(), SizeIs(1)); EXPECT_THAT(snippet.entries(0).property_name(), Eq("body")); @@ -629,7 +621,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextStartShortText) { // 3. trimmed, shifted window [0, 22) "one two three four...." snippet_spec_.set_max_window_utf32_length(28); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( - query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); + query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask); EXPECT_THAT(snippet.entries(), SizeIs(1)); EXPECT_THAT(snippet.entries(0).property_name(), Eq("body")); @@ -663,7 +655,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextEndShortText) { // 3. trimmed, shifted window [0, 22) "one two three four...." snippet_spec_.set_max_window_utf32_length(28); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( - query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); + query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask); EXPECT_THAT(snippet.entries(), SizeIs(1)); EXPECT_THAT(snippet.entries(0).property_name(), Eq("body")); @@ -684,7 +676,7 @@ TEST_F(SnippetRetrieverTest, PrefixSnippeting) { SectionIdMask section_mask = 0b00000011; SectionRestrictQueryTermsMap query_terms{{"", {"f"}}}; SnippetProto snippet = snippet_retriever_->RetrieveSnippet( - query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask); + query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask); // Check the snippets. 'f' should match prefix-enabled property 'subject', but // not exact-only property 'body' @@ -710,7 +702,7 @@ TEST_F(SnippetRetrieverTest, ExactSnippeting) { SectionIdMask section_mask = 0b00000011; SectionRestrictQueryTermsMap query_terms{{"", {"f"}}}; SnippetProto snippet = snippet_retriever_->RetrieveSnippet( - query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); + query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask); // Check the snippets EXPECT_THAT(snippet.entries(), IsEmpty()); @@ -730,7 +722,7 @@ TEST_F(SnippetRetrieverTest, SimpleSnippetingNoWindowing) { SectionIdMask section_mask = 0b00000011; SectionRestrictQueryTermsMap query_terms{{"", {"foo"}}}; SnippetProto snippet = snippet_retriever_->RetrieveSnippet( - query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); + query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask); // Check the snippets EXPECT_THAT(snippet.entries(), SizeIs(1)); @@ -764,7 +756,7 @@ TEST_F(SnippetRetrieverTest, SnippetingMultipleMatches) { SectionIdMask section_mask = 0b00000011; SectionRestrictQueryTermsMap query_terms{{"", {"foo", "bar"}}}; SnippetProto snippet = snippet_retriever_->RetrieveSnippet( - query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask); + query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask); // Check the snippets EXPECT_THAT(snippet.entries(), SizeIs(2)); @@ -822,7 +814,7 @@ TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesSectionRestrict) { SectionIdMask section_mask = 0b00000001; SectionRestrictQueryTermsMap query_terms{{"", {"foo", "bar"}}}; SnippetProto snippet = snippet_retriever_->RetrieveSnippet( - query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask); + query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask); // Check the snippets EXPECT_THAT(snippet.entries(), SizeIs(1)); @@ -874,7 +866,7 @@ TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesSectionRestrictedTerm) { SectionRestrictQueryTermsMap query_terms{{"", {"subject"}}, {"body", {"foo"}}}; SnippetProto snippet = snippet_retriever_->RetrieveSnippet( - query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask); + query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask); // Check the snippets EXPECT_THAT(snippet.entries(), SizeIs(2)); @@ -933,7 +925,7 @@ TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesOneMatchPerProperty) { SectionIdMask section_mask = 0b00000011; SectionRestrictQueryTermsMap query_terms{{"", {"foo", "bar"}}}; SnippetProto snippet = snippet_retriever_->RetrieveSnippet( - query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask); + query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask); // Check the snippets EXPECT_THAT(snippet.entries(), SizeIs(2)); @@ -970,7 +962,7 @@ TEST_F(SnippetRetrieverTest, PrefixSnippetingNormalization) { SectionIdMask section_mask = 0b00000011; SectionRestrictQueryTermsMap query_terms{{"", {"md"}}}; SnippetProto snippet = snippet_retriever_->RetrieveSnippet( - query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask); + query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask); EXPECT_THAT(snippet.entries(), SizeIs(1)); EXPECT_THAT(snippet.entries(0).property_name(), Eq("subject")); @@ -993,7 +985,7 @@ TEST_F(SnippetRetrieverTest, ExactSnippetingNormalization) { SectionIdMask section_mask = 0b00000011; SectionRestrictQueryTermsMap query_terms{{"", {"zurich"}}}; SnippetProto snippet = snippet_retriever_->RetrieveSnippet( - query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); + query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask); EXPECT_THAT(snippet.entries(), SizeIs(1)); EXPECT_THAT(snippet.entries(0).property_name(), Eq("body")); @@ -1012,21 +1004,21 @@ TEST_F(SnippetRetrieverTest, SnippetingTestOneLevel) { SchemaBuilder() .AddType(SchemaTypeConfigBuilder() .SetType("SingleLevelType") - .AddProperty( - PropertyConfigBuilder() - .SetName("X") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_REPEATED)) - .AddProperty( - PropertyConfigBuilder() - .SetName("Y") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_REPEATED)) - .AddProperty( - PropertyConfigBuilder() - .SetName("Z") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_REPEATED))) + .AddProperty(PropertyConfigBuilder() + .SetName("X") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REPEATED)) + .AddProperty(PropertyConfigBuilder() + .SetName("Y") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REPEATED)) + .AddProperty(PropertyConfigBuilder() + .SetName("Z") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REPEATED))) .Build(); ICING_ASSERT_OK(schema_store_->SetSchema( schema, /*ignore_errors_and_delete_documents=*/true)); @@ -1057,7 +1049,7 @@ TEST_F(SnippetRetrieverTest, SnippetingTestOneLevel) { SectionIdMask section_mask = 0b00000111; SectionRestrictQueryTermsMap query_terms{{"", {"polo"}}}; SnippetProto snippet = snippet_retriever_->RetrieveSnippet( - query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); + query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask); EXPECT_THAT(snippet.entries(), SizeIs(6)); EXPECT_THAT(snippet.entries(0).property_name(), Eq("X[1]")); @@ -1082,21 +1074,21 @@ TEST_F(SnippetRetrieverTest, SnippetingTestMultiLevel) { SchemaBuilder() .AddType(SchemaTypeConfigBuilder() .SetType("SingleLevelType") - .AddProperty( - PropertyConfigBuilder() - .SetName("X") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_REPEATED)) - .AddProperty( - PropertyConfigBuilder() - .SetName("Y") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_REPEATED)) - .AddProperty( - PropertyConfigBuilder() - .SetName("Z") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_REPEATED))) + .AddProperty(PropertyConfigBuilder() + .SetName("X") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REPEATED)) + .AddProperty(PropertyConfigBuilder() + .SetName("Y") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REPEATED)) + .AddProperty(PropertyConfigBuilder() + .SetName("Z") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REPEATED))) .AddType(SchemaTypeConfigBuilder() .SetType("MultiLevelType") .AddProperty(PropertyConfigBuilder() @@ -1160,7 +1152,7 @@ TEST_F(SnippetRetrieverTest, SnippetingTestMultiLevel) { SectionIdMask section_mask = 0b111111111; SectionRestrictQueryTermsMap query_terms{{"", {"polo"}}}; SnippetProto snippet = snippet_retriever_->RetrieveSnippet( - query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); + query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask); EXPECT_THAT(snippet.entries(), SizeIs(18)); EXPECT_THAT(snippet.entries(0).property_name(), Eq("A.X[1]")); @@ -1188,21 +1180,21 @@ TEST_F(SnippetRetrieverTest, SnippetingTestMultiLevelRepeated) { SchemaBuilder() .AddType(SchemaTypeConfigBuilder() .SetType("SingleLevelType") - .AddProperty( - PropertyConfigBuilder() - .SetName("X") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_REPEATED)) - .AddProperty( - PropertyConfigBuilder() - .SetName("Y") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_REPEATED)) - .AddProperty( - PropertyConfigBuilder() - .SetName("Z") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_REPEATED))) + .AddProperty(PropertyConfigBuilder() + .SetName("X") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REPEATED)) + .AddProperty(PropertyConfigBuilder() + .SetName("Y") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REPEATED)) + .AddProperty(PropertyConfigBuilder() + .SetName("Z") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REPEATED))) .AddType(SchemaTypeConfigBuilder() .SetType("MultiLevelType") .AddProperty(PropertyConfigBuilder() @@ -1269,7 +1261,7 @@ TEST_F(SnippetRetrieverTest, SnippetingTestMultiLevelRepeated) { SectionIdMask section_mask = 0b111111111; SectionRestrictQueryTermsMap query_terms{{"", {"polo"}}}; SnippetProto snippet = snippet_retriever_->RetrieveSnippet( - query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); + query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask); EXPECT_THAT(snippet.entries(), SizeIs(36)); EXPECT_THAT(snippet.entries(0).property_name(), Eq("A[0].X[1]")); @@ -1302,21 +1294,21 @@ TEST_F(SnippetRetrieverTest, SnippetingTestMultiLevelSingleValue) { SchemaBuilder() .AddType(SchemaTypeConfigBuilder() .SetType("SingleLevelType") - .AddProperty( - PropertyConfigBuilder() - .SetName("X") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL)) - .AddProperty( - PropertyConfigBuilder() - .SetName("Y") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL)) - .AddProperty( - PropertyConfigBuilder() - .SetName("Z") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL))) + .AddProperty(PropertyConfigBuilder() + .SetName("X") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("Y") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("Z") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) .AddType(SchemaTypeConfigBuilder() .SetType("MultiLevelType") .AddProperty(PropertyConfigBuilder() @@ -1376,7 +1368,7 @@ TEST_F(SnippetRetrieverTest, SnippetingTestMultiLevelSingleValue) { SectionIdMask section_mask = 0b111111111; SectionRestrictQueryTermsMap query_terms{{"", {"polo"}}}; SnippetProto snippet = snippet_retriever_->RetrieveSnippet( - query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); + query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask); EXPECT_THAT(snippet.entries(), SizeIs(12)); EXPECT_THAT(snippet.entries(0).property_name(), Eq("A[0].X")); @@ -1419,7 +1411,7 @@ TEST_F(SnippetRetrieverTest, CJKSnippetMatchTest) { SectionRestrictQueryTermsMap query_terms{{"", {"走"}}}; SnippetProto snippet = snippet_retriever_->RetrieveSnippet( - query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask); + query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask); // Ensure that one and only one property was matched and it was "body" ASSERT_THAT(snippet.entries(), SizeIs(1)); @@ -1480,7 +1472,7 @@ TEST_F(SnippetRetrieverTest, CJKSnippetWindowTest) { snippet_spec_.set_max_window_utf32_length(6); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( - query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask); + query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask); // Ensure that one and only one property was matched and it was "body" ASSERT_THAT(snippet.entries(), SizeIs(1)); @@ -1524,7 +1516,7 @@ TEST_F(SnippetRetrieverTest, Utf16MultiCodeUnitSnippetMatchTest) { SectionRestrictQueryTermsMap query_terms{{"", {"𐀂"}}}; SnippetProto snippet = snippet_retriever_->RetrieveSnippet( - query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask); + query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask); // Ensure that one and only one property was matched and it was "body" ASSERT_THAT(snippet.entries(), SizeIs(1)); @@ -1579,7 +1571,7 @@ TEST_F(SnippetRetrieverTest, Utf16MultiCodeUnitWindowTest) { snippet_spec_.set_max_window_utf32_length(6); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( - query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask); + query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask); // Ensure that one and only one property was matched and it was "body" ASSERT_THAT(snippet.entries(), SizeIs(1)); @@ -1607,7 +1599,7 @@ TEST_F(SnippetRetrieverTest, SnippettingVerbatimAscii) { .SetType("verbatimType") .AddProperty(PropertyConfigBuilder() .SetName("verbatim") - .SetDataTypeString(MATCH_EXACT, + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_VERBATIM) .SetCardinality(CARDINALITY_REPEATED))) .Build(); @@ -1629,7 +1621,7 @@ TEST_F(SnippetRetrieverTest, SnippettingVerbatimAscii) { snippet_spec_.set_max_window_utf32_length(13); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( - query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); + query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask); // There should only be one snippet entry and match, the verbatim token in its // entirety. @@ -1660,7 +1652,7 @@ TEST_F(SnippetRetrieverTest, SnippettingVerbatimCJK) { .SetType("verbatimType") .AddProperty(PropertyConfigBuilder() .SetName("verbatim") - .SetDataTypeString(MATCH_PREFIX, + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_VERBATIM) .SetCardinality(CARDINALITY_REPEATED))) .Build(); @@ -1689,7 +1681,7 @@ TEST_F(SnippetRetrieverTest, SnippettingVerbatimCJK) { snippet_spec_.set_max_window_utf32_length(9); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( - query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask); + query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask); // There should only be one snippet entry and match, the verbatim token in its // entirety. @@ -1718,7 +1710,7 @@ TEST_F(SnippetRetrieverTest, SnippettingRfc822Ascii) { .SetType("rfc822Type") .AddProperty(PropertyConfigBuilder() .SetName("rfc822") - .SetDataTypeString(MATCH_PREFIX, + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_RFC822) .SetCardinality(CARDINALITY_REPEATED))) .Build(); @@ -1747,7 +1739,7 @@ TEST_F(SnippetRetrieverTest, SnippettingRfc822Ascii) { snippet_spec_.set_max_window_utf32_length(35); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( - query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask); + query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask); ASSERT_THAT(snippet.entries(), SizeIs(1)); EXPECT_THAT(snippet.entries(0).property_name(), "rfc822"); @@ -1768,7 +1760,7 @@ TEST_F(SnippetRetrieverTest, SnippettingRfc822Ascii) { snippet_spec_.set_max_window_utf32_length(36); snippet = snippet_retriever_->RetrieveSnippet( - query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask); + query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask); ASSERT_THAT(snippet.entries(), SizeIs(1)); EXPECT_THAT(snippet.entries(0).property_name(), "rfc822"); @@ -1793,7 +1785,7 @@ TEST_F(SnippetRetrieverTest, SnippettingRfc822CJK) { .SetType("rfc822Type") .AddProperty(PropertyConfigBuilder() .SetName("rfc822") - .SetDataTypeString(MATCH_PREFIX, + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_RFC822) .SetCardinality(CARDINALITY_REPEATED))) .Build(); @@ -1819,7 +1811,7 @@ TEST_F(SnippetRetrieverTest, SnippettingRfc822CJK) { snippet_spec_.set_max_window_utf32_length(8); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( - query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask); + query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask); // There should only be one snippet entry and match, the local component token ASSERT_THAT(snippet.entries(), SizeIs(1)); @@ -1839,6 +1831,174 @@ TEST_F(SnippetRetrieverTest, SnippettingRfc822CJK) { ElementsAre("走", "走")); } +#ifdef ENABLE_URL_TOKENIZER +TEST_F(SnippetRetrieverTest, SnippettingUrlAscii) { + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType("urlType") + .AddProperty(PropertyConfigBuilder() + .SetName("url") + .SetDataTypeString(MATCH_PREFIX, + TOKENIZER_URL) + .SetCardinality(CARDINALITY_REPEATED))) + .Build(); + ICING_ASSERT_OK(schema_store_->SetSchema( + schema, /*ignore_errors_and_delete_documents=*/true)); + + ICING_ASSERT_OK_AND_ASSIGN( + snippet_retriever_, + SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(), + normalizer_.get())); + + DocumentProto document = + DocumentBuilder() + .SetKey("icing", "url/1") + .SetSchema("urlType") + .AddStringProperty("url", "https://mail.google.com/calendar/google/") + .Build(); + + SectionIdMask section_mask = 0b00000001; + + // Query with single url split-token match + SectionRestrictQueryTermsMap query_terms{{"", {"com"}}}; + // 40 is the length of the url. + // Window that is the size of the url should return entire url. + snippet_spec_.set_max_window_utf32_length(40); + + SnippetProto snippet = snippet_retriever_->RetrieveSnippet( + query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask); + + ASSERT_THAT(snippet.entries(), SizeIs(1)); + EXPECT_THAT(snippet.entries(0).property_name(), "url"); + + std::string_view content = + GetString(&document, snippet.entries(0).property_name()); + + EXPECT_THAT(GetWindows(content, snippet.entries(0)), + ElementsAre("https://mail.google.com/calendar/google/")); + EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("com")); + EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("com")); + + // Query with single url suffix-token match + query_terms = SectionRestrictQueryTermsMap{{"", {"mail.goo"}}}; + snippet_spec_.set_max_window_utf32_length(40); + + snippet = snippet_retriever_->RetrieveSnippet( + query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask); + + ASSERT_THAT(snippet.entries(), SizeIs(1)); + EXPECT_THAT(snippet.entries(0).property_name(), "url"); + + content = GetString(&document, snippet.entries(0).property_name()); + + EXPECT_THAT(GetWindows(content, snippet.entries(0)), + ElementsAre("https://mail.google.com/calendar/google/")); + EXPECT_THAT(GetMatches(content, snippet.entries(0)), + ElementsAre("mail.google.com/calendar/google/")); + EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), + ElementsAre("mail.goo")); + + // Query with multiple url split-token matches + query_terms = SectionRestrictQueryTermsMap{{"", {"goog"}}}; + snippet_spec_.set_max_window_utf32_length(40); + + snippet = snippet_retriever_->RetrieveSnippet( + query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask); + + ASSERT_THAT(snippet.entries(), SizeIs(1)); + EXPECT_THAT(snippet.entries(0).property_name(), "url"); + + content = GetString(&document, snippet.entries(0).property_name()); + + EXPECT_THAT(GetWindows(content, snippet.entries(0)), + ElementsAre("https://mail.google.com/calendar/google/", + "https://mail.google.com/calendar/google/")); + EXPECT_THAT(GetMatches(content, snippet.entries(0)), + ElementsAre("google", "google")); + EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), + ElementsAre("goog", "goog")); + + // Query with both url split-token and suffix-token matches + query_terms = SectionRestrictQueryTermsMap{{"", {"mail"}}}; + snippet_spec_.set_max_window_utf32_length(40); + + snippet = snippet_retriever_->RetrieveSnippet( + query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask); + + ASSERT_THAT(snippet.entries(), SizeIs(1)); + EXPECT_THAT(snippet.entries(0).property_name(), "url"); + + content = GetString(&document, snippet.entries(0).property_name()); + + EXPECT_THAT(GetWindows(content, snippet.entries(0)), + ElementsAre("https://mail.google.com/calendar/google/", + "https://mail.google.com/calendar/google/")); + EXPECT_THAT(GetMatches(content, snippet.entries(0)), + ElementsAre("mail", "mail.google.com/calendar/google/")); + EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), + ElementsAre("mail", "mail")); + + // Prefix query with both url split-token and suffix-token matches + query_terms = SectionRestrictQueryTermsMap{{"", {"http"}}}; + snippet_spec_.set_max_window_utf32_length(40); + + snippet = snippet_retriever_->RetrieveSnippet( + query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask); + + ASSERT_THAT(snippet.entries(), SizeIs(1)); + EXPECT_THAT(snippet.entries(0).property_name(), "url"); + + content = GetString(&document, snippet.entries(0).property_name()); + + EXPECT_THAT(GetWindows(content, snippet.entries(0)), + ElementsAre("https://mail.google.com/calendar/google/", + "https://mail.google.com/calendar/google/")); + EXPECT_THAT(GetMatches(content, snippet.entries(0)), + ElementsAre("https", "https://mail.google.com/calendar/google/")); + EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), + ElementsAre("http", "http")); + + // Window that's smaller than the input size should not return any matches. + query_terms = SectionRestrictQueryTermsMap{{"", {"google"}}}; + snippet_spec_.set_max_window_utf32_length(10); + + snippet = snippet_retriever_->RetrieveSnippet( + query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask); + + ASSERT_THAT(snippet.entries(), SizeIs(0)); + + // Test case with more than two matches + document = + DocumentBuilder() + .SetKey("icing", "url/1") + .SetSchema("urlType") + .AddStringProperty("url", "https://www.google.com/calendar/google/") + .Build(); + + // Prefix query with both url split-token and suffix-token matches + query_terms = SectionRestrictQueryTermsMap{{"", {"google"}}}; + snippet_spec_.set_max_window_utf32_length(39); + + snippet = snippet_retriever_->RetrieveSnippet( + query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask); + + ASSERT_THAT(snippet.entries(), SizeIs(1)); + EXPECT_THAT(snippet.entries(0).property_name(), "url"); + + content = GetString(&document, snippet.entries(0).property_name()); + + EXPECT_THAT(GetWindows(content, snippet.entries(0)), + ElementsAre("https://www.google.com/calendar/google/", + "https://www.google.com/calendar/google/", + "https://www.google.com/calendar/google/")); + EXPECT_THAT(GetMatches(content, snippet.entries(0)), + ElementsAre("google", "google", "google.com/calendar/google/")); + EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), + ElementsAre("google", "google", "google")); +} +#endif // ENABLE_URL_TOKENIZER + } // namespace } // namespace lib diff --git a/icing/schema-builder.h b/icing/schema-builder.h index 3bc4527..ea0a774 100644 --- a/icing/schema-builder.h +++ b/icing/schema-builder.h @@ -27,6 +27,48 @@ namespace icing { namespace lib { +constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_UNKNOWN = + PropertyConfigProto::Cardinality::UNKNOWN; +constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REPEATED = + PropertyConfigProto::Cardinality::REPEATED; +constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL = + PropertyConfigProto::Cardinality::OPTIONAL; +constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REQUIRED = + PropertyConfigProto::Cardinality::REQUIRED; + +constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_NONE = + StringIndexingConfig::TokenizerType::NONE; +constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN = + StringIndexingConfig::TokenizerType::PLAIN; +constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_VERBATIM = + StringIndexingConfig::TokenizerType::VERBATIM; +constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_RFC822 = + StringIndexingConfig::TokenizerType::RFC822; + +constexpr TermMatchType::Code TERM_MATCH_UNKNOWN = TermMatchType::UNKNOWN; +constexpr TermMatchType::Code TERM_MATCH_EXACT = TermMatchType::EXACT_ONLY; +constexpr TermMatchType::Code TERM_MATCH_PREFIX = TermMatchType::PREFIX; + +constexpr IntegerIndexingConfig::NumericMatchType::Code NUMERIC_MATCH_UNKNOWN = + IntegerIndexingConfig::NumericMatchType::UNKNOWN; +constexpr IntegerIndexingConfig::NumericMatchType::Code NUMERIC_MATCH_RANGE = + IntegerIndexingConfig::NumericMatchType::RANGE; + +constexpr PropertyConfigProto::DataType::Code TYPE_UNKNOWN = + PropertyConfigProto::DataType::UNKNOWN; +constexpr PropertyConfigProto::DataType::Code TYPE_STRING = + PropertyConfigProto::DataType::STRING; +constexpr PropertyConfigProto::DataType::Code TYPE_INT64 = + PropertyConfigProto::DataType::INT64; +constexpr PropertyConfigProto::DataType::Code TYPE_DOUBLE = + PropertyConfigProto::DataType::DOUBLE; +constexpr PropertyConfigProto::DataType::Code TYPE_BOOLEAN = + PropertyConfigProto::DataType::BOOLEAN; +constexpr PropertyConfigProto::DataType::Code TYPE_BYTES = + PropertyConfigProto::DataType::BYTES; +constexpr PropertyConfigProto::DataType::Code TYPE_DOCUMENT = + PropertyConfigProto::DataType::DOCUMENT; + class PropertyConfigBuilder { public: PropertyConfigBuilder() = default; @@ -53,6 +95,14 @@ class PropertyConfigBuilder { return *this; } + PropertyConfigBuilder& SetDataTypeInt64( + IntegerIndexingConfig::NumericMatchType::Code numeric_match_type) { + property_.set_data_type(PropertyConfigProto::DataType::INT64); + property_.mutable_integer_indexing_config()->set_numeric_match_type( + numeric_match_type); + return *this; + } + PropertyConfigBuilder& SetDataTypeDocument(std::string_view schema_type, bool index_nested_properties) { property_.set_data_type(PropertyConfigProto::DataType::DOCUMENT); diff --git a/icing/schema/schema-store.cc b/icing/schema/schema-store.cc index b1a2fe5..5f4baa8 100644 --- a/icing/schema/schema-store.cc +++ b/icing/schema/schema-store.cc @@ -516,14 +516,16 @@ libtextclassifier3::StatusOr<std::vector<std::string_view>> SchemaStore::GetStringSectionContent(const DocumentProto& document, std::string_view section_path) const { ICING_RETURN_IF_ERROR(CheckSchemaSet()); - return section_manager_->GetStringSectionContent(document, section_path); + return section_manager_->GetSectionContent<std::string_view>(document, + section_path); } libtextclassifier3::StatusOr<std::vector<std::string_view>> SchemaStore::GetStringSectionContent(const DocumentProto& document, SectionId section_id) const { ICING_RETURN_IF_ERROR(CheckSchemaSet()); - return section_manager_->GetStringSectionContent(document, section_id); + return section_manager_->GetSectionContent<std::string_view>(document, + section_id); } libtextclassifier3::StatusOr<const SectionMetadata*> @@ -533,7 +535,7 @@ SchemaStore::GetSectionMetadata(SchemaTypeId schema_type_id, return section_manager_->GetSectionMetadata(schema_type_id, section_id); } -libtextclassifier3::StatusOr<std::vector<Section>> SchemaStore::ExtractSections( +libtextclassifier3::StatusOr<SectionGroup> SchemaStore::ExtractSections( const DocumentProto& document) const { ICING_RETURN_IF_ERROR(CheckSchemaSet()); return section_manager_->ExtractSections(document); diff --git a/icing/schema/schema-store.h b/icing/schema/schema-store.h index 82f4ffa..d5a7c6f 100644 --- a/icing/schema/schema-store.h +++ b/icing/schema/schema-store.h @@ -221,15 +221,21 @@ class SchemaStore { libtextclassifier3::StatusOr<const SectionMetadata*> GetSectionMetadata( SchemaTypeId schema_type_id, SectionId section_id) const; - // Extracts all sections from the given document, sections are sorted by - // section id in increasing order. Section ids start from 0. Sections with - // empty content won't be returned. + // Extracts all sections of different types from the given document and group + // them by type. + // - Each Section vector is sorted by section Id in ascending order. The + // sorted section Ids may not be continuous, since not all section Ids are + // present in the document. + // - Sections with empty content won't be returned. + // - For example, we may extract: + // string_sections: [2, 7, 10] + // integer_sections: [3, 5, 8] // // Returns: - // A list of sections on success + // A SectionGroup instance on success // FAILED_PRECONDITION if schema hasn't been set yet // NOT_FOUND if type config name of document not found - libtextclassifier3::StatusOr<std::vector<Section>> ExtractSections( + libtextclassifier3::StatusOr<SectionGroup> ExtractSections( const DocumentProto& document) const; // Syncs all the data changes to disk. diff --git a/icing/schema/schema-store_test.cc b/icing/schema/schema-store_test.cc index aa05151..da04931 100644 --- a/icing/schema/schema-store_test.cc +++ b/icing/schema/schema-store_test.cc @@ -57,21 +57,7 @@ using ::testing::Pointee; using ::testing::Return; using ::testing::SizeIs; -constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL = - PropertyConfigProto::Cardinality::OPTIONAL; -constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REPEATED = - PropertyConfigProto::Cardinality::REPEATED; - -constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN = - StringIndexingConfig::TokenizerType::PLAIN; - -constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY; -constexpr TermMatchType::Code MATCH_PREFIX = TermMatchType::PREFIX; - -constexpr PropertyConfigProto::DataType::Code TYPE_STRING = - PropertyConfigProto::DataType::STRING; -constexpr PropertyConfigProto::DataType::Code TYPE_DOUBLE = - PropertyConfigProto::DataType::DOUBLE; +constexpr int64_t kDefaultTimestamp = 12345678; class SchemaStoreTest : public ::testing::Test { protected: @@ -80,15 +66,23 @@ class SchemaStoreTest : public ::testing::Test { schema_store_dir_ = test_dir_ + "/schema_store"; filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str()); - schema_ = - SchemaBuilder() - .AddType(SchemaTypeConfigBuilder().SetType("email").AddProperty( - // Add an indexed property so we generate section metadata on it - PropertyConfigBuilder() - .SetName("subject") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL))) - .Build(); + schema_ = SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType("email") + .AddProperty( + // Add an indexed property so we generate + // section metadata on it + PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(TERM_MATCH_EXACT, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("timestamp") + .SetDataTypeInt64(NUMERIC_MATCH_RANGE) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); } void TearDown() override { @@ -123,7 +117,7 @@ TEST_F(SchemaStoreTest, SchemaStoreMoveConstructible) { .AddType(SchemaTypeConfigBuilder().SetType("TypeA").AddProperty( PropertyConfigBuilder() .SetName("prop1") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_OPTIONAL))) .Build(); @@ -141,7 +135,8 @@ TEST_F(SchemaStoreTest, SchemaStoreMoveConstructible) { IsOkAndHolds(Pointee(EqualsProto(schema)))); EXPECT_THAT(move_constructed_schema_store.ComputeChecksum(), IsOkAndHolds(Eq(expected_checksum))); - SectionMetadata expected_metadata(/*id_in=*/0, MATCH_EXACT, TOKENIZER_PLAIN, + SectionMetadata expected_metadata(/*id_in=*/0, TYPE_STRING, TOKENIZER_PLAIN, + TERM_MATCH_EXACT, NUMERIC_MATCH_UNKNOWN, "prop1"); EXPECT_THAT(move_constructed_schema_store.GetSectionMetadata("TypeA"), IsOkAndHolds(Pointee(ElementsAre(expected_metadata)))); @@ -154,7 +149,7 @@ TEST_F(SchemaStoreTest, SchemaStoreMoveAssignment) { .AddType(SchemaTypeConfigBuilder().SetType("TypeA").AddProperty( PropertyConfigBuilder() .SetName("prop1") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_OPTIONAL))) .Build(); @@ -172,7 +167,7 @@ TEST_F(SchemaStoreTest, SchemaStoreMoveAssignment) { .AddType(SchemaTypeConfigBuilder().SetType("TypeB").AddProperty( PropertyConfigBuilder() .SetName("prop2") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_OPTIONAL))) .Build(); @@ -187,7 +182,8 @@ TEST_F(SchemaStoreTest, SchemaStoreMoveAssignment) { IsOkAndHolds(Pointee(EqualsProto(schema1)))); EXPECT_THAT(move_assigned_schema_store->ComputeChecksum(), IsOkAndHolds(Eq(expected_checksum))); - SectionMetadata expected_metadata(/*id_in=*/0, MATCH_EXACT, TOKENIZER_PLAIN, + SectionMetadata expected_metadata(/*id_in=*/0, TYPE_STRING, TOKENIZER_PLAIN, + TERM_MATCH_EXACT, NUMERIC_MATCH_UNKNOWN, "prop1"); EXPECT_THAT(move_assigned_schema_store->GetSectionMetadata("TypeA"), IsOkAndHolds(Pointee(ElementsAre(expected_metadata)))); @@ -363,9 +359,12 @@ TEST_F(SchemaStoreTest, CreateWithPreviousSchemaOk) { TEST_F(SchemaStoreTest, MultipleCreateOk) { DocumentProto document; document.set_schema("email"); - auto properties = document.add_properties(); - properties->set_name("subject"); - properties->add_string_values("subject_content"); + auto subject_property = document.add_properties(); + subject_property->set_name("subject"); + subject_property->add_string_values("subject_content"); + auto timestamp_property = document.add_properties(); + timestamp_property->set_name("timestamp"); + timestamp_property->add_int64_values(kDefaultTimestamp); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<SchemaStore> schema_store, @@ -380,9 +379,12 @@ TEST_F(SchemaStoreTest, MultipleCreateOk) { // Verify that our in-memory structures are ok EXPECT_THAT(schema_store->GetSchemaTypeConfig("email"), IsOkAndHolds(Pointee(EqualsProto(schema_.types(0))))); - ICING_ASSERT_OK_AND_ASSIGN(std::vector<Section> sections, + ICING_ASSERT_OK_AND_ASSIGN(SectionGroup section_group, schema_store->ExtractSections(document)); - EXPECT_THAT(sections[0].content, ElementsAre("subject_content")); + EXPECT_THAT(section_group.string_sections[0].content, + ElementsAre("subject_content")); + EXPECT_THAT(section_group.integer_sections[0].content, + ElementsAre(kDefaultTimestamp)); // Verify that our persisted data is ok EXPECT_THAT(schema_store->GetSchemaTypeId("email"), IsOkAndHolds(0)); @@ -396,8 +398,12 @@ TEST_F(SchemaStoreTest, MultipleCreateOk) { EXPECT_THAT(schema_store->GetSchemaTypeConfig("email"), IsOkAndHolds(Pointee(EqualsProto(schema_.types(0))))); - ICING_ASSERT_OK_AND_ASSIGN(sections, schema_store->ExtractSections(document)); - EXPECT_THAT(sections[0].content, ElementsAre("subject_content")); + ICING_ASSERT_OK_AND_ASSIGN(section_group, + schema_store->ExtractSections(document)); + EXPECT_THAT(section_group.string_sections[0].content, + ElementsAre("subject_content")); + EXPECT_THAT(section_group.integer_sections[0].content, + ElementsAre(kDefaultTimestamp)); // Verify that our persisted data is ok EXPECT_THAT(schema_store->GetSchemaTypeId("email"), IsOkAndHolds(0)); @@ -635,7 +641,7 @@ TEST_F(SchemaStoreTest, IndexedPropertyChangeRequiresReindexingOk) { .AddType(SchemaTypeConfigBuilder().SetType("email").AddProperty( PropertyConfigBuilder() .SetName("subject") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_OPTIONAL))) .Build(); @@ -661,7 +667,7 @@ TEST_F(SchemaStoreTest, IndexNestedDocumentsChangeRequiresReindexingOk) { .SetType("email") .AddProperty(PropertyConfigBuilder() .SetName("subject") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_OPTIONAL)) .Build(); SchemaProto no_nested_index_schema = @@ -789,10 +795,11 @@ TEST_F(SchemaStoreTest, SetSchemaWithIncompatibleNestedTypesOk) { SchemaTypeConfigBuilder contact_point_repeated_label = SchemaTypeConfigBuilder() .SetType("ContactPoint") - .AddProperty(PropertyConfigBuilder() - .SetName("label") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_REPEATED)); + .AddProperty( + PropertyConfigBuilder() + .SetName("label") + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REPEATED)); SchemaProto old_schema = SchemaBuilder().AddType(contact_point_repeated_label).Build(); ICING_EXPECT_OK(schema_store->SetSchema(old_schema)); @@ -804,10 +811,11 @@ TEST_F(SchemaStoreTest, SetSchemaWithIncompatibleNestedTypesOk) { SchemaTypeConfigBuilder contact_point_optional_label = SchemaTypeConfigBuilder() .SetType("ContactPoint") - .AddProperty(PropertyConfigBuilder() - .SetName("label") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL)); + .AddProperty( + PropertyConfigBuilder() + .SetName("label") + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)); SchemaTypeConfigBuilder person = SchemaTypeConfigBuilder().SetType("Person").AddProperty( PropertyConfigBuilder() @@ -857,10 +865,11 @@ TEST_F(SchemaStoreTest, SetSchemaWithIndexIncompatibleNestedTypesOk) { SchemaTypeConfigBuilder contact_point_prefix_label = SchemaTypeConfigBuilder() .SetType("ContactPoint") - .AddProperty(PropertyConfigBuilder() - .SetName("label") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_REPEATED)); + .AddProperty( + PropertyConfigBuilder() + .SetName("label") + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REPEATED)); SchemaProto old_schema = SchemaBuilder().AddType(contact_point_prefix_label).Build(); ICING_EXPECT_OK(schema_store->SetSchema(old_schema)); @@ -872,7 +881,7 @@ TEST_F(SchemaStoreTest, SetSchemaWithIndexIncompatibleNestedTypesOk) { .SetType("ContactPoint") .AddProperty(PropertyConfigBuilder() .SetName("label") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_REPEATED)); SchemaTypeConfigBuilder person = SchemaTypeConfigBuilder().SetType("Person").AddProperty( @@ -911,10 +920,11 @@ TEST_F(SchemaStoreTest, SetSchemaWithCompatibleNestedTypesOk) { SchemaTypeConfigBuilder contact_point_optional_label = SchemaTypeConfigBuilder() .SetType("ContactPoint") - .AddProperty(PropertyConfigBuilder() - .SetName("label") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL)); + .AddProperty( + PropertyConfigBuilder() + .SetName("label") + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)); SchemaProto old_schema = SchemaBuilder().AddType(contact_point_optional_label).Build(); ICING_EXPECT_OK(schema_store->SetSchema(old_schema)); @@ -924,10 +934,11 @@ TEST_F(SchemaStoreTest, SetSchemaWithCompatibleNestedTypesOk) { SchemaTypeConfigBuilder contact_point_repeated_label = SchemaTypeConfigBuilder() .SetType("ContactPoint") - .AddProperty(PropertyConfigBuilder() - .SetName("label") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_REPEATED)); + .AddProperty( + PropertyConfigBuilder() + .SetName("label") + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REPEATED)); SchemaTypeConfigBuilder person = SchemaTypeConfigBuilder().SetType("Person").AddProperty( PropertyConfigBuilder() @@ -1106,7 +1117,7 @@ TEST_F(SchemaStoreTest, SchemaStoreStorageInfoProto) { PropertyConfigProto prop = PropertyConfigBuilder() .SetName("subject") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_OPTIONAL) .Build(); SchemaTypeConfigBuilder full_sections_type_builder = @@ -1201,8 +1212,12 @@ TEST_F(SchemaStoreTest, SetSchemaRegenerateDerivedFilesFailure) { SchemaTypeConfigBuilder() .SetType("Type") .AddProperty(PropertyConfigBuilder() - .SetName("prop1") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetName("intProp1") + .SetDataTypeInt64(NUMERIC_MATCH_RANGE) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("stringProp1") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_OPTIONAL)) .Build(); { @@ -1230,17 +1245,30 @@ TEST_F(SchemaStoreTest, SetSchemaRegenerateDerivedFilesFailure) { .Build(); EXPECT_THAT(schema_store->SetSchema(std::move(schema)), StatusIs(libtextclassifier3::StatusCode::INTERNAL)); - DocumentProto document = DocumentBuilder() - .SetSchema("Type") - .AddStringProperty("prop1", "foo bar baz") - .Build(); - SectionMetadata expected_metadata(/*id_in=*/0, MATCH_EXACT, TOKENIZER_PLAIN, - "prop1"); - ICING_ASSERT_OK_AND_ASSIGN(std::vector<Section> sections, + DocumentProto document = + DocumentBuilder() + .SetSchema("Type") + .AddInt64Property("intProp1", 1, 2, 3) + .AddStringProperty("stringProp1", "foo bar baz") + .Build(); + SectionMetadata expected_int_prop1_metadata( + /*id_in=*/0, TYPE_INT64, TOKENIZER_NONE, TERM_MATCH_UNKNOWN, + NUMERIC_MATCH_RANGE, "intProp1"); + SectionMetadata expected_string_prop1_metadata( + /*id_in=*/1, TYPE_STRING, TOKENIZER_PLAIN, TERM_MATCH_EXACT, + NUMERIC_MATCH_UNKNOWN, "stringProp1"); + ICING_ASSERT_OK_AND_ASSIGN(SectionGroup section_group, schema_store->ExtractSections(document)); - ASSERT_THAT(sections, SizeIs(1)); - EXPECT_THAT(sections.at(0).metadata, Eq(expected_metadata)); - EXPECT_THAT(sections.at(0).content, ElementsAre("foo bar baz")); + ASSERT_THAT(section_group.string_sections, SizeIs(1)); + EXPECT_THAT(section_group.string_sections.at(0).metadata, + Eq(expected_string_prop1_metadata)); + EXPECT_THAT(section_group.string_sections.at(0).content, + ElementsAre("foo bar baz")); + ASSERT_THAT(section_group.integer_sections, SizeIs(1)); + EXPECT_THAT(section_group.integer_sections.at(0).metadata, + Eq(expected_int_prop1_metadata)); + EXPECT_THAT(section_group.integer_sections.at(0).content, + ElementsAre(1, 2, 3)); } } diff --git a/icing/schema/schema-util_test.cc b/icing/schema/schema-util_test.cc index f28a2f8..ffe1036 100644 --- a/icing/schema/schema-util_test.cc +++ b/icing/schema/schema-util_test.cc @@ -38,33 +38,6 @@ constexpr char kEmailType[] = "EmailMessage"; constexpr char kMessageType[] = "Text"; constexpr char kPersonType[] = "Person"; -constexpr PropertyConfigProto::DataType::Code TYPE_DOCUMENT = - PropertyConfigProto::DataType::DOCUMENT; -constexpr PropertyConfigProto::DataType::Code TYPE_STRING = - PropertyConfigProto::DataType::STRING; -constexpr PropertyConfigProto::DataType::Code TYPE_INT = - PropertyConfigProto::DataType::INT64; -constexpr PropertyConfigProto::DataType::Code TYPE_DOUBLE = - PropertyConfigProto::DataType::DOUBLE; - -constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_UNKNOWN = - PropertyConfigProto::Cardinality::UNKNOWN; -constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REQUIRED = - PropertyConfigProto::Cardinality::REQUIRED; -constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL = - PropertyConfigProto::Cardinality::OPTIONAL; -constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REPEATED = - PropertyConfigProto::Cardinality::REPEATED; - -constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_NONE = - StringIndexingConfig::TokenizerType::NONE; -constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN = - StringIndexingConfig::TokenizerType::PLAIN; - -constexpr TermMatchType::Code MATCH_UNKNOWN = TermMatchType::UNKNOWN; -constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY; -constexpr TermMatchType::Code MATCH_PREFIX = TermMatchType::PREFIX; - TEST(SchemaUtilTest, DependencyGraphAlphabeticalOrder) { // Create a schema with the following dependencies: // C @@ -125,10 +98,11 @@ TEST(SchemaUtilTest, DependencyGraphAlphabeticalOrder) { SchemaTypeConfigProto type_f = SchemaTypeConfigBuilder() .SetType("F") - .AddProperty(PropertyConfigBuilder() - .SetName("text") - .SetCardinality(CARDINALITY_OPTIONAL) - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)) + .AddProperty( + PropertyConfigBuilder() + .SetName("text") + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)) .Build(); // Provide these in alphabetical (also parent-child) order: A, B, C, D, E, F @@ -211,10 +185,11 @@ TEST(SchemaUtilTest, DependencyGraphReverseAlphabeticalOrder) { SchemaTypeConfigProto type_f = SchemaTypeConfigBuilder() .SetType("F") - .AddProperty(PropertyConfigBuilder() - .SetName("text") - .SetCardinality(CARDINALITY_OPTIONAL) - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)) + .AddProperty( + PropertyConfigBuilder() + .SetName("text") + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)) .Build(); // Provide these in reverse alphabetical (also child-parent) order: @@ -298,10 +273,11 @@ TEST(SchemaUtilTest, DependencyGraphMixedOrder) { SchemaTypeConfigProto type_f = SchemaTypeConfigBuilder() .SetType("F") - .AddProperty(PropertyConfigBuilder() - .SetName("text") - .SetCardinality(CARDINALITY_OPTIONAL) - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)) + .AddProperty( + PropertyConfigBuilder() + .SetName("text") + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)) .Build(); // Provide these in a random order: C, E, F, A, B, D @@ -760,7 +736,7 @@ TEST(SchemaUtilTest, NewSchemaMissingPropertyIsIncompatible) { .SetCardinality(CARDINALITY_REQUIRED)) .AddProperty(PropertyConfigBuilder() .SetName("OldOptional") - .SetDataType(TYPE_INT) + .SetDataType(TYPE_INT64) .SetCardinality(CARDINALITY_OPTIONAL))) .Build(); @@ -792,7 +768,7 @@ TEST(SchemaUtilTest, CompatibilityOfDifferentCardinalityOk) { .SetType(kEmailType) .AddProperty(PropertyConfigBuilder() .SetName("Property") - .SetDataType(TYPE_INT) + .SetDataType(TYPE_INT64) .SetCardinality(CARDINALITY_REPEATED))) .Build(); @@ -803,7 +779,7 @@ TEST(SchemaUtilTest, CompatibilityOfDifferentCardinalityOk) { .SetType(kEmailType) .AddProperty(PropertyConfigBuilder() .SetName("Property") - .SetDataType(TYPE_INT) + .SetDataType(TYPE_INT64) .SetCardinality(CARDINALITY_OPTIONAL))) .Build(); @@ -834,7 +810,7 @@ TEST(SchemaUtilTest, DifferentDataTypeIsIncompatible) { .SetType(kEmailType) .AddProperty(PropertyConfigBuilder() .SetName("Property") - .SetDataType(TYPE_INT) + .SetDataType(TYPE_INT64) .SetCardinality(CARDINALITY_REPEATED))) .Build(); @@ -865,13 +841,13 @@ TEST(SchemaUtilTest, DifferentSchemaTypeIsIncompatible) { .SetType(kPersonType) .AddProperty(PropertyConfigBuilder() .SetName("prop") - .SetDataType(TYPE_INT) + .SetDataType(TYPE_INT64) .SetCardinality(CARDINALITY_REPEATED))) .AddType(SchemaTypeConfigBuilder() .SetType(kMessageType) .AddProperty(PropertyConfigBuilder() .SetName("prop") - .SetDataType(TYPE_INT) + .SetDataType(TYPE_INT64) .SetCardinality(CARDINALITY_REPEATED))) .AddType(SchemaTypeConfigBuilder() .SetType(kEmailType) @@ -890,13 +866,13 @@ TEST(SchemaUtilTest, DifferentSchemaTypeIsIncompatible) { .SetType(kPersonType) .AddProperty(PropertyConfigBuilder() .SetName("prop") - .SetDataType(TYPE_INT) + .SetDataType(TYPE_INT64) .SetCardinality(CARDINALITY_REPEATED))) .AddType(SchemaTypeConfigBuilder() .SetType(kMessageType) .AddProperty(PropertyConfigBuilder() .SetName("prop") - .SetDataType(TYPE_INT) + .SetDataType(TYPE_INT64) .SetCardinality(CARDINALITY_REPEATED))) .AddType(SchemaTypeConfigBuilder() .SetType(kEmailType) @@ -926,11 +902,11 @@ TEST(SchemaUtilTest, ChangingIndexedPropertiesMakesIndexIncompatible) { SchemaBuilder() .AddType(SchemaTypeConfigBuilder() .SetType(kPersonType) - .AddProperty( - PropertyConfigBuilder() - .SetName("Property") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL))) + .AddProperty(PropertyConfigBuilder() + .SetName("Property") + .SetDataTypeString(TERM_MATCH_EXACT, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) .Build(); // Configure new schema @@ -938,11 +914,11 @@ TEST(SchemaUtilTest, ChangingIndexedPropertiesMakesIndexIncompatible) { SchemaBuilder() .AddType(SchemaTypeConfigBuilder() .SetType(kPersonType) - .AddProperty( - PropertyConfigBuilder() - .SetName("Property") - .SetDataTypeString(MATCH_UNKNOWN, TOKENIZER_NONE) - .SetCardinality(CARDINALITY_OPTIONAL))) + .AddProperty(PropertyConfigBuilder() + .SetName("Property") + .SetDataTypeString(TERM_MATCH_UNKNOWN, + TOKENIZER_NONE) + .SetCardinality(CARDINALITY_OPTIONAL))) .Build(); SchemaUtil::SchemaDelta schema_delta; @@ -968,11 +944,11 @@ TEST(SchemaUtilTest, AddingNewIndexedPropertyMakesIndexIncompatible) { SchemaBuilder() .AddType(SchemaTypeConfigBuilder() .SetType(kPersonType) - .AddProperty( - PropertyConfigBuilder() - .SetName("Property") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL))) + .AddProperty(PropertyConfigBuilder() + .SetName("Property") + .SetDataTypeString(TERM_MATCH_EXACT, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) .Build(); // Configure new schema @@ -980,16 +956,16 @@ TEST(SchemaUtilTest, AddingNewIndexedPropertyMakesIndexIncompatible) { SchemaBuilder() .AddType(SchemaTypeConfigBuilder() .SetType(kPersonType) - .AddProperty( - PropertyConfigBuilder() - .SetName("Property") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL)) - .AddProperty( - PropertyConfigBuilder() - .SetName("NewIndexedProperty") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL))) + .AddProperty(PropertyConfigBuilder() + .SetName("Property") + .SetDataTypeString(TERM_MATCH_EXACT, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("NewIndexedProperty") + .SetDataTypeString(TERM_MATCH_EXACT, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) .Build(); SchemaUtil::SchemaDelta schema_delta; @@ -1007,29 +983,29 @@ TEST(SchemaUtilTest, AddingTypeIsCompatible) { SchemaBuilder() .AddType(SchemaTypeConfigBuilder() .SetType(kPersonType) - .AddProperty( - PropertyConfigBuilder() - .SetName("Property") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL))) + .AddProperty(PropertyConfigBuilder() + .SetName("Property") + .SetDataTypeString(TERM_MATCH_EXACT, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) .Build(); SchemaProto new_schema = SchemaBuilder() .AddType(SchemaTypeConfigBuilder() .SetType(kPersonType) - .AddProperty( - PropertyConfigBuilder() - .SetName("Property") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL))) + .AddProperty(PropertyConfigBuilder() + .SetName("Property") + .SetDataTypeString(TERM_MATCH_EXACT, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) .AddType(SchemaTypeConfigBuilder() .SetType(kEmailType) - .AddProperty( - PropertyConfigBuilder() - .SetName("Property") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL))) + .AddProperty(PropertyConfigBuilder() + .SetName("Property") + .SetDataTypeString(TERM_MATCH_EXACT, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) .Build(); SchemaUtil::SchemaDelta schema_delta; @@ -1048,29 +1024,29 @@ TEST(SchemaUtilTest, DeletingTypeIsNoted) { SchemaBuilder() .AddType(SchemaTypeConfigBuilder() .SetType(kPersonType) - .AddProperty( - PropertyConfigBuilder() - .SetName("Property") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL))) + .AddProperty(PropertyConfigBuilder() + .SetName("Property") + .SetDataTypeString(TERM_MATCH_EXACT, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) .AddType(SchemaTypeConfigBuilder() .SetType(kEmailType) - .AddProperty( - PropertyConfigBuilder() - .SetName("Property") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL))) + .AddProperty(PropertyConfigBuilder() + .SetName("Property") + .SetDataTypeString(TERM_MATCH_EXACT, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) .Build(); SchemaProto new_schema = SchemaBuilder() .AddType(SchemaTypeConfigBuilder() .SetType(kEmailType) - .AddProperty( - PropertyConfigBuilder() - .SetName("Property") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL))) + .AddProperty(PropertyConfigBuilder() + .SetName("Property") + .SetDataTypeString(TERM_MATCH_EXACT, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) .Build(); SchemaUtil::SchemaDelta schema_delta; @@ -1090,11 +1066,11 @@ TEST(SchemaUtilTest, DeletingPropertyAndChangingProperty) { .SetName("Property1") .SetDataType(TYPE_STRING) .SetCardinality(CARDINALITY_OPTIONAL)) - .AddProperty( - PropertyConfigBuilder() - .SetName("Property2") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_REQUIRED))) + .AddProperty(PropertyConfigBuilder() + .SetName("Property2") + .SetDataTypeString(TERM_MATCH_EXACT, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED))) .Build(); // Remove Property2 and make Property1 indexed now. Removing Property2 should @@ -1103,11 +1079,11 @@ TEST(SchemaUtilTest, DeletingPropertyAndChangingProperty) { SchemaBuilder() .AddType(SchemaTypeConfigBuilder() .SetType(kEmailType) - .AddProperty( - PropertyConfigBuilder() - .SetName("Property1") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL))) + .AddProperty(PropertyConfigBuilder() + .SetName("Property1") + .SetDataTypeString(TERM_MATCH_EXACT, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) .Build(); SchemaUtil::SchemaDelta schema_delta; @@ -1127,7 +1103,7 @@ TEST(SchemaUtilTest, IndexNestedDocumentsIndexIncompatible) { .SetType(kEmailType) .AddProperty(PropertyConfigBuilder() .SetName("subject") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_OPTIONAL)) .Build(); SchemaProto no_nested_index_schema = @@ -1180,7 +1156,7 @@ TEST(SchemaUtilTest, ValidateStringIndexingConfigShouldHaveTermMatchType) { .AddType(SchemaTypeConfigBuilder().SetType("MyType").AddProperty( PropertyConfigBuilder() .SetName("Foo") - .SetDataTypeString(MATCH_UNKNOWN, TOKENIZER_PLAIN) + .SetDataTypeString(TERM_MATCH_UNKNOWN, TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_REQUIRED))) .Build(); @@ -1193,7 +1169,7 @@ TEST(SchemaUtilTest, ValidateStringIndexingConfigShouldHaveTermMatchType) { .AddType(SchemaTypeConfigBuilder().SetType("MyType").AddProperty( PropertyConfigBuilder() .SetName("Foo") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_REQUIRED))) .Build(); EXPECT_THAT(SchemaUtil::Validate(schema), IsOk()); @@ -1205,7 +1181,7 @@ TEST(SchemaUtilTest, ValidateStringIndexingConfigShouldHaveTokenizer) { .AddType(SchemaTypeConfigBuilder().SetType("MyType").AddProperty( PropertyConfigBuilder() .SetName("Foo") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_NONE) + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_NONE) .SetCardinality(CARDINALITY_REQUIRED))) .Build(); @@ -1218,7 +1194,7 @@ TEST(SchemaUtilTest, ValidateStringIndexingConfigShouldHaveTokenizer) { .AddType(SchemaTypeConfigBuilder().SetType("MyType").AddProperty( PropertyConfigBuilder() .SetName("Foo") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_REQUIRED))) .Build(); EXPECT_THAT(SchemaUtil::Validate(schema), IsOk()); @@ -1278,11 +1254,11 @@ TEST(SchemaUtilTest, InvalidSelfReferenceEvenWithOtherProperties) { "OwnSchema", /*index_nested_properties=*/true) .SetCardinality(CARDINALITY_OPTIONAL)) - .AddProperty( - PropertyConfigBuilder() - .SetName("SomeString") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL))) + .AddProperty(PropertyConfigBuilder() + .SetName("SomeString") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) .Build(); EXPECT_THAT(SchemaUtil::Validate(schema), diff --git a/icing/schema/section-manager.cc b/icing/schema/section-manager.cc index a0893e6..2ca534e 100644 --- a/icing/schema/section-manager.cc +++ b/icing/schema/section-manager.cc @@ -57,6 +57,32 @@ std::string ConcatenatePath(const std::string& path, return absl_ports::StrCat(path, kPropertySeparator, next_property_name); } +// Helper function to append a new section metadata +libtextclassifier3::Status AppendNewSectionMetadata( + std::vector<SectionMetadata>* metadata_list, + std::string&& concatenated_path, + PropertyConfigProto::DataType::Code data_type, + StringIndexingConfig::TokenizerType::Code string_tokenizer_type, + TermMatchType::Code term_match_type, + IntegerIndexingConfig::NumericMatchType::Code numeric_match_type) { + // Validates next section id, makes sure that section id is the same as the + // list index so that we could find any section metadata by id in O(1) later. + SectionId new_section_id = static_cast<SectionId>(metadata_list->size()); + if (!IsSectionIdValid(new_section_id)) { + // Max number of sections reached + return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf( + "Too many properties to be indexed, max number of properties " + "allowed: %d", + kMaxSectionId - kMinSectionId + 1)); + } + + // Creates section metadata + metadata_list->push_back(SectionMetadata( + new_section_id, data_type, string_tokenizer_type, term_match_type, + numeric_match_type, std::move(concatenated_path))); + return libtextclassifier3::Status::OK; +} + libtextclassifier3::Status AssignSections( const SchemaTypeConfigProto& current_type_config, const std::string& current_section_path, @@ -70,58 +96,63 @@ libtextclassifier3::Status AssignSections( return p1->property_name() < p2->property_name(); }); for (const auto& property_config : sorted_properties) { - if (property_config.data_type() == - PropertyConfigProto::DataType::DOCUMENT) { - auto nested_type_config_iter = - type_config_map.find(property_config.schema_type()); - if (nested_type_config_iter == type_config_map.end()) { - // This should never happen because our schema should already be - // validated by this point. - return absl_ports::NotFoundError(absl_ports::StrCat( - "Type config not found: ", property_config.schema_type())); - } + // Creates section metadata according to data type + switch (property_config.data_type()) { + case PropertyConfigProto::DataType::DOCUMENT: { + auto nested_type_config_iter = + type_config_map.find(property_config.schema_type()); + if (nested_type_config_iter == type_config_map.end()) { + // This should never happen because our schema should already be + // validated by this point. + return absl_ports::NotFoundError(absl_ports::StrCat( + "Type config not found: ", property_config.schema_type())); + } - if (property_config.document_indexing_config() - .index_nested_properties()) { - // Assign any indexed sections recursively - const SchemaTypeConfigProto& nested_type_config = - nested_type_config_iter->second; - ICING_RETURN_IF_ERROR( - AssignSections(nested_type_config, - ConcatenatePath(current_section_path, - property_config.property_name()), - type_config_map, metadata_list)); + if (property_config.document_indexing_config() + .index_nested_properties()) { + // Assign any indexed sections recursively + const SchemaTypeConfigProto& nested_type_config = + nested_type_config_iter->second; + ICING_RETURN_IF_ERROR( + AssignSections(nested_type_config, + ConcatenatePath(current_section_path, + property_config.property_name()), + type_config_map, metadata_list)); + } + break; } - } - - // Only index strings currently. - if (property_config.has_data_type() != - PropertyConfigProto::DataType::STRING || - property_config.string_indexing_config().term_match_type() == + case PropertyConfigProto::DataType::STRING: { + if (property_config.string_indexing_config().term_match_type() != TermMatchType::UNKNOWN) { - // No need to create section for current property - continue; - } - - // Creates section metadata according to data type - // Validates next section id, makes sure that section id is the same as - // the list index so that we could find any section metadata by id in O(1) - // later. - auto new_section_id = static_cast<SectionId>(metadata_list->size()); - if (!IsSectionIdValid(new_section_id)) { - // Max number of sections reached - return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf( - "Too many properties to be indexed, max number of properties " - "allowed: %d", - kMaxSectionId - kMinSectionId + 1)); + ICING_RETURN_IF_ERROR(AppendNewSectionMetadata( + metadata_list, + ConcatenatePath(current_section_path, + property_config.property_name()), + PropertyConfigProto::DataType::STRING, + property_config.string_indexing_config().tokenizer_type(), + property_config.string_indexing_config().term_match_type(), + IntegerIndexingConfig::NumericMatchType::UNKNOWN)); + } + break; + } + case PropertyConfigProto::DataType::INT64: { + if (property_config.integer_indexing_config().numeric_match_type() != + IntegerIndexingConfig::NumericMatchType::UNKNOWN) { + ICING_RETURN_IF_ERROR(AppendNewSectionMetadata( + metadata_list, + ConcatenatePath(current_section_path, + property_config.property_name()), + PropertyConfigProto::DataType::INT64, + StringIndexingConfig::TokenizerType::NONE, TermMatchType::UNKNOWN, + property_config.integer_indexing_config().numeric_match_type())); + } + break; + } + default: { + // Skip other data types. + break; + } } - - // Creates section metadata from property config - metadata_list->emplace_back( - new_section_id, - property_config.string_indexing_config().term_match_type(), - property_config.string_indexing_config().tokenizer_type(), - ConcatenatePath(current_section_path, property_config.property_name())); } return libtextclassifier3::Status::OK; } @@ -153,16 +184,40 @@ BuildSectionMetadataCache(const SchemaUtil::TypeConfigMap& type_config_map, return section_metadata_cache; } -// Helper function to get string content from a property. Repeated values are -// joined into one string. We only care about the STRING data type. -std::vector<std::string_view> GetStringPropertyContent( +// Helper function to get content from a property according to the template type +// T. We only care about STRING and INT64, which are the only 2 indexable data +// types. +template <typename T> +libtextclassifier3::StatusOr<std::vector<T>> GetPropertyContent( const PropertyProto& property) { - std::vector<std::string_view> values; - if (!property.string_values().empty()) { - std::copy(property.string_values().begin(), property.string_values().end(), - std::back_inserter(values)); + return absl_ports::UnimplementedError( + "Unimplemented template type for GetPropertyContent"); +} + +template <> +libtextclassifier3::StatusOr<std::vector<std::string_view>> +GetPropertyContent<std::string_view>(const PropertyProto& property) { + return std::vector<std::string_view>(property.string_values().begin(), + property.string_values().end()); +} + +template <> +libtextclassifier3::StatusOr<std::vector<int64_t>> GetPropertyContent<int64_t>( + const PropertyProto& property) { + return std::vector<int64_t>(property.int64_values().begin(), + property.int64_values().end()); +} + +template <typename T> +void AppendSection( + SectionMetadata section_metadata, + libtextclassifier3::StatusOr<std::vector<T>>&& section_content_or, + std::vector<Section<T>>& sections_out) { + if (section_content_or.ok()) { + // Adds to result vector if section is found in document + sections_out.emplace_back(std::move(section_metadata), + std::move(section_content_or).ValueOrDie()); } - return values; } } // namespace @@ -185,9 +240,9 @@ SectionManager::Create(const SchemaUtil::TypeConfigMap& type_config_map, schema_type_mapper, std::move(section_metadata_cache))); } -libtextclassifier3::StatusOr<std::vector<std::string_view>> -SectionManager::GetStringSectionContent(const DocumentProto& document, - std::string_view section_path) const { +template <typename T> +libtextclassifier3::StatusOr<std::vector<T>> SectionManager::GetSectionContent( + const DocumentProto& document, std::string_view section_path) const { // Finds the first property name in section_path size_t separator_position = section_path.find(kPropertySeparator); std::string_view current_property_name = @@ -212,11 +267,11 @@ SectionManager::GetStringSectionContent(const DocumentProto& document, if (separator_position == std::string::npos) { // Current property name is the last one in section path - std::vector<std::string_view> content = - GetStringPropertyContent(*property_iterator); + ICING_ASSIGN_OR_RETURN(std::vector<T> content, + GetPropertyContent<T>(*property_iterator)); if (content.empty()) { // The content of property is explicitly set to empty, we'll treat it as - // NOT_FOUND because the index doesn't care about empty strings. + // NOT_FOUND because the index doesn't care about empty contents. return absl_ports::NotFoundError(absl_ports::StrCat( "Section path '", section_path, "' content was empty")); } @@ -226,13 +281,11 @@ SectionManager::GetStringSectionContent(const DocumentProto& document, // Gets section content recursively std::string_view sub_section_path = section_path.substr(separator_position + 1); - std::vector<std::string_view> nested_document_content; + std::vector<T> nested_document_content; for (const auto& nested_document : property_iterator->document_values()) { - auto content_or = - GetStringSectionContent(nested_document, sub_section_path); + auto content_or = GetSectionContent<T>(nested_document, sub_section_path); if (content_or.ok()) { - std::vector<std::string_view> content = - std::move(content_or).ValueOrDie(); + std::vector<T> content = std::move(content_or).ValueOrDie(); std::move(content.begin(), content.end(), std::back_inserter(nested_document_content)); } @@ -245,9 +298,17 @@ SectionManager::GetStringSectionContent(const DocumentProto& document, return nested_document_content; } -libtextclassifier3::StatusOr<std::vector<std::string_view>> -SectionManager::GetStringSectionContent(const DocumentProto& document, - SectionId section_id) const { +// Explicit template instantiation +template libtextclassifier3::StatusOr<std::vector<std::string_view>> +SectionManager::GetSectionContent<std::string_view>( + const DocumentProto& document, std::string_view section_path) const; +template libtextclassifier3::StatusOr<std::vector<int64_t>> +SectionManager::GetSectionContent<int64_t>(const DocumentProto& document, + std::string_view section_path) const; + +template <typename T> +libtextclassifier3::StatusOr<std::vector<T>> SectionManager::GetSectionContent( + const DocumentProto& document, SectionId section_id) const { if (!IsSectionIdValid(section_id)) { return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( "Section id %d is greater than the max value %d", section_id, @@ -262,9 +323,17 @@ SectionManager::GetStringSectionContent(const DocumentProto& document, } // The index of metadata list is the same as the section id, so we can use // section id as the index. - return GetStringSectionContent(document, metadata_list->at(section_id).path); + return GetSectionContent<T>(document, metadata_list->at(section_id).path); } +// Explicit template instantiation +template libtextclassifier3::StatusOr<std::vector<std::string_view>> +SectionManager::GetSectionContent<std::string_view>( + const DocumentProto& document, SectionId section_id) const; +template libtextclassifier3::StatusOr<std::vector<int64_t>> +SectionManager::GetSectionContent<int64_t>(const DocumentProto& document, + SectionId section_id) const; + libtextclassifier3::StatusOr<const SectionMetadata*> SectionManager::GetSectionMetadata(SchemaTypeId schema_type_id, SectionId section_id) const { @@ -286,21 +355,34 @@ SectionManager::GetSectionMetadata(SchemaTypeId schema_type_id, return §ion_metadatas[section_id]; } -libtextclassifier3::StatusOr<std::vector<Section>> -SectionManager::ExtractSections(const DocumentProto& document) const { +libtextclassifier3::StatusOr<SectionGroup> SectionManager::ExtractSections( + const DocumentProto& document) const { ICING_ASSIGN_OR_RETURN(const std::vector<SectionMetadata>* metadata_list, GetMetadataList(document.schema())); - std::vector<Section> sections; - for (const auto& section_metadata : *metadata_list) { - auto section_content_or = - GetStringSectionContent(document, section_metadata.path); - // Adds to result vector if section is found in document - if (section_content_or.ok()) { - sections.emplace_back(SectionMetadata(section_metadata), - std::move(section_content_or).ValueOrDie()); + SectionGroup section_group; + for (const SectionMetadata& section_metadata : *metadata_list) { + switch (section_metadata.data_type) { + case PropertyConfigProto::DataType::STRING: { + AppendSection(section_metadata, + GetSectionContent<std::string_view>( + document, section_metadata.path), + section_group.string_sections); + break; + } + case PropertyConfigProto::DataType::INT64: { + AppendSection( + section_metadata, + GetSectionContent<int64_t>(document, section_metadata.path), + section_group.integer_sections); + break; + } + default: { + // Skip other data types. + break; + } } } - return sections; + return section_group; } libtextclassifier3::StatusOr<const std::vector<SectionMetadata>*> diff --git a/icing/schema/section-manager.h b/icing/schema/section-manager.h index 51eb133..78a5acb 100644 --- a/icing/schema/section-manager.h +++ b/icing/schema/section-manager.h @@ -55,27 +55,36 @@ class SectionManager { const SchemaUtil::TypeConfigMap& type_config_map, const KeyMapper<SchemaTypeId>* schema_type_mapper); - // Finds content of a section by section path (e.g. property1.property2) + // Finds contents of a section by section path (e.g. property1.property2) + // according to the template type T. + // + // Types of supported T: + // - std::string, std::string_view: return property.string_values() + // - int64_t : return property.int64_values() // // Returns: - // A string of content on success + // A vector of contents with the specified type on success // NOT_FOUND if: // 1. Property is optional and not found in the document // 2. section_path is invalid - // 3. Content is empty - libtextclassifier3::StatusOr<std::vector<std::string_view>> - GetStringSectionContent(const DocumentProto& document, - std::string_view section_path) const; + // 3. Content is empty (could be caused by incorrect type T) + template <typename T> + libtextclassifier3::StatusOr<std::vector<T>> GetSectionContent( + const DocumentProto& document, std::string_view section_path) const; - // Finds content of a section by id + // Finds contents of a section by id according to the template type T. + // + // Types of supported T: + // - std::string, std::string_view: return property.string_values() + // - int64_t : return property.int64_values() // // Returns: - // A string of content on success + // A vector of contents on success // INVALID_ARGUMENT if section id is invalid // NOT_FOUND if type config name of document not found - libtextclassifier3::StatusOr<std::vector<std::string_view>> - GetStringSectionContent(const DocumentProto& document, - SectionId section_id) const; + template <typename T> + libtextclassifier3::StatusOr<std::vector<T>> GetSectionContent( + const DocumentProto& document, SectionId section_id) const; // Returns the SectionMetadata associated with the SectionId that's in the // SchemaTypeId. @@ -86,14 +95,16 @@ class SectionManager { libtextclassifier3::StatusOr<const SectionMetadata*> GetSectionMetadata( SchemaTypeId schema_type_id, SectionId section_id) const; - // Extracts all sections from the given document, sections are sorted by - // section id in increasing order. Section ids start from 0. Sections with - // empty content won't be returned. + // Extracts all sections of different types from the given document and group + // them by type. + // - Sections are sorted by section id in ascending order. + // - Section ids start from 0. + // - Sections with empty content won't be returned. // // Returns: - // A list of sections on success + // A SectionGroup instance on success // NOT_FOUND if type config name of document not found - libtextclassifier3::StatusOr<std::vector<Section>> ExtractSections( + libtextclassifier3::StatusOr<SectionGroup> ExtractSections( const DocumentProto& document) const; // Returns: diff --git a/icing/schema/section-manager_test.cc b/icing/schema/section-manager_test.cc index 39c02d1..4e8fbbd 100644 --- a/icing/schema/section-manager_test.cc +++ b/icing/schema/section-manager_test.cc @@ -23,6 +23,7 @@ #include "icing/proto/document.pb.h" #include "icing/proto/schema.pb.h" #include "icing/proto/term.pb.h" +#include "icing/schema-builder.h" #include "icing/schema/schema-util.h" #include "icing/store/dynamic-trie-key-mapper.h" #include "icing/store/key-mapper.h" @@ -32,6 +33,8 @@ namespace icing { namespace lib { +namespace { + using ::testing::ElementsAre; using ::testing::Eq; using ::testing::HasSubstr; @@ -44,11 +47,16 @@ constexpr char kPropertySubject[] = "subject"; constexpr char kPropertyText[] = "text"; constexpr char kPropertyAttachment[] = "attachment"; constexpr char kPropertyRecipients[] = "recipients"; +constexpr char kPropertyRecipientIds[] = "recipientIds"; +constexpr char kPropertyTimestamp[] = "timestamp"; +constexpr char kPropertyNonIndexableInteger[] = "non_indexable_integer"; // type and property names of Conversation constexpr char kTypeConversation[] = "Conversation"; constexpr char kPropertyName[] = "name"; constexpr char kPropertyEmails[] = "emails"; +constexpr int64_t kDefaultTimestamp = 1663274901; + class SectionManagerTest : public ::testing::Test { protected: SectionManagerTest() : test_dir_(GetTestTempDir() + "/icing") { @@ -67,6 +75,9 @@ class SectionManagerTest : public ::testing::Test { .AddBytesProperty(kPropertyAttachment, "attachment bytes") .AddStringProperty(kPropertyRecipients, "recipient1", "recipient2", "recipient3") + .AddInt64Property(kPropertyRecipientIds, 1, 2, 3) + .AddInt64Property(kPropertyTimestamp, kDefaultTimestamp) + .AddInt64Property(kPropertyNonIndexableInteger, 100) .Build(); conversation_document_ = @@ -91,39 +102,41 @@ class SectionManagerTest : public ::testing::Test { } static SchemaTypeConfigProto CreateEmailTypeConfig() { - SchemaTypeConfigProto type; - type.set_schema_type(kTypeEmail); - - auto subject = type.add_properties(); - subject->set_property_name(kPropertySubject); - subject->set_data_type(PropertyConfigProto::DataType::STRING); - subject->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); - subject->mutable_string_indexing_config()->set_term_match_type( - TermMatchType::EXACT_ONLY); - subject->mutable_string_indexing_config()->set_tokenizer_type( - StringIndexingConfig::TokenizerType::PLAIN); - - auto text = type.add_properties(); - text->set_property_name(kPropertyText); - text->set_data_type(PropertyConfigProto::DataType::STRING); - text->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); - text->mutable_string_indexing_config()->set_term_match_type( - TermMatchType::UNKNOWN); - - auto attachment = type.add_properties(); - attachment->set_property_name(kPropertyAttachment); - attachment->set_data_type(PropertyConfigProto::DataType::BYTES); - attachment->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); - - auto recipients = type.add_properties(); - recipients->set_property_name(kPropertyRecipients); - recipients->set_data_type(PropertyConfigProto::DataType::STRING); - recipients->set_cardinality(PropertyConfigProto::Cardinality::REPEATED); - recipients->mutable_string_indexing_config()->set_term_match_type( - TermMatchType::EXACT_ONLY); - recipients->mutable_string_indexing_config()->set_tokenizer_type( - StringIndexingConfig::TokenizerType::PLAIN); - + SchemaTypeConfigProto type = + SchemaTypeConfigBuilder() + .SetType(kTypeEmail) + .AddProperty( + PropertyConfigBuilder() + .SetName(kPropertySubject) + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty( + PropertyConfigBuilder() + .SetName(kPropertyText) + .SetDataTypeString(TERM_MATCH_UNKNOWN, TOKENIZER_NONE) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName(kPropertyAttachment) + .SetDataType(TYPE_BYTES) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty( + PropertyConfigBuilder() + .SetName(kPropertyRecipients) + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REPEATED)) + .AddProperty(PropertyConfigBuilder() + .SetName(kPropertyRecipientIds) + .SetDataTypeInt64(NUMERIC_MATCH_RANGE) + .SetCardinality(CARDINALITY_REPEATED)) + .AddProperty(PropertyConfigBuilder() + .SetName(kPropertyTimestamp) + .SetDataTypeInt64(NUMERIC_MATCH_RANGE) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty(PropertyConfigBuilder() + .SetName(kPropertyNonIndexableInteger) + .SetDataType(TYPE_INT64) + .SetCardinality(CARDINALITY_REQUIRED)) + .Build(); return type; } @@ -133,15 +146,15 @@ class SectionManagerTest : public ::testing::Test { auto name = type.add_properties(); name->set_property_name(kPropertyName); - name->set_data_type(PropertyConfigProto::DataType::STRING); - name->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); + name->set_data_type(TYPE_STRING); + name->set_cardinality(CARDINALITY_OPTIONAL); name->mutable_string_indexing_config()->set_term_match_type( - TermMatchType::EXACT_ONLY); + TERM_MATCH_EXACT); auto emails = type.add_properties(); emails->set_property_name(kPropertyEmails); - emails->set_data_type(PropertyConfigProto::DataType::DOCUMENT); - emails->set_cardinality(PropertyConfigProto::Cardinality::REPEATED); + emails->set_data_type(TYPE_DOCUMENT); + emails->set_cardinality(CARDINALITY_REPEATED); emails->set_schema_type(kTypeEmail); emails->mutable_document_indexing_config()->set_index_nested_properties( true); @@ -172,10 +185,10 @@ TEST_F(SectionManagerTest, CreationWithTooManyPropertiesShouldFail) { for (int i = 0; i < max_num_sections_allowed + 1; i++) { auto property = type_config.add_properties(); property->set_property_name("property" + std::to_string(i)); - property->set_data_type(PropertyConfigProto::DataType::STRING); - property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); + property->set_data_type(TYPE_STRING); + property->set_cardinality(CARDINALITY_REQUIRED); property->mutable_string_indexing_config()->set_term_match_type( - TermMatchType::EXACT_ONLY); + TERM_MATCH_EXACT); } SchemaUtil::TypeConfigMap type_config_map; @@ -187,109 +200,186 @@ TEST_F(SectionManagerTest, CreationWithTooManyPropertiesShouldFail) { HasSubstr("Too many properties"))); } -TEST_F(SectionManagerTest, GetStringSectionContent) { +TEST_F(SectionManagerTest, GetSectionContent) { ICING_ASSERT_OK_AND_ASSIGN( auto section_manager, SectionManager::Create(type_config_map_, schema_type_mapper_.get())); // Test simple section paths - EXPECT_THAT( - section_manager->GetStringSectionContent(email_document_, - /*section_path*/ "subject"), - IsOkAndHolds(ElementsAre("the subject"))); - EXPECT_THAT(section_manager->GetStringSectionContent(email_document_, - /*section_path*/ "text"), + EXPECT_THAT(section_manager->GetSectionContent<std::string_view>( + email_document_, + /*section_path=*/"subject"), + IsOkAndHolds(ElementsAre("the subject"))); + EXPECT_THAT(section_manager->GetSectionContent<std::string_view>( + email_document_, + /*section_path=*/"text"), IsOkAndHolds(ElementsAre("the text"))); + EXPECT_THAT( + section_manager->GetSectionContent<int64_t>(email_document_, + /*section_path=*/"timestamp"), + IsOkAndHolds(ElementsAre(kDefaultTimestamp))); +} - // Test repeated values, they are joined into one string +TEST_F(SectionManagerTest, GetSectionContentRepeatedValues) { ICING_ASSERT_OK_AND_ASSIGN( - auto content, - section_manager->GetStringSectionContent(email_document_, - /*section_path*/ "recipients")); - EXPECT_THAT(content, ElementsAre("recipient1", "recipient2", "recipient3")); + auto section_manager, + SectionManager::Create(type_config_map_, schema_type_mapper_.get())); + + // Test repeated values + EXPECT_THAT( + section_manager->GetSectionContent<std::string_view>( + email_document_, + /*section_path=*/"recipients"), + IsOkAndHolds(ElementsAre("recipient1", "recipient2", "recipient3"))); + EXPECT_THAT(section_manager->GetSectionContent<int64_t>( + email_document_, + /*section_path=*/"recipientIds"), + IsOkAndHolds(ElementsAre(1, 2, 3))); +} + +TEST_F(SectionManagerTest, GetSectionContentConcatenatedSectionPaths) { + ICING_ASSERT_OK_AND_ASSIGN( + auto section_manager, + SectionManager::Create(type_config_map_, schema_type_mapper_.get())); // Test concatenated section paths: "property1.property2" - ICING_ASSERT_OK_AND_ASSIGN(content, section_manager->GetStringSectionContent( - conversation_document_, - /*section_path*/ "emails.subject")); - EXPECT_THAT(content, ElementsAre("the subject", "the subject")); - - ICING_ASSERT_OK_AND_ASSIGN(content, section_manager->GetStringSectionContent( - conversation_document_, - /*section_path*/ "emails.text")); - EXPECT_THAT(content, ElementsAre("the text", "the text")); - - ICING_ASSERT_OK_AND_ASSIGN(content, - section_manager->GetStringSectionContent( - conversation_document_, - /*section_path*/ "emails.recipients")); - EXPECT_THAT(content, ElementsAre("recipient1", "recipient2", "recipient3", - "recipient1", "recipient2", "recipient3")); + EXPECT_THAT(section_manager->GetSectionContent<std::string_view>( + conversation_document_, + /*section_path=*/"emails.subject"), + IsOkAndHolds(ElementsAre("the subject", "the subject"))); + EXPECT_THAT(section_manager->GetSectionContent<std::string_view>( + conversation_document_, + /*section_path=*/"emails.text"), + IsOkAndHolds(ElementsAre("the text", "the text"))); + EXPECT_THAT(section_manager->GetSectionContent<int64_t>( + conversation_document_, + /*section_path=*/"emails.timestamp"), + IsOkAndHolds(ElementsAre(kDefaultTimestamp, kDefaultTimestamp))); + EXPECT_THAT( + section_manager->GetSectionContent<std::string_view>( + conversation_document_, + /*section_path=*/"emails.recipients"), + IsOkAndHolds(ElementsAre("recipient1", "recipient2", "recipient3", + "recipient1", "recipient2", "recipient3"))); + EXPECT_THAT(section_manager->GetSectionContent<int64_t>( + conversation_document_, + /*section_path=*/"emails.recipientIds"), + IsOkAndHolds(ElementsAre(1, 2, 3, 1, 2, 3))); +} + +TEST_F(SectionManagerTest, GetSectionContentNonExistingPaths) { + ICING_ASSERT_OK_AND_ASSIGN( + auto section_manager, + SectionManager::Create(type_config_map_, schema_type_mapper_.get())); // Test non-existing paths - EXPECT_THAT(section_manager->GetStringSectionContent(email_document_, - /*section_path*/ "name"), + EXPECT_THAT(section_manager->GetSectionContent<std::string_view>( + email_document_, + /*section_path=*/"name"), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); - EXPECT_THAT( - section_manager->GetStringSectionContent(email_document_, - /*section_path*/ "invalid"), - StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); - EXPECT_THAT(section_manager->GetStringSectionContent( + EXPECT_THAT(section_manager->GetSectionContent<std::string_view>( + email_document_, + /*section_path=*/"invalid"), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + EXPECT_THAT(section_manager->GetSectionContent<std::string_view>( conversation_document_, - /*section_path*/ "emails.invalid"), + /*section_path=*/"emails.invalid"), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); +} + +TEST_F(SectionManagerTest, GetSectionContentNonIndexableTypes) { + ICING_ASSERT_OK_AND_ASSIGN( + auto section_manager, + SectionManager::Create(type_config_map_, schema_type_mapper_.get())); // Test other data types // BYTES type can't be indexed, so content won't be returned - EXPECT_THAT( - section_manager->GetStringSectionContent(email_document_, - /*section_path*/ "attachment"), - StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + EXPECT_THAT(section_manager->GetSectionContent<std::string_view>( + email_document_, + /*section_path=*/"attachment"), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); +} - // The following tests are similar to the ones above but use section ids - // instead of section paths +TEST_F(SectionManagerTest, GetSectionContentMismatchedType) { + ICING_ASSERT_OK_AND_ASSIGN( + auto section_manager, + SectionManager::Create(type_config_map_, schema_type_mapper_.get())); - // EmailMessage (section id -> section path): - SectionId recipients_section_id = 0; - SectionId subject_section_id = 1; - SectionId invalid_email_section_id = 2; + // Use the wrong template type to get the indexable content. GetSectionContent + // should get empty content from the corresponding proto (repeated) field and + // return NOT_FOUND error. + EXPECT_THAT(section_manager->GetSectionContent<std::string_view>( + email_document_, + /*section_path=*/"recipientIds"), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + EXPECT_THAT(section_manager->GetSectionContent<int64_t>( + email_document_, + /*section_path=*/"recipients"), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); +} + +// The following tests are similar to the ones above but use section ids +// instead of section paths +TEST_F(SectionManagerTest, GetSectionContentBySectionId) { ICING_ASSERT_OK_AND_ASSIGN( - content, section_manager->GetStringSectionContent(email_document_, - recipients_section_id)); - EXPECT_THAT(content, ElementsAre("recipient1", "recipient2", "recipient3")); + auto section_manager, + SectionManager::Create(type_config_map_, schema_type_mapper_.get())); - EXPECT_THAT(section_manager->GetStringSectionContent(email_document_, - subject_section_id), + // EmailMessage (section id -> section path): + SectionId recipient_ids_section_id = 0; + SectionId recipients_section_id = 1; + SectionId subject_section_id = 2; + SectionId timestamp_section_id = 3; + SectionId invalid_email_section_id = 4; + EXPECT_THAT(section_manager->GetSectionContent<int64_t>( + email_document_, recipient_ids_section_id), + IsOkAndHolds(ElementsAre(1, 2, 3))); + EXPECT_THAT( + section_manager->GetSectionContent<std::string_view>( + email_document_, recipients_section_id), + IsOkAndHolds(ElementsAre("recipient1", "recipient2", "recipient3"))); + EXPECT_THAT(section_manager->GetSectionContent<std::string_view>( + email_document_, subject_section_id), IsOkAndHolds(ElementsAre("the subject"))); + EXPECT_THAT(section_manager->GetSectionContent<int64_t>(email_document_, + timestamp_section_id), + IsOkAndHolds(ElementsAre(kDefaultTimestamp))); - EXPECT_THAT(section_manager->GetStringSectionContent( + EXPECT_THAT(section_manager->GetSectionContent<std::string_view>( email_document_, invalid_email_section_id), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); // Conversation (section id -> section path): - // 0 -> emails.recipients - // 1 -> emails.subject - // 2 -> name - SectionId emails_recipients_section_id = 0; - SectionId emails_subject_section_id = 1; - SectionId name_section_id = 2; - SectionId invalid_conversation_section_id = 3; - ICING_ASSERT_OK_AND_ASSIGN( - content, section_manager->GetStringSectionContent( - conversation_document_, emails_recipients_section_id)); - EXPECT_THAT(content, ElementsAre("recipient1", "recipient2", "recipient3", - "recipient1", "recipient2", "recipient3")); - - ICING_ASSERT_OK_AND_ASSIGN( - content, section_manager->GetStringSectionContent( - conversation_document_, emails_subject_section_id)); - EXPECT_THAT(content, ElementsAre("the subject", "the subject")); - - EXPECT_THAT(section_manager->GetStringSectionContent(conversation_document_, - name_section_id), + // 0 -> emails.recipientIds + // 1 -> emails.recipients + // 2 -> emails.subject + // 3 -> emails.timestamp + // 4 -> name + SectionId emails_recipient_ids_section_id = 0; + SectionId emails_recipients_section_id = 1; + SectionId emails_subject_section_id = 2; + SectionId emails_timestamp_section_id = 3; + SectionId name_section_id = 4; + SectionId invalid_conversation_section_id = 5; + EXPECT_THAT(section_manager->GetSectionContent<int64_t>( + conversation_document_, emails_recipient_ids_section_id), + IsOkAndHolds(ElementsAre(1, 2, 3, 1, 2, 3))); + EXPECT_THAT( + section_manager->GetSectionContent<std::string_view>( + conversation_document_, emails_recipients_section_id), + IsOkAndHolds(ElementsAre("recipient1", "recipient2", "recipient3", + "recipient1", "recipient2", "recipient3"))); + EXPECT_THAT(section_manager->GetSectionContent<std::string_view>( + conversation_document_, emails_subject_section_id), + IsOkAndHolds(ElementsAre("the subject", "the subject"))); + EXPECT_THAT(section_manager->GetSectionContent<int64_t>( + conversation_document_, emails_timestamp_section_id), + IsOkAndHolds(ElementsAre(kDefaultTimestamp, kDefaultTimestamp))); + + EXPECT_THAT(section_manager->GetSectionContent<std::string_view>( + conversation_document_, name_section_id), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); - - EXPECT_THAT(section_manager->GetStringSectionContent( + EXPECT_THAT(section_manager->GetSectionContent<std::string_view>( conversation_document_, invalid_conversation_section_id), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } @@ -300,35 +390,91 @@ TEST_F(SectionManagerTest, ExtractSections) { SectionManager::Create(type_config_map_, schema_type_mapper_.get())); // Extracts all sections from 'EmailMessage' document - ICING_ASSERT_OK_AND_ASSIGN(auto sections, + ICING_ASSERT_OK_AND_ASSIGN(SectionGroup section_group, section_manager->ExtractSections(email_document_)); - EXPECT_THAT(sections.size(), Eq(2)); - EXPECT_THAT(sections[0].metadata.id, Eq(0)); - EXPECT_THAT(sections[0].metadata.path, Eq("recipients")); - EXPECT_THAT(sections[0].content, + // String sections + EXPECT_THAT(section_group.string_sections, SizeIs(2)); + + EXPECT_THAT(section_group.string_sections[0].metadata, + Eq(SectionMetadata( + /*id_in=*/1, TYPE_STRING, TOKENIZER_PLAIN, TERM_MATCH_EXACT, + NUMERIC_MATCH_UNKNOWN, + /*path_in=*/"recipients"))); + EXPECT_THAT(section_group.string_sections[0].content, ElementsAre("recipient1", "recipient2", "recipient3")); - EXPECT_THAT(sections[1].metadata.id, Eq(1)); - EXPECT_THAT(sections[1].metadata.path, Eq("subject")); - EXPECT_THAT(sections[1].content, ElementsAre("the subject")); + EXPECT_THAT(section_group.string_sections[1].metadata, + Eq(SectionMetadata( + /*id_in=*/2, TYPE_STRING, TOKENIZER_PLAIN, TERM_MATCH_EXACT, + NUMERIC_MATCH_UNKNOWN, + /*path_in=*/"subject"))); + EXPECT_THAT(section_group.string_sections[1].content, + ElementsAre("the subject")); + + // Integer sections + EXPECT_THAT(section_group.integer_sections, SizeIs(2)); + + EXPECT_THAT(section_group.integer_sections[0].metadata, + Eq(SectionMetadata(/*id_in=*/0, TYPE_INT64, TOKENIZER_NONE, + TERM_MATCH_UNKNOWN, NUMERIC_MATCH_RANGE, + /*path_in=*/"recipientIds"))); + EXPECT_THAT(section_group.integer_sections[0].content, ElementsAre(1, 2, 3)); + + EXPECT_THAT(section_group.integer_sections[1].metadata, + Eq(SectionMetadata(/*id_in=*/3, TYPE_INT64, TOKENIZER_NONE, + TERM_MATCH_UNKNOWN, NUMERIC_MATCH_RANGE, + /*path_in=*/"timestamp"))); + EXPECT_THAT(section_group.integer_sections[1].content, + ElementsAre(kDefaultTimestamp)); +} + +TEST_F(SectionManagerTest, ExtractSectionsNested) { + ICING_ASSERT_OK_AND_ASSIGN( + auto section_manager, + SectionManager::Create(type_config_map_, schema_type_mapper_.get())); // Extracts all sections from 'Conversation' document ICING_ASSERT_OK_AND_ASSIGN( - sections, section_manager->ExtractSections(conversation_document_)); - EXPECT_THAT(sections.size(), Eq(2)); - - // Section id 3 (name) not found in document, so the first section id found - // is 1 below. - EXPECT_THAT(sections[0].metadata.id, Eq(0)); - EXPECT_THAT(sections[0].metadata.path, Eq("emails.recipients")); - EXPECT_THAT(sections[0].content, + SectionGroup section_group, + section_manager->ExtractSections(conversation_document_)); + + // String sections + EXPECT_THAT(section_group.string_sections, SizeIs(2)); + + EXPECT_THAT(section_group.string_sections[0].metadata, + Eq(SectionMetadata( + /*id_in=*/1, TYPE_STRING, TOKENIZER_PLAIN, TERM_MATCH_EXACT, + NUMERIC_MATCH_UNKNOWN, + /*path_in=*/"emails.recipients"))); + EXPECT_THAT(section_group.string_sections[0].content, ElementsAre("recipient1", "recipient2", "recipient3", "recipient1", "recipient2", "recipient3")); - EXPECT_THAT(sections[1].metadata.id, Eq(1)); - EXPECT_THAT(sections[1].metadata.path, Eq("emails.subject")); - EXPECT_THAT(sections[1].content, ElementsAre("the subject", "the subject")); + EXPECT_THAT(section_group.string_sections[1].metadata, + Eq(SectionMetadata( + /*id_in=*/2, TYPE_STRING, TOKENIZER_PLAIN, TERM_MATCH_EXACT, + NUMERIC_MATCH_UNKNOWN, + /*path_in=*/"emails.subject"))); + EXPECT_THAT(section_group.string_sections[1].content, + ElementsAre("the subject", "the subject")); + + // Integer sections + EXPECT_THAT(section_group.integer_sections, SizeIs(2)); + + EXPECT_THAT(section_group.integer_sections[0].metadata, + Eq(SectionMetadata(/*id_in=*/0, TYPE_INT64, TOKENIZER_NONE, + TERM_MATCH_UNKNOWN, NUMERIC_MATCH_RANGE, + /*path_in=*/"emails.recipientIds"))); + EXPECT_THAT(section_group.integer_sections[0].content, + ElementsAre(1, 2, 3, 1, 2, 3)); + + EXPECT_THAT(section_group.integer_sections[1].metadata, + Eq(SectionMetadata(/*id_in=*/3, TYPE_INT64, TOKENIZER_NONE, + TERM_MATCH_UNKNOWN, NUMERIC_MATCH_RANGE, + /*path_in=*/"emails.timestamp"))); + EXPECT_THAT(section_group.integer_sections[1].content, + ElementsAre(kDefaultTimestamp, kDefaultTimestamp)); } TEST_F(SectionManagerTest, @@ -344,54 +490,53 @@ TEST_F(SectionManagerTest, // Create an int property with a string_indexing_config auto int_property = type_with_non_string_properties.add_properties(); int_property->set_property_name("int"); - int_property->set_data_type(PropertyConfigProto::DataType::INT64); - int_property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); + int_property->set_data_type(TYPE_INT64); + int_property->set_cardinality(CARDINALITY_REQUIRED); int_property->mutable_string_indexing_config()->set_term_match_type( - TermMatchType::EXACT_ONLY); + TERM_MATCH_EXACT); int_property->mutable_string_indexing_config()->set_tokenizer_type( - StringIndexingConfig::TokenizerType::PLAIN); + TOKENIZER_PLAIN); // Create a double property with a string_indexing_config auto double_property = type_with_non_string_properties.add_properties(); double_property->set_property_name("double"); - double_property->set_data_type(PropertyConfigProto::DataType::DOUBLE); - double_property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); + double_property->set_data_type(TYPE_DOUBLE); + double_property->set_cardinality(CARDINALITY_REQUIRED); double_property->mutable_string_indexing_config()->set_term_match_type( - TermMatchType::EXACT_ONLY); + TERM_MATCH_EXACT); double_property->mutable_string_indexing_config()->set_tokenizer_type( - StringIndexingConfig::TokenizerType::PLAIN); + TOKENIZER_PLAIN); // Create a boolean property with a string_indexing_config auto boolean_property = type_with_non_string_properties.add_properties(); boolean_property->set_property_name("boolean"); - boolean_property->set_data_type(PropertyConfigProto::DataType::BOOLEAN); - boolean_property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); + boolean_property->set_data_type(TYPE_BOOLEAN); + boolean_property->set_cardinality(CARDINALITY_REQUIRED); boolean_property->mutable_string_indexing_config()->set_term_match_type( - TermMatchType::EXACT_ONLY); + TERM_MATCH_EXACT); boolean_property->mutable_string_indexing_config()->set_tokenizer_type( - StringIndexingConfig::TokenizerType::PLAIN); + TOKENIZER_PLAIN); // Create a bytes property with a string_indexing_config auto bytes_property = type_with_non_string_properties.add_properties(); bytes_property->set_property_name("bytes"); - bytes_property->set_data_type(PropertyConfigProto::DataType::BYTES); - bytes_property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); + bytes_property->set_data_type(TYPE_BYTES); + bytes_property->set_cardinality(CARDINALITY_REQUIRED); bytes_property->mutable_string_indexing_config()->set_term_match_type( - TermMatchType::EXACT_ONLY); + TERM_MATCH_EXACT); bytes_property->mutable_string_indexing_config()->set_tokenizer_type( - StringIndexingConfig::TokenizerType::PLAIN); + TOKENIZER_PLAIN); // Create a document property with a string_indexing_config auto document_property = type_with_non_string_properties.add_properties(); document_property->set_property_name("document"); - document_property->set_data_type(PropertyConfigProto::DataType::DOCUMENT); + document_property->set_data_type(TYPE_DOCUMENT); document_property->set_schema_type(empty_type.schema_type()); - document_property->set_cardinality( - PropertyConfigProto::Cardinality::REQUIRED); + document_property->set_cardinality(CARDINALITY_REQUIRED); document_property->mutable_string_indexing_config()->set_term_match_type( - TermMatchType::EXACT_ONLY); + TERM_MATCH_EXACT); document_property->mutable_string_indexing_config()->set_tokenizer_type( - StringIndexingConfig::TokenizerType::PLAIN); + TOKENIZER_PLAIN); // Setup classes to create the section manager SchemaUtil::TypeConfigMap type_config_map; @@ -435,9 +580,109 @@ TEST_F(SectionManagerTest, .Build(); // Extracts sections from 'Schema' document - ICING_ASSERT_OK_AND_ASSIGN(auto sections, + ICING_ASSERT_OK_AND_ASSIGN(SectionGroup section_group, + section_manager->ExtractSections(document)); + EXPECT_THAT(section_group.string_sections, IsEmpty()); + EXPECT_THAT(section_group.integer_sections, IsEmpty()); +} + +TEST_F(SectionManagerTest, + NonIntegerFieldsWithIntegerIndexingConfigDontCreateSections) { + // Create a schema for an empty document. + SchemaTypeConfigProto empty_type; + empty_type.set_schema_type("EmptySchema"); + + // Create a schema with all the non-integer fields + SchemaTypeConfigProto type_with_non_integer_properties; + type_with_non_integer_properties.set_schema_type("Schema"); + + // Create an string property with a integer_indexing_config + auto string_property = type_with_non_integer_properties.add_properties(); + string_property->set_property_name("string"); + string_property->set_data_type(TYPE_STRING); + string_property->set_cardinality(CARDINALITY_REQUIRED); + string_property->mutable_integer_indexing_config()->set_numeric_match_type( + NUMERIC_MATCH_RANGE); + + // Create a double property with a integer_indexing_config + auto double_property = type_with_non_integer_properties.add_properties(); + double_property->set_property_name("double"); + double_property->set_data_type(TYPE_DOUBLE); + double_property->set_cardinality(CARDINALITY_REQUIRED); + double_property->mutable_integer_indexing_config()->set_numeric_match_type( + NUMERIC_MATCH_RANGE); + + // Create a boolean property with a integer_indexing_config + auto boolean_property = type_with_non_integer_properties.add_properties(); + boolean_property->set_property_name("boolean"); + boolean_property->set_data_type(TYPE_BOOLEAN); + boolean_property->set_cardinality(CARDINALITY_REQUIRED); + boolean_property->mutable_integer_indexing_config()->set_numeric_match_type( + NUMERIC_MATCH_RANGE); + + // Create a bytes property with a integer_indexing_config + auto bytes_property = type_with_non_integer_properties.add_properties(); + bytes_property->set_property_name("bytes"); + bytes_property->set_data_type(TYPE_BYTES); + bytes_property->set_cardinality(CARDINALITY_REQUIRED); + bytes_property->mutable_integer_indexing_config()->set_numeric_match_type( + NUMERIC_MATCH_RANGE); + + // Create a document property with a integer_indexing_config + auto document_property = type_with_non_integer_properties.add_properties(); + document_property->set_property_name("document"); + document_property->set_data_type(TYPE_DOCUMENT); + document_property->set_schema_type(empty_type.schema_type()); + document_property->set_cardinality(CARDINALITY_REQUIRED); + document_property->mutable_integer_indexing_config()->set_numeric_match_type( + NUMERIC_MATCH_RANGE); + + // Setup classes to create the section manager + SchemaUtil::TypeConfigMap type_config_map; + type_config_map.emplace(type_with_non_integer_properties.schema_type(), + type_with_non_integer_properties); + type_config_map.emplace(empty_type.schema_type(), empty_type); + + // DynamicTrieKeyMapper uses 3 internal arrays for bookkeeping. Give each one + // 128KiB so the total DynamicTrieKeyMapper should get 384KiB + int key_mapper_size = 3 * 128 * 1024; + std::string dir = GetTestTempDir() + "/non_integer_fields"; + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<KeyMapper<SchemaTypeId>> schema_type_mapper, + DynamicTrieKeyMapper<SchemaTypeId>::Create(filesystem_, dir, + key_mapper_size)); + ICING_ASSERT_OK(schema_type_mapper->Put( + type_with_non_integer_properties.schema_type(), /*schema_type_id=*/0)); + ICING_ASSERT_OK(schema_type_mapper->Put(empty_type.schema_type(), + /*schema_type_id=*/1)); + + ICING_ASSERT_OK_AND_ASSIGN( + auto section_manager, + SectionManager::Create(type_config_map, schema_type_mapper.get())); + + // Create an empty document to be nested + DocumentProto empty_document = DocumentBuilder() + .SetKey("icing", "uri1") + .SetSchema(empty_type.schema_type()) + .Build(); + + // Create a document that follows "Schema" + DocumentProto document = + DocumentBuilder() + .SetKey("icing", "uri2") + .SetSchema(type_with_non_integer_properties.schema_type()) + .AddStringProperty("string", "abc") + .AddDoubleProperty("double", 0.2) + .AddBooleanProperty("boolean", true) + .AddBytesProperty("bytes", "attachment bytes") + .AddDocumentProperty("document", empty_document) + .Build(); + + // Extracts sections from 'Schema' document + ICING_ASSERT_OK_AND_ASSIGN(SectionGroup section_group, section_manager->ExtractSections(document)); - EXPECT_THAT(sections.size(), Eq(0)); + EXPECT_THAT(section_group.string_sections, IsEmpty()); + EXPECT_THAT(section_group.integer_sections, IsEmpty()); } TEST_F(SectionManagerTest, AssignSectionsRecursivelyForDocumentFields) { @@ -447,12 +692,19 @@ TEST_F(SectionManagerTest, AssignSectionsRecursivelyForDocumentFields) { auto string_property = document_type.add_properties(); string_property->set_property_name("string"); - string_property->set_data_type(PropertyConfigProto::DataType::STRING); - string_property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); + string_property->set_data_type(TYPE_STRING); + string_property->set_cardinality(CARDINALITY_REQUIRED); string_property->mutable_string_indexing_config()->set_term_match_type( - TermMatchType::EXACT_ONLY); + TERM_MATCH_EXACT); string_property->mutable_string_indexing_config()->set_tokenizer_type( - StringIndexingConfig::TokenizerType::PLAIN); + TOKENIZER_PLAIN); + + auto integer_property = document_type.add_properties(); + integer_property->set_property_name("integer"); + integer_property->set_data_type(TYPE_INT64); + integer_property->set_cardinality(CARDINALITY_REQUIRED); + integer_property->mutable_integer_indexing_config()->set_numeric_match_type( + NUMERIC_MATCH_RANGE); // Create the outer schema which has the document property. SchemaTypeConfigProto type; @@ -460,10 +712,9 @@ TEST_F(SectionManagerTest, AssignSectionsRecursivelyForDocumentFields) { auto document_property = type.add_properties(); document_property->set_property_name("document"); - document_property->set_data_type(PropertyConfigProto::DataType::DOCUMENT); + document_property->set_data_type(TYPE_DOCUMENT); document_property->set_schema_type(document_type.schema_type()); - document_property->set_cardinality( - PropertyConfigProto::Cardinality::REQUIRED); + document_property->set_cardinality(CARDINALITY_REQUIRED); // Opt into recursing into the document fields. document_property->mutable_document_indexing_config() @@ -474,6 +725,7 @@ TEST_F(SectionManagerTest, AssignSectionsRecursivelyForDocumentFields) { .SetKey("icing", "uri1") .SetSchema(document_type.schema_type()) .AddStringProperty("string", "foo") + .AddInt64Property("integer", 123) .Build(); // Create the outer document that holds the inner document @@ -509,10 +761,11 @@ TEST_F(SectionManagerTest, AssignSectionsRecursivelyForDocumentFields) { SectionManager::Create(type_config_map, schema_type_mapper.get())); // Extracts sections from 'Schema' document; there should be the 1 string - // property inside the document. - ICING_ASSERT_OK_AND_ASSIGN(std::vector<Section> sections, + // property and 1 integer property inside the document. + ICING_ASSERT_OK_AND_ASSIGN(SectionGroup section_group, section_manager->ExtractSections(outer_document)); - EXPECT_THAT(sections, SizeIs(1)); + EXPECT_THAT(section_group.string_sections, SizeIs(1)); + EXPECT_THAT(section_group.integer_sections, SizeIs(1)); } TEST_F(SectionManagerTest, DontAssignSectionsRecursivelyForDocumentFields) { @@ -522,12 +775,19 @@ TEST_F(SectionManagerTest, DontAssignSectionsRecursivelyForDocumentFields) { auto string_property = document_type.add_properties(); string_property->set_property_name("string"); - string_property->set_data_type(PropertyConfigProto::DataType::STRING); - string_property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); + string_property->set_data_type(TYPE_STRING); + string_property->set_cardinality(CARDINALITY_REQUIRED); string_property->mutable_string_indexing_config()->set_term_match_type( - TermMatchType::EXACT_ONLY); + TERM_MATCH_EXACT); string_property->mutable_string_indexing_config()->set_tokenizer_type( - StringIndexingConfig::TokenizerType::PLAIN); + TOKENIZER_PLAIN); + + auto integer_property = document_type.add_properties(); + integer_property->set_property_name("integer"); + integer_property->set_data_type(TYPE_INT64); + integer_property->set_cardinality(CARDINALITY_REQUIRED); + integer_property->mutable_integer_indexing_config()->set_numeric_match_type( + NUMERIC_MATCH_RANGE); // Create the outer schema which has the document property. SchemaTypeConfigProto type; @@ -535,10 +795,9 @@ TEST_F(SectionManagerTest, DontAssignSectionsRecursivelyForDocumentFields) { auto document_property = type.add_properties(); document_property->set_property_name("document"); - document_property->set_data_type(PropertyConfigProto::DataType::DOCUMENT); + document_property->set_data_type(TYPE_DOCUMENT); document_property->set_schema_type(document_type.schema_type()); - document_property->set_cardinality( - PropertyConfigProto::Cardinality::REQUIRED); + document_property->set_cardinality(CARDINALITY_REQUIRED); // Opt into recursing into the document fields. document_property->mutable_document_indexing_config() @@ -549,6 +808,7 @@ TEST_F(SectionManagerTest, DontAssignSectionsRecursivelyForDocumentFields) { .SetKey("icing", "uri1") .SetSchema(document_type.schema_type()) .AddStringProperty("string", "foo") + .AddInt64Property("integer", 123) .Build(); // Create the outer document that holds the inner document @@ -585,10 +845,13 @@ TEST_F(SectionManagerTest, DontAssignSectionsRecursivelyForDocumentFields) { // Extracts sections from 'Schema' document; there won't be any since we // didn't recurse into the document to see the inner string property - ICING_ASSERT_OK_AND_ASSIGN(std::vector<Section> sections, + ICING_ASSERT_OK_AND_ASSIGN(SectionGroup section_group, section_manager->ExtractSections(outer_document)); - EXPECT_THAT(sections, IsEmpty()); + EXPECT_THAT(section_group.string_sections, IsEmpty()); + EXPECT_THAT(section_group.integer_sections, IsEmpty()); } +} // namespace + } // namespace lib } // namespace icing diff --git a/icing/schema/section.h b/icing/schema/section.h index 34c8c58..241095b 100644 --- a/icing/schema/section.h +++ b/icing/schema/section.h @@ -54,11 +54,14 @@ struct SectionMetadata { // A unique id of property within a type config SectionId id; - // How strings should be tokenized. It is invalid for a section to have - // tokenizer == 'NONE'. + // Indexable data type of this section. E.g. STRING, INT64. + PropertyConfigProto::DataType::Code data_type; + + // How strings should be tokenized. It is invalid for a string section + // (data_type == 'STRING') to have tokenizer == 'NONE'. StringIndexingConfig::TokenizerType::Code tokenizer; - // How tokens in this section should be matched. + // How tokens in a string section should be matched. // // TermMatchType::UNKNOWN: // Terms will not match anything @@ -70,30 +73,68 @@ struct SectionMetadata { // Terms will be only stored as an exact match, "fool" only matches "fool" TermMatchType::Code term_match_type = TermMatchType::UNKNOWN; - SectionMetadata(SectionId id_in, TermMatchType::Code term_match_type_in, - StringIndexingConfig::TokenizerType::Code tokenizer, - std::string&& path_in) + // How tokens in a numeric section should be matched. + // + // NumericMatchType::UNKNOWN: + // Contents will not match anything. It is invalid for a numeric section + // (data_type == 'INT64') to have numeric_match_type == 'UNKNOWN'. + // + // NumericMatchType::RANGE: + // Contents will be matched by a range query. + IntegerIndexingConfig::NumericMatchType::Code numeric_match_type; + + explicit SectionMetadata( + SectionId id_in, PropertyConfigProto::DataType::Code data_type_in, + StringIndexingConfig::TokenizerType::Code tokenizer, + TermMatchType::Code term_match_type_in, + IntegerIndexingConfig::NumericMatchType::Code numeric_match_type_in, + std::string&& path_in) : path(std::move(path_in)), id(id_in), + data_type(data_type_in), tokenizer(tokenizer), - term_match_type(term_match_type_in) {} + term_match_type(term_match_type_in), + numeric_match_type(numeric_match_type_in) {} + + SectionMetadata(const SectionMetadata& other) = default; + SectionMetadata& operator=(const SectionMetadata& other) = default; + + SectionMetadata(SectionMetadata&& other) = default; + SectionMetadata& operator=(SectionMetadata&& other) = default; bool operator==(const SectionMetadata& rhs) const { - return path == rhs.path && id == rhs.id && tokenizer == rhs.tokenizer && - term_match_type == rhs.term_match_type; + return path == rhs.path && id == rhs.id && data_type == rhs.data_type && + tokenizer == rhs.tokenizer && + term_match_type == rhs.term_match_type && + numeric_match_type == rhs.numeric_match_type; } }; // Section is an icing internal concept similar to document property but with // extra metadata. The content can be a value or the combination of repeated -// values of a property. +// values of a property, and the type of content is specified by template. +// +// Current supported types: +// - std::string_view (PropertyConfigProto::DataType::STRING) +// - int64_t (PropertyConfigProto::DataType::INT64) +template <typename T> struct Section { SectionMetadata metadata; - std::vector<std::string_view> content; + std::vector<T> content; - Section(SectionMetadata&& metadata_in, - std::vector<std::string_view>&& content_in) + explicit Section(SectionMetadata&& metadata_in, std::vector<T>&& content_in) : metadata(std::move(metadata_in)), content(std::move(content_in)) {} + + PropertyConfigProto::DataType::Code data_type() const { + return metadata.data_type; + } +}; + +// Groups of different type sections. Callers can access sections with types +// they want and avoid going through non-desired ones. +struct SectionGroup { + std::vector<Section<std::string_view>> string_sections; + std::vector<Section<int64_t>> integer_sections; }; } // namespace lib diff --git a/icing/scoring/priority-queue-scored-document-hits-ranker.h b/icing/scoring/priority-queue-scored-document-hits-ranker.h index e0ae4b0..3ef2ae5 100644 --- a/icing/scoring/priority-queue-scored-document-hits-ranker.h +++ b/icing/scoring/priority-queue-scored-document-hits-ranker.h @@ -51,7 +51,14 @@ class PriorityQueueScoredDocumentHitsRanker : public ScoredDocumentHitsRanker { bool operator()(const ScoredDocumentHit& lhs, const ScoredDocumentHit& rhs) const { - return is_ascending_ == !(lhs < rhs); + // STL comparator requirement: equal MUST return false. + // If writing `return is_ascending_ == !(lhs < rhs)`: + // - When lhs == rhs, !(lhs < rhs) is true + // - If is_ascending_ is true, then we return true for equal case! + if (is_ascending_) { + return rhs < lhs; + } + return lhs < rhs; } private: diff --git a/icing/scoring/scored-document-hit.h b/icing/scoring/scored-document-hit.h index 079ba7e..96ca6aa 100644 --- a/icing/scoring/scored-document-hit.h +++ b/icing/scoring/scored-document-hit.h @@ -71,7 +71,14 @@ class ScoredDocumentHitComparator { bool operator()(const ScoredDocumentHit& lhs, const ScoredDocumentHit& rhs) const { - return is_descending_ == !(lhs < rhs); + // STL comparator requirement: equal MUST return false. + // If writing `return is_descending_ == !(lhs < rhs)`: + // - When lhs == rhs, !(lhs < rhs) is true + // - If is_descending_ is true, then we return true for equal case! + if (is_descending_) { + return rhs < lhs; + } + return lhs < rhs; } private: diff --git a/icing/scoring/scorer.cc b/icing/scoring/scorer.cc index 5f33e66..14a004e 100644 --- a/icing/scoring/scorer.cc +++ b/icing/scoring/scorer.cc @@ -192,6 +192,10 @@ libtextclassifier3::StatusOr<std::unique_ptr<Scorer>> Scorer::Create( case ScoringSpecProto::RankingStrategy::USAGE_TYPE3_LAST_USED_TIMESTAMP: return std::make_unique<UsageScorer>( document_store, scoring_spec.rank_by(), default_score); + case ScoringSpecProto::RankingStrategy::JOIN_AGGREGATE_SCORE: + ICING_LOG(WARNING) + << "JOIN_AGGREGATE_SCORE not implemented, falling back to NoScorer"; + [[fallthrough]]; case ScoringSpecProto::RankingStrategy::NONE: return std::make_unique<NoScorer>(default_score); } diff --git a/icing/scoring/scorer_test.cc b/icing/scoring/scorer_test.cc index 1062f50..5432cde 100644 --- a/icing/scoring/scorer_test.cc +++ b/icing/scoring/scorer_test.cc @@ -41,12 +41,6 @@ namespace lib { namespace { using ::testing::Eq; -constexpr PropertyConfigProto::DataType::Code TYPE_STRING = - PropertyConfigProto::DataType::STRING; - -constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REQUIRED = - PropertyConfigProto::Cardinality::REQUIRED; - class ScorerTest : public testing::Test { protected: ScorerTest() diff --git a/icing/scoring/scoring-processor_test.cc b/icing/scoring/scoring-processor_test.cc index ad63a2b..921fc7f 100644 --- a/icing/scoring/scoring-processor_test.cc +++ b/icing/scoring/scoring-processor_test.cc @@ -41,12 +41,6 @@ using ::testing::Gt; using ::testing::IsEmpty; using ::testing::SizeIs; -constexpr PropertyConfigProto::DataType::Code TYPE_STRING = - PropertyConfigProto::DataType::STRING; - -constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL = - PropertyConfigProto::Cardinality::OPTIONAL; - class ScoringProcessorTest : public testing::Test { protected: ScoringProcessorTest() diff --git a/icing/store/document-id.h b/icing/store/document-id.h index 3230819..7ea33b8 100644 --- a/icing/store/document-id.h +++ b/icing/store/document-id.h @@ -26,7 +26,8 @@ using DocumentId = int32_t; // We use 22 bits to encode document_ids and use the largest value (2^22 - 1) to // represent an invalid document_id. inline constexpr int kDocumentIdBits = 22; -inline constexpr DocumentId kInvalidDocumentId = (1u << kDocumentIdBits) - 1; +inline constexpr DocumentId kInvalidDocumentId = + (INT32_C(1) << kDocumentIdBits) - 1; inline constexpr DocumentId kMinDocumentId = 0; inline constexpr DocumentId kMaxDocumentId = kInvalidDocumentId - 1; diff --git a/icing/store/document-store_benchmark.cc b/icing/store/document-store_benchmark.cc index ab9bff1..a4b3a17 100644 --- a/icing/store/document-store_benchmark.cc +++ b/icing/store/document-store_benchmark.cc @@ -65,14 +65,6 @@ namespace lib { namespace { -constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL = - PropertyConfigProto::Cardinality::OPTIONAL; - -constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN = - StringIndexingConfig::TokenizerType::PLAIN; - -constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY; - class DestructibleDirectory { public: explicit DestructibleDirectory(const Filesystem& filesystem, @@ -101,17 +93,18 @@ DocumentProto CreateDocument(const std::string namespace_, SchemaProto CreateSchema() { return SchemaBuilder() - .AddType( - SchemaTypeConfigBuilder() - .SetType("email") - .AddProperty(PropertyConfigBuilder() - .SetName("subject") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL)) - .AddProperty(PropertyConfigBuilder() - .SetName("body") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType(SchemaTypeConfigBuilder() + .SetType("email") + .AddProperty( + PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) .Build(); } diff --git a/icing/store/document-store_test.cc b/icing/store/document-store_test.cc index e158fdc..7cf951a 100644 --- a/icing/store/document-store_test.cc +++ b/icing/store/document-store_test.cc @@ -91,17 +91,6 @@ const NamespaceStorageInfoProto& GetNamespaceStorageInfo( return std::move(NamespaceStorageInfoProto()); } -constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL = - PropertyConfigProto::Cardinality::OPTIONAL; - -constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN = - StringIndexingConfig::TokenizerType::PLAIN; - -constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY; - -constexpr PropertyConfigProto::DataType::Code TYPE_INT = - PropertyConfigProto::DataType::INT64; - UsageReport CreateUsageReport(std::string name_space, std::string uri, int64 timestamp_ms, UsageReport::UsageType usage_type) { @@ -183,16 +172,16 @@ class DocumentStoreTest : public ::testing::Test { .AddType( SchemaTypeConfigBuilder() .SetType("email") - .AddProperty( - PropertyConfigBuilder() - .SetName("subject") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL)) - .AddProperty( - PropertyConfigBuilder() - .SetName("body") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL))) + .AddProperty(PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(TERM_MATCH_EXACT, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(TERM_MATCH_EXACT, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) .Build(); ICING_ASSERT_OK_AND_ASSIGN( schema_store_, @@ -2363,7 +2352,7 @@ TEST_F(DocumentStoreTest, UpdateSchemaStoreDeletesInvalidDocuments) { .AddType(SchemaTypeConfigBuilder().SetType("email").AddProperty( PropertyConfigBuilder() .SetName("subject") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_OPTIONAL))) .Build(); @@ -2597,7 +2586,7 @@ TEST_F(DocumentStoreTest, OptimizedUpdateSchemaStoreDeletesInvalidDocuments) { .AddType(SchemaTypeConfigBuilder().SetType("email").AddProperty( PropertyConfigBuilder() .SetName("subject") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_OPTIONAL))) .Build(); @@ -3427,11 +3416,11 @@ TEST_F(DocumentStoreTest, InitializeForceRecoveryUpdatesTypeIds) { .SetType("email") .AddProperty(PropertyConfigBuilder() .SetName("subject") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_OPTIONAL)) .AddProperty(PropertyConfigBuilder() .SetName("body") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_OPTIONAL)) .Build(); SchemaProto schema = SchemaBuilder().AddType(email_type_config).Build(); @@ -3476,14 +3465,14 @@ TEST_F(DocumentStoreTest, InitializeForceRecoveryUpdatesTypeIds) { SchemaBuilder() .AddType(SchemaTypeConfigBuilder() .SetType("alarm") - .AddProperty( - PropertyConfigBuilder() - .SetName("name") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("name") + .SetDataTypeString(TERM_MATCH_EXACT, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) .AddProperty(PropertyConfigBuilder() .SetName("time") - .SetDataType(TYPE_INT) + .SetDataType(TYPE_INT64) .SetCardinality(CARDINALITY_OPTIONAL))) .AddType(email_type_config) .Build(); @@ -3527,11 +3516,11 @@ TEST_F(DocumentStoreTest, InitializeDontForceRecoveryDoesntUpdateTypeIds) { .SetType("email") .AddProperty(PropertyConfigBuilder() .SetName("subject") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_OPTIONAL)) .AddProperty(PropertyConfigBuilder() .SetName("body") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_OPTIONAL)) .Build(); SchemaProto schema = SchemaBuilder().AddType(email_type_config).Build(); @@ -3576,14 +3565,14 @@ TEST_F(DocumentStoreTest, InitializeDontForceRecoveryDoesntUpdateTypeIds) { SchemaBuilder() .AddType(SchemaTypeConfigBuilder() .SetType("alarm") - .AddProperty( - PropertyConfigBuilder() - .SetName("name") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("name") + .SetDataTypeString(TERM_MATCH_EXACT, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) .AddProperty(PropertyConfigBuilder() .SetName("time") - .SetDataType(TYPE_INT) + .SetDataType(TYPE_INT64) .SetCardinality(CARDINALITY_OPTIONAL))) .AddType(email_type_config) .Build(); @@ -3623,11 +3612,11 @@ TEST_F(DocumentStoreTest, InitializeForceRecoveryDeletesInvalidDocument) { .SetType("email") .AddProperty(PropertyConfigBuilder() .SetName("subject") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_OPTIONAL)) .AddProperty(PropertyConfigBuilder() .SetName("body") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_OPTIONAL)) .Build(); SchemaProto schema = SchemaBuilder().AddType(email_type_config).Build(); @@ -3689,7 +3678,7 @@ TEST_F(DocumentStoreTest, InitializeForceRecoveryDeletesInvalidDocument) { .SetType("email") .AddProperty(PropertyConfigBuilder() .SetName("subject") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_OPTIONAL)) .Build(); schema = SchemaBuilder().AddType(email_type_config).Build(); @@ -3728,11 +3717,11 @@ TEST_F(DocumentStoreTest, InitializeDontForceRecoveryKeepsInvalidDocument) { .SetType("email") .AddProperty(PropertyConfigBuilder() .SetName("subject") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_OPTIONAL)) .AddProperty(PropertyConfigBuilder() .SetName("body") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_OPTIONAL)) .Build(); SchemaProto schema = SchemaBuilder().AddType(email_type_config).Build(); @@ -3794,7 +3783,7 @@ TEST_F(DocumentStoreTest, InitializeDontForceRecoveryKeepsInvalidDocument) { .SetType("email") .AddProperty(PropertyConfigBuilder() .SetName("subject") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_OPTIONAL)) .Build(); schema = SchemaBuilder().AddType(email_type_config).Build(); @@ -3828,16 +3817,16 @@ TEST_F(DocumentStoreTest, MigrateToPortableFileBackedProtoLog) { SchemaBuilder() .AddType(SchemaTypeConfigBuilder() .SetType("email") - .AddProperty( - PropertyConfigBuilder() - .SetName("subject") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL)) - .AddProperty( - PropertyConfigBuilder() - .SetName("body") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL))) + .AddProperty(PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(TERM_MATCH_EXACT, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(TERM_MATCH_EXACT, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) .Build(); std::string schema_store_dir = schema_store_dir_ + "_migrate"; @@ -3948,20 +3937,20 @@ TEST_F(DocumentStoreTest, GetDebugInfo) { SchemaBuilder() .AddType(SchemaTypeConfigBuilder() .SetType("email") - .AddProperty( - PropertyConfigBuilder() - .SetName("subject") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL)) - .AddProperty( - PropertyConfigBuilder() - .SetName("body") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL))) + .AddProperty(PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(TERM_MATCH_EXACT, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(TERM_MATCH_EXACT, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) .AddType(SchemaTypeConfigBuilder().SetType("person").AddProperty( PropertyConfigBuilder() .SetName("name") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_OPTIONAL))) .Build(); std::string schema_store_dir = schema_store_dir_ + "_custom"; diff --git a/icing/tokenization/token.h b/icing/tokenization/token.h index 41fbee0..05d6fe4 100644 --- a/icing/tokenization/token.h +++ b/icing/tokenization/token.h @@ -47,7 +47,7 @@ struct Token { QUERY_RIGHT_PARENTHESES, // Right parentheses // Types used in URL tokenization - URL_SCHEME, // "http", "https" + URL_SCHEME, // "http", "https", "ftp", "content" URL_USERNAME, URL_PASSWORD, URL_HOST_COMMON_PART, // Hosts are split into two types, common and diff --git a/icing/tokenization/tokenizer-factory.cc b/icing/tokenization/tokenizer-factory.cc index dc5cfdf..d120ac8 100644 --- a/icing/tokenization/tokenizer-factory.cc +++ b/icing/tokenization/tokenizer-factory.cc @@ -24,6 +24,11 @@ #include "icing/tokenization/raw-query-tokenizer.h" #include "icing/tokenization/rfc822-tokenizer.h" #include "icing/tokenization/tokenizer.h" + +#ifdef ENABLE_URL_TOKENIZER +#include "icing/tokenization/url-tokenizer.h" +#endif // ENABLE_URL_TOKENIZER + #include "icing/tokenization/verbatim-tokenizer.h" #include "icing/util/status-macros.h" @@ -44,6 +49,12 @@ CreateIndexingTokenizer(StringIndexingConfig::TokenizerType::Code type, return std::make_unique<VerbatimTokenizer>(); case StringIndexingConfig::TokenizerType::RFC822: return std::make_unique<Rfc822Tokenizer>(); +// TODO (b/246964044): remove ifdef guard when url-tokenizer is ready for export +// to Android. +#ifdef ENABLE_URL_TOKENIZER + case StringIndexingConfig::TokenizerType::URL: + return std::make_unique<UrlTokenizer>(); +#endif // ENABLE_URL_TOKENIZER case StringIndexingConfig::TokenizerType::NONE: [[fallthrough]]; default: diff --git a/icing/util/document-validator_test.cc b/icing/util/document-validator_test.cc index 46a2679..310494a 100644 --- a/icing/util/document-validator_test.cc +++ b/icing/util/document-validator_test.cc @@ -32,6 +32,7 @@ namespace icing { namespace lib { namespace { + using ::testing::HasSubstr; // type and property names of EmailMessage @@ -47,16 +48,6 @@ constexpr char kPropertyEmails[] = "emails"; constexpr char kDefaultNamespace[] = "icing"; constexpr char kDefaultString[] = "This is a string."; -constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL = - PropertyConfigProto::Cardinality::OPTIONAL; -constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REQUIRED = - PropertyConfigProto::Cardinality::REQUIRED; -constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REPEATED = - PropertyConfigProto::Cardinality::REPEATED; - -constexpr PropertyConfigProto::DataType::Code TYPE_STRING = - PropertyConfigProto::DataType::STRING; - class DocumentValidatorTest : public ::testing::Test { protected: DocumentValidatorTest() {} diff --git a/icing/util/tokenized-document.cc b/icing/util/tokenized-document.cc index 885e489..e741987 100644 --- a/icing/util/tokenized-document.cc +++ b/icing/util/tokenized-document.cc @@ -49,9 +49,11 @@ libtextclassifier3::Status TokenizedDocument::Tokenize( DocumentValidator validator(schema_store); ICING_RETURN_IF_ERROR(validator.Validate(document_)); - ICING_ASSIGN_OR_RETURN(std::vector<Section> sections, + ICING_ASSIGN_OR_RETURN(SectionGroup section_group, schema_store->ExtractSections(document_)); - for (const Section& section : sections) { + // string sections + for (const Section<std::string_view>& section : + section_group.string_sections) { ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer> tokenizer, tokenizer_factory::CreateIndexingTokenizer( section.metadata.tokenizer, language_segmenter)); diff --git a/proto/icing/proto/schema.proto b/proto/icing/proto/schema.proto index f960708..d9c43e2 100644 --- a/proto/icing/proto/schema.proto +++ b/proto/icing/proto/schema.proto @@ -111,6 +111,21 @@ message StringIndexingConfig { // original string as an rfc822 token. // See more here: https://datatracker.ietf.org/doc/html/rfc822 RFC822 = 3; + + // Tokenizes text as an url address. This tokenizes a url string into a + // token for each component in the url, as well as any significant + // url suffixes. For example, + // https://www.google.com/path/subpath?query#ref would be tokenizes into a + // scheme token "https“; 3 host tokens "www", "google", "com"; 2 path + // tokens "path", "subpath"; a query token "query"; a reference token + // "ref"; and 3 suffix tokens + // "https://www.google.com/path/subpath?query#ref", + // "www.google.com/path/subpath?query#ref", + // "google.com/path/subpath?query#ref". + // Currently only supports tokenization of one url string at a time + // i.e. the input string cannot have spaces in the middle, but can have + // leading or trailing spaces. + URL = 4; } } optional TokenizerType.Code tokenizer_type = 2; @@ -128,10 +143,31 @@ message DocumentIndexingConfig { optional bool index_nested_properties = 1; } +// Describes how a int64 property should be indexed. +// Next tag: 3 +message IntegerIndexingConfig { + // OPTIONAL: Indicates how the int64 contents of this property should be + // matched. + // + // The default value is UNKNOWN. + message NumericMatchType { + enum Code { + // Contents in this property will not be indexed. Useful if the int64 + // property type is not indexable. + UNKNOWN = 0; + + // Contents in this property should only be returned for queries matching + // the range. + RANGE = 1; + } + } + optional NumericMatchType.Code numeric_match_type = 1; +} + // Describes the schema of a single property of Documents that belong to a // specific SchemaTypeConfigProto. These can be considered as a rich, structured // type for each property of Documents accepted by IcingSearchEngine. -// Next tag: 7 +// Next tag: 8 message PropertyConfigProto { // REQUIRED: Name that uniquely identifies a property within an Document of // a specific SchemaTypeConfigProto. @@ -208,6 +244,10 @@ message PropertyConfigProto { // OPTIONAL: Describes how document properties should be indexed. optional DocumentIndexingConfig document_indexing_config = 6; + + // OPTIONAL: Describes how int64 properties should be indexed. Int64 + // properties that do not set the indexing config will not be indexed. + optional IntegerIndexingConfig integer_indexing_config = 7; } // List of all supported types constitutes the schema used by Icing. diff --git a/proto/icing/proto/scoring.proto b/proto/icing/proto/scoring.proto index 375e9bd..13861c9 100644 --- a/proto/icing/proto/scoring.proto +++ b/proto/icing/proto/scoring.proto @@ -68,6 +68,9 @@ message ScoringSpecProto { // Ranked by relevance score, currently computed as BM25F score. RELEVANCE_SCORE = 9; + + // Ranked by the aggregated score of the joined documents. + JOIN_AGGREGATE_SCORE = 10; } } optional RankingStrategy.Code rank_by = 1; diff --git a/proto/icing/proto/search.proto b/proto/icing/proto/search.proto index e3324a3..181c63c 100644 --- a/proto/icing/proto/search.proto +++ b/proto/icing/proto/search.proto @@ -27,7 +27,7 @@ option java_multiple_files = true; option objc_class_prefix = "ICNG"; // Client-supplied specifications on what documents to retrieve. -// Next tag: 7 +// Next tag: 8 message SearchSpecProto { // REQUIRED: The "raw" query string that users may type. For example, "cat" // will search for documents with the term cat in it. @@ -86,6 +86,10 @@ message SearchSpecProto { // TODO(b/208654892) Remove this field once EXPERIMENTAL_ICING_ADVANCED_QUERY // is fully supported. optional SearchType.Code search_type = 6 [default = ICING_RAW_QUERY]; + + // OPTIONAL: If this field is present, join documents based on a nested + // SearchSpec. + optional JoinSpecProto join_spec = 7; } // Client-supplied specifications on what to include/how to format the search @@ -282,7 +286,7 @@ message SearchResultProto { optional StatusProto status = 1; // The Results that matched the query. Empty if there was an error. - // Next tag: 4 + // Next tag: 5 message ResultProto { // Document that matches the SearchSpecProto. optional DocumentProto document = 1; @@ -294,6 +298,9 @@ message SearchResultProto { // The score that the document was ranked by. The meaning of this score is // determined by ScoringSpecProto.rank_by. optional double score = 3; + + // The documents that were joined to a parent document. + repeated ResultProto joined_results = 4; } repeated ResultProto results = 2; @@ -418,3 +425,50 @@ message SuggestionResponse { repeated Suggestion suggestions = 2; } + +// Specification for a left outer join. +// +// Next tag: 7 +message JoinSpecProto { + // A nested SearchSpec that will be used to retrieve joined documents. If you + // are only looking to join on Action type documents, you could set a schema + // filter in this SearchSpec. This includes the nested search query. See + // SearchSpecProto. + optional SearchSpecProto nested_search_spec = 1; + + // The equivalent of a primary key in SQL. This is an expression that will be + // used to match child documents from the nested search to this document. One + // such expression is qualifiedId(). When used, it means the + // child_property_expression in the joined documents must be equal to the + // qualified id. + // TODO(b/256022027) allow for parent_property_expression to be any property + // of the parent document. + optional string parent_property_expression = 2; + + // The equivalent of a foreign key in SQL. This defines an equality constraint + // between a property in a child document and a property in the parent + // document. For example, if you want to join Action documents which an + // entityId property containing a fully qualified document id, + // child_property_expression can be set to "entityId". + // TODO(b/256022027) figure out how to allow this to refer to documents + // outside of same pkg+db+ns. + optional string child_property_expression = 3; + + // The max amount of joined documents to join to a parent document. + optional int32 max_joined_result_count = 4; + + // The strategy by which to score the aggregation of joined documents. For + // example, you might want to know which entity document has the most actions + // taken on it. If JOIN_AGGREGATE_SCORE is used in the base SearchSpecProto, + // the COUNT value will rank entity documents based on the number of joined + // documents. + enum AggregationScore { + UNDEFINED = 0; + COUNT = 1; + MIN = 2; + AVG = 3; + MAX = 4; + SUM = 5; + } + optional AggregationScore aggregation_score_strategy = 5 [default = COUNT]; +} diff --git a/synced_AOSP_CL_number.txt b/synced_AOSP_CL_number.txt index cf3c8f0..55403b4 100644 --- a/synced_AOSP_CL_number.txt +++ b/synced_AOSP_CL_number.txt @@ -1 +1 @@ -set(synced_AOSP_CL_number=484090353) +set(synced_AOSP_CL_number=487674301) |