diff options
author | Tim Barron <tjbarron@google.com> | 2024-03-07 00:42:57 +0000 |
---|---|---|
committer | Tim Barron <tjbarron@google.com> | 2024-03-12 15:41:59 +0000 |
commit | 29d4712b67d1ade154739e5fa9a9a7970afe6c0a (patch) | |
tree | 0a26da402c204c50e001539cffc89abbe51b08b6 | |
parent | 030b99d2648aea03e5c2e3809a13fdfacdccde11 (diff) | |
download | icing-29d4712b67d1ade154739e5fa9a9a7970afe6c0a.tar.gz |
Update Icing from upstream.
Descriptions:
========================================================================
Allow adding duplicate hits with the same value into the posting list.
========================================================================
Support an extension to the encoded Hit in Icing's posting list.
========================================================================
Adds description fields to SchemaTypeConfigProto and PropertyConfigProto
========================================================================
Support polymorphism in type property filters
========================================================================
Fix posting list GetMinPostingListSizeToFit size calculation bug
========================================================================
Add instructions to error message for advanced query backward compat.
========================================================================
BUG: 294274922
BUG: 321107391
BUG: 324908653
BUG: 326987971
Change-Id: I9e8e0589a9682ee14ac34ded32b959c1ef64d84d
30 files changed, 1902 insertions, 712 deletions
diff --git a/icing/file/posting_list/flash-index-storage-header.h b/icing/file/posting_list/flash-index-storage-header.h index 6bbf1ba..f7b331c 100644 --- a/icing/file/posting_list/flash-index-storage-header.h +++ b/icing/file/posting_list/flash-index-storage-header.h @@ -33,7 +33,7 @@ class HeaderBlock { // The class used to access the actual header. struct Header { // A magic used to mark the beginning of a valid header. - static constexpr int kMagic = 0xb0780cf4; + static constexpr int kMagic = 0xd1b7b293; int magic; int block_size; int last_indexed_docid; diff --git a/icing/file/posting_list/flash-index-storage_test.cc b/icing/file/posting_list/flash-index-storage_test.cc index ef60037..203041e 100644 --- a/icing/file/posting_list/flash-index-storage_test.cc +++ b/icing/file/posting_list/flash-index-storage_test.cc @@ -16,9 +16,9 @@ #include <unistd.h> -#include <algorithm> -#include <cstdlib> -#include <limits> +#include <cstdint> +#include <memory> +#include <string> #include <utility> #include <vector> @@ -27,6 +27,7 @@ #include "gtest/gtest.h" #include "icing/file/filesystem.h" #include "icing/file/posting_list/flash-index-storage-header.h" +#include "icing/file/posting_list/posting-list-identifier.h" #include "icing/index/hit/hit.h" #include "icing/index/main/posting-list-hit-serializer.h" #include "icing/store/document-id.h" @@ -213,10 +214,14 @@ TEST_F(FlashIndexStorageTest, FreeListInMemory) { EXPECT_THAT(flash_index_storage.empty(), IsFalse()); std::vector<Hit> hits1 = { - Hit(/*section_id=*/1, /*document_id=*/0, /*term_frequency=*/12), - Hit(/*section_id=*/6, /*document_id=*/2, /*term_frequency=*/19), - Hit(/*section_id=*/5, /*document_id=*/2, /*term_frequency=*/100), - Hit(/*section_id=*/8, /*document_id=*/5, /*term_frequency=*/197)}; + Hit(/*section_id=*/1, /*document_id=*/0, /*term_frequency=*/12, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false), + Hit(/*section_id=*/6, /*document_id=*/2, /*term_frequency=*/19, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false), + Hit(/*section_id=*/5, /*document_id=*/2, /*term_frequency=*/100, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false), + Hit(/*section_id=*/8, /*document_id=*/5, /*term_frequency=*/197, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false)}; for (const Hit& hit : hits1) { ICING_ASSERT_OK( serializer_->PrependHit(&posting_list_holder1.posting_list, hit)); @@ -237,10 +242,14 @@ TEST_F(FlashIndexStorageTest, FreeListInMemory) { EXPECT_THAT(flash_index_storage.empty(), IsFalse()); std::vector<Hit> hits2 = { - Hit(/*section_id=*/4, /*document_id=*/0, /*term_frequency=*/12), - Hit(/*section_id=*/8, /*document_id=*/4, /*term_frequency=*/19), - Hit(/*section_id=*/9, /*document_id=*/7, /*term_frequency=*/100), - Hit(/*section_id=*/6, /*document_id=*/7, /*term_frequency=*/197)}; + Hit(/*section_id=*/4, /*document_id=*/0, /*term_frequency=*/12, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false), + Hit(/*section_id=*/8, /*document_id=*/4, /*term_frequency=*/19, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false), + Hit(/*section_id=*/9, /*document_id=*/7, /*term_frequency=*/100, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false), + Hit(/*section_id=*/6, /*document_id=*/7, /*term_frequency=*/197, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false)}; for (const Hit& hit : hits2) { ICING_ASSERT_OK( serializer_->PrependHit(&posting_list_holder2.posting_list, hit)); @@ -273,10 +282,14 @@ TEST_F(FlashIndexStorageTest, FreeListInMemory) { EXPECT_THAT(serializer_->GetHits(&posting_list_holder3.posting_list), IsOkAndHolds(IsEmpty())); std::vector<Hit> hits3 = { - Hit(/*section_id=*/7, /*document_id=*/1, /*term_frequency=*/62), - Hit(/*section_id=*/12, /*document_id=*/3, /*term_frequency=*/45), - Hit(/*section_id=*/11, /*document_id=*/18, /*term_frequency=*/12), - Hit(/*section_id=*/7, /*document_id=*/100, /*term_frequency=*/74)}; + Hit(/*section_id=*/7, /*document_id=*/1, /*term_frequency=*/62, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false), + Hit(/*section_id=*/12, /*document_id=*/3, /*term_frequency=*/45, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false), + Hit(/*section_id=*/11, /*document_id=*/18, /*term_frequency=*/12, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false), + Hit(/*section_id=*/7, /*document_id=*/100, /*term_frequency=*/74, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false)}; for (const Hit& hit : hits3) { ICING_ASSERT_OK( serializer_->PrependHit(&posting_list_holder3.posting_list, hit)); @@ -314,10 +327,14 @@ TEST_F(FlashIndexStorageTest, FreeListNotInMemory) { EXPECT_THAT(flash_index_storage.empty(), IsFalse()); std::vector<Hit> hits1 = { - Hit(/*section_id=*/1, /*document_id=*/0, /*term_frequency=*/12), - Hit(/*section_id=*/6, /*document_id=*/2, /*term_frequency=*/19), - Hit(/*section_id=*/5, /*document_id=*/2, /*term_frequency=*/100), - Hit(/*section_id=*/8, /*document_id=*/5, /*term_frequency=*/197)}; + Hit(/*section_id=*/1, /*document_id=*/0, /*term_frequency=*/12, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false), + Hit(/*section_id=*/6, /*document_id=*/2, /*term_frequency=*/19, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false), + Hit(/*section_id=*/5, /*document_id=*/2, /*term_frequency=*/100, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false), + Hit(/*section_id=*/8, /*document_id=*/5, /*term_frequency=*/197, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false)}; for (const Hit& hit : hits1) { ICING_ASSERT_OK( serializer_->PrependHit(&posting_list_holder1.posting_list, hit)); @@ -338,10 +355,14 @@ TEST_F(FlashIndexStorageTest, FreeListNotInMemory) { EXPECT_THAT(flash_index_storage.empty(), IsFalse()); std::vector<Hit> hits2 = { - Hit(/*section_id=*/4, /*document_id=*/0, /*term_frequency=*/12), - Hit(/*section_id=*/8, /*document_id=*/4, /*term_frequency=*/19), - Hit(/*section_id=*/9, /*document_id=*/7, /*term_frequency=*/100), - Hit(/*section_id=*/6, /*document_id=*/7, /*term_frequency=*/197)}; + Hit(/*section_id=*/4, /*document_id=*/0, /*term_frequency=*/12, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false), + Hit(/*section_id=*/8, /*document_id=*/4, /*term_frequency=*/19, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false), + Hit(/*section_id=*/9, /*document_id=*/7, /*term_frequency=*/100, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false), + Hit(/*section_id=*/6, /*document_id=*/7, /*term_frequency=*/197, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false)}; for (const Hit& hit : hits2) { ICING_ASSERT_OK( serializer_->PrependHit(&posting_list_holder2.posting_list, hit)); @@ -374,10 +395,14 @@ TEST_F(FlashIndexStorageTest, FreeListNotInMemory) { EXPECT_THAT(serializer_->GetHits(&posting_list_holder3.posting_list), IsOkAndHolds(IsEmpty())); std::vector<Hit> hits3 = { - Hit(/*section_id=*/7, /*document_id=*/1, /*term_frequency=*/62), - Hit(/*section_id=*/12, /*document_id=*/3, /*term_frequency=*/45), - Hit(/*section_id=*/11, /*document_id=*/18, /*term_frequency=*/12), - Hit(/*section_id=*/7, /*document_id=*/100, /*term_frequency=*/74)}; + Hit(/*section_id=*/7, /*document_id=*/1, /*term_frequency=*/62, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false), + Hit(/*section_id=*/12, /*document_id=*/3, /*term_frequency=*/45, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false), + Hit(/*section_id=*/11, /*document_id=*/18, /*term_frequency=*/12, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false), + Hit(/*section_id=*/7, /*document_id=*/100, /*term_frequency=*/74, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false)}; for (const Hit& hit : hits3) { ICING_ASSERT_OK( serializer_->PrependHit(&posting_list_holder3.posting_list, hit)); @@ -417,10 +442,14 @@ TEST_F(FlashIndexStorageTest, FreeListInMemoryPersistence) { EXPECT_THAT(flash_index_storage.empty(), IsFalse()); std::vector<Hit> hits1 = { - Hit(/*section_id=*/1, /*document_id=*/0, /*term_frequency=*/12), - Hit(/*section_id=*/6, /*document_id=*/2, /*term_frequency=*/19), - Hit(/*section_id=*/5, /*document_id=*/2, /*term_frequency=*/100), - Hit(/*section_id=*/8, /*document_id=*/5, /*term_frequency=*/197)}; + Hit(/*section_id=*/1, /*document_id=*/0, /*term_frequency=*/12, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false), + Hit(/*section_id=*/6, /*document_id=*/2, /*term_frequency=*/19, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false), + Hit(/*section_id=*/5, /*document_id=*/2, /*term_frequency=*/100, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false), + Hit(/*section_id=*/8, /*document_id=*/5, /*term_frequency=*/197, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false)}; for (const Hit& hit : hits1) { ICING_ASSERT_OK( serializer_->PrependHit(&posting_list_holder1.posting_list, hit)); @@ -441,10 +470,14 @@ TEST_F(FlashIndexStorageTest, FreeListInMemoryPersistence) { EXPECT_THAT(flash_index_storage.empty(), IsFalse()); std::vector<Hit> hits2 = { - Hit(/*section_id=*/4, /*document_id=*/0, /*term_frequency=*/12), - Hit(/*section_id=*/8, /*document_id=*/4, /*term_frequency=*/19), - Hit(/*section_id=*/9, /*document_id=*/7, /*term_frequency=*/100), - Hit(/*section_id=*/6, /*document_id=*/7, /*term_frequency=*/197)}; + Hit(/*section_id=*/4, /*document_id=*/0, /*term_frequency=*/12, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false), + Hit(/*section_id=*/8, /*document_id=*/4, /*term_frequency=*/19, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false), + Hit(/*section_id=*/9, /*document_id=*/7, /*term_frequency=*/100, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false), + Hit(/*section_id=*/6, /*document_id=*/7, /*term_frequency=*/197, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false)}; for (const Hit& hit : hits2) { ICING_ASSERT_OK( serializer_->PrependHit(&posting_list_holder2.posting_list, hit)); @@ -492,10 +525,14 @@ TEST_F(FlashIndexStorageTest, FreeListInMemoryPersistence) { EXPECT_THAT(serializer_->GetHits(&posting_list_holder3.posting_list), IsOkAndHolds(IsEmpty())); std::vector<Hit> hits3 = { - Hit(/*section_id=*/7, /*document_id=*/1, /*term_frequency=*/62), - Hit(/*section_id=*/12, /*document_id=*/3, /*term_frequency=*/45), - Hit(/*section_id=*/11, /*document_id=*/18, /*term_frequency=*/12), - Hit(/*section_id=*/7, /*document_id=*/100, /*term_frequency=*/74)}; + Hit(/*section_id=*/7, /*document_id=*/1, /*term_frequency=*/62, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false), + Hit(/*section_id=*/12, /*document_id=*/3, /*term_frequency=*/45, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false), + Hit(/*section_id=*/11, /*document_id=*/18, /*term_frequency=*/12, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false), + Hit(/*section_id=*/7, /*document_id=*/100, /*term_frequency=*/74, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false)}; for (const Hit& hit : hits3) { ICING_ASSERT_OK( serializer_->PrependHit(&posting_list_holder3.posting_list, hit)); @@ -534,10 +571,14 @@ TEST_F(FlashIndexStorageTest, DifferentSizedPostingLists) { EXPECT_THAT(flash_index_storage.empty(), IsFalse()); std::vector<Hit> hits1 = { - Hit(/*section_id=*/1, /*document_id=*/0, /*term_frequency=*/12), - Hit(/*section_id=*/6, /*document_id=*/2, /*term_frequency=*/19), - Hit(/*section_id=*/5, /*document_id=*/2, /*term_frequency=*/100), - Hit(/*section_id=*/8, /*document_id=*/5, /*term_frequency=*/197)}; + Hit(/*section_id=*/1, /*document_id=*/0, /*term_frequency=*/12, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false), + Hit(/*section_id=*/6, /*document_id=*/2, /*term_frequency=*/19, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false), + Hit(/*section_id=*/5, /*document_id=*/2, /*term_frequency=*/100, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false), + Hit(/*section_id=*/8, /*document_id=*/5, /*term_frequency=*/197, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false)}; for (const Hit& hit : hits1) { ICING_ASSERT_OK( serializer_->PrependHit(&posting_list_holder1.posting_list, hit)); @@ -561,10 +602,14 @@ TEST_F(FlashIndexStorageTest, DifferentSizedPostingLists) { EXPECT_THAT(flash_index_storage.empty(), IsFalse()); std::vector<Hit> hits2 = { - Hit(/*section_id=*/4, /*document_id=*/0, /*term_frequency=*/12), - Hit(/*section_id=*/8, /*document_id=*/4, /*term_frequency=*/19), - Hit(/*section_id=*/9, /*document_id=*/7, /*term_frequency=*/100), - Hit(/*section_id=*/6, /*document_id=*/7, /*term_frequency=*/197)}; + Hit(/*section_id=*/4, /*document_id=*/0, /*term_frequency=*/12, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false), + Hit(/*section_id=*/8, /*document_id=*/4, /*term_frequency=*/19, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false), + Hit(/*section_id=*/9, /*document_id=*/7, /*term_frequency=*/100, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false), + Hit(/*section_id=*/6, /*document_id=*/7, /*term_frequency=*/197, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false)}; for (const Hit& hit : hits2) { ICING_ASSERT_OK( serializer_->PrependHit(&posting_list_holder2.posting_list, hit)); diff --git a/icing/file/posting_list/index-block_test.cc b/icing/file/posting_list/index-block_test.cc index ebc9ba4..d841e79 100644 --- a/icing/file/posting_list/index-block_test.cc +++ b/icing/file/posting_list/index-block_test.cc @@ -14,11 +14,16 @@ #include "icing/file/posting_list/index-block.h" +#include <memory> +#include <string> +#include <vector> + #include "icing/text_classifier/lib3/utils/base/status.h" #include "gmock/gmock.h" #include "gtest/gtest.h" #include "icing/file/filesystem.h" -#include "icing/file/posting_list/posting-list-used.h" +#include "icing/file/posting_list/posting-list-common.h" +#include "icing/index/hit/hit.h" #include "icing/index/main/posting-list-hit-serializer.h" #include "icing/testing/common-matchers.h" #include "icing/testing/tmp-directory.h" @@ -67,7 +72,7 @@ class IndexBlockTest : public ::testing::Test { }; TEST_F(IndexBlockTest, CreateFromUninitializedRegionProducesEmptyBlock) { - constexpr int kPostingListBytes = 20; + constexpr int kPostingListBytes = 24; { // Create an IndexBlock from this newly allocated file block. @@ -80,7 +85,7 @@ TEST_F(IndexBlockTest, CreateFromUninitializedRegionProducesEmptyBlock) { } TEST_F(IndexBlockTest, SizeAccessorsWorkCorrectly) { - constexpr int kPostingListBytes1 = 20; + constexpr int kPostingListBytes1 = 24; // Create an IndexBlock from this newly allocated file block. ICING_ASSERT_OK_AND_ASSIGN(IndexBlock block, @@ -88,13 +93,13 @@ TEST_F(IndexBlockTest, SizeAccessorsWorkCorrectly) { &filesystem_, serializer_.get(), sfd_->get(), /*offset=*/0, kBlockSize, kPostingListBytes1)); EXPECT_THAT(block.posting_list_bytes(), Eq(kPostingListBytes1)); - // There should be (4096 - 12) / 20 = 204 posting lists - // (sizeof(BlockHeader)==12). We can store a PostingListIndex of 203 in only 8 + // There should be (4096 - 12) / 24 = 170 posting lists + // (sizeof(BlockHeader)==12). We can store a PostingListIndex of 170 in only 8 // bits. - EXPECT_THAT(block.max_num_posting_lists(), Eq(204)); + EXPECT_THAT(block.max_num_posting_lists(), Eq(170)); EXPECT_THAT(block.posting_list_index_bits(), Eq(8)); - constexpr int kPostingListBytes2 = 200; + constexpr int kPostingListBytes2 = 240; // Create an IndexBlock from this newly allocated file block. ICING_ASSERT_OK_AND_ASSIGN( @@ -102,22 +107,27 @@ TEST_F(IndexBlockTest, SizeAccessorsWorkCorrectly) { &filesystem_, serializer_.get(), sfd_->get(), /*offset=*/0, kBlockSize, kPostingListBytes2)); EXPECT_THAT(block.posting_list_bytes(), Eq(kPostingListBytes2)); - // There should be (4096 - 12) / 200 = 20 posting lists + // There should be (4096 - 12) / 240 = 17 posting lists // (sizeof(BlockHeader)==12). We can store a PostingListIndex of 19 in only 5 // bits. - EXPECT_THAT(block.max_num_posting_lists(), Eq(20)); + EXPECT_THAT(block.max_num_posting_lists(), Eq(17)); EXPECT_THAT(block.posting_list_index_bits(), Eq(5)); } TEST_F(IndexBlockTest, IndexBlockChangesPersistAcrossInstances) { - constexpr int kPostingListBytes = 2000; + constexpr int kPostingListBytes = 2004; std::vector<Hit> test_hits{ - Hit(/*section_id=*/2, /*document_id=*/0, Hit::kDefaultTermFrequency), - Hit(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency), - Hit(/*section_id=*/5, /*document_id=*/1, /*term_frequency=*/99), - Hit(/*section_id=*/3, /*document_id=*/3, /*term_frequency=*/17), - Hit(/*section_id=*/10, /*document_id=*/10, Hit::kDefaultTermFrequency), + Hit(/*section_id=*/2, /*document_id=*/0, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false), + Hit(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false), + Hit(/*section_id=*/5, /*document_id=*/1, /*term_frequency=*/99, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false), + Hit(/*section_id=*/3, /*document_id=*/3, /*term_frequency=*/17, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false), + Hit(/*section_id=*/10, /*document_id=*/10, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false), }; PostingListIndex allocated_index; { @@ -158,21 +168,31 @@ TEST_F(IndexBlockTest, IndexBlockChangesPersistAcrossInstances) { } TEST_F(IndexBlockTest, IndexBlockMultiplePostingLists) { - constexpr int kPostingListBytes = 2000; + constexpr int kPostingListBytes = 2004; std::vector<Hit> hits_in_posting_list1{ - Hit(/*section_id=*/2, /*document_id=*/0, Hit::kDefaultTermFrequency), - Hit(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency), - Hit(/*section_id=*/5, /*document_id=*/1, /*term_frequency=*/99), - Hit(/*section_id=*/3, /*document_id=*/3, /*term_frequency=*/17), - Hit(/*section_id=*/10, /*document_id=*/10, Hit::kDefaultTermFrequency), + Hit(/*section_id=*/2, /*document_id=*/0, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false), + Hit(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false), + Hit(/*section_id=*/5, /*document_id=*/1, /*term_frequency=*/99, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false), + Hit(/*section_id=*/3, /*document_id=*/3, /*term_frequency=*/17, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false), + Hit(/*section_id=*/10, /*document_id=*/10, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false), }; std::vector<Hit> hits_in_posting_list2{ - Hit(/*section_id=*/12, /*document_id=*/220, /*term_frequency=*/88), - Hit(/*section_id=*/17, /*document_id=*/265, Hit::kDefaultTermFrequency), - Hit(/*section_id=*/0, /*document_id=*/287, /*term_frequency=*/2), - Hit(/*section_id=*/11, /*document_id=*/306, /*term_frequency=*/12), - Hit(/*section_id=*/10, /*document_id=*/306, Hit::kDefaultTermFrequency), + Hit(/*section_id=*/12, /*document_id=*/220, /*term_frequency=*/88, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false), + Hit(/*section_id=*/17, /*document_id=*/265, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false), + Hit(/*section_id=*/0, /*document_id=*/287, /*term_frequency=*/2, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false), + Hit(/*section_id=*/11, /*document_id=*/306, /*term_frequency=*/12, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false), + Hit(/*section_id=*/10, /*document_id=*/306, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false), }; PostingListIndex allocated_index_1; PostingListIndex allocated_index_2; @@ -242,7 +262,7 @@ TEST_F(IndexBlockTest, IndexBlockMultiplePostingLists) { } TEST_F(IndexBlockTest, IndexBlockReallocatingPostingLists) { - constexpr int kPostingListBytes = 2000; + constexpr int kPostingListBytes = 2004; // Create an IndexBlock from this newly allocated file block. ICING_ASSERT_OK_AND_ASSIGN(IndexBlock block, @@ -252,11 +272,16 @@ TEST_F(IndexBlockTest, IndexBlockReallocatingPostingLists) { // Add hits to the first posting list. std::vector<Hit> hits_in_posting_list1{ - Hit(/*section_id=*/2, /*document_id=*/0, Hit::kDefaultTermFrequency), - Hit(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency), - Hit(/*section_id=*/5, /*document_id=*/1, /*term_frequency=*/99), - Hit(/*section_id=*/3, /*document_id=*/3, /*term_frequency=*/17), - Hit(/*section_id=*/10, /*document_id=*/10, Hit::kDefaultTermFrequency), + Hit(/*section_id=*/2, /*document_id=*/0, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false), + Hit(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false), + Hit(/*section_id=*/5, /*document_id=*/1, /*term_frequency=*/99, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false), + Hit(/*section_id=*/3, /*document_id=*/3, /*term_frequency=*/17, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false), + Hit(/*section_id=*/10, /*document_id=*/10, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false), }; ICING_ASSERT_OK_AND_ASSIGN(IndexBlock::PostingListAndBlockInfo alloc_info_1, block.AllocatePostingList()); @@ -270,11 +295,16 @@ TEST_F(IndexBlockTest, IndexBlockReallocatingPostingLists) { // Add hits to the second posting list. std::vector<Hit> hits_in_posting_list2{ - Hit(/*section_id=*/12, /*document_id=*/220, /*term_frequency=*/88), - Hit(/*section_id=*/17, /*document_id=*/265, Hit::kDefaultTermFrequency), - Hit(/*section_id=*/0, /*document_id=*/287, /*term_frequency=*/2), - Hit(/*section_id=*/11, /*document_id=*/306, /*term_frequency=*/12), - Hit(/*section_id=*/10, /*document_id=*/306, Hit::kDefaultTermFrequency), + Hit(/*section_id=*/12, /*document_id=*/220, /*term_frequency=*/88, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false), + Hit(/*section_id=*/17, /*document_id=*/265, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false), + Hit(/*section_id=*/0, /*document_id=*/287, /*term_frequency=*/2, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false), + Hit(/*section_id=*/11, /*document_id=*/306, /*term_frequency=*/12, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false), + Hit(/*section_id=*/10, /*document_id=*/306, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false), }; ICING_ASSERT_OK_AND_ASSIGN(IndexBlock::PostingListAndBlockInfo alloc_info_2, block.AllocatePostingList()); @@ -296,9 +326,12 @@ TEST_F(IndexBlockTest, IndexBlockReallocatingPostingLists) { EXPECT_THAT(block.HasFreePostingLists(), IsOkAndHolds(IsTrue())); std::vector<Hit> hits_in_posting_list3{ - Hit(/*section_id=*/12, /*document_id=*/0, /*term_frequency=*/88), - Hit(/*section_id=*/17, /*document_id=*/1, Hit::kDefaultTermFrequency), - Hit(/*section_id=*/0, /*document_id=*/2, /*term_frequency=*/2), + Hit(/*section_id=*/12, /*document_id=*/0, /*term_frequency=*/88, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false), + Hit(/*section_id=*/17, /*document_id=*/1, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false), + Hit(/*section_id=*/0, /*document_id=*/2, /*term_frequency=*/2, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false), }; ICING_ASSERT_OK_AND_ASSIGN(IndexBlock::PostingListAndBlockInfo alloc_info_3, block.AllocatePostingList()); @@ -317,7 +350,7 @@ TEST_F(IndexBlockTest, IndexBlockReallocatingPostingLists) { } TEST_F(IndexBlockTest, IndexBlockNextBlockIndex) { - constexpr int kPostingListBytes = 2000; + constexpr int kPostingListBytes = 2004; constexpr int kSomeBlockIndex = 22; { diff --git a/icing/icing-search-engine_search_test.cc b/icing/icing-search-engine_search_test.cc index 22f42e7..d815f61 100644 --- a/icing/icing-search-engine_search_test.cc +++ b/icing/icing-search-engine_search_test.cc @@ -3716,6 +3716,109 @@ TEST_P(IcingSearchEngineSearchTest, SearchWithPropertyFilters) { EXPECT_THAT(results.results(0).document(), EqualsProto(document_one)); } +TEST_P(IcingSearchEngineSearchTest, SearchWithPropertyFiltersPolymorphism) { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType("Person") + .AddProperty(PropertyConfigBuilder() + .SetName("name") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("emailAddress") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType(SchemaTypeConfigBuilder() + .SetType("Artist") + .AddParentType("Person") + .AddProperty(PropertyConfigBuilder() + .SetName("name") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("emailAddress") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("company") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk()); + + // Add a person document and an artist document + DocumentProto document_person = + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetCreationTimestampMs(1000) + .SetSchema("Person") + .AddStringProperty("name", "Meg Ryan") + .AddStringProperty("emailAddress", "shopgirl@aol.com") + .Build(); + DocumentProto document_artist = + DocumentBuilder() + .SetKey("namespace", "uri2") + .SetCreationTimestampMs(1000) + .SetSchema("Artist") + .AddStringProperty("name", "Meg Artist") + .AddStringProperty("emailAddress", "artist@aol.com") + .AddStringProperty("company", "company") + .Build(); + ASSERT_THAT(icing.Put(document_person).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(document_artist).status(), ProtoIsOk()); + + // Set a query with property filters of "name" in Person and "emailAddress" + // in Artist. By polymorphism, "name" should also apply to Artist. + auto search_spec = std::make_unique<SearchSpecProto>(); + search_spec->set_term_match_type(TermMatchType::PREFIX); + search_spec->set_search_type(GetParam()); + TypePropertyMask* person_type_property_mask = + search_spec->add_type_property_filters(); + person_type_property_mask->set_schema_type("Person"); + person_type_property_mask->add_paths("name"); + TypePropertyMask* artist_type_property_mask = + search_spec->add_type_property_filters(); + artist_type_property_mask->set_schema_type("Artist"); + artist_type_property_mask->add_paths("emailAddress"); + + auto result_spec = std::make_unique<ResultSpecProto>(); + auto scoring_spec = std::make_unique<ScoringSpecProto>(); + *scoring_spec = GetDefaultScoringSpec(); + + // Verify that the property filter for "name" in Person is also applied to + // Artist. + search_spec->set_query("Meg"); + SearchResultProto results = + icing.Search(*search_spec, *scoring_spec, *result_spec); + EXPECT_THAT(results.status(), ProtoIsOk()); + EXPECT_THAT(results.results(), SizeIs(2)); + EXPECT_THAT(results.results(1).document(), EqualsProto(document_person)); + EXPECT_THAT(results.results(0).document(), EqualsProto(document_artist)); + + // Verify that the property filter for "emailAddress" in Artist is only + // applied to Artist. + search_spec->set_query("aol"); + results = icing.Search(*search_spec, *scoring_spec, *result_spec); + EXPECT_THAT(results.status(), ProtoIsOk()); + EXPECT_THAT(results.results(), SizeIs(1)); + EXPECT_THAT(results.results(0).document(), EqualsProto(document_artist)); + + // Verify that the "company" property is filtered out, since it is not + // specified in the property filter. + search_spec->set_query("company"); + results = icing.Search(*search_spec, *scoring_spec, *result_spec); + EXPECT_THAT(results.status(), ProtoIsOk()); + EXPECT_THAT(results.results(), IsEmpty()); +} + TEST_P(IcingSearchEngineSearchTest, EmptySearchWithPropertyFilter) { IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); diff --git a/icing/index/hit/hit.cc b/icing/index/hit/hit.cc index 493e62b..7b7f2f6 100644 --- a/icing/index/hit/hit.cc +++ b/icing/index/hit/hit.cc @@ -14,42 +14,19 @@ #include "icing/index/hit/hit.h" +#include <cstring> +#include <limits> + +#include "icing/schema/section.h" #include "icing/store/document-id.h" #include "icing/util/bit-util.h" +#include "icing/util/logging.h" namespace icing { namespace lib { namespace { -enum FlagOffset { - // This hit, whether exact or not, came from a prefixed section and will - // need to be backfilled into branching posting lists if/when those are - // created. - kInPrefixSection = 0, - // This hit represents a prefix of a longer term. If exact matches are - // required, then this hit should be ignored. - kPrefixHit = 1, - // Whether or not the hit has a term_frequency other than - // kDefaultTermFrequency. - kHasTermFrequency = 2, - kNumFlags = 3, -}; - -static_assert(kDocumentIdBits + kSectionIdBits + kNumFlags < - sizeof(Hit::Value) * 8, - "Hit::kInvalidValue contains risky value and we should have at " - "least one unused bit to avoid potential bugs. Please follow the " - "process mentioned in hit.h to correct the value of " - "Hit::kInvalidValue and remove this static_assert afterwards."); - -static_assert(kDocumentIdBits + kSectionIdBits + kNumFlags <= - sizeof(Hit::Value) * 8, - "HitOverflow"); -static_assert(kDocumentIdBits == 22, ""); -static_assert(kSectionIdBits == 6, ""); -static_assert(kNumFlags == 3, ""); - inline DocumentId InvertDocumentId(DocumentId document_id) { static_assert(kMaxDocumentId <= (std::numeric_limits<DocumentId>::max() - 1), "(kMaxDocumentId + 1) must not overflow."); @@ -88,10 +65,28 @@ SectionId BasicHit::section_id() const { /*len=*/kSectionIdBits); } +Hit::Hit(Value value, Flags flags, TermFrequency term_frequency) + : flags_(flags), term_frequency_(term_frequency) { + memcpy(value_.data(), &value, sizeof(value)); + if (!CheckFlagsAreConsistent()) { + ICING_VLOG(1) + << "Creating Hit that has inconsistent flag values across its fields: " + << "Hit(value=" << value << ", flags=" << flags + << "term_frequency=" << term_frequency << ")"; + } +} + Hit::Hit(SectionId section_id, DocumentId document_id, Hit::TermFrequency term_frequency, bool is_in_prefix_section, bool is_prefix_hit) : term_frequency_(term_frequency) { + // We compute flags first as the value's has_flags bit depends on the flags_ + // field. + Flags temp_flags = 0; + bit_util::BitfieldSet(term_frequency != kDefaultTermFrequency, + kHasTermFrequency, /*len=*/1, &temp_flags); + flags_ = temp_flags; + // Values are stored so that when sorted, they appear in document_id // descending, section_id ascending, order. Also, all else being // equal, non-prefix hits sort before prefix hits. So inverted @@ -99,30 +94,29 @@ Hit::Hit(SectionId section_id, DocumentId document_id, // (uninverted) section_id. Value temp_value = 0; bit_util::BitfieldSet(InvertDocumentId(document_id), - kSectionIdBits + kNumFlags, kDocumentIdBits, + kSectionIdBits + kNumFlagsInValueField, kDocumentIdBits, + &temp_value); + bit_util::BitfieldSet(section_id, kNumFlagsInValueField, kSectionIdBits, &temp_value); - bit_util::BitfieldSet(section_id, kNumFlags, kSectionIdBits, &temp_value); - bit_util::BitfieldSet(term_frequency != kDefaultTermFrequency, - kHasTermFrequency, /*len=*/1, &temp_value); bit_util::BitfieldSet(is_prefix_hit, kPrefixHit, /*len=*/1, &temp_value); bit_util::BitfieldSet(is_in_prefix_section, kInPrefixSection, /*len=*/1, &temp_value); - value_ = temp_value; + + bool has_flags = flags_ != kNoEnabledFlags; + bit_util::BitfieldSet(has_flags, kHasFlags, /*len=*/1, &temp_value); + + memcpy(value_.data(), &temp_value, sizeof(temp_value)); } DocumentId Hit::document_id() const { DocumentId inverted_document_id = bit_util::BitfieldGet( - value(), kSectionIdBits + kNumFlags, kDocumentIdBits); + value(), kSectionIdBits + kNumFlagsInValueField, kDocumentIdBits); // Undo the document_id inversion. return InvertDocumentId(inverted_document_id); } SectionId Hit::section_id() const { - return bit_util::BitfieldGet(value(), kNumFlags, kSectionIdBits); -} - -bool Hit::has_term_frequency() const { - return bit_util::BitfieldGet(value(), kHasTermFrequency, 1); + return bit_util::BitfieldGet(value(), kNumFlagsInValueField, kSectionIdBits); } bool Hit::is_prefix_hit() const { @@ -133,6 +127,27 @@ bool Hit::is_in_prefix_section() const { return bit_util::BitfieldGet(value(), kInPrefixSection, 1); } +bool Hit::has_flags() const { + return bit_util::BitfieldGet(value(), kHasFlags, 1); +} + +bool Hit::has_term_frequency() const { + return bit_util::BitfieldGet(flags(), kHasTermFrequency, 1); +} + +bool Hit::CheckFlagsAreConsistent() const { + bool has_flags = flags_ != kNoEnabledFlags; + bool has_flags_enabled_in_value = + bit_util::BitfieldGet(value(), kHasFlags, /*len=*/1); + + bool has_term_frequency = term_frequency_ != kDefaultTermFrequency; + bool has_term_frequency_enabled_in_flags = + bit_util::BitfieldGet(flags(), kHasTermFrequency, /*len=*/1); + + return has_flags == has_flags_enabled_in_value && + has_term_frequency == has_term_frequency_enabled_in_flags; +} + Hit Hit::TranslateHit(Hit old_hit, DocumentId new_document_id) { return Hit(old_hit.section_id(), new_document_id, old_hit.term_frequency(), old_hit.is_in_prefix_section(), old_hit.is_prefix_hit()); @@ -140,7 +155,8 @@ Hit Hit::TranslateHit(Hit old_hit, DocumentId new_document_id) { bool Hit::EqualsDocumentIdAndSectionId::operator()(const Hit& hit1, const Hit& hit2) const { - return (hit1.value() >> kNumFlags) == (hit2.value() >> kNumFlags); + return (hit1.value() >> kNumFlagsInValueField) == + (hit2.value() >> kNumFlagsInValueField); } } // namespace lib diff --git a/icing/index/hit/hit.h b/icing/index/hit/hit.h index 111b320..83fff12 100644 --- a/icing/index/hit/hit.h +++ b/icing/index/hit/hit.h @@ -17,6 +17,7 @@ #include <array> #include <cstdint> +#include <cstring> #include <limits> #include "icing/legacy/core/icing-packed-pod.h" @@ -70,6 +71,9 @@ class BasicHit { private: // Value bits layout: 4 unused + 22 document_id + 6 section id. + // + // The Value is guaranteed to be an unsigned integer, but its size and the + // information it stores may change if the Hit's encoding format is changed. Value value_; } __attribute__((packed)); static_assert(sizeof(BasicHit) == 4, ""); @@ -78,51 +82,83 @@ static_assert(sizeof(BasicHit) == 4, ""); // consists of: // - a DocumentId // - a SectionId -// referring to the document and section that the hit corresponds to, as well as -// metadata about the hit: -// - whether the Hit has a TermFrequency other than the default value -// - whether the Hit does not appear exactly in the document, but instead -// represents a term that is a prefix of a term in the document -// - whether the Hit came from a section that has prefix expansion enabled -// and a term frequency for the hit. +// - referring to the document and section that the hit corresponds to +// - Metadata about the hit: +// - whether the Hit does not appear exactly in the document, but instead +// represents a term that is a prefix of a term in the document +// (is_prefix_hit) +// - whether the Hit came from a section that has prefix expansion enabled +// (is_in_prefix_section) +// - whether the Hit has set any bitmask flags (has_flags) +// - bitmasks in flags fields: +// - whether the Hit has a TermFrequency other than the default value +// (has_term_frequency) +// - a term frequency for the hit // // The hit is the most basic unit of the index and, when grouped together by // term, can be used to encode what terms appear in what documents. class Hit { public: - // The datatype used to encode Hit information: the document_id, section_id - // and the has_term_frequency, prefix hit and in prefix section flags. + // The datatype used to encode Hit information: the document_id, section_id, + // and 3 flags: is_prefix_hit, is_hit_in_prefix_section and has_flags flag. + // + // The Value is guaranteed to be an unsigned integer, but its size and the + // information it stores may change if the Hit's encoding format is changed. using Value = uint32_t; // WARNING: Changing this value will invalidate any pre-existing posting lists // on user devices. // - // WARNING: - // - Hit::kInvalidValue should contain inverted kInvalidDocumentId, which is - // b'00...0. However, currently we set it as UINT32_MAX and actually it - // contains b'11...1, which is the inverted document_id 0. - // - It means Hit::kInvalidValue contains valid (document_id, section_id, - // flags), so we potentially cannot distinguish if a Hit is invalid or not. - // The invalidity is an essential feature for posting list since we use it - // to determine the state of the posting list. - // - The reason why it won't break the current posting list is because the - // unused bit(s) are set as 1 for Hit::kInvalidValue and 0 for all valid - // Hits. In other words, the unused bit(s) are actually serving as "invalid - // flag". - // - If we want to exhaust all unused bits in the future, then we have to - // change Hit::kInvalidValue to set the inverted document_id section - // correctly (b'00...0, refer to BasicHit::kInvalidValue as an example). - // - Also this problem is guarded by static_assert in hit.cc. If exhausting - // all unused bits, then the static_assert will detect and fail. We can - // safely remove the static_assert check after following the above process - // to resolve the incorrect Hit::kInvalidValue issue. - static constexpr Value kInvalidValue = std::numeric_limits<Value>::max(); + // kInvalidValue contains: + // - 0 for unused bits. Note that unused bits are always 0 for both valid and + // invalid Hit values. + // - Inverted kInvalidDocumentId + // - SectionId 0 (valid), which is ok because inverted kInvalidDocumentId has + // already invalidated the value. In fact, we currently use all 2^6 section + // ids and there is no "invalid section id", so it doesn't matter what + // SectionId we set for kInvalidValue. + static constexpr Value kInvalidValue = 0; // Docs are sorted in reverse, and 0 is never used as the inverted // DocumentId (because it is the inverse of kInvalidValue), so it is always // the max in a descending sort. static constexpr Value kMaxDocumentIdSortValue = 0; + enum FlagOffsetsInFlagsField { + // Whether or not the hit has a term_frequency other than + // kDefaultTermFrequency. + kHasTermFrequency = 0, + kNumFlagsInFlagsField = 1, + }; + + enum FlagOffsetsInValueField { + // Whether or not the hit has a flags value other than kNoEnabledFlags (i.e. + // it has flags enabled in the flags field) + kHasFlags = 0, + // This hit, whether exact or not, came from a prefixed section and will + // need to be backfilled into branching posting lists if/when those are + // created. + kInPrefixSection = 1, + // This hit represents a prefix of a longer term. If exact matches are + // required, then this hit should be ignored. + kPrefixHit = 2, + kNumFlagsInValueField = 3, + }; + static_assert(kDocumentIdBits + kSectionIdBits + kNumFlagsInValueField <= + sizeof(Hit::Value) * 8, + "HitOverflow"); + static_assert(kDocumentIdBits == 22, ""); + static_assert(kSectionIdBits == 6, ""); + static_assert(kNumFlagsInValueField == 3, ""); + + // The datatype used to encode additional bit-flags in the Hit. + // This is guaranteed to be an unsigned integer, but its size may change if + // more flags are introduced in the future and require more bits to encode. + using Flags = uint8_t; + static constexpr Flags kNoEnabledFlags = 0; + // The Term Frequency of a Hit. + // This is guaranteed to be an unsigned integer, but its size may change if we + // need to expand the max term-frequency. using TermFrequency = uint8_t; using TermFrequencyArray = std::array<Hit::TermFrequency, kTotalNumSections>; // Max TermFrequency is 255. @@ -131,40 +167,67 @@ class Hit { static constexpr TermFrequency kDefaultTermFrequency = 1; static constexpr TermFrequency kNoTermFrequency = 0; - explicit Hit(Value value = kInvalidValue, - TermFrequency term_frequency = kDefaultTermFrequency) - : value_(value), term_frequency_(term_frequency) {} - Hit(SectionId section_id, DocumentId document_id, - TermFrequency term_frequency, bool is_in_prefix_section = false, - bool is_prefix_hit = false); + explicit Hit(Value value) + : Hit(value, kNoEnabledFlags, kDefaultTermFrequency) {} + explicit Hit(Value value, Flags flags, TermFrequency term_frequency); + explicit Hit(SectionId section_id, DocumentId document_id, + TermFrequency term_frequency, bool is_in_prefix_section, + bool is_prefix_hit); bool is_valid() const { return value() != kInvalidValue; } - Value value() const { return value_; } + + Value value() const { + Value value; + memcpy(&value, value_.data(), sizeof(value)); + return value; + } + DocumentId document_id() const; SectionId section_id() const; + bool is_prefix_hit() const; + bool is_in_prefix_section() const; + // Whether or not the hit has any flags set to true. + bool has_flags() const; + + Flags flags() const { return flags_; } // Whether or not the hit contains a valid term frequency. bool has_term_frequency() const; + TermFrequency term_frequency() const { return term_frequency_; } - bool is_prefix_hit() const; - bool is_in_prefix_section() const; + + // Returns true if the flags values across the Hit's value_, term_frequency_ + // and flags_ fields are consistent. + bool CheckFlagsAreConsistent() const; // Creates a new hit based on old_hit but with new_document_id set. static Hit TranslateHit(Hit old_hit, DocumentId new_document_id); - bool operator<(const Hit& h2) const { return value() < h2.value(); } - bool operator==(const Hit& h2) const { return value() == h2.value(); } + bool operator<(const Hit& h2) const { + if (value() != h2.value()) { + return value() < h2.value(); + } + return flags() < h2.flags(); + } + bool operator==(const Hit& h2) const { + return value() == h2.value() && flags() == h2.flags(); + } struct EqualsDocumentIdAndSectionId { bool operator()(const Hit& hit1, const Hit& hit2) const; }; private: - // Value and TermFrequency must be in this order. - // Value bits layout: 1 unused + 22 document_id + 6 section id + 3 flags. - Value value_; + // Value, Flags and TermFrequency must be in this order. + // Value bits layout: 1 unused + 22 document_id + 6 section_id + 1 + // is_prefix_hit + 1 is_in_prefix_section + 1 has_flags. + std::array<char, sizeof(Value)> value_; + // Flags bits layout: 1 reserved + 6 unused + 1 has_term_frequency. + // The left-most bit is reserved for chaining additional fields in case of + // future hit expansions. + Flags flags_; TermFrequency term_frequency_; -} __attribute__((packed)); -static_assert(sizeof(Hit) == 5, ""); +}; +static_assert(sizeof(Hit) == 6, ""); // TODO(b/138991332) decide how to remove/replace all is_packed_pod assertions. static_assert(icing_is_packed_pod<Hit>::value, "go/icing-ubsan"); diff --git a/icing/index/hit/hit_test.cc b/icing/index/hit/hit_test.cc index 0086d91..1233e00 100644 --- a/icing/index/hit/hit_test.cc +++ b/icing/index/hit/hit_test.cc @@ -14,6 +14,9 @@ #include "icing/index/hit/hit.h" +#include <algorithm> +#include <vector> + #include "gmock/gmock.h" #include "gtest/gtest.h" #include "icing/schema/section.h" @@ -94,77 +97,127 @@ TEST(BasicHitTest, Comparison) { } TEST(HitTest, HasTermFrequencyFlag) { - Hit h1(kSomeSectionid, kSomeDocumentId, Hit::kDefaultTermFrequency); + Hit h1(kSomeSectionid, kSomeDocumentId, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); EXPECT_THAT(h1.has_term_frequency(), IsFalse()); EXPECT_THAT(h1.term_frequency(), Eq(Hit::kDefaultTermFrequency)); - Hit h2(kSomeSectionid, kSomeDocumentId, kSomeTermFrequency); + Hit h2(kSomeSectionid, kSomeDocumentId, kSomeTermFrequency, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); EXPECT_THAT(h2.has_term_frequency(), IsTrue()); EXPECT_THAT(h2.term_frequency(), Eq(kSomeTermFrequency)); } TEST(HitTest, IsPrefixHitFlag) { - Hit h1(kSomeSectionid, kSomeDocumentId, Hit::kDefaultTermFrequency); + Hit h1(kSomeSectionid, kSomeDocumentId, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); EXPECT_THAT(h1.is_prefix_hit(), IsFalse()); Hit h2(kSomeSectionid, kSomeDocumentId, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); + /*is_in_prefix_section=*/true, /*is_prefix_hit=*/false); EXPECT_THAT(h2.is_prefix_hit(), IsFalse()); Hit h3(kSomeSectionid, kSomeDocumentId, Hit::kDefaultTermFrequency, /*is_in_prefix_section=*/false, /*is_prefix_hit=*/true); EXPECT_THAT(h3.is_prefix_hit(), IsTrue()); + + Hit h4(kSomeSectionid, kSomeDocumentId, kSomeTermFrequency, + /*is_in_prefix_section=*/true, /*is_prefix_hit=*/false); + EXPECT_THAT(h4.is_prefix_hit(), IsFalse()); } TEST(HitTest, IsInPrefixSectionFlag) { - Hit h1(kSomeSectionid, kSomeDocumentId, Hit::kDefaultTermFrequency); + Hit h1(kSomeSectionid, kSomeDocumentId, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); EXPECT_THAT(h1.is_in_prefix_section(), IsFalse()); Hit h2(kSomeSectionid, kSomeDocumentId, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); EXPECT_THAT(h2.is_in_prefix_section(), IsFalse()); Hit h3(kSomeSectionid, kSomeDocumentId, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/true); + /*is_in_prefix_section=*/true, /*is_prefix_hit=*/false); EXPECT_THAT(h3.is_in_prefix_section(), IsTrue()); } +TEST(HitTest, HasFlags) { + Hit h1(kSomeSectionid, kSomeDocumentId, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); + EXPECT_THAT(h1.has_flags(), IsFalse()); + + Hit h2(kSomeSectionid, kSomeDocumentId, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/true); + EXPECT_THAT(h2.has_flags(), IsFalse()); + + Hit h3(kSomeSectionid, kSomeDocumentId, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/true, /*is_prefix_hit=*/false); + EXPECT_THAT(h3.has_flags(), IsFalse()); + + Hit h4(kSomeSectionid, kSomeDocumentId, kSomeTermFrequency, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); + EXPECT_THAT(h4.has_flags(), IsTrue()); + + Hit h5(kSomeSectionid, kSomeDocumentId, kSomeTermFrequency, + /*is_prefix_hit=*/true, /*is_in_prefix_section=*/true); + EXPECT_THAT(h5.has_flags(), IsTrue()); + + Hit h6(kSomeSectionid, kSomeDocumentId, kSomeTermFrequency, + /*is_prefix_hit=*/false, /*is_in_prefix_section=*/true); + EXPECT_THAT(h6.has_flags(), IsTrue()); + + Hit h7(kSomeSectionid, kSomeDocumentId, kSomeTermFrequency, + /*is_prefix_hit=*/true, /*is_in_prefix_section=*/false); + EXPECT_THAT(h7.has_flags(), IsTrue()); +} + TEST(HitTest, Accessors) { - Hit h1(kSomeSectionid, kSomeDocumentId, Hit::kDefaultTermFrequency); + Hit h1(kSomeSectionid, kSomeDocumentId, kSomeTermFrequency, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/true); EXPECT_THAT(h1.document_id(), Eq(kSomeDocumentId)); EXPECT_THAT(h1.section_id(), Eq(kSomeSectionid)); + EXPECT_THAT(h1.term_frequency(), Eq(kSomeTermFrequency)); + EXPECT_THAT(h1.is_in_prefix_section(), IsFalse()); + EXPECT_THAT(h1.is_prefix_hit(), IsTrue()); } TEST(HitTest, Valid) { - Hit def; + Hit def(Hit::kInvalidValue); EXPECT_THAT(def.is_valid(), IsFalse()); - - Hit explicit_invalid(Hit::kInvalidValue); + Hit explicit_invalid(Hit::kInvalidValue, Hit::kNoEnabledFlags, + Hit::kDefaultTermFrequency); EXPECT_THAT(explicit_invalid.is_valid(), IsFalse()); static constexpr Hit::Value kSomeValue = 65372; - Hit explicit_valid(kSomeValue); + Hit explicit_valid(kSomeValue, Hit::kNoEnabledFlags, + Hit::kDefaultTermFrequency); EXPECT_THAT(explicit_valid.is_valid(), IsTrue()); - Hit maximum_document_id_hit(kSomeSectionid, kMaxDocumentId, - kSomeTermFrequency); + Hit maximum_document_id_hit( + kSomeSectionid, kMaxDocumentId, kSomeTermFrequency, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); EXPECT_THAT(maximum_document_id_hit.is_valid(), IsTrue()); - Hit maximum_section_id_hit(kMaxSectionId, kSomeDocumentId, - kSomeTermFrequency); + Hit maximum_section_id_hit(kMaxSectionId, kSomeDocumentId, kSomeTermFrequency, + /*is_in_prefix_section=*/false, + /*is_prefix_hit=*/false); EXPECT_THAT(maximum_section_id_hit.is_valid(), IsTrue()); - Hit minimum_document_id_hit(kSomeSectionid, 0, kSomeTermFrequency); + Hit minimum_document_id_hit(kSomeSectionid, 0, kSomeTermFrequency, + /*is_in_prefix_section=*/false, + /*is_prefix_hit=*/false); EXPECT_THAT(minimum_document_id_hit.is_valid(), IsTrue()); - Hit minimum_section_id_hit(0, kSomeDocumentId, kSomeTermFrequency); + Hit minimum_section_id_hit(0, kSomeDocumentId, kSomeTermFrequency, + /*is_in_prefix_section=*/false, + /*is_prefix_hit=*/false); EXPECT_THAT(minimum_section_id_hit.is_valid(), IsTrue()); - // We use Hit with value Hit::kMaxDocumentIdSortValue for std::lower_bound in - // the lite index. Verify that the value of the smallest valid Hit (which + // We use Hit with value Hit::kMaxDocumentIdSortValue for std::lower_bound + // in the lite index. Verify that the value of the smallest valid Hit (which // contains kMinSectionId, kMaxDocumentId and 3 flags = false) is >= // Hit::kMaxDocumentIdSortValue. - Hit smallest_hit(kMinSectionId, kMaxDocumentId, Hit::kDefaultTermFrequency); + Hit smallest_hit(kMinSectionId, kMaxDocumentId, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); ASSERT_THAT(smallest_hit.is_valid(), IsTrue()); ASSERT_THAT(smallest_hit.has_term_frequency(), IsFalse()); ASSERT_THAT(smallest_hit.is_prefix_hit(), IsFalse()); @@ -173,39 +226,78 @@ TEST(HitTest, Valid) { } TEST(HitTest, Comparison) { - Hit hit(1, 243, Hit::kDefaultTermFrequency); + Hit hit(/*section_id=*/1, /*document_id=*/243, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); // DocumentIds are sorted in ascending order. So a hit with a lower // document_id should be considered greater than one with a higher // document_id. - Hit higher_document_id_hit(1, 2409, Hit::kDefaultTermFrequency); - Hit higher_section_id_hit(15, 243, Hit::kDefaultTermFrequency); + Hit higher_document_id_hit( + /*section_id=*/1, /*document_id=*/2409, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); + Hit higher_section_id_hit(/*section_id=*/15, /*document_id=*/243, + Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false, + /*is_prefix_hit=*/false); // Whether or not a term frequency was set is considered, but the term // frequency itself is not. - Hit term_frequency_hit(1, 243, 12); - Hit prefix_hit(1, 243, Hit::kDefaultTermFrequency, + Hit term_frequency_hit(/*section_id=*/1, 243, /*term_frequency=*/12, + /*is_in_prefix_section=*/false, + /*is_prefix_hit=*/false); + Hit prefix_hit(/*section_id=*/1, 243, Hit::kDefaultTermFrequency, /*is_in_prefix_section=*/false, /*is_prefix_hit=*/true); - Hit hit_in_prefix_section(1, 243, Hit::kDefaultTermFrequency, + Hit hit_in_prefix_section(/*section_id=*/1, 243, Hit::kDefaultTermFrequency, /*is_in_prefix_section=*/true, /*is_prefix_hit=*/false); + Hit hit_with_all_flags_enabled(/*section_id=*/1, 243, 56, + /*is_in_prefix_section=*/true, + /*is_prefix_hit=*/true); std::vector<Hit> hits{hit, higher_document_id_hit, higher_section_id_hit, term_frequency_hit, prefix_hit, - hit_in_prefix_section}; + hit_in_prefix_section, + hit_with_all_flags_enabled}; std::sort(hits.begin(), hits.end()); - EXPECT_THAT( - hits, ElementsAre(higher_document_id_hit, hit, hit_in_prefix_section, - prefix_hit, term_frequency_hit, higher_section_id_hit)); + EXPECT_THAT(hits, + ElementsAre(higher_document_id_hit, hit, term_frequency_hit, + hit_in_prefix_section, prefix_hit, + hit_with_all_flags_enabled, higher_section_id_hit)); - Hit higher_term_frequency_hit(1, 243, 108); + Hit higher_term_frequency_hit(/*section_id=*/1, 243, /*term_frequency=*/108, + /*is_in_prefix_section=*/false, + /*is_prefix_hit=*/false); // The term frequency value is not considered when comparing hits. EXPECT_THAT(term_frequency_hit, Not(Lt(higher_term_frequency_hit))); EXPECT_THAT(higher_term_frequency_hit, Not(Lt(term_frequency_hit))); } +TEST(HitTest, CheckFlagsAreConsistent) { + Hit::Value value_without_flags = 1 << 30; + Hit::Value value_with_flags = value_without_flags + 1; + Hit::Flags flags_with_term_freq = 1; + + Hit consistent_hit_no_flags(value_without_flags, Hit::kNoEnabledFlags, + Hit::kDefaultTermFrequency); + Hit consistent_hit_with_term_frequency(value_with_flags, flags_with_term_freq, + kSomeTermFrequency); + EXPECT_THAT(consistent_hit_no_flags.CheckFlagsAreConsistent(), IsTrue()); + EXPECT_THAT(consistent_hit_with_term_frequency.CheckFlagsAreConsistent(), + IsTrue()); + + Hit inconsistent_hit_1(value_with_flags, Hit::kNoEnabledFlags, + Hit::kDefaultTermFrequency); + Hit inconsistent_hit_2(value_with_flags, Hit::kNoEnabledFlags, + kSomeTermFrequency); + Hit inconsistent_hit_3(value_with_flags, flags_with_term_freq, + Hit::kDefaultTermFrequency); + EXPECT_THAT(inconsistent_hit_1.CheckFlagsAreConsistent(), IsFalse()); + EXPECT_THAT(inconsistent_hit_2.CheckFlagsAreConsistent(), IsFalse()); + EXPECT_THAT(inconsistent_hit_3.CheckFlagsAreConsistent(), IsFalse()); +} + } // namespace } // namespace lib diff --git a/icing/index/index.cc b/icing/index/index.cc index 31dcc7e..f917aa6 100644 --- a/icing/index/index.cc +++ b/icing/index/index.cc @@ -65,10 +65,9 @@ libtextclassifier3::StatusOr<LiteIndex::Options> CreateLiteIndexOptions( "Requested hit buffer size %d is too large.", options.index_merge_size)); } - return LiteIndex::Options(options.base_dir + "/idx/lite.", - options.index_merge_size, - options.lite_index_sort_at_indexing, - options.lite_index_sort_size); + return LiteIndex::Options( + options.base_dir + "/idx/lite.", options.index_merge_size, + options.lite_index_sort_at_indexing, options.lite_index_sort_size); } std::string MakeMainIndexFilepath(const std::string& base_dir) { @@ -345,7 +344,8 @@ libtextclassifier3::Status Index::Editor::BufferTerm(const char* term) { libtextclassifier3::Status Index::Editor::IndexAllBufferedTerms() { for (auto itr = seen_tokens_.begin(); itr != seen_tokens_.end(); itr++) { Hit hit(section_id_, document_id_, /*term_frequency=*/itr->second, - term_match_type_ == TermMatchType::PREFIX); + /*is_in_prefix_section=*/term_match_type_ == TermMatchType::PREFIX, + /*is_prefix_hit=*/false); ICING_ASSIGN_OR_RETURN( uint32_t term_id, term_id_codec_->EncodeTvi(itr->first, TviType::LITE)); ICING_RETURN_IF_ERROR(lite_index_->AddHit(term_id, hit)); diff --git a/icing/index/index_test.cc b/icing/index/index_test.cc index ab550e1..50b65ad 100644 --- a/icing/index/index_test.cc +++ b/icing/index/index_test.cc @@ -2736,7 +2736,8 @@ TEST_F(IndexTest, IndexStorageInfoProto) { EXPECT_THAT(storage_info.main_index_lexicon_size(), Ge(0)); EXPECT_THAT(storage_info.main_index_storage_size(), Ge(0)); EXPECT_THAT(storage_info.main_index_block_size(), Ge(0)); - // There should be 1 block for the header and 1 block for two posting lists. + // There should be 1 block for the header and 1 block for three posting lists + // ("fo", "foo", "foul"). EXPECT_THAT(storage_info.num_blocks(), Eq(2)); EXPECT_THAT(storage_info.min_free_fraction(), Ge(0)); } diff --git a/icing/index/iterator/doc-hit-info-iterator-section-restrict.cc b/icing/index/iterator/doc-hit-info-iterator-section-restrict.cc index 35dc0b9..682b752 100644 --- a/icing/index/iterator/doc-hit-info-iterator-section-restrict.cc +++ b/icing/index/iterator/doc-hit-info-iterator-section-restrict.cc @@ -113,12 +113,12 @@ DocHitInfoIteratorSectionRestrict::ApplyRestrictions( const DocumentStore* document_store, const SchemaStore* schema_store, const SearchSpecProto& search_spec, int64_t current_time_ms) { std::unordered_map<std::string, std::set<std::string>> type_property_filters; - // TODO(b/294274922): Add support for polymorphism in type property filters. - for (const TypePropertyMask& type_property_mask : - search_spec.type_property_filters()) { - type_property_filters[type_property_mask.schema_type()] = - std::set<std::string>(type_property_mask.paths().begin(), - type_property_mask.paths().end()); + for (const SchemaStore::ExpandedTypePropertyMask& type_property_mask : + schema_store->ExpandTypePropertyMasks( + search_spec.type_property_filters())) { + type_property_filters[type_property_mask.schema_type] = + std::set<std::string>(type_property_mask.paths.begin(), + type_property_mask.paths.end()); } auto data = std::make_unique<SectionRestrictData>( document_store, schema_store, current_time_ms, type_property_filters); diff --git a/icing/index/lite/lite-index-header.h b/icing/index/lite/lite-index-header.h index 6208cb6..741d173 100644 --- a/icing/index/lite/lite-index-header.h +++ b/icing/index/lite/lite-index-header.h @@ -53,7 +53,7 @@ class LiteIndex_Header { class LiteIndex_HeaderImpl : public LiteIndex_Header { public: struct HeaderData { - static const uint32_t kMagic = 0x01c61418; + static const uint32_t kMagic = 0xC2EAD682; uint32_t lite_index_crc; uint32_t magic; diff --git a/icing/index/lite/lite-index.cc b/icing/index/lite/lite-index.cc index 4c5cb37..3aed7e4 100644 --- a/icing/index/lite/lite-index.cc +++ b/icing/index/lite/lite-index.cc @@ -74,7 +74,7 @@ size_t header_size() { return sizeof(LiteIndex_HeaderImpl::HeaderData); } } // namespace const TermIdHitPair::Value TermIdHitPair::kInvalidValue = - TermIdHitPair(0, Hit()).value(); + TermIdHitPair(0, Hit(Hit::kInvalidValue)).value(); libtextclassifier3::StatusOr<std::unique_ptr<LiteIndex>> LiteIndex::Create( const LiteIndex::Options& options, const IcingFilesystem* filesystem) { @@ -517,7 +517,8 @@ int LiteIndex::FetchHits( // Do binary search over the sorted section and repeat the above steps. TermIdHitPair target_term_id_hit_pair( - term_id, Hit(Hit::kMaxDocumentIdSortValue, Hit::kDefaultTermFrequency)); + term_id, Hit(Hit::kMaxDocumentIdSortValue, Hit::kNoEnabledFlags, + Hit::kDefaultTermFrequency)); for (const TermIdHitPair* ptr = std::lower_bound( array, array + header_->searchable_end(), target_term_id_hit_pair); ptr < array + header_->searchable_end(); ++ptr) { diff --git a/icing/index/lite/lite-index_test.cc b/icing/index/lite/lite-index_test.cc index 7ff006e..f8ea94a 100644 --- a/icing/index/lite/lite-index_test.cc +++ b/icing/index/lite/lite-index_test.cc @@ -27,7 +27,6 @@ #include "icing/index/hit/hit.h" #include "icing/index/iterator/doc-hit-info-iterator.h" #include "icing/index/lite/doc-hit-info-iterator-term-lite.h" -#include "icing/index/lite/lite-index-header.h" #include "icing/index/lite/term-id-hit-pair.h" #include "icing/index/term-id-codec.h" #include "icing/legacy/index/icing-dynamic-trie.h" @@ -73,15 +72,25 @@ class LiteIndexTest : public testing::Test { constexpr NamespaceId kNamespace0 = 0; +TEST_F(LiteIndexTest, TermIdHitPairInvalidValue) { + TermIdHitPair invalidTermHitPair(TermIdHitPair::kInvalidValue); + + EXPECT_THAT(invalidTermHitPair.term_id(), Eq(0)); + EXPECT_THAT(invalidTermHitPair.hit().value(), Eq(Hit::kInvalidValue)); + EXPECT_THAT(invalidTermHitPair.hit().term_frequency(), + Eq(Hit::kDefaultTermFrequency)); +} + TEST_F(LiteIndexTest, LiteIndexFetchHits_sortAtQuerying_unsortedHitsBelowSortThreshold) { // Set up LiteIndex and TermIdCodec std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index"; - // At 64 bytes the unsorted tail can contain a max of 8 TermHitPairs. - LiteIndex::Options options(lite_index_file_name, - /*hit_buffer_want_merge_bytes=*/1024 * 1024, - /*hit_buffer_sort_at_indexing=*/false, - /*hit_buffer_sort_threshold_bytes=*/64); + // Unsorted tail can contain a max of 8 TermIdHitPairs. + LiteIndex::Options options( + lite_index_file_name, + /*hit_buffer_want_merge_bytes=*/1024 * 1024, + /*hit_buffer_sort_at_indexing=*/false, + /*hit_buffer_sort_threshold_bytes=*/sizeof(TermIdHitPair) * 8); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LiteIndex> lite_index, LiteIndex::Create(options, &icing_filesystem_)); ICING_ASSERT_OK_AND_ASSIGN( @@ -97,9 +106,9 @@ TEST_F(LiteIndexTest, ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id, term_id_codec_->EncodeTvi(foo_tvi, TviType::LITE)); Hit foo_hit0(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); Hit foo_hit1(/*section_id=*/1, /*document_id=*/1, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, foo_hit0)); ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, foo_hit1)); @@ -109,9 +118,9 @@ TEST_F(LiteIndexTest, ICING_ASSERT_OK_AND_ASSIGN(uint32_t bar_term_id, term_id_codec_->EncodeTvi(bar_tvi, TviType::LITE)); Hit bar_hit0(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); Hit bar_hit1(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); ICING_ASSERT_OK(lite_index->AddHit(bar_term_id, bar_hit0)); ICING_ASSERT_OK(lite_index->AddHit(bar_term_id, bar_hit1)); @@ -154,15 +163,16 @@ TEST_F(LiteIndexTest, LiteIndexFetchHits_sortAtIndexing_unsortedHitsBelowSortThreshold) { // Set up LiteIndex and TermIdCodec std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index"; - // At 64 bytes the unsorted tail can contain a max of 8 TermHitPairs. + // The unsorted tail can contain a max of 8 TermIdHitPairs. // However note that in these tests we're unable to sort hits after // indexing, as sorting performed by the string-section-indexing-handler // after indexing all hits in an entire document, rather than after each // AddHits() operation. - LiteIndex::Options options(lite_index_file_name, - /*hit_buffer_want_merge_bytes=*/1024 * 1024, - /*hit_buffer_sort_at_indexing=*/true, - /*hit_buffer_sort_threshold_bytes=*/64); + LiteIndex::Options options( + lite_index_file_name, + /*hit_buffer_want_merge_bytes=*/1024 * 1024, + /*hit_buffer_sort_at_indexing=*/true, + /*hit_buffer_sort_threshold_bytes=*/sizeof(TermIdHitPair) * 8); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LiteIndex> lite_index, LiteIndex::Create(options, &icing_filesystem_)); ICING_ASSERT_OK_AND_ASSIGN( @@ -178,9 +188,9 @@ TEST_F(LiteIndexTest, ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id, term_id_codec_->EncodeTvi(foo_tvi, TviType::LITE)); Hit foo_hit0(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); Hit foo_hit1(/*section_id=*/1, /*document_id=*/1, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, foo_hit0)); ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, foo_hit1)); @@ -190,9 +200,9 @@ TEST_F(LiteIndexTest, ICING_ASSERT_OK_AND_ASSIGN(uint32_t bar_term_id, term_id_codec_->EncodeTvi(bar_tvi, TviType::LITE)); Hit bar_hit0(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); Hit bar_hit1(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); ICING_ASSERT_OK(lite_index->AddHit(bar_term_id, bar_hit0)); ICING_ASSERT_OK(lite_index->AddHit(bar_term_id, bar_hit1)); @@ -237,15 +247,16 @@ TEST_F( LiteIndexFetchHits_sortAtQuerying_unsortedHitsExceedingSortAtIndexThreshold) { // Set up LiteIndex and TermIdCodec std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index"; - // At 64 bytes the unsorted tail can contain a max of 8 TermHitPairs. + // The unsorted tail can contain a max of 8 TermIdHitPairs. // However note that in these tests we're unable to sort hits after // indexing, as sorting performed by the string-section-indexing-handler // after indexing all hits in an entire document, rather than after each // AddHits() operation. - LiteIndex::Options options(lite_index_file_name, - /*hit_buffer_want_merge_bytes=*/1024 * 1024, - /*hit_buffer_sort_at_indexing=*/false, - /*hit_buffer_sort_threshold_bytes=*/64); + LiteIndex::Options options( + lite_index_file_name, + /*hit_buffer_want_merge_bytes=*/1024 * 1024, + /*hit_buffer_sort_at_indexing=*/false, + /*hit_buffer_sort_threshold_bytes=*/sizeof(TermIdHitPair) * 8); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LiteIndex> lite_index, LiteIndex::Create(options, &icing_filesystem_)); ICING_ASSERT_OK_AND_ASSIGN( @@ -257,36 +268,36 @@ TEST_F( // Create 4 hits for docs 0-2, and 2 hits for doc 3 -- 14 in total // Doc 0 Hit doc0_hit0(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); Hit doc0_hit1(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); Hit doc0_hit2(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); Hit doc0_hit3(/*section_id=*/2, /*document_id=*/0, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); // Doc 1 Hit doc1_hit0(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); Hit doc1_hit1(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); Hit doc1_hit2(/*section_id=*/1, /*document_id=*/1, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); Hit doc1_hit3(/*section_id=*/2, /*document_id=*/1, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); // Doc 2 Hit doc2_hit0(/*section_id=*/0, /*document_id=*/2, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); Hit doc2_hit1(/*section_id=*/0, /*document_id=*/2, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); Hit doc2_hit2(/*section_id=*/1, /*document_id=*/2, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); Hit doc2_hit3(/*section_id=*/2, /*document_id=*/2, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); // Doc 3 Hit doc3_hit0(/*section_id=*/0, /*document_id=*/3, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); Hit doc3_hit1(/*section_id=*/0, /*document_id=*/3, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); // Create terms // Foo @@ -398,11 +409,12 @@ TEST_F( LiteIndexFetchHits_sortAtIndexing_unsortedHitsExceedingSortAtIndexThreshold) { // Set up LiteIndex and TermIdCodec std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index"; - // At 64 bytes the unsorted tail can contain a max of 8 TermHitPairs. - LiteIndex::Options options(lite_index_file_name, - /*hit_buffer_want_merge_bytes=*/1024 * 1024, - /*hit_buffer_sort_at_indexing=*/true, - /*hit_buffer_sort_threshold_bytes=*/64); + // The unsorted tail can contain a max of 8 TermIdHitPairs. + LiteIndex::Options options( + lite_index_file_name, + /*hit_buffer_want_merge_bytes=*/1024 * 1024, + /*hit_buffer_sort_at_indexing=*/true, + /*hit_buffer_sort_threshold_bytes=*/sizeof(TermIdHitPair) * 8); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LiteIndex> lite_index, LiteIndex::Create(options, &icing_filesystem_)); ICING_ASSERT_OK_AND_ASSIGN( @@ -414,49 +426,49 @@ TEST_F( // Create 4 hits for docs 0-2, and 2 hits for doc 3 -- 14 in total // Doc 0 Hit doc0_hit0(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); Hit doc0_hit1(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); Hit doc0_hit2(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); Hit doc0_hit3(/*section_id=*/2, /*document_id=*/0, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); // Doc 1 Hit doc1_hit0(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); Hit doc1_hit1(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); Hit doc1_hit2(/*section_id=*/1, /*document_id=*/1, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); Hit doc1_hit3(/*section_id=*/2, /*document_id=*/1, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); // Doc 2 Hit doc2_hit0(/*section_id=*/0, /*document_id=*/2, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); Hit doc2_hit1(/*section_id=*/0, /*document_id=*/2, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); Hit doc2_hit2(/*section_id=*/1, /*document_id=*/2, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); Hit doc2_hit3(/*section_id=*/2, /*document_id=*/2, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); // Doc 3 Hit doc3_hit0(/*section_id=*/0, /*document_id=*/3, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); Hit doc3_hit1(/*section_id=*/0, /*document_id=*/3, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); Hit doc3_hit2(/*section_id=*/1, /*document_id=*/3, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); Hit doc3_hit3(/*section_id=*/2, /*document_id=*/3, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); // Doc 4 Hit doc4_hit0(/*section_id=*/0, /*document_id=*/4, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); Hit doc4_hit1(/*section_id=*/0, /*document_id=*/4, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); Hit doc4_hit2(/*section_id=*/1, /*document_id=*/4, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); Hit doc4_hit3(/*section_id=*/2, /*document_id=*/4, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); // Create terms // Foo @@ -590,11 +602,12 @@ TEST_F( TEST_F(LiteIndexTest, LiteIndexIterator) { // Set up LiteIndex and TermIdCodec std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index"; - // At 64 bytes the unsorted tail can contain a max of 8 TermHitPairs. - LiteIndex::Options options(lite_index_file_name, - /*hit_buffer_want_merge_bytes=*/1024 * 1024, - /*hit_buffer_sort_at_indexing=*/true, - /*hit_buffer_sort_threshold_bytes=*/64); + // The unsorted tail can contain a max of 8 TermIdHitPairs. + LiteIndex::Options options( + lite_index_file_name, + /*hit_buffer_want_merge_bytes=*/1024 * 1024, + /*hit_buffer_sort_at_indexing=*/true, + /*hit_buffer_sort_threshold_bytes=*/sizeof(TermIdHitPair) * 8); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LiteIndex> lite_index, LiteIndex::Create(options, &icing_filesystem_)); ICING_ASSERT_OK_AND_ASSIGN( @@ -610,9 +623,9 @@ TEST_F(LiteIndexTest, LiteIndexIterator) { ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id, term_id_codec_->EncodeTvi(tvi, TviType::LITE)); Hit doc0_hit0(/*section_id=*/0, /*document_id=*/0, /*term_frequency=*/3, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); Hit doc0_hit1(/*section_id=*/1, /*document_id=*/0, /*term_frequency=*/5, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); SectionIdMask doc0_section_id_mask = 0b11; std::unordered_map<SectionId, Hit::TermFrequency> expected_section_ids_tf_map0 = {{0, 3}, {1, 5}}; @@ -620,9 +633,9 @@ TEST_F(LiteIndexTest, LiteIndexIterator) { ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc0_hit1)); Hit doc1_hit1(/*section_id=*/1, /*document_id=*/1, /*term_frequency=*/7, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); Hit doc1_hit2(/*section_id=*/2, /*document_id=*/1, /*term_frequency=*/11, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); SectionIdMask doc1_section_id_mask = 0b110; std::unordered_map<SectionId, Hit::TermFrequency> expected_section_ids_tf_map1 = {{1, 7}, {2, 11}}; @@ -658,11 +671,12 @@ TEST_F(LiteIndexTest, LiteIndexIterator) { TEST_F(LiteIndexTest, LiteIndexIterator_sortAtIndexingDisabled) { // Set up LiteIndex and TermIdCodec std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index"; - // At 64 bytes the unsorted tail can contain a max of 8 TermHitPairs. - LiteIndex::Options options(lite_index_file_name, - /*hit_buffer_want_merge_bytes=*/1024 * 1024, - /*hit_buffer_sort_at_indexing=*/false, - /*hit_buffer_sort_threshold_bytes=*/64); + // The unsorted tail can contain a max of 8 TermIdHitPairs. + LiteIndex::Options options( + lite_index_file_name, + /*hit_buffer_want_merge_bytes=*/1024 * 1024, + /*hit_buffer_sort_at_indexing=*/false, + /*hit_buffer_sort_threshold_bytes=*/sizeof(TermIdHitPair) * 8); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LiteIndex> lite_index, LiteIndex::Create(options, &icing_filesystem_)); ICING_ASSERT_OK_AND_ASSIGN( @@ -678,9 +692,9 @@ TEST_F(LiteIndexTest, LiteIndexIterator_sortAtIndexingDisabled) { ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id, term_id_codec_->EncodeTvi(tvi, TviType::LITE)); Hit doc0_hit0(/*section_id=*/0, /*document_id=*/0, /*term_frequency=*/3, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); Hit doc0_hit1(/*section_id=*/1, /*document_id=*/0, /*term_frequency=*/5, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); SectionIdMask doc0_section_id_mask = 0b11; std::unordered_map<SectionId, Hit::TermFrequency> expected_section_ids_tf_map0 = {{0, 3}, {1, 5}}; @@ -688,9 +702,9 @@ TEST_F(LiteIndexTest, LiteIndexIterator_sortAtIndexingDisabled) { ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc0_hit1)); Hit doc1_hit1(/*section_id=*/1, /*document_id=*/1, /*term_frequency=*/7, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); Hit doc1_hit2(/*section_id=*/2, /*document_id=*/1, /*term_frequency=*/11, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); SectionIdMask doc1_section_id_mask = 0b110; std::unordered_map<SectionId, Hit::TermFrequency> expected_section_ids_tf_map1 = {{1, 7}, {2, 11}}; @@ -726,11 +740,12 @@ TEST_F(LiteIndexTest, LiteIndexIterator_sortAtIndexingDisabled) { TEST_F(LiteIndexTest, LiteIndexHitBufferSize) { // Set up LiteIndex and TermIdCodec std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index"; - // At 64 bytes the unsorted tail can contain a max of 8 TermHitPairs. - LiteIndex::Options options(lite_index_file_name, - /*hit_buffer_want_merge_bytes=*/1024 * 1024, - /*hit_buffer_sort_at_indexing=*/true, - /*hit_buffer_sort_threshold_bytes=*/64); + // The unsorted tail can contain a max of 8 TermIdHitPairs. + LiteIndex::Options options( + lite_index_file_name, + /*hit_buffer_want_merge_bytes=*/1024 * 1024, + /*hit_buffer_sort_at_indexing=*/true, + /*hit_buffer_sort_threshold_bytes=*/sizeof(TermIdHitPair) * 8); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LiteIndex> lite_index, LiteIndex::Create(options, &icing_filesystem_)); ICING_ASSERT_OK_AND_ASSIGN( @@ -746,9 +761,9 @@ TEST_F(LiteIndexTest, LiteIndexHitBufferSize) { ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id, term_id_codec_->EncodeTvi(tvi, TviType::LITE)); Hit hit0(/*section_id=*/0, /*document_id=*/0, /*term_frequency=*/3, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); Hit hit1(/*section_id=*/1, /*document_id=*/0, /*term_frequency=*/5, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, hit0)); ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, hit1)); diff --git a/icing/index/lite/lite-index_thread-safety_test.cc b/icing/index/lite/lite-index_thread-safety_test.cc index 53aa6cd..a73ca28 100644 --- a/icing/index/lite/lite-index_thread-safety_test.cc +++ b/icing/index/lite/lite-index_thread-safety_test.cc @@ -13,15 +13,25 @@ // limitations under the License. #include <array> +#include <cstdint> +#include <memory> #include <string> +#include <string_view> #include <thread> #include <vector> #include "gmock/gmock.h" #include "gtest/gtest.h" +#include "icing/file/filesystem.h" +#include "icing/index/hit/doc-hit-info.h" +#include "icing/index/hit/hit.h" #include "icing/index/lite/lite-index.h" #include "icing/index/term-id-codec.h" +#include "icing/legacy/index/icing-dynamic-trie.h" +#include "icing/legacy/index/icing-filesystem.h" #include "icing/schema/section.h" +#include "icing/store/document-id.h" +#include "icing/store/namespace-id.h" #include "icing/testing/common-matchers.h" #include "icing/testing/tmp-directory.h" @@ -109,9 +119,11 @@ TEST_F(LiteIndexThreadSafetyTest, SimultaneousFetchHits_singleTerm) { ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id, term_id_codec_->EncodeTvi(foo_tvi, TviType::LITE)); Hit doc_hit0(/*section_id=*/kSectionId0, /*document_id=*/kDocumentId0, - Hit::kDefaultTermFrequency, /*is_in_prefix_section=*/false); + Hit::kDefaultTermFrequency, /*is_in_prefix_section=*/false, + /*is_prefix_hit=*/false); Hit doc_hit1(/*section_id=*/kSectionId0, /*document_id=*/kDocumentId1, - Hit::kDefaultTermFrequency, /*is_in_prefix_section=*/false); + Hit::kDefaultTermFrequency, /*is_in_prefix_section=*/false, + /*is_prefix_hit=*/false); ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc_hit0)); ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc_hit1)); @@ -155,9 +167,11 @@ TEST_F(LiteIndexThreadSafetyTest, SimultaneousFetchHits_multipleTerms) { ICING_ASSERT_OK_AND_ASSIGN(uint32_t term_id, term_id_codec_->EncodeTvi(tvi, TviType::LITE)); Hit doc_hit0(/*section_id=*/kSectionId0, /*document_id=*/kDocumentId0, - Hit::kDefaultTermFrequency, /*is_in_prefix_section=*/false); + Hit::kDefaultTermFrequency, /*is_in_prefix_section=*/false, + /*is_prefix_hit=*/false); Hit doc_hit1(/*section_id=*/kSectionId0, /*document_id=*/kDocumentId1, - Hit::kDefaultTermFrequency, /*is_in_prefix_section=*/false); + Hit::kDefaultTermFrequency, /*is_in_prefix_section=*/false, + /*is_prefix_hit=*/false); ICING_ASSERT_OK(lite_index_->AddHit(term_id, doc_hit0)); ICING_ASSERT_OK(lite_index_->AddHit(term_id, doc_hit1)); } @@ -208,7 +222,8 @@ TEST_F(LiteIndexThreadSafetyTest, SimultaneousAddHitAndFetchHits_singleTerm) { ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id, term_id_codec_->EncodeTvi(foo_tvi, TviType::LITE)); Hit doc_hit0(/*section_id=*/kSectionId0, /*document_id=*/kDocumentId0, - Hit::kDefaultTermFrequency, /*is_in_prefix_section=*/false); + Hit::kDefaultTermFrequency, /*is_in_prefix_section=*/false, + /*is_prefix_hit=*/false); ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc_hit0)); // Create kNumThreads threads. Every even-numbered thread calls FetchHits and @@ -228,7 +243,8 @@ TEST_F(LiteIndexThreadSafetyTest, SimultaneousAddHitAndFetchHits_singleTerm) { } else { // Odd-numbered thread calls AddHit. Hit doc_hit(/*section_id=*/thread_id / 2, /*document_id=*/kDocumentId0, - Hit::kDefaultTermFrequency, /*is_in_prefix_section=*/false); + Hit::kDefaultTermFrequency, /*is_in_prefix_section=*/false, + /*is_prefix_hit=*/false); ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc_hit)); } }; @@ -273,7 +289,8 @@ TEST_F(LiteIndexThreadSafetyTest, ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id, term_id_codec_->EncodeTvi(foo_tvi, TviType::LITE)); Hit doc_hit0(/*section_id=*/kSectionId0, /*document_id=*/kDocumentId0, - Hit::kDefaultTermFrequency, /*is_in_prefix_section=*/false); + Hit::kDefaultTermFrequency, /*is_in_prefix_section=*/false, + /*is_prefix_hit=*/false); ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc_hit0)); // Create kNumThreads threads. Every even-numbered thread calls FetchHits and @@ -302,7 +319,8 @@ TEST_F(LiteIndexThreadSafetyTest, // Odd-numbered thread calls AddHit. // AddHit to section 0 of a new doc. Hit doc_hit(/*section_id=*/kSectionId0, /*document_id=*/thread_id / 2, - Hit::kDefaultTermFrequency, /*is_in_prefix_section=*/false); + Hit::kDefaultTermFrequency, /*is_in_prefix_section=*/false, + /*is_prefix_hit=*/false); ICING_ASSERT_OK(lite_index_->AddHit(term_id, doc_hit)); } }; @@ -335,9 +353,11 @@ TEST_F(LiteIndexThreadSafetyTest, ManyAddHitAndOneFetchHits_multipleTerms) { ICING_ASSERT_OK_AND_ASSIGN(uint32_t term_id, term_id_codec_->EncodeTvi(tvi, TviType::LITE)); Hit doc_hit0(/*section_id=*/kSectionId0, /*document_id=*/kDocumentId0, - Hit::kDefaultTermFrequency, /*is_in_prefix_section=*/false); + Hit::kDefaultTermFrequency, /*is_in_prefix_section=*/false, + /*is_prefix_hit=*/false); Hit doc_hit1(/*section_id=*/kSectionId1, /*document_id=*/kDocumentId0, - Hit::kDefaultTermFrequency, /*is_in_prefix_section=*/false); + Hit::kDefaultTermFrequency, /*is_in_prefix_section=*/false, + /*is_prefix_hit=*/false); ICING_ASSERT_OK(lite_index_->AddHit(term_id, doc_hit0)); ICING_ASSERT_OK(lite_index_->AddHit(term_id, doc_hit1)); } @@ -370,7 +390,7 @@ TEST_F(LiteIndexThreadSafetyTest, ManyAddHitAndOneFetchHits_multipleTerms) { // AddHit to section (thread_id % 5 + 1) of doc 0. Hit doc_hit(/*section_id=*/thread_id % 5 + 1, /*document_id=*/kDocumentId0, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); ICING_ASSERT_OK(lite_index_->AddHit(term_id, doc_hit)); } }; diff --git a/icing/index/lite/term-id-hit-pair.h b/icing/index/lite/term-id-hit-pair.h index 82bd010..760aa76 100644 --- a/icing/index/lite/term-id-hit-pair.h +++ b/icing/index/lite/term-id-hit-pair.h @@ -15,25 +15,22 @@ #ifndef ICING_INDEX_TERM_ID_HIT_PAIR_H_ #define ICING_INDEX_TERM_ID_HIT_PAIR_H_ +#include <array> #include <cstdint> -#include <limits> -#include <memory> -#include <string> -#include <vector> #include "icing/index/hit/hit.h" -#include "icing/util/bit-util.h" namespace icing { namespace lib { class TermIdHitPair { public: - // Layout bits: 24 termid + 32 hit value + 8 hit term frequency. - using Value = uint64_t; + // Layout bits: 24 termid + 32 hit value + 8 hit flags + 8 hit term frequency. + using Value = std::array<uint8_t, 9>; static constexpr int kTermIdBits = 24; static constexpr int kHitValueBits = sizeof(Hit::Value) * 8; + static constexpr int kHitFlagsBits = sizeof(Hit::Flags) * 8; static constexpr int kHitTermFrequencyBits = sizeof(Hit::TermFrequency) * 8; static const Value kInvalidValue; @@ -41,33 +38,48 @@ class TermIdHitPair { explicit TermIdHitPair(Value v = kInvalidValue) : value_(v) {} TermIdHitPair(uint32_t term_id, const Hit& hit) { - static_assert(kTermIdBits + kHitValueBits + kHitTermFrequencyBits <= - sizeof(Value) * 8, - "TermIdHitPairTooBig"); - - value_ = 0; - // Term id goes into the most significant bits because it takes - // precedent in sorts. - bit_util::BitfieldSet(term_id, kHitValueBits + kHitTermFrequencyBits, - kTermIdBits, &value_); - bit_util::BitfieldSet(hit.value(), kHitTermFrequencyBits, kHitValueBits, - &value_); - bit_util::BitfieldSet(hit.term_frequency(), 0, kHitTermFrequencyBits, - &value_); + static_assert( + kTermIdBits + kHitValueBits + kHitFlagsBits + kHitTermFrequencyBits <= + sizeof(Value) * 8, + "TermIdHitPairTooBig"); + + // Set termId. Term id takes 3 bytes and goes into value_[0:2] (most + // significant bits) because it takes precedent in sorts. + value_[0] = static_cast<uint8_t>((term_id >> 16) & 0xff); + value_[1] = static_cast<uint8_t>((term_id >> 8) & 0xff); + value_[2] = static_cast<uint8_t>((term_id >> 0) & 0xff); + + // Set hit value. Hit value takes 4 bytes and goes into value_[3:6] + value_[3] = static_cast<uint8_t>((hit.value() >> 24) & 0xff); + value_[4] = static_cast<uint8_t>((hit.value() >> 16) & 0xff); + value_[5] = static_cast<uint8_t>((hit.value() >> 8) & 0xff); + value_[6] = static_cast<uint8_t>((hit.value() >> 0) & 0xff); + + // Set flags in value_[7]. + value_[7] = hit.flags(); + + // Set term-frequency in value_[8] + value_[8] = hit.term_frequency(); } uint32_t term_id() const { - return bit_util::BitfieldGet(value_, kHitValueBits + kHitTermFrequencyBits, - kTermIdBits); + return (static_cast<uint32_t>(value_[0]) << 16) | + (static_cast<uint32_t>(value_[1]) << 8) | + (static_cast<uint32_t>(value_[2]) << 0); } Hit hit() const { - return Hit( - bit_util::BitfieldGet(value_, kHitTermFrequencyBits, kHitValueBits), - bit_util::BitfieldGet(value_, 0, kHitTermFrequencyBits)); + Hit::Value hit_value = (static_cast<uint32_t>(value_[3]) << 24) | + (static_cast<uint32_t>(value_[4]) << 16) | + (static_cast<uint32_t>(value_[5]) << 8) | + (static_cast<uint32_t>(value_[6]) << 0); + Hit::Flags hit_flags = value_[7]; + Hit::TermFrequency term_frequency = value_[8]; + + return Hit(hit_value, hit_flags, term_frequency); } - Value value() const { return value_; } + const Value& value() const { return value_; } bool operator==(const TermIdHitPair& rhs) const { return value_ == rhs.value_; diff --git a/icing/index/lite/term-id-hit-pair_test.cc b/icing/index/lite/term-id-hit-pair_test.cc new file mode 100644 index 0000000..28855b4 --- /dev/null +++ b/icing/index/lite/term-id-hit-pair_test.cc @@ -0,0 +1,95 @@ +// Copyright (C) 2024 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/index/lite/term-id-hit-pair.h" + +#include <algorithm> +#include <cstdint> +#include <vector> + +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "icing/index/hit/hit.h" +#include "icing/schema/section.h" +#include "icing/store/document-id.h" + +namespace icing { +namespace lib { +namespace { + +using ::testing::ElementsAre; +using ::testing::Eq; + +static constexpr DocumentId kSomeDocumentId = 24; +static constexpr SectionId kSomeSectionid = 5; +static constexpr Hit::TermFrequency kSomeTermFrequency = 57; +static constexpr uint32_t kSomeTermId = 129; +static constexpr uint32_t kSomeSmallerTermId = 1; +static constexpr uint32_t kSomeLargerTermId = 0b101010101111111100000001; + +TEST(TermIdHitPairTest, Accessors) { + Hit hit1(kSomeSectionid, kSomeDocumentId, kSomeTermFrequency, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); + Hit hit2(kSomeSectionid, kSomeDocumentId, kSomeTermFrequency, + /*is_in_prefix_section=*/true, /*is_prefix_hit=*/true); + Hit hit3(kSomeSectionid, kSomeDocumentId, kSomeTermFrequency, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); + Hit invalid_hit(Hit::kInvalidValue); + + TermIdHitPair term_id_hit_pair_1(kSomeTermId, hit1); + EXPECT_THAT(term_id_hit_pair_1.term_id(), Eq(kSomeTermId)); + EXPECT_THAT(term_id_hit_pair_1.hit(), Eq(hit1)); + + TermIdHitPair term_id_hit_pair_2(kSomeLargerTermId, hit2); + EXPECT_THAT(term_id_hit_pair_2.term_id(), Eq(kSomeLargerTermId)); + EXPECT_THAT(term_id_hit_pair_2.hit(), Eq(hit2)); + + TermIdHitPair term_id_hit_pair_3(kSomeTermId, invalid_hit); + EXPECT_THAT(term_id_hit_pair_3.term_id(), Eq(kSomeTermId)); + EXPECT_THAT(term_id_hit_pair_3.hit(), Eq(invalid_hit)); +} + +TEST(TermIdHitPairTest, Comparison) { + Hit hit(kSomeSectionid, kSomeDocumentId, kSomeTermFrequency, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); + Hit smaller_hit(/*section_id=*/1, /*document_id=*/100, /*term_frequency=*/1, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); + + TermIdHitPair term_id_hit_pair(kSomeTermId, hit); + TermIdHitPair term_id_hit_pair_equal(kSomeTermId, hit); + TermIdHitPair term_id_hit_pair_smaller_hit(kSomeTermId, smaller_hit); + TermIdHitPair term_id_hit_pair_smaller_term_id(kSomeSmallerTermId, hit); + TermIdHitPair term_id_hit_pair_larger_term_id(kSomeLargerTermId, hit); + TermIdHitPair term_id_hit_pair_smaller_term_id_and_hit(kSomeSmallerTermId, + smaller_hit); + + std::vector<TermIdHitPair> term_id_hit_pairs{ + term_id_hit_pair, + term_id_hit_pair_equal, + term_id_hit_pair_smaller_hit, + term_id_hit_pair_smaller_term_id, + term_id_hit_pair_larger_term_id, + term_id_hit_pair_smaller_term_id_and_hit}; + std::sort(term_id_hit_pairs.begin(), term_id_hit_pairs.end()); + EXPECT_THAT(term_id_hit_pairs, + ElementsAre(term_id_hit_pair_smaller_term_id_and_hit, + term_id_hit_pair_smaller_term_id, + term_id_hit_pair_smaller_hit, term_id_hit_pair_equal, + term_id_hit_pair, term_id_hit_pair_larger_term_id)); +} + +} // namespace + +} // namespace lib +} // namespace icing diff --git a/icing/index/main/main-index-merger.cc b/icing/index/main/main-index-merger.cc index c26a6d7..cc130c2 100644 --- a/icing/index/main/main-index-merger.cc +++ b/icing/index/main/main-index-merger.cc @@ -14,14 +14,20 @@ #include "icing/index/main/main-index-merger.h" +#include <algorithm> #include <cstdint> #include <cstring> -#include <memory> #include <unordered_map> +#include <utility> +#include <vector> +#include "icing/text_classifier/lib3/utils/base/statusor.h" #include "icing/absl_ports/canonical_errors.h" -#include "icing/file/posting_list/index-block.h" +#include "icing/file/posting_list/posting-list-common.h" +#include "icing/index/hit/hit.h" +#include "icing/index/lite/lite-index.h" #include "icing/index/lite/term-id-hit-pair.h" +#include "icing/index/main/main-index.h" #include "icing/index/term-id-codec.h" #include "icing/legacy/core/icing-string-util.h" #include "icing/util/logging.h" diff --git a/icing/index/main/main-index-merger_test.cc b/icing/index/main/main-index-merger_test.cc index 37e14fc..333e338 100644 --- a/icing/index/main/main-index-merger_test.cc +++ b/icing/index/main/main-index-merger_test.cc @@ -13,19 +13,23 @@ // limitations under the License. #include "icing/index/main/main-index-merger.h" +#include <cstdint> +#include <memory> +#include <string> +#include <utility> +#include <vector> + +#include "icing/text_classifier/lib3/utils/base/status.h" #include "gmock/gmock.h" #include "gtest/gtest.h" -#include "icing/absl_ports/canonical_errors.h" #include "icing/file/filesystem.h" -#include "icing/index/iterator/doc-hit-info-iterator.h" -#include "icing/index/main/doc-hit-info-iterator-term-main.h" -#include "icing/index/main/main-index-merger.h" +#include "icing/index/hit/hit.h" +#include "icing/index/lite/lite-index.h" +#include "icing/index/lite/term-id-hit-pair.h" #include "icing/index/main/main-index.h" #include "icing/index/term-id-codec.h" -#include "icing/index/term-property-id.h" #include "icing/legacy/index/icing-dynamic-trie.h" #include "icing/legacy/index/icing-filesystem.h" -#include "icing/schema/section.h" #include "icing/store/namespace-id.h" #include "icing/testing/common-matchers.h" #include "icing/testing/tmp-directory.h" @@ -89,10 +93,10 @@ TEST_F(MainIndexMergerTest, TranslateTermNotAdded) { term_id_codec_->EncodeTvi(fool_tvi, TviType::LITE)); Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, /*term_frequency=*/57, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit)); Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc1_hit)); // 2. Build up a fake LexiconMergeOutputs @@ -128,10 +132,10 @@ TEST_F(MainIndexMergerTest, PrefixExpansion) { term_id_codec_->EncodeTvi(fool_tvi, TviType::LITE)); Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, /*term_frequency=*/57, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit)); Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/true); + /*is_in_prefix_section=*/true, /*is_prefix_hit=*/false); ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc1_hit)); // 2. Build up a fake LexiconMergeOutputs @@ -191,11 +195,11 @@ TEST_F(MainIndexMergerTest, DedupePrefixAndExactWithDifferentTermFrequencies) { term_id_codec_->EncodeTvi(foo_tvi, TviType::LITE)); Hit foot_doc0_hit(/*section_id=*/0, /*document_id=*/0, /*term_frequency=*/57, - /*is_in_prefix_section=*/true); + /*is_in_prefix_section=*/true, /*is_prefix_hit=*/false); ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, foot_doc0_hit)); Hit foo_doc0_hit(/*section_id=*/0, /*document_id=*/0, - Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/true); + Hit::kDefaultTermFrequency, /*is_in_prefix_section=*/true, + /*is_prefix_hit=*/false); ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, foo_doc0_hit)); // 2. Build up a fake LexiconMergeOutputs @@ -255,10 +259,10 @@ TEST_F(MainIndexMergerTest, DedupeWithExactSameTermFrequencies) { term_id_codec_->EncodeTvi(foo_tvi, TviType::LITE)); Hit foot_doc0_hit(/*section_id=*/0, /*document_id=*/0, /*term_frequency=*/57, - /*is_in_prefix_section=*/true); + /*is_in_prefix_section=*/true, /*is_prefix_hit=*/false); ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, foot_doc0_hit)); Hit foo_doc0_hit(/*section_id=*/0, /*document_id=*/0, /*term_frequency=*/57, - /*is_in_prefix_section=*/true); + /*is_in_prefix_section=*/true, /*is_prefix_hit=*/false); ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, foo_doc0_hit)); // The prefix hit should take the sum as term_frequency - 114. Hit prefix_foo_doc0_hit(/*section_id=*/0, /*document_id=*/0, @@ -320,11 +324,11 @@ TEST_F(MainIndexMergerTest, DedupePrefixExpansion) { Hit foot_doc0_hit(/*section_id=*/0, /*document_id=*/0, /*term_frequency=*/Hit::kMaxTermFrequency, - /*is_in_prefix_section=*/true); + /*is_in_prefix_section=*/true, /*is_prefix_hit=*/false); ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, foot_doc0_hit)); Hit fool_doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/true); + /*is_in_prefix_section=*/true, /*is_prefix_hit=*/false); ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, fool_doc0_hit)); // 2. Build up a fake LexiconMergeOutputs diff --git a/icing/index/main/main-index.cc b/icing/index/main/main-index.cc index aae60c6..85ee4dc 100644 --- a/icing/index/main/main-index.cc +++ b/icing/index/main/main-index.cc @@ -651,7 +651,7 @@ libtextclassifier3::Status MainIndex::AddPrefixBackfillHits( ICING_ASSIGN_OR_RETURN(tmp, backfill_accessor->GetNextHitsBatch()); } - Hit last_added_hit; + Hit last_added_hit(Hit::kInvalidValue); // The hits in backfill_hits are in the reverse order of how they were added. // Iterate in reverse to add them to this new posting list in the correct // order. diff --git a/icing/index/main/main-index_test.cc b/icing/index/main/main-index_test.cc index fa96e6c..db9dbe2 100644 --- a/icing/index/main/main-index_test.cc +++ b/icing/index/main/main-index_test.cc @@ -14,23 +14,33 @@ #include "icing/index/main/main-index.h" +#include <cstdint> +#include <memory> +#include <string> +#include <utility> +#include <vector> + +#include "icing/text_classifier/lib3/utils/base/status.h" #include "gmock/gmock.h" #include "gtest/gtest.h" -#include "icing/absl_ports/canonical_errors.h" #include "icing/file/filesystem.h" +#include "icing/index/hit/doc-hit-info.h" +#include "icing/index/hit/hit.h" #include "icing/index/iterator/doc-hit-info-iterator.h" +#include "icing/index/lite/lite-index.h" #include "icing/index/lite/term-id-hit-pair.h" #include "icing/index/main/doc-hit-info-iterator-term-main.h" #include "icing/index/main/main-index-merger.h" #include "icing/index/term-id-codec.h" -#include "icing/index/term-property-id.h" #include "icing/legacy/index/icing-dynamic-trie.h" #include "icing/legacy/index/icing-filesystem.h" #include "icing/legacy/index/icing-mock-filesystem.h" #include "icing/schema/section.h" +#include "icing/store/document-id.h" #include "icing/store/namespace-id.h" #include "icing/testing/common-matchers.h" #include "icing/testing/tmp-directory.h" +#include "icing/util/status-macros.h" namespace icing { namespace lib { @@ -152,7 +162,7 @@ TEST_F(MainIndexTest, MainIndexGetAccessorForPrefixReturnsValidAccessor) { term_id_codec_->EncodeTvi(tvi, TviType::LITE)); Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/true); + /*is_in_prefix_section=*/true, /*is_prefix_hit=*/false); ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit)); // 2. Create the main index. It should have no entries in its lexicon. @@ -178,7 +188,7 @@ TEST_F(MainIndexTest, MainIndexGetAccessorForPrefixReturnsNotFound) { term_id_codec_->EncodeTvi(tvi, TviType::LITE)); Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit)); // 2. Create the main index. It should have no entries in its lexicon. @@ -217,7 +227,7 @@ TEST_F(MainIndexTest, MainIndexGetAccessorForExactReturnsValidAccessor) { term_id_codec_->EncodeTvi(tvi, TviType::LITE)); Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit)); // 2. Create the main index. It should have no entries in its lexicon. @@ -254,18 +264,18 @@ TEST_F(MainIndexTest, MergeIndexToEmpty) { term_id_codec_->EncodeTvi(tvi, TviType::LITE)); Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit)); ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc0_hit)); ICING_ASSERT_OK(lite_index_->AddHit(far_term_id, doc0_hit)); Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/true); + /*is_in_prefix_section=*/true, /*is_prefix_hit=*/false); ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc1_hit)); ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc1_hit)); Hit doc2_hit(/*section_id=*/0, /*document_id=*/2, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc2_hit)); ICING_ASSERT_OK(lite_index_->AddHit(far_term_id, doc2_hit)); @@ -332,18 +342,18 @@ TEST_F(MainIndexTest, MergeIndexToPreexisting) { term_id_codec_->EncodeTvi(tvi, TviType::LITE)); Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit)); ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc0_hit)); ICING_ASSERT_OK(lite_index_->AddHit(far_term_id, doc0_hit)); Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/true); + /*is_in_prefix_section=*/true, /*is_prefix_hit=*/false); ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc1_hit)); ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc1_hit)); Hit doc2_hit(/*section_id=*/0, /*document_id=*/2, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc2_hit)); ICING_ASSERT_OK(lite_index_->AddHit(far_term_id, doc2_hit)); @@ -387,14 +397,14 @@ TEST_F(MainIndexTest, MergeIndexToPreexisting) { term_id_codec_->EncodeTvi(tvi, TviType::LITE)); Hit doc3_hit(/*section_id=*/0, /*document_id=*/3, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc3_hit)); ICING_ASSERT_OK(lite_index_->AddHit(four_term_id, doc3_hit)); ICING_ASSERT_OK(lite_index_->AddHit(foul_term_id, doc3_hit)); ICING_ASSERT_OK(lite_index_->AddHit(fall_term_id, doc3_hit)); Hit doc4_hit(/*section_id=*/0, /*document_id=*/4, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/true); + /*is_in_prefix_section=*/true, /*is_prefix_hit=*/false); ICING_ASSERT_OK(lite_index_->AddHit(four_term_id, doc4_hit)); ICING_ASSERT_OK(lite_index_->AddHit(foul_term_id, doc4_hit)); @@ -449,15 +459,15 @@ TEST_F(MainIndexTest, ExactRetrievedInPrefixSearch) { term_id_codec_->EncodeTvi(tvi, TviType::LITE)); Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/true); + /*is_in_prefix_section=*/true, /*is_prefix_hit=*/false); ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit)); Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc1_hit)); Hit doc2_hit(/*section_id=*/0, /*document_id=*/2, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc2_hit)); // 2. Create the main index. It should have no entries in its lexicon. @@ -500,15 +510,15 @@ TEST_F(MainIndexTest, PrefixNotRetrievedInExactSearch) { term_id_codec_->EncodeTvi(tvi, TviType::LITE)); Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/true); + /*is_in_prefix_section=*/true, /*is_prefix_hit=*/false); ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit)); Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc1_hit)); Hit doc2_hit(/*section_id=*/0, /*document_id=*/2, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/true); + /*is_in_prefix_section=*/true, /*is_prefix_hit=*/false); ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc2_hit)); // 2. Create the main index. It should have no entries in its lexicon. @@ -554,17 +564,17 @@ TEST_F(MainIndexTest, document_id % Hit::kMaxTermFrequency + 1); Hit doc_hit0( /*section_id=*/0, /*document_id=*/document_id, term_frequency, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc_hit0)); Hit doc_hit1( /*section_id=*/1, /*document_id=*/document_id, term_frequency, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc_hit1)); Hit doc_hit2( /*section_id=*/2, /*document_id=*/document_id, term_frequency, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc_hit2)); } @@ -619,7 +629,7 @@ TEST_F(MainIndexTest, MergeIndexBackfilling) { term_id_codec_->EncodeTvi(tvi, TviType::LITE)); Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/true); + /*is_in_prefix_section=*/true, /*is_prefix_hit=*/false); ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc0_hit)); // 2. Create the main index. It should have no entries in its lexicon. @@ -648,7 +658,7 @@ TEST_F(MainIndexTest, MergeIndexBackfilling) { term_id_codec_->EncodeTvi(tvi, TviType::LITE)); Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc1_hit)); // 5. Merge the index. The main index should now contain "fool", "foot" @@ -682,7 +692,7 @@ TEST_F(MainIndexTest, OneHitInTheFirstPageForTwoPagesMainIndex) { uint32_t num_docs = 2038; for (DocumentId document_id = 0; document_id < num_docs; ++document_id) { Hit doc_hit(section_id, document_id, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc_hit)); } diff --git a/icing/index/main/posting-list-hit-accessor_test.cc b/icing/index/main/posting-list-hit-accessor_test.cc index 1127814..c2460ff 100644 --- a/icing/index/main/posting-list-hit-accessor_test.cc +++ b/icing/index/main/posting-list-hit-accessor_test.cc @@ -15,14 +15,19 @@ #include "icing/index/main/posting-list-hit-accessor.h" #include <cstdint> +#include <memory> +#include <string> +#include <utility> +#include <vector> +#include "icing/text_classifier/lib3/utils/base/status.h" #include "gmock/gmock.h" #include "gtest/gtest.h" #include "icing/file/filesystem.h" #include "icing/file/posting_list/flash-index-storage.h" -#include "icing/file/posting_list/index-block.h" +#include "icing/file/posting_list/posting-list-accessor.h" +#include "icing/file/posting_list/posting-list-common.h" #include "icing/file/posting_list/posting-list-identifier.h" -#include "icing/file/posting_list/posting-list-used.h" #include "icing/index/hit/hit.h" #include "icing/index/main/posting-list-hit-serializer.h" #include "icing/testing/common-matchers.h" @@ -102,7 +107,8 @@ TEST_F(PostingListHitAccessorTest, PreexistingPLKeepOnSameBlock) { PostingListHitAccessor::Create(flash_index_storage_.get(), serializer_.get())); // Add a single hit. This will fit in a min-sized posting list. - Hit hit1(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency); + Hit hit1(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); ICING_ASSERT_OK(pl_accessor->PrependHit(hit1)); PostingListAccessor::FinalizeResult result1 = std::move(*pl_accessor).Finalize(); @@ -139,12 +145,12 @@ TEST_F(PostingListHitAccessorTest, PreexistingPLReallocateToLargerPL) { std::unique_ptr<PostingListHitAccessor> pl_accessor, PostingListHitAccessor::Create(flash_index_storage_.get(), serializer_.get())); - // The smallest posting list size is 15 bytes. The first four hits will be - // compressed to one byte each and will be able to fit in the 5 byte padded - // region. The last hit will fit in one of the special hits. The posting list - // will be ALMOST_FULL and can fit at most 2 more hits. + // Use a small posting list of 30 bytes. The first 17 hits will be compressed + // to one byte each and will be able to fit in the 18 byte padded region. The + // last hit will fit in one of the special hits. The posting list will be + // ALMOST_FULL and can fit at most 2 more hits. std::vector<Hit> hits1 = - CreateHits(/*num_hits=*/5, /*desired_byte_length=*/1); + CreateHits(/*num_hits=*/18, /*desired_byte_length=*/1); for (const Hit& hit : hits1) { ICING_ASSERT_OK(pl_accessor->PrependHit(hit)); } @@ -160,10 +166,9 @@ TEST_F(PostingListHitAccessorTest, PreexistingPLReallocateToLargerPL) { pl_accessor, PostingListHitAccessor::CreateFromExisting( flash_index_storage_.get(), serializer_.get(), result1.id)); - // The current posting list can fit at most 2 more hits. Adding 12 more hits - // should result in these hits being moved to a larger posting list. + // The current posting list can fit at most 2 more hits. std::vector<Hit> hits2 = CreateHits( - /*start_docid=*/hits1.back().document_id() + 1, /*num_hits=*/12, + /*last_hit=*/hits1.back(), /*num_hits=*/2, /*desired_byte_length=*/1); for (const Hit& hit : hits2) { @@ -172,18 +177,36 @@ TEST_F(PostingListHitAccessorTest, PreexistingPLReallocateToLargerPL) { PostingListAccessor::FinalizeResult result2 = std::move(*pl_accessor).Finalize(); ICING_EXPECT_OK(result2.status); + // The 2 hits should still fit on the first block + EXPECT_THAT(result1.id.block_index(), Eq(1)); + EXPECT_THAT(result1.id.posting_list_index(), Eq(0)); + + // Add one more hit + ICING_ASSERT_OK_AND_ASSIGN( + pl_accessor, + PostingListHitAccessor::CreateFromExisting( + flash_index_storage_.get(), serializer_.get(), result2.id)); + // The current posting list should be FULL. Adding more hits should result in + // these hits being moved to a larger posting list. + Hit single_hit = + CreateHit(/*last_hit=*/hits2.back(), /*desired_byte_length=*/1); + ICING_ASSERT_OK(pl_accessor->PrependHit(single_hit)); + PostingListAccessor::FinalizeResult result3 = + std::move(*pl_accessor).Finalize(); + ICING_EXPECT_OK(result3.status); // Should have been allocated to the second (new) block because the posting // list should have grown beyond the size that the first block maintains. - EXPECT_THAT(result2.id.block_index(), Eq(2)); - EXPECT_THAT(result2.id.posting_list_index(), Eq(0)); + EXPECT_THAT(result3.id.block_index(), Eq(2)); + EXPECT_THAT(result3.id.posting_list_index(), Eq(0)); - // The posting list at result2.id should hold all of the hits that have been + // The posting list at result3.id should hold all of the hits that have been // added. for (const Hit& hit : hits2) { hits1.push_back(hit); } + hits1.push_back(single_hit); ICING_ASSERT_OK_AND_ASSIGN(PostingListHolder pl_holder, - flash_index_storage_->GetPostingList(result2.id)); + flash_index_storage_->GetPostingList(result3.id)); EXPECT_THAT(serializer_->GetHits(&pl_holder.posting_list), IsOkAndHolds(ElementsAreArray(hits1.rbegin(), hits1.rend()))); } @@ -307,7 +330,7 @@ TEST_F(PostingListHitAccessorTest, InvalidHitReturnsInvalidArgument) { std::unique_ptr<PostingListHitAccessor> pl_accessor, PostingListHitAccessor::Create(flash_index_storage_.get(), serializer_.get())); - Hit invalid_hit; + Hit invalid_hit(Hit::kInvalidValue); EXPECT_THAT(pl_accessor->PrependHit(invalid_hit), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } @@ -317,14 +340,17 @@ TEST_F(PostingListHitAccessorTest, HitsNotDecreasingReturnsInvalidArgument) { std::unique_ptr<PostingListHitAccessor> pl_accessor, PostingListHitAccessor::Create(flash_index_storage_.get(), serializer_.get())); - Hit hit1(/*section_id=*/3, /*document_id=*/1, Hit::kDefaultTermFrequency); + Hit hit1(/*section_id=*/3, /*document_id=*/1, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); ICING_ASSERT_OK(pl_accessor->PrependHit(hit1)); - Hit hit2(/*section_id=*/6, /*document_id=*/1, Hit::kDefaultTermFrequency); + Hit hit2(/*section_id=*/6, /*document_id=*/1, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); EXPECT_THAT(pl_accessor->PrependHit(hit2), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); - Hit hit3(/*section_id=*/2, /*document_id=*/0, Hit::kDefaultTermFrequency); + Hit hit3(/*section_id=*/2, /*document_id=*/0, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); EXPECT_THAT(pl_accessor->PrependHit(hit3), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } @@ -345,7 +371,8 @@ TEST_F(PostingListHitAccessorTest, PreexistingPostingListNoHitsAdded) { std::unique_ptr<PostingListHitAccessor> pl_accessor, PostingListHitAccessor::Create(flash_index_storage_.get(), serializer_.get())); - Hit hit1(/*section_id=*/3, /*document_id=*/1, Hit::kDefaultTermFrequency); + Hit hit1(/*section_id=*/3, /*document_id=*/1, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); ICING_ASSERT_OK(pl_accessor->PrependHit(hit1)); PostingListAccessor::FinalizeResult result1 = std::move(*pl_accessor).Finalize(); diff --git a/icing/index/main/posting-list-hit-serializer.cc b/icing/index/main/posting-list-hit-serializer.cc index e14a0c0..88c0754 100644 --- a/icing/index/main/posting-list-hit-serializer.cc +++ b/icing/index/main/posting-list-hit-serializer.cc @@ -19,8 +19,11 @@ #include <limits> #include <vector> +#include "icing/text_classifier/lib3/utils/base/status.h" +#include "icing/text_classifier/lib3/utils/base/statusor.h" #include "icing/absl_ports/canonical_errors.h" #include "icing/file/posting_list/posting-list-used.h" +#include "icing/index/hit/hit.h" #include "icing/legacy/core/icing-string-util.h" #include "icing/legacy/index/icing-bit-util.h" #include "icing/util/logging.h" @@ -35,6 +38,10 @@ uint32_t GetTermFrequencyByteSize(const Hit& hit) { return hit.has_term_frequency() ? sizeof(Hit::TermFrequency) : 0; } +uint32_t GetFlagsByteSize(const Hit& hit) { + return hit.has_flags() ? sizeof(Hit::Flags) : 0; +} + } // namespace uint32_t PostingListHitSerializer::GetBytesUsed( @@ -55,14 +62,45 @@ uint32_t PostingListHitSerializer::GetMinPostingListSizeToFit( return posting_list_used->size_in_bytes(); } - // In NOT_FULL status BytesUsed contains no special hits. The minimum sized - // posting list that would be guaranteed to fit these hits would be - // ALMOST_FULL, with kInvalidHit in special_hit(0), the uncompressed Hit in - // special_hit(1) and the n compressed hits in the compressed region. - // BytesUsed contains one uncompressed Hit and n compressed hits. Therefore, - // fitting these hits into a posting list would require BytesUsed plus one - // extra hit. - return GetBytesUsed(posting_list_used) + sizeof(Hit); + // Edge case: if number of hits <= 2, and posting_list_used is NOT_FULL, we + // should return kMinPostingListSize as we know that two hits is able to fit + // in the 2 special hits position of a min-sized posting list. + // - Checking this scenario requires deserializing posting_list_used, which is + // an expensive operation. However this is only possible when + // GetBytesUsed(posting_list_used) <= + // sizeof(uncompressed hit) + max(delta encoded value size) + + // (sizeof(Hit) + (sizeof(Hit) - sizeof(Hit::Value)), so we don't need to do + // this when the size exceeds this number. + if (GetBytesUsed(posting_list_used) <= + 2 * sizeof(Hit) + 5 - sizeof(Hit::Value)) { + // Check if we're able to get more than 2 hits from posting_list_used + std::vector<Hit> hits; + libtextclassifier3::Status status = + GetHitsInternal(posting_list_used, /*limit=*/3, /*pop=*/false, &hits); + if (status.ok() && hits.size() <= 2) { + return GetMinPostingListSize(); + } + } + + // - In NOT_FULL status, BytesUsed contains no special hits. For a posting + // list in the NOT_FULL state with n hits, we would have n-1 compressed hits + // and 1 uncompressed hit. + // - The minimum sized posting list that would be guaranteed to fit these hits + // would be FULL, but calculating the size required for the FULL posting + // list would require deserializing the last two added hits, so instead we + // will calculate the size of an ALMOST_FULL posting list to fit. + // - An ALMOST_FULL posting list would have kInvalidHit in special_hit(0), the + // full uncompressed Hit in special_hit(1), and the n-1 compressed hits in + // the compressed region. + // - Currently BytesUsed contains one uncompressed Hit and n-1 compressed + // hits. However, it's possible that the uncompressed Hit is not a full hit, + // but rather only the Hit::Value (this is the case if + // !hit.has_term_frequency()). + // - Therefore, fitting these hits into a posting list would require + // BytesUsed + one extra full hit + byte difference between a full hit and + // Hit::Value. i.e: + // ByteUsed + sizeof(Hit) + (sizeof(Hit) - sizeof(Hit::Value)). + return GetBytesUsed(posting_list_used) + 2 * sizeof(Hit) - sizeof(Hit::Value); } void PostingListHitSerializer::Clear(PostingListUsed* posting_list_used) const { @@ -160,27 +198,37 @@ libtextclassifier3::Status PostingListHitSerializer::PrependHitToAlmostFull( // in the padded area and put new hit at the special position 1. // Calling ValueOrDie is safe here because 1 < kNumSpecialData. Hit cur = GetSpecialHit(posting_list_used, /*index=*/1).ValueOrDie(); - if (cur.value() <= hit.value()) { + if (cur < hit || cur == hit) { return absl_ports::InvalidArgumentError( "Hit being prepended must be strictly less than the most recent Hit"); } - uint64_t delta = cur.value() - hit.value(); uint8_t delta_buf[VarInt::kMaxEncodedLen64]; - size_t delta_len = VarInt::Encode(delta, delta_buf); + size_t delta_len = + EncodeNextHitValue(/*next_hit_value=*/hit.value(), + /*curr_hit_value=*/cur.value(), delta_buf); + uint32_t cur_flags_bytes = GetFlagsByteSize(cur); uint32_t cur_term_frequency_bytes = GetTermFrequencyByteSize(cur); uint32_t pad_end = GetPadEnd(posting_list_used, /*offset=*/kSpecialHitsSize); - if (pad_end >= kSpecialHitsSize + delta_len + cur_term_frequency_bytes) { - // Pad area has enough space for delta and term_frequency of existing hit - // (cur). Write delta at pad_end - delta_len - cur_term_frequency_bytes. + if (pad_end >= kSpecialHitsSize + delta_len + cur_flags_bytes + + cur_term_frequency_bytes) { + // Pad area has enough space for delta, flags and term_frequency of existing + // hit (cur). Write delta at pad_end - delta_len - cur_flags_bytes - + // cur_term_frequency_bytes. uint8_t* delta_offset = posting_list_used->posting_list_buffer() + pad_end - - delta_len - cur_term_frequency_bytes; + delta_len - cur_flags_bytes - + cur_term_frequency_bytes; memcpy(delta_offset, delta_buf, delta_len); - // Now copy term_frequency. + + // Now copy flags. + Hit::Flags flags = cur.flags(); + uint8_t* flags_offset = delta_offset + delta_len; + memcpy(flags_offset, &flags, cur_flags_bytes); + // Copy term_frequency. Hit::TermFrequency term_frequency = cur.term_frequency(); - uint8_t* term_frequency_offset = delta_offset + delta_len; + uint8_t* term_frequency_offset = flags_offset + cur_flags_bytes; memcpy(term_frequency_offset, &term_frequency, cur_term_frequency_bytes); // Now first hit is the new hit, at special position 1. Safe to ignore the @@ -231,23 +279,36 @@ libtextclassifier3::Status PostingListHitSerializer::PrependHitToNotFull( return absl_ports::FailedPreconditionError( "Posting list is in an invalid state."); } + + // Retrieve the last added (cur) hit's value and flags and compare to the hit + // we're adding. Hit::Value cur_value; - memcpy(&cur_value, posting_list_used->posting_list_buffer() + offset, - sizeof(Hit::Value)); - if (cur_value <= hit.value()) { + uint8_t* cur_value_offset = posting_list_used->posting_list_buffer() + offset; + memcpy(&cur_value, cur_value_offset, sizeof(Hit::Value)); + Hit::Flags cur_flags = Hit::kNoEnabledFlags; + if (GetFlagsByteSize(Hit(cur_value)) > 0) { + uint8_t* cur_flags_offset = cur_value_offset + sizeof(Hit::Value); + memcpy(&cur_flags, cur_flags_offset, sizeof(Hit::Flags)); + } + // Term-frequency is not used for hit comparison so it's ok to pass in the + // default term-frequency here. + Hit cur(cur_value, cur_flags, Hit::kDefaultTermFrequency); + if (cur < hit || cur == hit) { return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( - "Hit %d being prepended must be strictly less than the most recent " - "Hit %d", - hit.value(), cur_value)); + "Hit (value=%d, flags=%d) being prepended must be strictly less than " + "the most recent Hit (value=%d, flags=%d)", + hit.value(), hit.flags(), cur_value, cur_flags)); } - uint64_t delta = cur_value - hit.value(); uint8_t delta_buf[VarInt::kMaxEncodedLen64]; - size_t delta_len = VarInt::Encode(delta, delta_buf); + size_t delta_len = + EncodeNextHitValue(/*next_hit_value=*/hit.value(), + /*curr_hit_value=*/cur.value(), delta_buf); + uint32_t hit_flags_bytes = GetFlagsByteSize(hit); uint32_t hit_term_frequency_bytes = GetTermFrequencyByteSize(hit); // offset now points to one past the end of the first hit. offset += sizeof(Hit::Value); - if (kSpecialHitsSize + sizeof(Hit::Value) + delta_len + + if (kSpecialHitsSize + sizeof(Hit::Value) + delta_len + hit_flags_bytes + hit_term_frequency_bytes <= offset) { // Enough space for delta in compressed area. @@ -257,9 +318,9 @@ libtextclassifier3::Status PostingListHitSerializer::PrependHitToNotFull( memcpy(posting_list_used->posting_list_buffer() + offset, delta_buf, delta_len); - // Prepend new hit with (possibly) its term_frequency. We know that there is - // room for 'hit' because of the if statement above, so calling ValueOrDie - // is safe. + // Prepend new hit with (possibly) its flags and term_frequency. We know + // that there is room for 'hit' because of the if statement above, so + // calling ValueOrDie is safe. offset = PrependHitUncompressed(posting_list_used, hit, offset).ValueOrDie(); // offset is guaranteed to be valid here. So it's safe to ignore the return @@ -295,13 +356,13 @@ libtextclassifier3::Status PostingListHitSerializer::PrependHitToNotFull( // (i.e. varint delta encoding expanded required storage). We // move first hit to special position 1 and put new hit in // special position 0. - Hit cur(cur_value); - // offset is < kSpecialHitsSize + delta_len. delta_len is at most 5 bytes. + + // Offset is < kSpecialHitsSize + delta_len. delta_len is at most 5 bytes. // Therefore, offset must be less than kSpecialHitSize + 5. Since posting - // list size must be divisible by sizeof(Hit) (5), it is guaranteed that + // list size must be divisible by sizeof(Hit) (6), it is guaranteed that // offset < size_in_bytes, so it is safe to ignore the return value here. - ICING_RETURN_IF_ERROR( - ConsumeTermFrequencyIfPresent(posting_list_used, &cur, &offset)); + ICING_RETURN_IF_ERROR(ConsumeFlagsAndTermFrequencyIfPresent( + posting_list_used, &cur, &offset)); // Safe to ignore the return value of PadToEnd because offset must be less // than posting_list_used->size_in_bytes(). Otherwise, this function // already would have returned FAILED_PRECONDITION. @@ -463,19 +524,19 @@ libtextclassifier3::Status PostingListHitSerializer::GetHitsInternal( offset += sizeof(Hit::Value); } else { // Now we have delta encoded subsequent hits. Decode and push. - uint64_t delta; - offset += VarInt::Decode( - posting_list_used->posting_list_buffer() + offset, &delta); - val += delta; + DecodedHitInfo decoded_hit_info = DecodeNextHitValue( + posting_list_used->posting_list_buffer() + offset, val); + offset += decoded_hit_info.encoded_size; + val = decoded_hit_info.hit_value; } Hit hit(val); libtextclassifier3::Status status = - ConsumeTermFrequencyIfPresent(posting_list_used, &hit, &offset); + ConsumeFlagsAndTermFrequencyIfPresent(posting_list_used, &hit, &offset); if (!status.ok()) { // This posting list has been corrupted somehow. The first hit of the - // posting list claims to have a term frequency, but there's no more room - // in the posting list for that term frequency to exist. Return an empty - // vector and zero to indicate no hits retrieved. + // posting list claims to have a term frequency or flag, but there's no + // more room in the posting list for that term frequency or flag to exist. + // Return an empty vector and zero to indicate no hits retrieved. if (out != nullptr) { out->clear(); } @@ -497,10 +558,10 @@ libtextclassifier3::Status PostingListHitSerializer::GetHitsInternal( // In the compressed area. Pop and reconstruct. offset/val is // the last traversed hit, which we must discard. So move one // more forward. - uint64_t delta; - offset += VarInt::Decode( - posting_list_used->posting_list_buffer() + offset, &delta); - val += delta; + DecodedHitInfo decoded_hit_info = DecodeNextHitValue( + posting_list_used->posting_list_buffer() + offset, val); + offset += decoded_hit_info.encoded_size; + val = decoded_hit_info.hit_value; // Now val is the first hit of the new posting list. if (kSpecialHitsSize + sizeof(Hit::Value) <= offset) { @@ -509,17 +570,18 @@ libtextclassifier3::Status PostingListHitSerializer::GetHitsInternal( memcpy(mutable_posting_list_used->posting_list_buffer() + offset, &val, sizeof(Hit::Value)); } else { - // val won't fit in compressed area. Also see if there is a + // val won't fit in compressed area. Also see if there is a flag or // term_frequency. Hit hit(val); libtextclassifier3::Status status = - ConsumeTermFrequencyIfPresent(posting_list_used, &hit, &offset); + ConsumeFlagsAndTermFrequencyIfPresent(posting_list_used, &hit, + &offset); if (!status.ok()) { // This posting list has been corrupted somehow. The first hit of - // the posting list claims to have a term frequency, but there's no - // more room in the posting list for that term frequency to exist. - // Return an empty vector and zero to indicate no hits retrieved. Do - // not pop anything. + // the posting list claims to have a term frequency or flag, but + // there's no more room in the posting list for that term frequency or + // flag to exist. Return an empty vector and zero to indicate no hits + // retrieved. Do not pop anything. if (out != nullptr) { out->clear(); } @@ -569,7 +631,7 @@ libtextclassifier3::StatusOr<Hit> PostingListHitSerializer::GetSpecialHit( return absl_ports::InvalidArgumentError( "Special hits only exist at indices 0 and 1"); } - Hit val; + Hit val(Hit::kInvalidValue); memcpy(&val, posting_list_used->posting_list_buffer() + index * sizeof(val), sizeof(val)); return val; @@ -653,11 +715,11 @@ bool PostingListHitSerializer::SetStartByteOffset( // not_full state. Safe to ignore the return value because 0 and 1 are both // < kNumSpecialData. SetSpecialHit(posting_list_used, /*index=*/0, Hit(offset)); - SetSpecialHit(posting_list_used, /*index=*/1, Hit()); + SetSpecialHit(posting_list_used, /*index=*/1, Hit(Hit::kInvalidValue)); } else if (offset == sizeof(Hit)) { // almost_full state. Safe to ignore the return value because 1 is both < // kNumSpecialData. - SetSpecialHit(posting_list_used, /*index=*/0, Hit()); + SetSpecialHit(posting_list_used, /*index=*/0, Hit(Hit::kInvalidValue)); } // Nothing to do for the FULL state - the offset isn't actually stored // anywhere and both special hits hold valid hits. @@ -667,46 +729,72 @@ bool PostingListHitSerializer::SetStartByteOffset( libtextclassifier3::StatusOr<uint32_t> PostingListHitSerializer::PrependHitUncompressed( PostingListUsed* posting_list_used, const Hit& hit, uint32_t offset) const { - if (hit.has_term_frequency()) { - if (offset < kSpecialHitsSize + sizeof(Hit)) { - return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( - "Not enough room to prepend Hit at offset %d.", offset)); - } - offset -= sizeof(Hit); - memcpy(posting_list_used->posting_list_buffer() + offset, &hit, - sizeof(Hit)); - } else { - if (offset < kSpecialHitsSize + sizeof(Hit::Value)) { - return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( - "Not enough room to prepend Hit::Value at offset %d.", offset)); - } - offset -= sizeof(Hit::Value); - Hit::Value val = hit.value(); - memcpy(posting_list_used->posting_list_buffer() + offset, &val, - sizeof(Hit::Value)); + uint32_t hit_bytes_to_prepend = sizeof(Hit::Value) + GetFlagsByteSize(hit) + + GetTermFrequencyByteSize(hit); + + if (offset < kSpecialHitsSize + hit_bytes_to_prepend) { + return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( + "Not enough room to prepend Hit at offset %d.", offset)); } + + if (hit.has_term_frequency()) { + offset -= sizeof(Hit::TermFrequency); + Hit::TermFrequency term_frequency = hit.term_frequency(); + memcpy(posting_list_used->posting_list_buffer() + offset, &term_frequency, + sizeof(Hit::TermFrequency)); + } + if (hit.has_flags()) { + offset -= sizeof(Hit::Flags); + Hit::Flags flags = hit.flags(); + memcpy(posting_list_used->posting_list_buffer() + offset, &flags, + sizeof(Hit::Flags)); + } + offset -= sizeof(Hit::Value); + Hit::Value val = hit.value(); + memcpy(posting_list_used->posting_list_buffer() + offset, &val, + sizeof(Hit::Value)); return offset; } libtextclassifier3::Status -PostingListHitSerializer::ConsumeTermFrequencyIfPresent( +PostingListHitSerializer::ConsumeFlagsAndTermFrequencyIfPresent( const PostingListUsed* posting_list_used, Hit* hit, uint32_t* offset) const { - if (!hit->has_term_frequency()) { - // No term frequency to consume. Everything is fine. + if (!hit->has_flags()) { + // No flags (and by extension, no term-frequency) to consume. Everything is + // fine. return libtextclassifier3::Status::OK; } - if (*offset + sizeof(Hit::TermFrequency) > - posting_list_used->size_in_bytes()) { + + // Consume flags + Hit::Flags flags; + if (*offset + sizeof(Hit::Flags) > posting_list_used->size_in_bytes()) { return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( - "offset %d must not point past the end of the posting list of size %d.", + "offset %d must not point past the end of the posting list of size " + "%d.", *offset, posting_list_used->size_in_bytes())); } - Hit::TermFrequency term_frequency; - memcpy(&term_frequency, posting_list_used->posting_list_buffer() + *offset, - sizeof(Hit::TermFrequency)); - *hit = Hit(hit->value(), term_frequency); - *offset += sizeof(Hit::TermFrequency); + memcpy(&flags, posting_list_used->posting_list_buffer() + *offset, + sizeof(Hit::Flags)); + *hit = Hit(hit->value(), flags, Hit::kDefaultTermFrequency); + *offset += sizeof(Hit::Flags); + + if (hit->has_term_frequency()) { + // Consume term frequency + if (*offset + sizeof(Hit::TermFrequency) > + posting_list_used->size_in_bytes()) { + return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( + "offset %d must not point past the end of the posting list of size " + "%d.", + *offset, posting_list_used->size_in_bytes())); + } + Hit::TermFrequency term_frequency; + memcpy(&term_frequency, posting_list_used->posting_list_buffer() + *offset, + sizeof(Hit::TermFrequency)); + *hit = Hit(hit->value(), flags, term_frequency); + *offset += sizeof(Hit::TermFrequency); + } + return libtextclassifier3::Status::OK; } diff --git a/icing/index/main/posting-list-hit-serializer.h b/icing/index/main/posting-list-hit-serializer.h index 2986d9c..08c792c 100644 --- a/icing/index/main/posting-list-hit-serializer.h +++ b/icing/index/main/posting-list-hit-serializer.h @@ -15,6 +15,7 @@ #ifndef ICING_INDEX_MAIN_POSTING_LIST_HIT_SERIALIZER_H_ #define ICING_INDEX_MAIN_POSTING_LIST_HIT_SERIALIZER_H_ +#include <cstddef> #include <cstdint> #include <vector> @@ -23,6 +24,7 @@ #include "icing/file/posting_list/posting-list-common.h" #include "icing/file/posting_list/posting-list-used.h" #include "icing/index/hit/hit.h" +#include "icing/legacy/index/icing-bit-util.h" #include "icing/util/status-macros.h" namespace icing { @@ -34,6 +36,52 @@ class PostingListHitSerializer : public PostingListSerializer { public: static constexpr uint32_t kSpecialHitsSize = kNumSpecialData * sizeof(Hit); + struct DecodedHitInfo { + // The decoded hit value. + Hit::Value hit_value; + + // Size of the encoded hit in bytes. + size_t encoded_size; + }; + + // Given the current hit value, encodes the next hit value for serialization + // in the posting list. + // + // The encoded value is the varint-encoded delta between next_hit_value and + // curr_hit_value. + // - We add 1 to this delta so as to avoid getting a delta value of 0. + // - This allows us to add duplicate hits with the same value, which is a + // valid case if we need to store hits with different flags that belong in + // the same section-id/doc-id combo. + // - We cannot have an encoded hit delta with a value of 0 as 0 is currently + // used for padding the unused region in the posting list. + // + // REQUIRES: next_hit_value <= curr_hit_value AND + // curr_hit_value - next_hit_value < + // std::numeric_limits<Hit::Value>::max() + // + // RETURNS: next_hit_value's encoded length in bytes and writes the encoded + // value directly into encoded_buf_out. + static size_t EncodeNextHitValue(Hit::Value next_hit_value, + Hit::Value curr_hit_value, + uint8_t* encoded_buf_out) { + uint64_t delta = curr_hit_value - next_hit_value + 1; + return VarInt::Encode(delta, encoded_buf_out); + } + + // Given the current hit value, decodes the next hit value from an encoded + // byte array buffer. + // + // RETURNS: DecodedHitInfo containing the decoded hit value and the value's + // encoded size in bytes. + static DecodedHitInfo DecodeNextHitValue(const uint8_t* encoded_buf_in, + Hit::Value curr_hit_value) { + uint64_t delta; + size_t delta_size = VarInt::Decode(encoded_buf_in, &delta); + Hit::Value hit_value = curr_hit_value + delta - 1; + return {hit_value, delta_size}; + } + uint32_t GetDataTypeBytes() const override { return sizeof(Hit); } uint32_t GetMinPostingListSize() const override { @@ -299,15 +347,19 @@ class PostingListHitSerializer : public PostingListSerializer { PostingListUsed* posting_list_used, const Hit& hit, uint32_t offset) const; - // If hit has a term frequency, consumes the term frequency at offset, updates - // hit to include the term frequency and updates offset to reflect that the - // term frequency has been consumed. + // If hit has the flags and/or term frequency field, consumes the flags and/or + // term frequency at offset, updates hit to include the flag and/or term + // frequency and updates offset to reflect that the flag and/or term frequency + // fields have been consumed. // // RETURNS: // - OK, if successful - // - INVALID_ARGUMENT if hit has a term frequency and offset + - // sizeof(Hit::TermFrequency) >= posting_list_used->size_in_bytes() - libtextclassifier3::Status ConsumeTermFrequencyIfPresent( + // - INVALID_ARGUMENT if hit has a flags and/or term frequency field and + // offset + sizeof(Hit's flag) + sizeof(Hit's tf) >= + // posting_list_used->size_in_bytes() + // i.e. the posting list is not large enough to consume the hit's flags + // and term frequency fields + libtextclassifier3::Status ConsumeFlagsAndTermFrequencyIfPresent( const PostingListUsed* posting_list_used, Hit* hit, uint32_t* offset) const; }; diff --git a/icing/index/main/posting-list-hit-serializer_test.cc b/icing/index/main/posting-list-hit-serializer_test.cc index 7f0b945..ea135ef 100644 --- a/icing/index/main/posting-list-hit-serializer_test.cc +++ b/icing/index/main/posting-list-hit-serializer_test.cc @@ -14,22 +14,32 @@ #include "icing/index/main/posting-list-hit-serializer.h" +#include <algorithm> +#include <cstddef> #include <cstdint> #include <deque> -#include <memory> +#include <iterator> +#include <limits> #include <vector> #include "icing/text_classifier/lib3/utils/base/status.h" #include "gmock/gmock.h" #include "gtest/gtest.h" #include "icing/file/posting_list/posting-list-used.h" +#include "icing/index/hit/hit.h" +#include "icing/legacy/index/icing-bit-util.h" +#include "icing/schema/section.h" +#include "icing/store/document-id.h" #include "icing/testing/common-matchers.h" #include "icing/testing/hit-test-utils.h" using testing::ElementsAre; using testing::ElementsAreArray; using testing::Eq; +using testing::Gt; using testing::IsEmpty; +using testing::IsFalse; +using testing::IsTrue; using testing::Le; using testing::Lt; @@ -58,103 +68,217 @@ TEST(PostingListHitSerializerTest, PostingListUsedPrependHitNotFull) { PostingListUsed::CreateFromUnitializedRegion(&serializer, kHitsSize)); // Make used. - Hit hit0(/*section_id=*/0, 0, /*term_frequency=*/56); + Hit hit0(/*section_id=*/0, /*document_id=*/0, /*term_frequency=*/56, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); ICING_ASSERT_OK(serializer.PrependHit(&pl_used, hit0)); - // Size = sizeof(uncompressed hit0) - int expected_size = sizeof(Hit); - EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Le(expected_size)); + // Size = sizeof(uncompressed hit0::Value) + // + sizeof(hit0::Flags) + // + sizeof(hit0::TermFrequency) + int expected_size = + sizeof(Hit::Value) + sizeof(Hit::Flags) + sizeof(Hit::TermFrequency); + EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(expected_size)); EXPECT_THAT(serializer.GetHits(&pl_used), IsOkAndHolds(ElementsAre(hit0))); - Hit hit1(/*section_id=*/0, 1, Hit::kDefaultTermFrequency); + Hit hit1(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); + uint8_t delta_buf[VarInt::kMaxEncodedLen64]; + size_t delta_len = PostingListHitSerializer::EncodeNextHitValue( + /*next_hit_value=*/hit1.value(), + /*curr_hit_value=*/hit0.value(), delta_buf); ICING_ASSERT_OK(serializer.PrependHit(&pl_used, hit1)); - // Size = sizeof(uncompressed hit1) - // + sizeof(hit0-hit1) + sizeof(hit0::term_frequency) - expected_size += 2 + sizeof(Hit::TermFrequency); - EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Le(expected_size)); + // Size = sizeof(uncompressed hit1::Value) + // + sizeof(hit0-hit1) + // + sizeof(hit0::Flags) + // + sizeof(hit0::TermFrequency) + expected_size += delta_len; + EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(expected_size)); EXPECT_THAT(serializer.GetHits(&pl_used), IsOkAndHolds(ElementsAre(hit1, hit0))); - Hit hit2(/*section_id=*/0, 2, /*term_frequency=*/56); + Hit hit2(/*section_id=*/0, /*document_id=*/2, /*term_frequency=*/56, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); + delta_len = PostingListHitSerializer::EncodeNextHitValue( + /*next_hit_value=*/hit2.value(), + /*curr_hit_value=*/hit1.value(), delta_buf); ICING_ASSERT_OK(serializer.PrependHit(&pl_used, hit2)); - // Size = sizeof(uncompressed hit2) + // Size = sizeof(uncompressed hit2::Value) + sizeof(hit2::Flags) + // + sizeof(hit2::TermFrequency) // + sizeof(hit1-hit2) - // + sizeof(hit0-hit1) + sizeof(hit0::term_frequency) - expected_size += 2; - EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Le(expected_size)); + // + sizeof(hit0-hit1) + // + sizeof(hit0::flags) + // + sizeof(hit0::term_frequency) + expected_size += delta_len + sizeof(Hit::Flags) + sizeof(Hit::TermFrequency); + EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(expected_size)); EXPECT_THAT(serializer.GetHits(&pl_used), IsOkAndHolds(ElementsAre(hit2, hit1, hit0))); - Hit hit3(/*section_id=*/0, 3, Hit::kDefaultTermFrequency); + Hit hit3(/*section_id=*/0, /*document_id=*/3, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); + delta_len = PostingListHitSerializer::EncodeNextHitValue( + /*next_hit_value=*/hit3.value(), + /*curr_hit_value=*/hit2.value(), delta_buf); ICING_ASSERT_OK(serializer.PrependHit(&pl_used, hit3)); - // Size = sizeof(uncompressed hit3) - // + sizeof(hit2-hit3) + sizeof(hit2::term_frequency) + // Size = sizeof(uncompressed hit3::Value) + // + sizeof(hit2-hit3) + sizeof(hit2::Flags) + // + sizeof(hit2::TermFrequency) // + sizeof(hit1-hit2) - // + sizeof(hit0-hit1) + sizeof(hit0::term_frequency) - expected_size += 2 + sizeof(Hit::TermFrequency); - EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Le(expected_size)); + // + sizeof(hit0-hit1) + // + sizeof(hit0::flags) + // + sizeof(hit0::term_frequency) + expected_size += delta_len; + EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(expected_size)); + EXPECT_THAT(serializer.GetHits(&pl_used), + IsOkAndHolds(ElementsAre(hit3, hit2, hit1, hit0))); +} + +TEST(PostingListHitSerializerTest, + PostingListUsedPrependHitAlmostFull_withFlags) { + PostingListHitSerializer serializer; + + // Size = 24 + int pl_size = 2 * serializer.GetMinPostingListSize(); + ICING_ASSERT_OK_AND_ASSIGN( + PostingListUsed pl_used, + PostingListUsed::CreateFromUnitializedRegion(&serializer, pl_size)); + + // Fill up the compressed region. + // Transitions: + // Adding hit0: EMPTY -> NOT_FULL + // Adding hit1: NOT_FULL -> NOT_FULL + // Adding hit2: NOT_FULL -> NOT_FULL + Hit hit0(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); + Hit hit1 = CreateHit(hit0, /*desired_byte_length=*/3); + Hit hit2 = CreateHit(hit1, /*desired_byte_length=*/2, /*term_frequency=*/57, + /*is_in_prefix_section=*/true, + /*is_prefix_hit=*/true); + EXPECT_THAT(hit2.has_flags(), IsTrue()); + EXPECT_THAT(hit2.has_term_frequency(), IsTrue()); + ICING_EXPECT_OK(serializer.PrependHit(&pl_used, hit0)); + ICING_EXPECT_OK(serializer.PrependHit(&pl_used, hit1)); + ICING_EXPECT_OK(serializer.PrependHit(&pl_used, hit2)); + // Size used will be 4 (hit2::Value) + 1 (hit2::Flags) + 1 + // (hit2::TermFrequency) + 2 (hit1-hit2) + 3 (hit0-hit1) = 11 bytes + int expected_size = sizeof(Hit::Value) + sizeof(Hit::Flags) + + sizeof(Hit::TermFrequency) + 2 + 3; + EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(expected_size)); + EXPECT_THAT(serializer.GetHits(&pl_used), + IsOkAndHolds(ElementsAre(hit2, hit1, hit0))); + + // Add one more hit to transition NOT_FULL -> ALMOST_FULL + Hit hit3 = + CreateHit(hit2, /*desired_byte_length=*/3, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); + EXPECT_THAT(hit3.has_flags(), IsFalse()); + ICING_EXPECT_OK(serializer.PrependHit(&pl_used, hit3)); + // Storing them in the compressed region requires 4 (hit3::Value) + 3 + // (hit2-hit3) + 1 (hit2::Flags) + 1 (hit2::TermFrequency) + 2 (hit1-hit2) + 3 + // (hit0-hit1) = 14 bytes, but there are only 12 bytes in the compressed + // region. So instead, the posting list will transition to ALMOST_FULL. + // The in-use compressed region will actually shrink from 11 bytes to 10 bytes + // because the uncompressed version of hit2 will be overwritten with the + // compressed delta of hit2. hit3 will be written to one of the special hits. + // Because we're in ALMOST_FULL, the expected size is the size of the pl minus + // the one hit used to mark the posting list as ALMOST_FULL. + expected_size = pl_size - sizeof(Hit); + EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(expected_size)); EXPECT_THAT(serializer.GetHits(&pl_used), IsOkAndHolds(ElementsAre(hit3, hit2, hit1, hit0))); + + // Add one more hit to transition ALMOST_FULL -> ALMOST_FULL + Hit hit4 = CreateHit(hit3, /*desired_byte_length=*/2); + ICING_EXPECT_OK(serializer.PrependHit(&pl_used, hit4)); + // There are currently 10 bytes in use in the compressed region. hit3 will + // have a 2-byte delta, which fits in the compressed region. Hit3 will be + // moved from the special hit to the compressed region (which will have 12 + // bytes in use after adding hit3). Hit4 will be placed in one of the special + // hits and the posting list will remain in ALMOST_FULL. + EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(expected_size)); + EXPECT_THAT(serializer.GetHits(&pl_used), + IsOkAndHolds(ElementsAre(hit4, hit3, hit2, hit1, hit0))); + + // Add one more hit to transition ALMOST_FULL -> FULL + Hit hit5 = CreateHit(hit4, /*desired_byte_length=*/2); + ICING_EXPECT_OK(serializer.PrependHit(&pl_used, hit5)); + // There are currently 12 bytes in use in the compressed region. hit4 will + // have a 2-byte delta which will not fit in the compressed region. So hit4 + // will remain in one of the special hits and hit5 will occupy the other, + // making the posting list FULL. + EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(pl_size)); + EXPECT_THAT(serializer.GetHits(&pl_used), + IsOkAndHolds(ElementsAre(hit5, hit4, hit3, hit2, hit1, hit0))); + + // The posting list is FULL. Adding another hit should fail. + Hit hit6 = CreateHit(hit5, /*desired_byte_length=*/1); + EXPECT_THAT(serializer.PrependHit(&pl_used, hit6), + StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED)); } TEST(PostingListHitSerializerTest, PostingListUsedPrependHitAlmostFull) { PostingListHitSerializer serializer; - int size = 2 * serializer.GetMinPostingListSize(); + // Size = 24 + int pl_size = 2 * serializer.GetMinPostingListSize(); ICING_ASSERT_OK_AND_ASSIGN( PostingListUsed pl_used, - PostingListUsed::CreateFromUnitializedRegion(&serializer, size)); + PostingListUsed::CreateFromUnitializedRegion(&serializer, pl_size)); // Fill up the compressed region. // Transitions: // Adding hit0: EMPTY -> NOT_FULL // Adding hit1: NOT_FULL -> NOT_FULL // Adding hit2: NOT_FULL -> NOT_FULL - Hit hit0(/*section_id=*/0, 0, Hit::kDefaultTermFrequency); - Hit hit1 = CreateHit(hit0, /*desired_byte_length=*/2); - Hit hit2 = CreateHit(hit1, /*desired_byte_length=*/2); + Hit hit0(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); + Hit hit1 = CreateHit(hit0, /*desired_byte_length=*/3); + Hit hit2 = CreateHit(hit1, /*desired_byte_length=*/3); ICING_EXPECT_OK(serializer.PrependHit(&pl_used, hit0)); ICING_EXPECT_OK(serializer.PrependHit(&pl_used, hit1)); ICING_EXPECT_OK(serializer.PrependHit(&pl_used, hit2)); - // Size used will be 2+2+4=8 bytes - int expected_size = sizeof(Hit::Value) + 2 + 2; - EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Le(expected_size)); + // Size used will be 4 (hit2::Value) + 3 (hit1-hit2) + 3 (hit0-hit1) + // = 10 bytes + int expected_size = sizeof(Hit::Value) + 3 + 3; + EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(expected_size)); EXPECT_THAT(serializer.GetHits(&pl_used), IsOkAndHolds(ElementsAre(hit2, hit1, hit0))); // Add one more hit to transition NOT_FULL -> ALMOST_FULL Hit hit3 = CreateHit(hit2, /*desired_byte_length=*/3); ICING_EXPECT_OK(serializer.PrependHit(&pl_used, hit3)); - // Compressed region would be 2+2+3+4=11 bytes, but the compressed region is - // only 10 bytes. So instead, the posting list will transition to ALMOST_FULL. - // The in-use compressed region will actually shrink from 8 bytes to 7 bytes + // Storing them in the compressed region requires 4 (hit3::Value) + 3 + // (hit2-hit3) + 3 (hit1-hit2) + 3 (hit0-hit1) = 13 bytes, but there are only + // 12 bytes in the compressed region. So instead, the posting list will + // transition to ALMOST_FULL. + // The in-use compressed region will actually shrink from 10 bytes to 9 bytes // because the uncompressed version of hit2 will be overwritten with the // compressed delta of hit2. hit3 will be written to one of the special hits. // Because we're in ALMOST_FULL, the expected size is the size of the pl minus // the one hit used to mark the posting list as ALMOST_FULL. - expected_size = size - sizeof(Hit); - EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Le(expected_size)); + expected_size = pl_size - sizeof(Hit); + EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(expected_size)); EXPECT_THAT(serializer.GetHits(&pl_used), IsOkAndHolds(ElementsAre(hit3, hit2, hit1, hit0))); // Add one more hit to transition ALMOST_FULL -> ALMOST_FULL Hit hit4 = CreateHit(hit3, /*desired_byte_length=*/2); ICING_EXPECT_OK(serializer.PrependHit(&pl_used, hit4)); - // There are currently 7 bytes in use in the compressed region. hit3 will have - // a 2-byte delta. That delta will fit in the compressed region (which will - // now have 9 bytes in use), hit4 will be placed in one of the special hits - // and the posting list will remain in ALMOST_FULL. - EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Le(expected_size)); + // There are currently 9 bytes in use in the compressed region. Hit3 will + // have a 2-byte delta, which fits in the compressed region. Hit3 will be + // moved from the special hit to the compressed region (which will have 11 + // bytes in use after adding hit3). Hit4 will be placed in one of the special + // hits and the posting list will remain in ALMOST_FULL. + EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(expected_size)); EXPECT_THAT(serializer.GetHits(&pl_used), IsOkAndHolds(ElementsAre(hit4, hit3, hit2, hit1, hit0))); // Add one more hit to transition ALMOST_FULL -> FULL Hit hit5 = CreateHit(hit4, /*desired_byte_length=*/2); ICING_EXPECT_OK(serializer.PrependHit(&pl_used, hit5)); - // There are currently 9 bytes in use in the compressed region. hit4 will have - // a 2-byte delta which will not fit in the compressed region. So hit4 will - // remain in one of the special hits and hit5 will occupy the other, making - // the posting list FULL. - EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Le(size)); + // There are currently 11 bytes in use in the compressed region. Hit4 will + // have a 2-byte delta which will not fit in the compressed region. So hit4 + // will remain in one of the special hits and hit5 will occupy the other, + // making the posting list FULL. + EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(pl_size)); EXPECT_THAT(serializer.GetHits(&pl_used), IsOkAndHolds(ElementsAre(hit5, hit4, hit3, hit2, hit1, hit0))); @@ -164,9 +288,59 @@ TEST(PostingListHitSerializerTest, PostingListUsedPrependHitAlmostFull) { StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED)); } +TEST(PostingListHitSerializerTest, PrependHitsWithSameValue) { + PostingListHitSerializer serializer; + + // Size = 24 + int pl_size = 2 * serializer.GetMinPostingListSize(); + ICING_ASSERT_OK_AND_ASSIGN( + PostingListUsed pl_used, + PostingListUsed::CreateFromUnitializedRegion(&serializer, pl_size)); + + // Fill up the compressed region. + Hit hit0(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); + Hit hit1 = CreateHit(hit0, /*desired_byte_length=*/3); + Hit hit2 = CreateHit(hit1, /*desired_byte_length=*/2, /*term_frequency=*/57, + /*is_in_prefix_section=*/true, + /*is_prefix_hit=*/true); + // Create hit3 with the same value but different flags as hit2 (hit3_flags + // is set to have all currently-defined flags enabled) + Hit::Flags hit3_flags = 0; + for (int i = 0; i < Hit::kNumFlagsInFlagsField; ++i) { + hit3_flags |= (1 << i); + } + Hit hit3(hit2.value(), /*term_frequency=*/hit2.term_frequency(), + /*flags=*/hit3_flags); + + // hit3 is larger than hit2 (its flag value is larger), and so needs to be + // prepended first + ICING_EXPECT_OK(serializer.PrependHit(&pl_used, hit0)); + ICING_EXPECT_OK(serializer.PrependHit(&pl_used, hit1)); + ICING_EXPECT_OK(serializer.PrependHit(&pl_used, hit3)); + ICING_EXPECT_OK(serializer.PrependHit(&pl_used, hit2)); + // Posting list is now ALMOST_FULL + // ---------------------- + // 23-21 delta(Hit #0) + // 20-19 delta(Hit #1) + // 18 term-frequency(Hit #2) + // 17 flags(Hit #2) + // 16 delta(Hit #2) = 0 + // 15-12 <unused padding = 0> + // 11-6 Hit #3 + // 5-0 kSpecialHit + // ---------------------- + int bytes_used = pl_size - sizeof(Hit); + + EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(bytes_used)); + EXPECT_THAT(serializer.GetHits(&pl_used), + IsOkAndHolds(ElementsAre(hit2, hit3, hit1, hit0))); +} + TEST(PostingListHitSerializerTest, PostingListUsedMinSize) { PostingListHitSerializer serializer; + // Min size = 12 ICING_ASSERT_OK_AND_ASSIGN( PostingListUsed pl_used, PostingListUsed::CreateFromUnitializedRegion( @@ -176,7 +350,7 @@ TEST(PostingListHitSerializerTest, PostingListUsedMinSize) { EXPECT_THAT(serializer.GetHits(&pl_used), IsOkAndHolds(IsEmpty())); // Add a hit, PL should shift to ALMOST_FULL state - Hit hit0(/*section_id=*/0, 0, /*term_frequency=*/0, + Hit hit0(/*section_id=*/1, /*document_id=*/0, /*term_frequency=*/0, /*is_in_prefix_section=*/false, /*is_prefix_hit=*/true); ICING_EXPECT_OK(serializer.PrependHit(&pl_used, hit0)); @@ -185,10 +359,10 @@ TEST(PostingListHitSerializerTest, PostingListUsedMinSize) { EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Le(expected_size)); EXPECT_THAT(serializer.GetHits(&pl_used), IsOkAndHolds(ElementsAre(hit0))); - // Add the smallest hit possible - no term_frequency and a delta of 1. PL - // should shift to FULL state. - Hit hit1(/*section_id=*/0, 0, /*term_frequency=*/0, - /*is_in_prefix_section=*/true, + // Add the smallest hit possible - no term_frequency, non-prefix hit and a + // delta of 0b10. PL should shift to FULL state. + Hit hit1(/*section_id=*/0, /*document_id=*/0, /*term_frequency=*/0, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); ICING_EXPECT_OK(serializer.PrependHit(&pl_used, hit1)); // Size = sizeof(uncompressed hit1) + sizeof(uncompressed hit0) @@ -198,7 +372,7 @@ TEST(PostingListHitSerializerTest, PostingListUsedMinSize) { IsOkAndHolds(ElementsAre(hit1, hit0))); // Try to add the smallest hit possible. Should fail - Hit hit2(/*section_id=*/0, 0, /*term_frequency=*/0, + Hit hit2(/*section_id=*/0, /*document_id=*/0, /*term_frequency=*/0, /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); EXPECT_THAT(serializer.PrependHit(&pl_used, hit2), @@ -212,14 +386,17 @@ TEST(PostingListHitSerializerTest, PostingListPrependHitArrayMinSizePostingList) { PostingListHitSerializer serializer; - // Min Size = 10 - int size = serializer.GetMinPostingListSize(); + // Min Size = 12 + int pl_size = serializer.GetMinPostingListSize(); ICING_ASSERT_OK_AND_ASSIGN( PostingListUsed pl_used, - PostingListUsed::CreateFromUnitializedRegion(&serializer, size)); + PostingListUsed::CreateFromUnitializedRegion(&serializer, pl_size)); std::vector<HitElt> hits_in; - hits_in.emplace_back(Hit(1, 0, Hit::kDefaultTermFrequency)); + hits_in.emplace_back(Hit(/*section_id=*/1, /*document_id=*/0, + Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false, + /*is_prefix_hit=*/false)); hits_in.emplace_back( CreateHit(hits_in.rbegin()->hit, /*desired_byte_length=*/1)); hits_in.emplace_back( @@ -235,7 +412,7 @@ TEST(PostingListHitSerializerTest, ICING_ASSERT_OK_AND_ASSIGN( uint32_t num_can_prepend, (serializer.PrependHitArray<HitElt, HitElt::get_hit>( - &pl_used, &hits_in[0], hits_in.size(), false))); + &pl_used, &hits_in[0], hits_in.size(), /*keep_prepended=*/false))); EXPECT_THAT(num_can_prepend, Eq(2)); int can_fit_hits = num_can_prepend; @@ -243,10 +420,11 @@ TEST(PostingListHitSerializerTest, // problem, transitioning the PL from EMPTY -> ALMOST_FULL -> FULL const HitElt *hits_in_ptr = hits_in.data() + (hits_in.size() - 2); ICING_ASSERT_OK_AND_ASSIGN( - num_can_prepend, (serializer.PrependHitArray<HitElt, HitElt::get_hit>( - &pl_used, hits_in_ptr, can_fit_hits, false))); + num_can_prepend, + (serializer.PrependHitArray<HitElt, HitElt::get_hit>( + &pl_used, hits_in_ptr, can_fit_hits, /*keep_prepended=*/false))); EXPECT_THAT(num_can_prepend, Eq(can_fit_hits)); - EXPECT_THAT(size, Eq(serializer.GetBytesUsed(&pl_used))); + EXPECT_THAT(pl_size, Eq(serializer.GetBytesUsed(&pl_used))); std::deque<Hit> hits_pushed; std::transform(hits_in.rbegin(), hits_in.rend() - hits_in.size() + can_fit_hits, @@ -258,14 +436,17 @@ TEST(PostingListHitSerializerTest, TEST(PostingListHitSerializerTest, PostingListPrependHitArrayPostingList) { PostingListHitSerializer serializer; - // Size = 30 - int size = 3 * serializer.GetMinPostingListSize(); + // Size = 36 + int pl_size = 3 * serializer.GetMinPostingListSize(); ICING_ASSERT_OK_AND_ASSIGN( PostingListUsed pl_used, - PostingListUsed::CreateFromUnitializedRegion(&serializer, size)); + PostingListUsed::CreateFromUnitializedRegion(&serializer, pl_size)); std::vector<HitElt> hits_in; - hits_in.emplace_back(Hit(1, 0, Hit::kDefaultTermFrequency)); + hits_in.emplace_back(Hit(/*section_id=*/1, /*document_id=*/0, + Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false, + /*is_prefix_hit=*/false)); hits_in.emplace_back( CreateHit(hits_in.rbegin()->hit, /*desired_byte_length=*/1)); hits_in.emplace_back( @@ -278,14 +459,14 @@ TEST(PostingListHitSerializerTest, PostingListPrependHitArrayPostingList) { // The last hit is uncompressed and the four before it should only take one // byte. Total use = 8 bytes. // ---------------------- - // 29 delta(Hit #1) - // 28 delta(Hit #2) - // 27 delta(Hit #3) - // 26 delta(Hit #4) - // 25-22 Hit #5 - // 21-10 <unused> - // 9-5 kSpecialHit - // 4-0 Offset=22 + // 35 delta(Hit #0) + // 34 delta(Hit #1) + // 33 delta(Hit #2) + // 32 delta(Hit #3) + // 31-28 Hit #4 + // 27-12 <unused> + // 11-6 kSpecialHit + // 5-0 Offset=28 // ---------------------- int byte_size = sizeof(Hit::Value) + hits_in.size() - 1; @@ -294,7 +475,7 @@ TEST(PostingListHitSerializerTest, PostingListPrependHitArrayPostingList) { ICING_ASSERT_OK_AND_ASSIGN( uint32_t num_could_fit, (serializer.PrependHitArray<HitElt, HitElt::get_hit>( - &pl_used, &hits_in[0], hits_in.size(), false))); + &pl_used, &hits_in[0], hits_in.size(), /*keep_prepended=*/false))); EXPECT_THAT(num_could_fit, Eq(hits_in.size())); EXPECT_THAT(byte_size, Eq(serializer.GetBytesUsed(&pl_used))); std::deque<Hit> hits_pushed; @@ -316,31 +497,35 @@ TEST(PostingListHitSerializerTest, PostingListPrependHitArrayPostingList) { CreateHit(hits_in.rbegin()->hit, /*desired_byte_length=*/3)); hits_in.emplace_back( CreateHit(hits_in.rbegin()->hit, /*desired_byte_length=*/2)); + hits_in.emplace_back( + CreateHit(hits_in.rbegin()->hit, /*desired_byte_length=*/3)); std::reverse(hits_in.begin(), hits_in.end()); - // Size increased by the deltas of these hits (1+2+1+2+3+2) = 11 bytes + // Size increased by the deltas of these hits (1+2+1+2+3+2+3) = 14 bytes // ---------------------- - // 29 delta(Hit #1) - // 28 delta(Hit #2) - // 27 delta(Hit #3) - // 26 delta(Hit #4) - // 25 delta(Hit #5) - // 24-23 delta(Hit #6) - // 22 delta(Hit #7) - // 21-20 delta(Hit #8) - // 19-17 delta(Hit #9) - // 16-15 delta(Hit #10) - // 14-11 Hit #11 - // 10 <unused> - // 9-5 kSpecialHit - // 4-0 Offset=11 + // 35 delta(Hit #0) + // 34 delta(Hit #1) + // 33 delta(Hit #2) + // 32 delta(Hit #3) + // 31 delta(Hit #4) + // 30-29 delta(Hit #5) + // 28 delta(Hit #6) + // 27-26 delta(Hit #7) + // 25-23 delta(Hit #8) + // 22-21 delta(Hit #9) + // 20-18 delta(Hit #10) + // 17-14 Hit #11 + // 13-12 <unused> + // 11-6 kSpecialHit + // 5-0 Offset=14 // ---------------------- - byte_size += 11; + byte_size += 14; - // Add these 6 hits. The PL is currently in the NOT_FULL state and should + // Add these 7 hits. The PL is currently in the NOT_FULL state and should // remain in the NOT_FULL state. ICING_ASSERT_OK_AND_ASSIGN( - num_could_fit, (serializer.PrependHitArray<HitElt, HitElt::get_hit>( - &pl_used, &hits_in[0], hits_in.size(), false))); + num_could_fit, + (serializer.PrependHitArray<HitElt, HitElt::get_hit>( + &pl_used, &hits_in[0], hits_in.size(), /*keep_prepended=*/false))); EXPECT_THAT(num_could_fit, Eq(hits_in.size())); EXPECT_THAT(byte_size, Eq(serializer.GetBytesUsed(&pl_used))); // All hits from hits_in were added. @@ -353,29 +538,32 @@ TEST(PostingListHitSerializerTest, PostingListPrependHitArrayPostingList) { hits_in.clear(); hits_in.emplace_back(first_hit); // ---------------------- - // 29 delta(Hit #1) - // 28 delta(Hit #2) - // 27 delta(Hit #3) - // 26 delta(Hit #4) - // 25 delta(Hit #5) - // 24-23 delta(Hit #6) - // 22 delta(Hit #7) - // 21-20 delta(Hit #8) - // 19-17 delta(Hit #9) - // 16-15 delta(Hit #10) - // 14-12 delta(Hit #11) - // 11-10 <unused> - // 9-5 Hit #12 - // 4-0 kSpecialHit + // 35 delta(Hit #0) + // 34 delta(Hit #1) + // 33 delta(Hit #2) + // 32 delta(Hit #3) + // 31 delta(Hit #4) + // 30-29 delta(Hit #5) + // 28 delta(Hit #6) + // 27-26 delta(Hit #7) + // 25-23 delta(Hit #8) + // 22-21 delta(Hit #9) + // 20-18 delta(Hit #10) + // 17-15 delta(Hit #11) + // 14-12 <unused> + // 11-6 Hit #12 + // 5-0 kSpecialHit // ---------------------- - byte_size = 25; + byte_size = 30; // 36 - 6 // Add this 1 hit. The PL is currently in the NOT_FULL state and should // transition to the ALMOST_FULL state - even though there is still some - // unused space. + // unused space. This is because the unused space (3 bytes) is less than + // the size of a uncompressed Hit. ICING_ASSERT_OK_AND_ASSIGN( - num_could_fit, (serializer.PrependHitArray<HitElt, HitElt::get_hit>( - &pl_used, &hits_in[0], hits_in.size(), false))); + num_could_fit, + (serializer.PrependHitArray<HitElt, HitElt::get_hit>( + &pl_used, &hits_in[0], hits_in.size(), /*keep_prepended=*/false))); EXPECT_THAT(num_could_fit, Eq(hits_in.size())); EXPECT_THAT(byte_size, Eq(serializer.GetBytesUsed(&pl_used))); // All hits from hits_in were added. @@ -388,37 +576,40 @@ TEST(PostingListHitSerializerTest, PostingListPrependHitArrayPostingList) { hits_in.clear(); hits_in.emplace_back(first_hit); hits_in.emplace_back( - CreateHit(hits_in.rbegin()->hit, /*desired_byte_length=*/2)); + CreateHit(hits_in.rbegin()->hit, /*desired_byte_length=*/3)); std::reverse(hits_in.begin(), hits_in.end()); // ---------------------- - // 29 delta(Hit #1) - // 28 delta(Hit #2) - // 27 delta(Hit #3) - // 26 delta(Hit #4) - // 25 delta(Hit #5) - // 24-23 delta(Hit #6) - // 22 delta(Hit #7) - // 21-20 delta(Hit #8) - // 19-17 delta(Hit #9) - // 16-15 delta(Hit #10) - // 14-12 delta(Hit #11) - // 11 delta(Hit #12) - // 10 <unused> - // 9-5 Hit #13 - // 4-0 Hit #14 + // 35 delta(Hit #0) + // 34 delta(Hit #1) + // 33 delta(Hit #2) + // 32 delta(Hit #3) + // 31 delta(Hit #4) + // 30-29 delta(Hit #5) + // 28 delta(Hit #6) + // 27-26 delta(Hit #7) + // 25-23 delta(Hit #8) + // 22-21 delta(Hit #9) + // 20-18 delta(Hit #10) + // 17-15 delta(Hit #11) + // 14 delta(Hit #12) + // 13-12 <unused> + // 11-6 Hit #13 + // 5-0 Hit #14 // ---------------------- - // Add these 2 hits. The PL is currently in the ALMOST_FULL state. Adding the - // first hit should keep the PL in ALMOST_FULL because the delta between Hit - // #12 and Hit #13 (1 byte) can fit in the unused area (2 bytes). Adding the - // second hit should tranisition to the FULL state because the delta between - // Hit #13 and Hit #14 (2 bytes) is larger than the remaining unused area - // (1 byte). + // Add these 2 hits. + // - The PL is currently in the ALMOST_FULL state. Adding the first hit should + // keep the PL in ALMOST_FULL because the delta between + // Hit #13 and Hit #14 (1 byte) can fit in the unused area (3 bytes). + // - Adding the second hit should transition to the FULL state because the + // delta between Hit #14 and Hit #15 (3 bytes) is larger than the remaining + // unused area (2 byte). ICING_ASSERT_OK_AND_ASSIGN( - num_could_fit, (serializer.PrependHitArray<HitElt, HitElt::get_hit>( - &pl_used, &hits_in[0], hits_in.size(), false))); + num_could_fit, + (serializer.PrependHitArray<HitElt, HitElt::get_hit>( + &pl_used, &hits_in[0], hits_in.size(), /*keep_prepended=*/false))); EXPECT_THAT(num_could_fit, Eq(hits_in.size())); - EXPECT_THAT(size, Eq(serializer.GetBytesUsed(&pl_used))); + EXPECT_THAT(pl_size, Eq(serializer.GetBytesUsed(&pl_used))); // All hits from hits_in were added. std::transform(hits_in.rbegin(), hits_in.rend(), std::front_inserter(hits_pushed), HitElt::get_hit); @@ -431,9 +622,9 @@ TEST(PostingListHitSerializerTest, PostingListPrependHitArrayTooManyHits) { static constexpr int kNumHits = 128; static constexpr int kDeltaSize = 1; - static constexpr int kTermFrequencySize = 1; static constexpr size_t kHitsSize = - ((kNumHits * (kDeltaSize + kTermFrequencySize)) / 5) * 5; + ((kNumHits - 2) * kDeltaSize + (2 * sizeof(Hit))) / sizeof(Hit) * + sizeof(Hit); // Create an array with one too many hits std::vector<Hit> hits_in_too_many = @@ -442,18 +633,21 @@ TEST(PostingListHitSerializerTest, PostingListPrependHitArrayTooManyHits) { for (const Hit &hit : hits_in_too_many) { hit_elts_in_too_many.emplace_back(hit); } + // Reverse so that hits are inserted in descending order + std::reverse(hit_elts_in_too_many.begin(), hit_elts_in_too_many.end()); + ICING_ASSERT_OK_AND_ASSIGN( PostingListUsed pl_used, PostingListUsed::CreateFromUnitializedRegion( &serializer, serializer.GetMinPostingListSize())); - // PrependHitArray should fail because hit_elts_in_too_many is far too large // for the minimum size pl. ICING_ASSERT_OK_AND_ASSIGN( uint32_t num_could_fit, (serializer.PrependHitArray<HitElt, HitElt::get_hit>( &pl_used, &hit_elts_in_too_many[0], hit_elts_in_too_many.size(), - false))); + /*keep_prepended=*/false))); + ASSERT_THAT(num_could_fit, Eq(2)); ASSERT_THAT(num_could_fit, Lt(hit_elts_in_too_many.size())); ASSERT_THAT(serializer.GetBytesUsed(&pl_used), Eq(0)); ASSERT_THAT(serializer.GetHits(&pl_used), IsOkAndHolds(IsEmpty())); @@ -464,10 +658,11 @@ TEST(PostingListHitSerializerTest, PostingListPrependHitArrayTooManyHits) { // PrependHitArray should fail because hit_elts_in_too_many is one hit too // large for this pl. ICING_ASSERT_OK_AND_ASSIGN( - num_could_fit, (serializer.PrependHitArray<HitElt, HitElt::get_hit>( - &pl_used, &hit_elts_in_too_many[0], - hit_elts_in_too_many.size(), false))); - ASSERT_THAT(num_could_fit, Lt(hit_elts_in_too_many.size())); + num_could_fit, + (serializer.PrependHitArray<HitElt, HitElt::get_hit>( + &pl_used, &hit_elts_in_too_many[0], hit_elts_in_too_many.size(), + /*keep_prepended=*/false))); + ASSERT_THAT(num_could_fit, Eq(hit_elts_in_too_many.size() - 1)); ASSERT_THAT(serializer.GetBytesUsed(&pl_used), Eq(0)); ASSERT_THAT(serializer.GetHits(&pl_used), IsOkAndHolds(IsEmpty())); } @@ -476,16 +671,37 @@ TEST(PostingListHitSerializerTest, PostingListStatusJumpFromNotFullToFullAndBack) { PostingListHitSerializer serializer; + // Size = 18 const uint32_t pl_size = 3 * sizeof(Hit); ICING_ASSERT_OK_AND_ASSIGN( PostingListUsed pl, PostingListUsed::CreateFromUnitializedRegion(&serializer, pl_size)); - ICING_ASSERT_OK(serializer.PrependHit(&pl, Hit(Hit::kInvalidValue - 1, 0))); + + Hit max_valued_hit(kMaxSectionId, kMinDocumentId, Hit::kMaxTermFrequency, + /*is_in_prefix_section=*/true, /*is_prefix_hit=*/true); + ICING_ASSERT_OK(serializer.PrependHit(&pl, max_valued_hit)); uint32_t bytes_used = serializer.GetBytesUsed(&pl); + ASSERT_THAT(bytes_used, sizeof(Hit::Value) + sizeof(Hit::Flags) + + sizeof(Hit::TermFrequency)); // Status not full. ASSERT_THAT(bytes_used, Le(pl_size - PostingListHitSerializer::kSpecialHitsSize)); - ICING_ASSERT_OK(serializer.PrependHit(&pl, Hit(Hit::kInvalidValue >> 2, 0))); + + Hit min_valued_hit(kMinSectionId, kMaxDocumentId, Hit::kMaxTermFrequency, + /*is_in_prefix_section=*/true, /*is_prefix_hit=*/true); + uint8_t delta_buf[VarInt::kMaxEncodedLen64]; + size_t delta_len = PostingListHitSerializer::EncodeNextHitValue( + /*next_hit_value=*/min_valued_hit.value(), + /*curr_hit_value=*/max_valued_hit.value(), delta_buf); + // The compressed region available is pl_size - 2 * sizeof(specialHits) = 6 + // We need to also fit max_valued_hit's flags and term-frequency fields, which + // each take 1 byte + // So we'll jump directly to FULL if the varint-encoded delta of the 2 hits > + // 6 - sizeof(Hit::Flags) - sizeof(Hit::TermFrequency) = 4 + ASSERT_THAT(delta_len, Gt(4)); + ICING_ASSERT_OK(serializer.PrependHit( + &pl, Hit(kMinSectionId, kMaxDocumentId, Hit::kMaxTermFrequency, + /*is_in_prefix_section=*/true, /*is_prefix_hit=*/true))); // Status should jump to full directly. ASSERT_THAT(serializer.GetBytesUsed(&pl), Eq(pl_size)); ICING_ASSERT_OK(serializer.PopFrontHits(&pl, 1)); @@ -501,11 +717,12 @@ TEST(PostingListHitSerializerTest, DeltaOverflow) { PostingListUsed pl, PostingListUsed::CreateFromUnitializedRegion(&serializer, pl_size)); + static const Hit::Value kMaxHitValue = std::numeric_limits<Hit::Value>::max(); static const Hit::Value kOverflow[4] = { - Hit::kInvalidValue >> 2, - (Hit::kInvalidValue >> 2) * 2, - (Hit::kInvalidValue >> 2) * 3, - Hit::kInvalidValue - 1, + kMaxHitValue >> 2, + (kMaxHitValue >> 2) * 2, + (kMaxHitValue >> 2) * 3, + kMaxHitValue - 1, }; // Fit at least 4 ordinary values. @@ -516,22 +733,245 @@ TEST(PostingListHitSerializerTest, DeltaOverflow) { // Cannot fit 4 overflow values. ICING_ASSERT_OK_AND_ASSIGN( pl, PostingListUsed::CreateFromUnitializedRegion(&serializer, pl_size)); - ICING_EXPECT_OK(serializer.PrependHit(&pl, Hit(kOverflow[3]))); - ICING_EXPECT_OK(serializer.PrependHit(&pl, Hit(kOverflow[2]))); + Hit::Flags has_term_frequency_flags = 0b1; + ICING_EXPECT_OK(serializer.PrependHit( + &pl, Hit(/*value=*/kOverflow[3], has_term_frequency_flags, + /*term_frequency=*/8))); + ICING_EXPECT_OK(serializer.PrependHit( + &pl, Hit(/*value=*/kOverflow[2], has_term_frequency_flags, + /*term_frequency=*/8))); // Can fit only one more. - ICING_EXPECT_OK(serializer.PrependHit(&pl, Hit(kOverflow[1]))); - EXPECT_THAT(serializer.PrependHit(&pl, Hit(kOverflow[0])), + ICING_EXPECT_OK(serializer.PrependHit( + &pl, Hit(/*value=*/kOverflow[1], has_term_frequency_flags, + /*term_frequency=*/8))); + EXPECT_THAT(serializer.PrependHit( + &pl, Hit(/*value=*/kOverflow[0], has_term_frequency_flags, + /*term_frequency=*/8)), + StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED)); +} + +TEST(PostingListHitSerializerTest, GetMinPostingListToFitForNotFullPL) { + PostingListHitSerializer serializer; + + // Size = 24 + int pl_size = 2 * serializer.GetMinPostingListSize(); + ICING_ASSERT_OK_AND_ASSIGN( + PostingListUsed pl_used, + PostingListUsed::CreateFromUnitializedRegion(&serializer, pl_size)); + // Create and add some hits to make pl_used NOT_FULL + std::vector<Hit> hits_in = + CreateHits(/*num_hits=*/7, /*desired_byte_length=*/1); + for (const Hit &hit : hits_in) { + ICING_ASSERT_OK(serializer.PrependHit(&pl_used, hit)); + } + // ---------------------- + // 23 delta(Hit #0) + // 22 delta(Hit #1) + // 21 delta(Hit #2) + // 20 delta(Hit #3) + // 19 delta(Hit #4) + // 18 delta(Hit #5) + // 17-14 Hit #6 + // 13-12 <unused> + // 11-6 kSpecialHit + // 5-0 Offset=14 + // ---------------------- + int bytes_used = 10; + + // Check that all hits have been inserted + EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(bytes_used)); + std::deque<Hit> hits_pushed(hits_in.rbegin(), hits_in.rend()); + EXPECT_THAT(serializer.GetHits(&pl_used), + IsOkAndHolds(ElementsAreArray(hits_pushed))); + + // Get the min size to fit for the hits in pl_used. Moving the hits in pl_used + // into a posting list with this min size should make it ALMOST_FULL, which we + // can see should have size = 18. + // ---------------------- + // 17 delta(Hit #0) + // 16 delta(Hit #1) + // 15 delta(Hit #2) + // 14 delta(Hit #3) + // 13 delta(Hit #4) + // 12 delta(Hit #5) + // 11-6 Hit #6 + // 5-0 kSpecialHit + // ---------------------- + int expected_min_size = 18; + uint32_t min_size_to_fit = serializer.GetMinPostingListSizeToFit(&pl_used); + EXPECT_THAT(min_size_to_fit, Eq(expected_min_size)); + + // Also check that this min size to fit posting list actually does fit all the + // hits and can only hit one more hit in the ALMOST_FULL state. + ICING_ASSERT_OK_AND_ASSIGN(PostingListUsed min_size_to_fit_pl, + PostingListUsed::CreateFromUnitializedRegion( + &serializer, min_size_to_fit)); + for (const Hit &hit : hits_in) { + ICING_ASSERT_OK(serializer.PrependHit(&min_size_to_fit_pl, hit)); + } + + // Adding another hit to the min-size-to-fit posting list should succeed + Hit hit = CreateHit(hits_in.back(), /*desired_byte_length=*/1); + ICING_ASSERT_OK(serializer.PrependHit(&min_size_to_fit_pl, hit)); + // Adding any other hits should fail with RESOURCE_EXHAUSTED error. + EXPECT_THAT(serializer.PrependHit(&min_size_to_fit_pl, + CreateHit(hit, /*desired_byte_length=*/1)), StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED)); + + // Check that all hits have been inserted and the min-fit posting list is now + // FULL. + EXPECT_THAT(serializer.GetBytesUsed(&min_size_to_fit_pl), + Eq(min_size_to_fit)); + hits_pushed.emplace_front(hit); + EXPECT_THAT(serializer.GetHits(&min_size_to_fit_pl), + IsOkAndHolds(ElementsAreArray(hits_pushed))); +} + +TEST(PostingListHitSerializerTest, GetMinPostingListToFitForTwoHits) { + PostingListHitSerializer serializer; + + // Size = 36 + int pl_size = 3 * serializer.GetMinPostingListSize(); + ICING_ASSERT_OK_AND_ASSIGN( + PostingListUsed pl_used, + PostingListUsed::CreateFromUnitializedRegion(&serializer, pl_size)); + + // Create and add 2 hits + Hit first_hit(/*section_id=*/1, /*document_id=*/0, /*term_frequency=*/5, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); + std::vector<Hit> hits_in = + CreateHits(first_hit, /*num_hits=*/2, /*desired_byte_length=*/4); + for (const Hit &hit : hits_in) { + ICING_ASSERT_OK(serializer.PrependHit(&pl_used, hit)); + } + // ---------------------- + // 35 term-frequency(Hit #0) + // 34 flags(Hit #0) + // 33-30 delta(Hit #0) + // 29 term-frequency(Hit #1) + // 28 flags(Hit #1) + // 27-24 Hit #1 + // 23-12 <unused> + // 11-6 kSpecialHit + // 5-0 Offset=24 + // ---------------------- + int bytes_used = 12; + + EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(bytes_used)); + std::deque<Hit> hits_pushed(hits_in.rbegin(), hits_in.rend()); + EXPECT_THAT(serializer.GetHits(&pl_used), + IsOkAndHolds(ElementsAreArray(hits_pushed))); + + // GetMinPostingListSizeToFit should return min posting list size. + EXPECT_THAT(serializer.GetMinPostingListSizeToFit(&pl_used), + Eq(serializer.GetMinPostingListSize())); +} + +TEST(PostingListHitSerializerTest, GetMinPostingListToFitForThreeSmallHits) { + PostingListHitSerializer serializer; + + // Size = 24 + int pl_size = 2 * serializer.GetMinPostingListSize(); + ICING_ASSERT_OK_AND_ASSIGN( + PostingListUsed pl_used, + PostingListUsed::CreateFromUnitializedRegion(&serializer, pl_size)); + // Create and add 3 small hits that fit in the size range where we should be + // checking for whether the PL has only 2 hits + std::vector<Hit> hits_in = + CreateHits(/*num_hits=*/3, /*desired_byte_length=*/1); + for (const Hit &hit : hits_in) { + ICING_ASSERT_OK(serializer.PrependHit(&pl_used, hit)); + } + // ---------------------- + // 23 delta(Hit #0) + // 22 delta(Hit #1) + // 21-18 Hit #2 + // 17-12 <unused> + // 11-6 kSpecialHit + // 5-0 Offset=18 + // ---------------------- + int bytes_used = 6; + + EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(bytes_used)); + std::deque<Hit> hits_pushed(hits_in.rbegin(), hits_in.rend()); + EXPECT_THAT(serializer.GetHits(&pl_used), + IsOkAndHolds(ElementsAreArray(hits_pushed))); + + // Get the min size to fit for the hits in pl_used. Moving the hits in pl_used + // into a posting list with this min size should make it ALMOST_FULL, which we + // can see should have size = 14. This should not return the min posting list + // size. + // ---------------------- + // 13 delta(Hit #0) + // 12 delta(Hit #1) + // 11-6 Hit #2 + // 5-0 kSpecialHit + // ---------------------- + int expected_min_size = 14; + + EXPECT_THAT(serializer.GetMinPostingListSizeToFit(&pl_used), + Gt(serializer.GetMinPostingListSize())); + EXPECT_THAT(serializer.GetMinPostingListSizeToFit(&pl_used), + Eq(expected_min_size)); +} + +TEST(PostingListHitSerializerTest, + GetMinPostingListToFitForAlmostFullAndFullPLReturnsSameSize) { + PostingListHitSerializer serializer; + + // Size = 24 + int pl_size = 2 * serializer.GetMinPostingListSize(); + ICING_ASSERT_OK_AND_ASSIGN( + PostingListUsed pl_used, + PostingListUsed::CreateFromUnitializedRegion(&serializer, pl_size)); + // Create and add some hits to make pl_used ALMOST_FULL + std::vector<Hit> hits_in = + CreateHits(/*num_hits=*/7, /*desired_byte_length=*/2); + for (const Hit &hit : hits_in) { + ICING_ASSERT_OK(serializer.PrependHit(&pl_used, hit)); + } + // ---------------------- + // 23-22 delta(Hit #0) + // 21-20 delta(Hit #1) + // 19-18 delta(Hit #2) + // 17-16 delta(Hit #3) + // 15-14 delta(Hit #4) + // 13-12 delta(Hit #5) + // 11-6 Hit #6 + // 5-0 kSpecialHit + // ---------------------- + int bytes_used = 18; + + EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(bytes_used)); + std::deque<Hit> hits_pushed(hits_in.rbegin(), hits_in.rend()); + EXPECT_THAT(serializer.GetHits(&pl_used), + IsOkAndHolds(ElementsAreArray(hits_pushed))); + + // GetMinPostingListSizeToFit should return the same size as pl_used. + uint32_t min_size_to_fit = serializer.GetMinPostingListSizeToFit(&pl_used); + EXPECT_THAT(min_size_to_fit, Eq(pl_size)); + + // Add another hit to make the posting list FULL + Hit hit = CreateHit(hits_in.back(), /*desired_byte_length=*/1); + ICING_ASSERT_OK(serializer.PrependHit(&pl_used, hit)); + EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(pl_size)); + hits_pushed.emplace_front(hit); + EXPECT_THAT(serializer.GetHits(&pl_used), + IsOkAndHolds(ElementsAreArray(hits_pushed))); + + // GetMinPostingListSizeToFit should still be the same size as pl_used. + min_size_to_fit = serializer.GetMinPostingListSizeToFit(&pl_used); + EXPECT_THAT(min_size_to_fit, Eq(pl_size)); } TEST(PostingListHitSerializerTest, MoveFrom) { PostingListHitSerializer serializer; - int size = 3 * serializer.GetMinPostingListSize(); + int pl_size = 3 * serializer.GetMinPostingListSize(); ICING_ASSERT_OK_AND_ASSIGN( PostingListUsed pl_used1, - PostingListUsed::CreateFromUnitializedRegion(&serializer, size)); + PostingListUsed::CreateFromUnitializedRegion(&serializer, pl_size)); std::vector<Hit> hits1 = CreateHits(/*num_hits=*/5, /*desired_byte_length=*/1); for (const Hit &hit : hits1) { @@ -540,7 +980,7 @@ TEST(PostingListHitSerializerTest, MoveFrom) { ICING_ASSERT_OK_AND_ASSIGN( PostingListUsed pl_used2, - PostingListUsed::CreateFromUnitializedRegion(&serializer, size)); + PostingListUsed::CreateFromUnitializedRegion(&serializer, pl_size)); std::vector<Hit> hits2 = CreateHits(/*num_hits=*/5, /*desired_byte_length=*/2); for (const Hit &hit : hits2) { @@ -556,10 +996,10 @@ TEST(PostingListHitSerializerTest, MoveFrom) { TEST(PostingListHitSerializerTest, MoveFromNullArgumentReturnsInvalidArgument) { PostingListHitSerializer serializer; - int size = 3 * serializer.GetMinPostingListSize(); + int pl_size = 3 * serializer.GetMinPostingListSize(); ICING_ASSERT_OK_AND_ASSIGN( PostingListUsed pl_used1, - PostingListUsed::CreateFromUnitializedRegion(&serializer, size)); + PostingListUsed::CreateFromUnitializedRegion(&serializer, pl_size)); std::vector<Hit> hits = CreateHits(/*num_hits=*/5, /*desired_byte_length=*/1); for (const Hit &hit : hits) { ICING_ASSERT_OK(serializer.PrependHit(&pl_used1, hit)); @@ -575,10 +1015,10 @@ TEST(PostingListHitSerializerTest, MoveFromInvalidPostingListReturnsInvalidArgument) { PostingListHitSerializer serializer; - int size = 3 * serializer.GetMinPostingListSize(); + int pl_size = 3 * serializer.GetMinPostingListSize(); ICING_ASSERT_OK_AND_ASSIGN( PostingListUsed pl_used1, - PostingListUsed::CreateFromUnitializedRegion(&serializer, size)); + PostingListUsed::CreateFromUnitializedRegion(&serializer, pl_size)); std::vector<Hit> hits1 = CreateHits(/*num_hits=*/5, /*desired_byte_length=*/1); for (const Hit &hit : hits1) { @@ -587,7 +1027,7 @@ TEST(PostingListHitSerializerTest, ICING_ASSERT_OK_AND_ASSIGN( PostingListUsed pl_used2, - PostingListUsed::CreateFromUnitializedRegion(&serializer, size)); + PostingListUsed::CreateFromUnitializedRegion(&serializer, pl_size)); std::vector<Hit> hits2 = CreateHits(/*num_hits=*/5, /*desired_byte_length=*/2); for (const Hit &hit : hits2) { @@ -595,7 +1035,7 @@ TEST(PostingListHitSerializerTest, } // Write invalid hits to the beginning of pl_used1 to make it invalid. - Hit invalid_hit; + Hit invalid_hit(Hit::kInvalidValue); Hit *first_hit = reinterpret_cast<Hit *>(pl_used1.posting_list_buffer()); *first_hit = invalid_hit; ++first_hit; @@ -610,10 +1050,10 @@ TEST(PostingListHitSerializerTest, MoveToInvalidPostingListReturnsFailedPrecondition) { PostingListHitSerializer serializer; - int size = 3 * serializer.GetMinPostingListSize(); + int pl_size = 3 * serializer.GetMinPostingListSize(); ICING_ASSERT_OK_AND_ASSIGN( PostingListUsed pl_used1, - PostingListUsed::CreateFromUnitializedRegion(&serializer, size)); + PostingListUsed::CreateFromUnitializedRegion(&serializer, pl_size)); std::vector<Hit> hits1 = CreateHits(/*num_hits=*/5, /*desired_byte_length=*/1); for (const Hit &hit : hits1) { @@ -622,7 +1062,7 @@ TEST(PostingListHitSerializerTest, ICING_ASSERT_OK_AND_ASSIGN( PostingListUsed pl_used2, - PostingListUsed::CreateFromUnitializedRegion(&serializer, size)); + PostingListUsed::CreateFromUnitializedRegion(&serializer, pl_size)); std::vector<Hit> hits2 = CreateHits(/*num_hits=*/5, /*desired_byte_length=*/2); for (const Hit &hit : hits2) { @@ -630,7 +1070,7 @@ TEST(PostingListHitSerializerTest, } // Write invalid hits to the beginning of pl_used2 to make it invalid. - Hit invalid_hit; + Hit invalid_hit(Hit::kInvalidValue); Hit *first_hit = reinterpret_cast<Hit *>(pl_used2.posting_list_buffer()); *first_hit = invalid_hit; ++first_hit; @@ -644,10 +1084,10 @@ TEST(PostingListHitSerializerTest, TEST(PostingListHitSerializerTest, MoveToPostingListTooSmall) { PostingListHitSerializer serializer; - int size = 3 * serializer.GetMinPostingListSize(); + int pl_size = 3 * serializer.GetMinPostingListSize(); ICING_ASSERT_OK_AND_ASSIGN( PostingListUsed pl_used1, - PostingListUsed::CreateFromUnitializedRegion(&serializer, size)); + PostingListUsed::CreateFromUnitializedRegion(&serializer, pl_size)); std::vector<Hit> hits1 = CreateHits(/*num_hits=*/5, /*desired_byte_length=*/1); for (const Hit &hit : hits1) { @@ -672,30 +1112,36 @@ TEST(PostingListHitSerializerTest, MoveToPostingListTooSmall) { IsOkAndHolds(ElementsAreArray(hits2.rbegin(), hits2.rend()))); } -TEST(PostingListHitSerializerTest, PopHitsWithScores) { +TEST(PostingListHitSerializerTest, PopHitsWithTermFrequenciesAndFlags) { PostingListHitSerializer serializer; - int size = 2 * serializer.GetMinPostingListSize(); + // Size = 24 + int pl_size = 2 * serializer.GetMinPostingListSize(); ICING_ASSERT_OK_AND_ASSIGN( PostingListUsed pl_used, - PostingListUsed::CreateFromUnitializedRegion(&serializer, size)); + PostingListUsed::CreateFromUnitializedRegion(&serializer, pl_size)); - // This posting list is 20-bytes. Create four hits that will have deltas of - // two bytes each and all of whom will have a non-default score. This posting - // list will be almost_full. + // This posting list is 24-bytes. Create four hits that will have deltas of + // two bytes each and all of whom will have a non-default term-frequency. This + // posting list will be almost_full. // // ---------------------- - // 19 score(Hit #0) - // 18-17 delta(Hit #0) - // 16 score(Hit #1) - // 15-14 delta(Hit #1) - // 13 score(Hit #2) - // 12-11 delta(Hit #2) - // 10 <unused> - // 9-5 Hit #3 - // 4-0 kInvalidHitVal + // 23 term-frequency(Hit #0) + // 22 flags(Hit #0) + // 21-20 delta(Hit #0) + // 19 term-frequency(Hit #1) + // 18 flags(Hit #1) + // 17-16 delta(Hit #1) + // 15 term-frequency(Hit #2) + // 14 flags(Hit #2) + // 13-12 delta(Hit #2) + // 11-6 Hit #3 + // 5-0 kInvalidHit // ---------------------- - Hit hit0(/*section_id=*/0, /*document_id=*/0, /*score=*/5); + int bytes_used = 18; + + Hit hit0(/*section_id=*/0, /*document_id=*/0, /*term_frequency=*/5, + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); Hit hit1 = CreateHit(hit0, /*desired_byte_length=*/2); Hit hit2 = CreateHit(hit1, /*desired_byte_length=*/2); Hit hit3 = CreateHit(hit2, /*desired_byte_length=*/2); @@ -707,22 +1153,26 @@ TEST(PostingListHitSerializerTest, PopHitsWithScores) { ICING_ASSERT_OK_AND_ASSIGN(std::vector<Hit> hits_out, serializer.GetHits(&pl_used)); EXPECT_THAT(hits_out, ElementsAre(hit3, hit2, hit1, hit0)); + EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(bytes_used)); // Now, pop the last hit. The posting list should contain the first three // hits. // // ---------------------- - // 19 score(Hit #0) - // 18-17 delta(Hit #0) - // 16 score(Hit #1) - // 15-14 delta(Hit #1) - // 13-10 <unused> - // 9-5 Hit #2 - // 4-0 kInvalidHitVal + // 23 term-frequency(Hit #0) + // 22 flags(Hit #0) + // 21-20 delta(Hit #0) + // 19 term-frequency(Hit #1) + // 18 flags(Hit #1) + // 17-16 delta(Hit #1) + // 15-12 <unused> + // 11-6 Hit #2 + // 5-0 kInvalidHit // ---------------------- ICING_ASSERT_OK(serializer.PopFrontHits(&pl_used, 1)); ICING_ASSERT_OK_AND_ASSIGN(hits_out, serializer.GetHits(&pl_used)); EXPECT_THAT(hits_out, ElementsAre(hit2, hit1, hit0)); + EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(bytes_used)); } } // namespace diff --git a/icing/query/query-processor.cc b/icing/query/query-processor.cc index 74e51cf..bc0917b 100644 --- a/icing/query/query-processor.cc +++ b/icing/query/query-processor.cc @@ -172,8 +172,10 @@ libtextclassifier3::StatusOr<QueryResults> QueryProcessor::ParseSearch( search_spec.enabled_features().end()); for (const Feature feature : results.features_in_use) { if (enabled_features.find(feature) == enabled_features.end()) { - return absl_ports::InvalidArgumentError( - absl_ports::StrCat("Attempted use of unenabled feature ", feature)); + return absl_ports::InvalidArgumentError(absl_ports::StrCat( + "Attempted use of unenabled feature ", feature, + ". Please make sure that you have explicitly set all advanced query " + "features used in this query as enabled in the SearchSpec.")); } } diff --git a/icing/testing/hit-test-utils.cc b/icing/testing/hit-test-utils.cc index 7ad8a64..2fd3ac8 100644 --- a/icing/testing/hit-test-utils.cc +++ b/icing/testing/hit-test-utils.cc @@ -14,29 +14,44 @@ #include "icing/testing/hit-test-utils.h" +#include <cstdint> +#include <vector> + +#include "icing/index/hit/hit.h" +#include "icing/index/main/posting-list-hit-serializer.h" +#include "icing/schema/section.h" +#include "icing/store/document-id.h" + namespace icing { namespace lib { // Returns a hit that has a delta of desired_byte_length from last_hit. -Hit CreateHit(Hit last_hit, int desired_byte_length) { - Hit hit = (last_hit.section_id() == kMinSectionId) - ? Hit(kMaxSectionId, last_hit.document_id() + 1, - last_hit.term_frequency()) - : Hit(last_hit.section_id() - 1, last_hit.document_id(), - last_hit.term_frequency()); +Hit CreateHit(const Hit& last_hit, int desired_byte_length) { + return CreateHit(last_hit, desired_byte_length, last_hit.term_frequency(), + /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false); +} + +// Returns a hit that has a delta of desired_byte_length from last_hit, with +// the desired term_frequency and flags +Hit CreateHit(const Hit& last_hit, int desired_byte_length, + Hit::TermFrequency term_frequency, bool is_in_prefix_section, + bool is_prefix_hit) { + Hit hit = last_hit; uint8_t buf[5]; - while (VarInt::Encode(last_hit.value() - hit.value(), buf) < - desired_byte_length) { + do { hit = (hit.section_id() == kMinSectionId) - ? Hit(kMaxSectionId, hit.document_id() + 1, hit.term_frequency()) - : Hit(hit.section_id() - 1, hit.document_id(), - hit.term_frequency()); - } + ? Hit(kMaxSectionId, hit.document_id() + 1, term_frequency, + is_in_prefix_section, is_prefix_hit) + : Hit(hit.section_id() - 1, hit.document_id(), term_frequency, + is_in_prefix_section, is_prefix_hit); + } while (PostingListHitSerializer::EncodeNextHitValue( + /*next_hit_value=*/hit.value(), + /*curr_hit_value=*/last_hit.value(), buf) < desired_byte_length); return hit; } // Returns a vector of num_hits Hits with the first hit starting at start_docid -// and with 1-byte deltas. +// and with deltas of the desired byte length. std::vector<Hit> CreateHits(DocumentId start_docid, int num_hits, int desired_byte_length) { std::vector<Hit> hits; @@ -44,13 +59,30 @@ std::vector<Hit> CreateHits(DocumentId start_docid, int num_hits, return hits; } hits.push_back(Hit(/*section_id=*/1, /*document_id=*/start_docid, - Hit::kDefaultTermFrequency)); + Hit::kDefaultTermFrequency, /*is_in_prefix_section=*/false, + /*is_prefix_hit=*/false)); while (hits.size() < num_hits) { hits.push_back(CreateHit(hits.back(), desired_byte_length)); } return hits; } +// Returns a vector of num_hits Hits with the first hit being the desired byte +// length from last_hit, and with deltas of the same desired byte length. +std::vector<Hit> CreateHits(const Hit& last_hit, int num_hits, + int desired_byte_length) { + std::vector<Hit> hits; + if (num_hits < 1) { + return hits; + } + hits.reserve(num_hits); + for (int i = 0; i < num_hits; ++i) { + hits.push_back( + CreateHit(hits.empty() ? last_hit : hits.back(), desired_byte_length)); + } + return hits; +} + std::vector<Hit> CreateHits(int num_hits, int desired_byte_length) { return CreateHits(/*start_docid=*/0, num_hits, desired_byte_length); } diff --git a/icing/testing/hit-test-utils.h b/icing/testing/hit-test-utils.h index e236ec0..2953c5c 100644 --- a/icing/testing/hit-test-utils.h +++ b/icing/testing/hit-test-utils.h @@ -18,21 +18,30 @@ #include <vector> #include "icing/index/hit/hit.h" -#include "icing/legacy/index/icing-bit-util.h" -#include "icing/schema/section.h" #include "icing/store/document-id.h" namespace icing { namespace lib { // Returns a hit that has a delta of desired_byte_length from last_hit. -Hit CreateHit(Hit last_hit, int desired_byte_length); +Hit CreateHit(const Hit& last_hit, int desired_byte_length); + +// Returns a hit that has a delta of desired_byte_length from last_hit, with +// the desired term_frequency and flags +Hit CreateHit(const Hit& last_hit, int desired_byte_length, + Hit::TermFrequency term_frequency, bool is_in_prefix_section, + bool is_prefix_hit); // Returns a vector of num_hits Hits with the first hit starting at start_docid // and with desired_byte_length deltas. std::vector<Hit> CreateHits(DocumentId start_docid, int num_hits, int desired_byte_length); +// Returns a vector of num_hits Hits with the first hit being the desired byte +// length from last_hit, and with deltas of the same desired byte length. +std::vector<Hit> CreateHits(const Hit& last_hit, int num_hits, + int desired_byte_length); + // Returns a vector of num_hits Hits with the first hit starting at 0 and each // with desired_byte_length deltas. std::vector<Hit> CreateHits(int num_hits, int desired_byte_length); diff --git a/java/src/com/google/android/icing/IcingSearchEngineInterface.java b/java/src/com/google/android/icing/IcingSearchEngineInterface.java index 0bc58f1..67f60ed 100644 --- a/java/src/com/google/android/icing/IcingSearchEngineInterface.java +++ b/java/src/com/google/android/icing/IcingSearchEngineInterface.java @@ -32,7 +32,7 @@ import com.google.android.icing.proto.SuggestionSpecProto; import com.google.android.icing.proto.UsageReport; import java.io.Closeable; -/** A common user-facing interface to expose the funcationalities provided by Icing Library. */ +/** A common user-facing interface to expose the functionalities provided by Icing Library. */ public interface IcingSearchEngineInterface extends Closeable { /** * Initializes the current IcingSearchEngine implementation. diff --git a/proto/icing/proto/schema.proto b/proto/icing/proto/schema.proto index c716dba..78e1588 100644 --- a/proto/icing/proto/schema.proto +++ b/proto/icing/proto/schema.proto @@ -34,7 +34,7 @@ option objc_class_prefix = "ICNG"; // TODO(cassiewang) Define a sample proto file that can be used by tests and for // documentation. // -// Next tag: 7 +// Next tag: 8 message SchemaTypeConfigProto { // REQUIRED: Named type that uniquely identifies the structured, logical // schema being defined. @@ -43,6 +43,13 @@ message SchemaTypeConfigProto { // in http://schema.org. Eg: DigitalDocument, Message, Person, etc. optional string schema_type = 1; + // OPTIONAL: A natural language description of the SchemaTypeConfigProto. + // + // This string is not used by Icing in any way. It simply exists to allow + // users to store semantic information about the SchemaTypeConfigProto for + // future retrieval. + optional string description = 7; + // List of all properties that are supported by Documents of this type. // An Document should never have properties that are not listed here. // @@ -208,7 +215,7 @@ message JoinableConfig { // Describes the schema of a single property of Documents that belong to a // specific SchemaTypeConfigProto. These can be considered as a rich, structured // type for each property of Documents accepted by IcingSearchEngine. -// Next tag: 9 +// Next tag: 10 message PropertyConfigProto { // REQUIRED: Name that uniquely identifies a property within an Document of // a specific SchemaTypeConfigProto. @@ -219,6 +226,13 @@ message PropertyConfigProto { // Eg: 'address' for http://schema.org/Place. optional string property_name = 1; + // OPTIONAL: A natural language description of the property. + // + // This string is not used by Icing in any way. It simply exists to allow + // users to store semantic information about the PropertyConfigProto for + // future retrieval. + optional string description = 9; + // REQUIRED: Physical data-types of the contents of the property. message DataType { enum Code { diff --git a/synced_AOSP_CL_number.txt b/synced_AOSP_CL_number.txt index a36a007..c22a1e8 100644 --- a/synced_AOSP_CL_number.txt +++ b/synced_AOSP_CL_number.txt @@ -1 +1 @@ -set(synced_AOSP_CL_number=596096845) +set(synced_AOSP_CL_number=613977851) |