aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTim Barron <tjbarron@google.com>2024-03-07 00:42:57 +0000
committerTim Barron <tjbarron@google.com>2024-03-12 15:41:59 +0000
commit29d4712b67d1ade154739e5fa9a9a7970afe6c0a (patch)
tree0a26da402c204c50e001539cffc89abbe51b08b6
parent030b99d2648aea03e5c2e3809a13fdfacdccde11 (diff)
downloadicing-29d4712b67d1ade154739e5fa9a9a7970afe6c0a.tar.gz
Update Icing from upstream.
Descriptions: ======================================================================== Allow adding duplicate hits with the same value into the posting list. ======================================================================== Support an extension to the encoded Hit in Icing's posting list. ======================================================================== Adds description fields to SchemaTypeConfigProto and PropertyConfigProto ======================================================================== Support polymorphism in type property filters ======================================================================== Fix posting list GetMinPostingListSizeToFit size calculation bug ======================================================================== Add instructions to error message for advanced query backward compat. ======================================================================== BUG: 294274922 BUG: 321107391 BUG: 324908653 BUG: 326987971 Change-Id: I9e8e0589a9682ee14ac34ded32b959c1ef64d84d
-rw-r--r--icing/file/posting_list/flash-index-storage-header.h2
-rw-r--r--icing/file/posting_list/flash-index-storage_test.cc139
-rw-r--r--icing/file/posting_list/index-block_test.cc115
-rw-r--r--icing/icing-search-engine_search_test.cc103
-rw-r--r--icing/index/hit/hit.cc96
-rw-r--r--icing/index/hit/hit.h153
-rw-r--r--icing/index/hit/hit_test.cc156
-rw-r--r--icing/index/index.cc10
-rw-r--r--icing/index/index_test.cc3
-rw-r--r--icing/index/iterator/doc-hit-info-iterator-section-restrict.cc12
-rw-r--r--icing/index/lite/lite-index-header.h2
-rw-r--r--icing/index/lite/lite-index.cc5
-rw-r--r--icing/index/lite/lite-index_test.cc191
-rw-r--r--icing/index/lite/lite-index_thread-safety_test.cc42
-rw-r--r--icing/index/lite/term-id-hit-pair.h64
-rw-r--r--icing/index/lite/term-id-hit-pair_test.cc95
-rw-r--r--icing/index/main/main-index-merger.cc10
-rw-r--r--icing/index/main/main-index-merger_test.cc38
-rw-r--r--icing/index/main/main-index.cc2
-rw-r--r--icing/index/main/main-index_test.cc60
-rw-r--r--icing/index/main/posting-list-hit-accessor_test.cc67
-rw-r--r--icing/index/main/posting-list-hit-serializer.cc254
-rw-r--r--icing/index/main/posting-list-hit-serializer.h64
-rw-r--r--icing/index/main/posting-list-hit-serializer_test.cc828
-rw-r--r--icing/query/query-processor.cc6
-rw-r--r--icing/testing/hit-test-utils.cc60
-rw-r--r--icing/testing/hit-test-utils.h15
-rw-r--r--java/src/com/google/android/icing/IcingSearchEngineInterface.java2
-rw-r--r--proto/icing/proto/schema.proto18
-rw-r--r--synced_AOSP_CL_number.txt2
30 files changed, 1902 insertions, 712 deletions
diff --git a/icing/file/posting_list/flash-index-storage-header.h b/icing/file/posting_list/flash-index-storage-header.h
index 6bbf1ba..f7b331c 100644
--- a/icing/file/posting_list/flash-index-storage-header.h
+++ b/icing/file/posting_list/flash-index-storage-header.h
@@ -33,7 +33,7 @@ class HeaderBlock {
// The class used to access the actual header.
struct Header {
// A magic used to mark the beginning of a valid header.
- static constexpr int kMagic = 0xb0780cf4;
+ static constexpr int kMagic = 0xd1b7b293;
int magic;
int block_size;
int last_indexed_docid;
diff --git a/icing/file/posting_list/flash-index-storage_test.cc b/icing/file/posting_list/flash-index-storage_test.cc
index ef60037..203041e 100644
--- a/icing/file/posting_list/flash-index-storage_test.cc
+++ b/icing/file/posting_list/flash-index-storage_test.cc
@@ -16,9 +16,9 @@
#include <unistd.h>
-#include <algorithm>
-#include <cstdlib>
-#include <limits>
+#include <cstdint>
+#include <memory>
+#include <string>
#include <utility>
#include <vector>
@@ -27,6 +27,7 @@
#include "gtest/gtest.h"
#include "icing/file/filesystem.h"
#include "icing/file/posting_list/flash-index-storage-header.h"
+#include "icing/file/posting_list/posting-list-identifier.h"
#include "icing/index/hit/hit.h"
#include "icing/index/main/posting-list-hit-serializer.h"
#include "icing/store/document-id.h"
@@ -213,10 +214,14 @@ TEST_F(FlashIndexStorageTest, FreeListInMemory) {
EXPECT_THAT(flash_index_storage.empty(), IsFalse());
std::vector<Hit> hits1 = {
- Hit(/*section_id=*/1, /*document_id=*/0, /*term_frequency=*/12),
- Hit(/*section_id=*/6, /*document_id=*/2, /*term_frequency=*/19),
- Hit(/*section_id=*/5, /*document_id=*/2, /*term_frequency=*/100),
- Hit(/*section_id=*/8, /*document_id=*/5, /*term_frequency=*/197)};
+ Hit(/*section_id=*/1, /*document_id=*/0, /*term_frequency=*/12,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false),
+ Hit(/*section_id=*/6, /*document_id=*/2, /*term_frequency=*/19,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false),
+ Hit(/*section_id=*/5, /*document_id=*/2, /*term_frequency=*/100,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false),
+ Hit(/*section_id=*/8, /*document_id=*/5, /*term_frequency=*/197,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false)};
for (const Hit& hit : hits1) {
ICING_ASSERT_OK(
serializer_->PrependHit(&posting_list_holder1.posting_list, hit));
@@ -237,10 +242,14 @@ TEST_F(FlashIndexStorageTest, FreeListInMemory) {
EXPECT_THAT(flash_index_storage.empty(), IsFalse());
std::vector<Hit> hits2 = {
- Hit(/*section_id=*/4, /*document_id=*/0, /*term_frequency=*/12),
- Hit(/*section_id=*/8, /*document_id=*/4, /*term_frequency=*/19),
- Hit(/*section_id=*/9, /*document_id=*/7, /*term_frequency=*/100),
- Hit(/*section_id=*/6, /*document_id=*/7, /*term_frequency=*/197)};
+ Hit(/*section_id=*/4, /*document_id=*/0, /*term_frequency=*/12,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false),
+ Hit(/*section_id=*/8, /*document_id=*/4, /*term_frequency=*/19,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false),
+ Hit(/*section_id=*/9, /*document_id=*/7, /*term_frequency=*/100,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false),
+ Hit(/*section_id=*/6, /*document_id=*/7, /*term_frequency=*/197,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false)};
for (const Hit& hit : hits2) {
ICING_ASSERT_OK(
serializer_->PrependHit(&posting_list_holder2.posting_list, hit));
@@ -273,10 +282,14 @@ TEST_F(FlashIndexStorageTest, FreeListInMemory) {
EXPECT_THAT(serializer_->GetHits(&posting_list_holder3.posting_list),
IsOkAndHolds(IsEmpty()));
std::vector<Hit> hits3 = {
- Hit(/*section_id=*/7, /*document_id=*/1, /*term_frequency=*/62),
- Hit(/*section_id=*/12, /*document_id=*/3, /*term_frequency=*/45),
- Hit(/*section_id=*/11, /*document_id=*/18, /*term_frequency=*/12),
- Hit(/*section_id=*/7, /*document_id=*/100, /*term_frequency=*/74)};
+ Hit(/*section_id=*/7, /*document_id=*/1, /*term_frequency=*/62,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false),
+ Hit(/*section_id=*/12, /*document_id=*/3, /*term_frequency=*/45,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false),
+ Hit(/*section_id=*/11, /*document_id=*/18, /*term_frequency=*/12,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false),
+ Hit(/*section_id=*/7, /*document_id=*/100, /*term_frequency=*/74,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false)};
for (const Hit& hit : hits3) {
ICING_ASSERT_OK(
serializer_->PrependHit(&posting_list_holder3.posting_list, hit));
@@ -314,10 +327,14 @@ TEST_F(FlashIndexStorageTest, FreeListNotInMemory) {
EXPECT_THAT(flash_index_storage.empty(), IsFalse());
std::vector<Hit> hits1 = {
- Hit(/*section_id=*/1, /*document_id=*/0, /*term_frequency=*/12),
- Hit(/*section_id=*/6, /*document_id=*/2, /*term_frequency=*/19),
- Hit(/*section_id=*/5, /*document_id=*/2, /*term_frequency=*/100),
- Hit(/*section_id=*/8, /*document_id=*/5, /*term_frequency=*/197)};
+ Hit(/*section_id=*/1, /*document_id=*/0, /*term_frequency=*/12,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false),
+ Hit(/*section_id=*/6, /*document_id=*/2, /*term_frequency=*/19,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false),
+ Hit(/*section_id=*/5, /*document_id=*/2, /*term_frequency=*/100,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false),
+ Hit(/*section_id=*/8, /*document_id=*/5, /*term_frequency=*/197,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false)};
for (const Hit& hit : hits1) {
ICING_ASSERT_OK(
serializer_->PrependHit(&posting_list_holder1.posting_list, hit));
@@ -338,10 +355,14 @@ TEST_F(FlashIndexStorageTest, FreeListNotInMemory) {
EXPECT_THAT(flash_index_storage.empty(), IsFalse());
std::vector<Hit> hits2 = {
- Hit(/*section_id=*/4, /*document_id=*/0, /*term_frequency=*/12),
- Hit(/*section_id=*/8, /*document_id=*/4, /*term_frequency=*/19),
- Hit(/*section_id=*/9, /*document_id=*/7, /*term_frequency=*/100),
- Hit(/*section_id=*/6, /*document_id=*/7, /*term_frequency=*/197)};
+ Hit(/*section_id=*/4, /*document_id=*/0, /*term_frequency=*/12,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false),
+ Hit(/*section_id=*/8, /*document_id=*/4, /*term_frequency=*/19,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false),
+ Hit(/*section_id=*/9, /*document_id=*/7, /*term_frequency=*/100,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false),
+ Hit(/*section_id=*/6, /*document_id=*/7, /*term_frequency=*/197,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false)};
for (const Hit& hit : hits2) {
ICING_ASSERT_OK(
serializer_->PrependHit(&posting_list_holder2.posting_list, hit));
@@ -374,10 +395,14 @@ TEST_F(FlashIndexStorageTest, FreeListNotInMemory) {
EXPECT_THAT(serializer_->GetHits(&posting_list_holder3.posting_list),
IsOkAndHolds(IsEmpty()));
std::vector<Hit> hits3 = {
- Hit(/*section_id=*/7, /*document_id=*/1, /*term_frequency=*/62),
- Hit(/*section_id=*/12, /*document_id=*/3, /*term_frequency=*/45),
- Hit(/*section_id=*/11, /*document_id=*/18, /*term_frequency=*/12),
- Hit(/*section_id=*/7, /*document_id=*/100, /*term_frequency=*/74)};
+ Hit(/*section_id=*/7, /*document_id=*/1, /*term_frequency=*/62,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false),
+ Hit(/*section_id=*/12, /*document_id=*/3, /*term_frequency=*/45,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false),
+ Hit(/*section_id=*/11, /*document_id=*/18, /*term_frequency=*/12,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false),
+ Hit(/*section_id=*/7, /*document_id=*/100, /*term_frequency=*/74,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false)};
for (const Hit& hit : hits3) {
ICING_ASSERT_OK(
serializer_->PrependHit(&posting_list_holder3.posting_list, hit));
@@ -417,10 +442,14 @@ TEST_F(FlashIndexStorageTest, FreeListInMemoryPersistence) {
EXPECT_THAT(flash_index_storage.empty(), IsFalse());
std::vector<Hit> hits1 = {
- Hit(/*section_id=*/1, /*document_id=*/0, /*term_frequency=*/12),
- Hit(/*section_id=*/6, /*document_id=*/2, /*term_frequency=*/19),
- Hit(/*section_id=*/5, /*document_id=*/2, /*term_frequency=*/100),
- Hit(/*section_id=*/8, /*document_id=*/5, /*term_frequency=*/197)};
+ Hit(/*section_id=*/1, /*document_id=*/0, /*term_frequency=*/12,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false),
+ Hit(/*section_id=*/6, /*document_id=*/2, /*term_frequency=*/19,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false),
+ Hit(/*section_id=*/5, /*document_id=*/2, /*term_frequency=*/100,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false),
+ Hit(/*section_id=*/8, /*document_id=*/5, /*term_frequency=*/197,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false)};
for (const Hit& hit : hits1) {
ICING_ASSERT_OK(
serializer_->PrependHit(&posting_list_holder1.posting_list, hit));
@@ -441,10 +470,14 @@ TEST_F(FlashIndexStorageTest, FreeListInMemoryPersistence) {
EXPECT_THAT(flash_index_storage.empty(), IsFalse());
std::vector<Hit> hits2 = {
- Hit(/*section_id=*/4, /*document_id=*/0, /*term_frequency=*/12),
- Hit(/*section_id=*/8, /*document_id=*/4, /*term_frequency=*/19),
- Hit(/*section_id=*/9, /*document_id=*/7, /*term_frequency=*/100),
- Hit(/*section_id=*/6, /*document_id=*/7, /*term_frequency=*/197)};
+ Hit(/*section_id=*/4, /*document_id=*/0, /*term_frequency=*/12,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false),
+ Hit(/*section_id=*/8, /*document_id=*/4, /*term_frequency=*/19,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false),
+ Hit(/*section_id=*/9, /*document_id=*/7, /*term_frequency=*/100,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false),
+ Hit(/*section_id=*/6, /*document_id=*/7, /*term_frequency=*/197,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false)};
for (const Hit& hit : hits2) {
ICING_ASSERT_OK(
serializer_->PrependHit(&posting_list_holder2.posting_list, hit));
@@ -492,10 +525,14 @@ TEST_F(FlashIndexStorageTest, FreeListInMemoryPersistence) {
EXPECT_THAT(serializer_->GetHits(&posting_list_holder3.posting_list),
IsOkAndHolds(IsEmpty()));
std::vector<Hit> hits3 = {
- Hit(/*section_id=*/7, /*document_id=*/1, /*term_frequency=*/62),
- Hit(/*section_id=*/12, /*document_id=*/3, /*term_frequency=*/45),
- Hit(/*section_id=*/11, /*document_id=*/18, /*term_frequency=*/12),
- Hit(/*section_id=*/7, /*document_id=*/100, /*term_frequency=*/74)};
+ Hit(/*section_id=*/7, /*document_id=*/1, /*term_frequency=*/62,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false),
+ Hit(/*section_id=*/12, /*document_id=*/3, /*term_frequency=*/45,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false),
+ Hit(/*section_id=*/11, /*document_id=*/18, /*term_frequency=*/12,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false),
+ Hit(/*section_id=*/7, /*document_id=*/100, /*term_frequency=*/74,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false)};
for (const Hit& hit : hits3) {
ICING_ASSERT_OK(
serializer_->PrependHit(&posting_list_holder3.posting_list, hit));
@@ -534,10 +571,14 @@ TEST_F(FlashIndexStorageTest, DifferentSizedPostingLists) {
EXPECT_THAT(flash_index_storage.empty(), IsFalse());
std::vector<Hit> hits1 = {
- Hit(/*section_id=*/1, /*document_id=*/0, /*term_frequency=*/12),
- Hit(/*section_id=*/6, /*document_id=*/2, /*term_frequency=*/19),
- Hit(/*section_id=*/5, /*document_id=*/2, /*term_frequency=*/100),
- Hit(/*section_id=*/8, /*document_id=*/5, /*term_frequency=*/197)};
+ Hit(/*section_id=*/1, /*document_id=*/0, /*term_frequency=*/12,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false),
+ Hit(/*section_id=*/6, /*document_id=*/2, /*term_frequency=*/19,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false),
+ Hit(/*section_id=*/5, /*document_id=*/2, /*term_frequency=*/100,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false),
+ Hit(/*section_id=*/8, /*document_id=*/5, /*term_frequency=*/197,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false)};
for (const Hit& hit : hits1) {
ICING_ASSERT_OK(
serializer_->PrependHit(&posting_list_holder1.posting_list, hit));
@@ -561,10 +602,14 @@ TEST_F(FlashIndexStorageTest, DifferentSizedPostingLists) {
EXPECT_THAT(flash_index_storage.empty(), IsFalse());
std::vector<Hit> hits2 = {
- Hit(/*section_id=*/4, /*document_id=*/0, /*term_frequency=*/12),
- Hit(/*section_id=*/8, /*document_id=*/4, /*term_frequency=*/19),
- Hit(/*section_id=*/9, /*document_id=*/7, /*term_frequency=*/100),
- Hit(/*section_id=*/6, /*document_id=*/7, /*term_frequency=*/197)};
+ Hit(/*section_id=*/4, /*document_id=*/0, /*term_frequency=*/12,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false),
+ Hit(/*section_id=*/8, /*document_id=*/4, /*term_frequency=*/19,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false),
+ Hit(/*section_id=*/9, /*document_id=*/7, /*term_frequency=*/100,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false),
+ Hit(/*section_id=*/6, /*document_id=*/7, /*term_frequency=*/197,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false)};
for (const Hit& hit : hits2) {
ICING_ASSERT_OK(
serializer_->PrependHit(&posting_list_holder2.posting_list, hit));
diff --git a/icing/file/posting_list/index-block_test.cc b/icing/file/posting_list/index-block_test.cc
index ebc9ba4..d841e79 100644
--- a/icing/file/posting_list/index-block_test.cc
+++ b/icing/file/posting_list/index-block_test.cc
@@ -14,11 +14,16 @@
#include "icing/file/posting_list/index-block.h"
+#include <memory>
+#include <string>
+#include <vector>
+
#include "icing/text_classifier/lib3/utils/base/status.h"
#include "gmock/gmock.h"
#include "gtest/gtest.h"
#include "icing/file/filesystem.h"
-#include "icing/file/posting_list/posting-list-used.h"
+#include "icing/file/posting_list/posting-list-common.h"
+#include "icing/index/hit/hit.h"
#include "icing/index/main/posting-list-hit-serializer.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/tmp-directory.h"
@@ -67,7 +72,7 @@ class IndexBlockTest : public ::testing::Test {
};
TEST_F(IndexBlockTest, CreateFromUninitializedRegionProducesEmptyBlock) {
- constexpr int kPostingListBytes = 20;
+ constexpr int kPostingListBytes = 24;
{
// Create an IndexBlock from this newly allocated file block.
@@ -80,7 +85,7 @@ TEST_F(IndexBlockTest, CreateFromUninitializedRegionProducesEmptyBlock) {
}
TEST_F(IndexBlockTest, SizeAccessorsWorkCorrectly) {
- constexpr int kPostingListBytes1 = 20;
+ constexpr int kPostingListBytes1 = 24;
// Create an IndexBlock from this newly allocated file block.
ICING_ASSERT_OK_AND_ASSIGN(IndexBlock block,
@@ -88,13 +93,13 @@ TEST_F(IndexBlockTest, SizeAccessorsWorkCorrectly) {
&filesystem_, serializer_.get(), sfd_->get(),
/*offset=*/0, kBlockSize, kPostingListBytes1));
EXPECT_THAT(block.posting_list_bytes(), Eq(kPostingListBytes1));
- // There should be (4096 - 12) / 20 = 204 posting lists
- // (sizeof(BlockHeader)==12). We can store a PostingListIndex of 203 in only 8
+ // There should be (4096 - 12) / 24 = 170 posting lists
+ // (sizeof(BlockHeader)==12). We can store a PostingListIndex of 170 in only 8
// bits.
- EXPECT_THAT(block.max_num_posting_lists(), Eq(204));
+ EXPECT_THAT(block.max_num_posting_lists(), Eq(170));
EXPECT_THAT(block.posting_list_index_bits(), Eq(8));
- constexpr int kPostingListBytes2 = 200;
+ constexpr int kPostingListBytes2 = 240;
// Create an IndexBlock from this newly allocated file block.
ICING_ASSERT_OK_AND_ASSIGN(
@@ -102,22 +107,27 @@ TEST_F(IndexBlockTest, SizeAccessorsWorkCorrectly) {
&filesystem_, serializer_.get(), sfd_->get(), /*offset=*/0,
kBlockSize, kPostingListBytes2));
EXPECT_THAT(block.posting_list_bytes(), Eq(kPostingListBytes2));
- // There should be (4096 - 12) / 200 = 20 posting lists
+ // There should be (4096 - 12) / 240 = 17 posting lists
// (sizeof(BlockHeader)==12). We can store a PostingListIndex of 19 in only 5
// bits.
- EXPECT_THAT(block.max_num_posting_lists(), Eq(20));
+ EXPECT_THAT(block.max_num_posting_lists(), Eq(17));
EXPECT_THAT(block.posting_list_index_bits(), Eq(5));
}
TEST_F(IndexBlockTest, IndexBlockChangesPersistAcrossInstances) {
- constexpr int kPostingListBytes = 2000;
+ constexpr int kPostingListBytes = 2004;
std::vector<Hit> test_hits{
- Hit(/*section_id=*/2, /*document_id=*/0, Hit::kDefaultTermFrequency),
- Hit(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency),
- Hit(/*section_id=*/5, /*document_id=*/1, /*term_frequency=*/99),
- Hit(/*section_id=*/3, /*document_id=*/3, /*term_frequency=*/17),
- Hit(/*section_id=*/10, /*document_id=*/10, Hit::kDefaultTermFrequency),
+ Hit(/*section_id=*/2, /*document_id=*/0, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false),
+ Hit(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false),
+ Hit(/*section_id=*/5, /*document_id=*/1, /*term_frequency=*/99,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false),
+ Hit(/*section_id=*/3, /*document_id=*/3, /*term_frequency=*/17,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false),
+ Hit(/*section_id=*/10, /*document_id=*/10, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false),
};
PostingListIndex allocated_index;
{
@@ -158,21 +168,31 @@ TEST_F(IndexBlockTest, IndexBlockChangesPersistAcrossInstances) {
}
TEST_F(IndexBlockTest, IndexBlockMultiplePostingLists) {
- constexpr int kPostingListBytes = 2000;
+ constexpr int kPostingListBytes = 2004;
std::vector<Hit> hits_in_posting_list1{
- Hit(/*section_id=*/2, /*document_id=*/0, Hit::kDefaultTermFrequency),
- Hit(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency),
- Hit(/*section_id=*/5, /*document_id=*/1, /*term_frequency=*/99),
- Hit(/*section_id=*/3, /*document_id=*/3, /*term_frequency=*/17),
- Hit(/*section_id=*/10, /*document_id=*/10, Hit::kDefaultTermFrequency),
+ Hit(/*section_id=*/2, /*document_id=*/0, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false),
+ Hit(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false),
+ Hit(/*section_id=*/5, /*document_id=*/1, /*term_frequency=*/99,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false),
+ Hit(/*section_id=*/3, /*document_id=*/3, /*term_frequency=*/17,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false),
+ Hit(/*section_id=*/10, /*document_id=*/10, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false),
};
std::vector<Hit> hits_in_posting_list2{
- Hit(/*section_id=*/12, /*document_id=*/220, /*term_frequency=*/88),
- Hit(/*section_id=*/17, /*document_id=*/265, Hit::kDefaultTermFrequency),
- Hit(/*section_id=*/0, /*document_id=*/287, /*term_frequency=*/2),
- Hit(/*section_id=*/11, /*document_id=*/306, /*term_frequency=*/12),
- Hit(/*section_id=*/10, /*document_id=*/306, Hit::kDefaultTermFrequency),
+ Hit(/*section_id=*/12, /*document_id=*/220, /*term_frequency=*/88,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false),
+ Hit(/*section_id=*/17, /*document_id=*/265, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false),
+ Hit(/*section_id=*/0, /*document_id=*/287, /*term_frequency=*/2,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false),
+ Hit(/*section_id=*/11, /*document_id=*/306, /*term_frequency=*/12,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false),
+ Hit(/*section_id=*/10, /*document_id=*/306, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false),
};
PostingListIndex allocated_index_1;
PostingListIndex allocated_index_2;
@@ -242,7 +262,7 @@ TEST_F(IndexBlockTest, IndexBlockMultiplePostingLists) {
}
TEST_F(IndexBlockTest, IndexBlockReallocatingPostingLists) {
- constexpr int kPostingListBytes = 2000;
+ constexpr int kPostingListBytes = 2004;
// Create an IndexBlock from this newly allocated file block.
ICING_ASSERT_OK_AND_ASSIGN(IndexBlock block,
@@ -252,11 +272,16 @@ TEST_F(IndexBlockTest, IndexBlockReallocatingPostingLists) {
// Add hits to the first posting list.
std::vector<Hit> hits_in_posting_list1{
- Hit(/*section_id=*/2, /*document_id=*/0, Hit::kDefaultTermFrequency),
- Hit(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency),
- Hit(/*section_id=*/5, /*document_id=*/1, /*term_frequency=*/99),
- Hit(/*section_id=*/3, /*document_id=*/3, /*term_frequency=*/17),
- Hit(/*section_id=*/10, /*document_id=*/10, Hit::kDefaultTermFrequency),
+ Hit(/*section_id=*/2, /*document_id=*/0, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false),
+ Hit(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false),
+ Hit(/*section_id=*/5, /*document_id=*/1, /*term_frequency=*/99,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false),
+ Hit(/*section_id=*/3, /*document_id=*/3, /*term_frequency=*/17,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false),
+ Hit(/*section_id=*/10, /*document_id=*/10, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false),
};
ICING_ASSERT_OK_AND_ASSIGN(IndexBlock::PostingListAndBlockInfo alloc_info_1,
block.AllocatePostingList());
@@ -270,11 +295,16 @@ TEST_F(IndexBlockTest, IndexBlockReallocatingPostingLists) {
// Add hits to the second posting list.
std::vector<Hit> hits_in_posting_list2{
- Hit(/*section_id=*/12, /*document_id=*/220, /*term_frequency=*/88),
- Hit(/*section_id=*/17, /*document_id=*/265, Hit::kDefaultTermFrequency),
- Hit(/*section_id=*/0, /*document_id=*/287, /*term_frequency=*/2),
- Hit(/*section_id=*/11, /*document_id=*/306, /*term_frequency=*/12),
- Hit(/*section_id=*/10, /*document_id=*/306, Hit::kDefaultTermFrequency),
+ Hit(/*section_id=*/12, /*document_id=*/220, /*term_frequency=*/88,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false),
+ Hit(/*section_id=*/17, /*document_id=*/265, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false),
+ Hit(/*section_id=*/0, /*document_id=*/287, /*term_frequency=*/2,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false),
+ Hit(/*section_id=*/11, /*document_id=*/306, /*term_frequency=*/12,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false),
+ Hit(/*section_id=*/10, /*document_id=*/306, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false),
};
ICING_ASSERT_OK_AND_ASSIGN(IndexBlock::PostingListAndBlockInfo alloc_info_2,
block.AllocatePostingList());
@@ -296,9 +326,12 @@ TEST_F(IndexBlockTest, IndexBlockReallocatingPostingLists) {
EXPECT_THAT(block.HasFreePostingLists(), IsOkAndHolds(IsTrue()));
std::vector<Hit> hits_in_posting_list3{
- Hit(/*section_id=*/12, /*document_id=*/0, /*term_frequency=*/88),
- Hit(/*section_id=*/17, /*document_id=*/1, Hit::kDefaultTermFrequency),
- Hit(/*section_id=*/0, /*document_id=*/2, /*term_frequency=*/2),
+ Hit(/*section_id=*/12, /*document_id=*/0, /*term_frequency=*/88,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false),
+ Hit(/*section_id=*/17, /*document_id=*/1, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false),
+ Hit(/*section_id=*/0, /*document_id=*/2, /*term_frequency=*/2,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false),
};
ICING_ASSERT_OK_AND_ASSIGN(IndexBlock::PostingListAndBlockInfo alloc_info_3,
block.AllocatePostingList());
@@ -317,7 +350,7 @@ TEST_F(IndexBlockTest, IndexBlockReallocatingPostingLists) {
}
TEST_F(IndexBlockTest, IndexBlockNextBlockIndex) {
- constexpr int kPostingListBytes = 2000;
+ constexpr int kPostingListBytes = 2004;
constexpr int kSomeBlockIndex = 22;
{
diff --git a/icing/icing-search-engine_search_test.cc b/icing/icing-search-engine_search_test.cc
index 22f42e7..d815f61 100644
--- a/icing/icing-search-engine_search_test.cc
+++ b/icing/icing-search-engine_search_test.cc
@@ -3716,6 +3716,109 @@ TEST_P(IcingSearchEngineSearchTest, SearchWithPropertyFilters) {
EXPECT_THAT(results.results(0).document(), EqualsProto(document_one));
}
+TEST_P(IcingSearchEngineSearchTest, SearchWithPropertyFiltersPolymorphism) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Person")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("emailAddress")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Artist")
+ .AddParentType("Person")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("emailAddress")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("company")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+
+ // Add a person document and an artist document
+ DocumentProto document_person =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Person")
+ .AddStringProperty("name", "Meg Ryan")
+ .AddStringProperty("emailAddress", "shopgirl@aol.com")
+ .Build();
+ DocumentProto document_artist =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Artist")
+ .AddStringProperty("name", "Meg Artist")
+ .AddStringProperty("emailAddress", "artist@aol.com")
+ .AddStringProperty("company", "company")
+ .Build();
+ ASSERT_THAT(icing.Put(document_person).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document_artist).status(), ProtoIsOk());
+
+ // Set a query with property filters of "name" in Person and "emailAddress"
+ // in Artist. By polymorphism, "name" should also apply to Artist.
+ auto search_spec = std::make_unique<SearchSpecProto>();
+ search_spec->set_term_match_type(TermMatchType::PREFIX);
+ search_spec->set_search_type(GetParam());
+ TypePropertyMask* person_type_property_mask =
+ search_spec->add_type_property_filters();
+ person_type_property_mask->set_schema_type("Person");
+ person_type_property_mask->add_paths("name");
+ TypePropertyMask* artist_type_property_mask =
+ search_spec->add_type_property_filters();
+ artist_type_property_mask->set_schema_type("Artist");
+ artist_type_property_mask->add_paths("emailAddress");
+
+ auto result_spec = std::make_unique<ResultSpecProto>();
+ auto scoring_spec = std::make_unique<ScoringSpecProto>();
+ *scoring_spec = GetDefaultScoringSpec();
+
+ // Verify that the property filter for "name" in Person is also applied to
+ // Artist.
+ search_spec->set_query("Meg");
+ SearchResultProto results =
+ icing.Search(*search_spec, *scoring_spec, *result_spec);
+ EXPECT_THAT(results.status(), ProtoIsOk());
+ EXPECT_THAT(results.results(), SizeIs(2));
+ EXPECT_THAT(results.results(1).document(), EqualsProto(document_person));
+ EXPECT_THAT(results.results(0).document(), EqualsProto(document_artist));
+
+ // Verify that the property filter for "emailAddress" in Artist is only
+ // applied to Artist.
+ search_spec->set_query("aol");
+ results = icing.Search(*search_spec, *scoring_spec, *result_spec);
+ EXPECT_THAT(results.status(), ProtoIsOk());
+ EXPECT_THAT(results.results(), SizeIs(1));
+ EXPECT_THAT(results.results(0).document(), EqualsProto(document_artist));
+
+ // Verify that the "company" property is filtered out, since it is not
+ // specified in the property filter.
+ search_spec->set_query("company");
+ results = icing.Search(*search_spec, *scoring_spec, *result_spec);
+ EXPECT_THAT(results.status(), ProtoIsOk());
+ EXPECT_THAT(results.results(), IsEmpty());
+}
+
TEST_P(IcingSearchEngineSearchTest, EmptySearchWithPropertyFilter) {
IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
diff --git a/icing/index/hit/hit.cc b/icing/index/hit/hit.cc
index 493e62b..7b7f2f6 100644
--- a/icing/index/hit/hit.cc
+++ b/icing/index/hit/hit.cc
@@ -14,42 +14,19 @@
#include "icing/index/hit/hit.h"
+#include <cstring>
+#include <limits>
+
+#include "icing/schema/section.h"
#include "icing/store/document-id.h"
#include "icing/util/bit-util.h"
+#include "icing/util/logging.h"
namespace icing {
namespace lib {
namespace {
-enum FlagOffset {
- // This hit, whether exact or not, came from a prefixed section and will
- // need to be backfilled into branching posting lists if/when those are
- // created.
- kInPrefixSection = 0,
- // This hit represents a prefix of a longer term. If exact matches are
- // required, then this hit should be ignored.
- kPrefixHit = 1,
- // Whether or not the hit has a term_frequency other than
- // kDefaultTermFrequency.
- kHasTermFrequency = 2,
- kNumFlags = 3,
-};
-
-static_assert(kDocumentIdBits + kSectionIdBits + kNumFlags <
- sizeof(Hit::Value) * 8,
- "Hit::kInvalidValue contains risky value and we should have at "
- "least one unused bit to avoid potential bugs. Please follow the "
- "process mentioned in hit.h to correct the value of "
- "Hit::kInvalidValue and remove this static_assert afterwards.");
-
-static_assert(kDocumentIdBits + kSectionIdBits + kNumFlags <=
- sizeof(Hit::Value) * 8,
- "HitOverflow");
-static_assert(kDocumentIdBits == 22, "");
-static_assert(kSectionIdBits == 6, "");
-static_assert(kNumFlags == 3, "");
-
inline DocumentId InvertDocumentId(DocumentId document_id) {
static_assert(kMaxDocumentId <= (std::numeric_limits<DocumentId>::max() - 1),
"(kMaxDocumentId + 1) must not overflow.");
@@ -88,10 +65,28 @@ SectionId BasicHit::section_id() const {
/*len=*/kSectionIdBits);
}
+Hit::Hit(Value value, Flags flags, TermFrequency term_frequency)
+ : flags_(flags), term_frequency_(term_frequency) {
+ memcpy(value_.data(), &value, sizeof(value));
+ if (!CheckFlagsAreConsistent()) {
+ ICING_VLOG(1)
+ << "Creating Hit that has inconsistent flag values across its fields: "
+ << "Hit(value=" << value << ", flags=" << flags
+ << "term_frequency=" << term_frequency << ")";
+ }
+}
+
Hit::Hit(SectionId section_id, DocumentId document_id,
Hit::TermFrequency term_frequency, bool is_in_prefix_section,
bool is_prefix_hit)
: term_frequency_(term_frequency) {
+ // We compute flags first as the value's has_flags bit depends on the flags_
+ // field.
+ Flags temp_flags = 0;
+ bit_util::BitfieldSet(term_frequency != kDefaultTermFrequency,
+ kHasTermFrequency, /*len=*/1, &temp_flags);
+ flags_ = temp_flags;
+
// Values are stored so that when sorted, they appear in document_id
// descending, section_id ascending, order. Also, all else being
// equal, non-prefix hits sort before prefix hits. So inverted
@@ -99,30 +94,29 @@ Hit::Hit(SectionId section_id, DocumentId document_id,
// (uninverted) section_id.
Value temp_value = 0;
bit_util::BitfieldSet(InvertDocumentId(document_id),
- kSectionIdBits + kNumFlags, kDocumentIdBits,
+ kSectionIdBits + kNumFlagsInValueField, kDocumentIdBits,
+ &temp_value);
+ bit_util::BitfieldSet(section_id, kNumFlagsInValueField, kSectionIdBits,
&temp_value);
- bit_util::BitfieldSet(section_id, kNumFlags, kSectionIdBits, &temp_value);
- bit_util::BitfieldSet(term_frequency != kDefaultTermFrequency,
- kHasTermFrequency, /*len=*/1, &temp_value);
bit_util::BitfieldSet(is_prefix_hit, kPrefixHit, /*len=*/1, &temp_value);
bit_util::BitfieldSet(is_in_prefix_section, kInPrefixSection,
/*len=*/1, &temp_value);
- value_ = temp_value;
+
+ bool has_flags = flags_ != kNoEnabledFlags;
+ bit_util::BitfieldSet(has_flags, kHasFlags, /*len=*/1, &temp_value);
+
+ memcpy(value_.data(), &temp_value, sizeof(temp_value));
}
DocumentId Hit::document_id() const {
DocumentId inverted_document_id = bit_util::BitfieldGet(
- value(), kSectionIdBits + kNumFlags, kDocumentIdBits);
+ value(), kSectionIdBits + kNumFlagsInValueField, kDocumentIdBits);
// Undo the document_id inversion.
return InvertDocumentId(inverted_document_id);
}
SectionId Hit::section_id() const {
- return bit_util::BitfieldGet(value(), kNumFlags, kSectionIdBits);
-}
-
-bool Hit::has_term_frequency() const {
- return bit_util::BitfieldGet(value(), kHasTermFrequency, 1);
+ return bit_util::BitfieldGet(value(), kNumFlagsInValueField, kSectionIdBits);
}
bool Hit::is_prefix_hit() const {
@@ -133,6 +127,27 @@ bool Hit::is_in_prefix_section() const {
return bit_util::BitfieldGet(value(), kInPrefixSection, 1);
}
+bool Hit::has_flags() const {
+ return bit_util::BitfieldGet(value(), kHasFlags, 1);
+}
+
+bool Hit::has_term_frequency() const {
+ return bit_util::BitfieldGet(flags(), kHasTermFrequency, 1);
+}
+
+bool Hit::CheckFlagsAreConsistent() const {
+ bool has_flags = flags_ != kNoEnabledFlags;
+ bool has_flags_enabled_in_value =
+ bit_util::BitfieldGet(value(), kHasFlags, /*len=*/1);
+
+ bool has_term_frequency = term_frequency_ != kDefaultTermFrequency;
+ bool has_term_frequency_enabled_in_flags =
+ bit_util::BitfieldGet(flags(), kHasTermFrequency, /*len=*/1);
+
+ return has_flags == has_flags_enabled_in_value &&
+ has_term_frequency == has_term_frequency_enabled_in_flags;
+}
+
Hit Hit::TranslateHit(Hit old_hit, DocumentId new_document_id) {
return Hit(old_hit.section_id(), new_document_id, old_hit.term_frequency(),
old_hit.is_in_prefix_section(), old_hit.is_prefix_hit());
@@ -140,7 +155,8 @@ Hit Hit::TranslateHit(Hit old_hit, DocumentId new_document_id) {
bool Hit::EqualsDocumentIdAndSectionId::operator()(const Hit& hit1,
const Hit& hit2) const {
- return (hit1.value() >> kNumFlags) == (hit2.value() >> kNumFlags);
+ return (hit1.value() >> kNumFlagsInValueField) ==
+ (hit2.value() >> kNumFlagsInValueField);
}
} // namespace lib
diff --git a/icing/index/hit/hit.h b/icing/index/hit/hit.h
index 111b320..83fff12 100644
--- a/icing/index/hit/hit.h
+++ b/icing/index/hit/hit.h
@@ -17,6 +17,7 @@
#include <array>
#include <cstdint>
+#include <cstring>
#include <limits>
#include "icing/legacy/core/icing-packed-pod.h"
@@ -70,6 +71,9 @@ class BasicHit {
private:
// Value bits layout: 4 unused + 22 document_id + 6 section id.
+ //
+ // The Value is guaranteed to be an unsigned integer, but its size and the
+ // information it stores may change if the Hit's encoding format is changed.
Value value_;
} __attribute__((packed));
static_assert(sizeof(BasicHit) == 4, "");
@@ -78,51 +82,83 @@ static_assert(sizeof(BasicHit) == 4, "");
// consists of:
// - a DocumentId
// - a SectionId
-// referring to the document and section that the hit corresponds to, as well as
-// metadata about the hit:
-// - whether the Hit has a TermFrequency other than the default value
-// - whether the Hit does not appear exactly in the document, but instead
-// represents a term that is a prefix of a term in the document
-// - whether the Hit came from a section that has prefix expansion enabled
-// and a term frequency for the hit.
+// - referring to the document and section that the hit corresponds to
+// - Metadata about the hit:
+// - whether the Hit does not appear exactly in the document, but instead
+// represents a term that is a prefix of a term in the document
+// (is_prefix_hit)
+// - whether the Hit came from a section that has prefix expansion enabled
+// (is_in_prefix_section)
+// - whether the Hit has set any bitmask flags (has_flags)
+// - bitmasks in flags fields:
+// - whether the Hit has a TermFrequency other than the default value
+// (has_term_frequency)
+// - a term frequency for the hit
//
// The hit is the most basic unit of the index and, when grouped together by
// term, can be used to encode what terms appear in what documents.
class Hit {
public:
- // The datatype used to encode Hit information: the document_id, section_id
- // and the has_term_frequency, prefix hit and in prefix section flags.
+ // The datatype used to encode Hit information: the document_id, section_id,
+ // and 3 flags: is_prefix_hit, is_hit_in_prefix_section and has_flags flag.
+ //
+ // The Value is guaranteed to be an unsigned integer, but its size and the
+ // information it stores may change if the Hit's encoding format is changed.
using Value = uint32_t;
// WARNING: Changing this value will invalidate any pre-existing posting lists
// on user devices.
//
- // WARNING:
- // - Hit::kInvalidValue should contain inverted kInvalidDocumentId, which is
- // b'00...0. However, currently we set it as UINT32_MAX and actually it
- // contains b'11...1, which is the inverted document_id 0.
- // - It means Hit::kInvalidValue contains valid (document_id, section_id,
- // flags), so we potentially cannot distinguish if a Hit is invalid or not.
- // The invalidity is an essential feature for posting list since we use it
- // to determine the state of the posting list.
- // - The reason why it won't break the current posting list is because the
- // unused bit(s) are set as 1 for Hit::kInvalidValue and 0 for all valid
- // Hits. In other words, the unused bit(s) are actually serving as "invalid
- // flag".
- // - If we want to exhaust all unused bits in the future, then we have to
- // change Hit::kInvalidValue to set the inverted document_id section
- // correctly (b'00...0, refer to BasicHit::kInvalidValue as an example).
- // - Also this problem is guarded by static_assert in hit.cc. If exhausting
- // all unused bits, then the static_assert will detect and fail. We can
- // safely remove the static_assert check after following the above process
- // to resolve the incorrect Hit::kInvalidValue issue.
- static constexpr Value kInvalidValue = std::numeric_limits<Value>::max();
+ // kInvalidValue contains:
+ // - 0 for unused bits. Note that unused bits are always 0 for both valid and
+ // invalid Hit values.
+ // - Inverted kInvalidDocumentId
+ // - SectionId 0 (valid), which is ok because inverted kInvalidDocumentId has
+ // already invalidated the value. In fact, we currently use all 2^6 section
+ // ids and there is no "invalid section id", so it doesn't matter what
+ // SectionId we set for kInvalidValue.
+ static constexpr Value kInvalidValue = 0;
// Docs are sorted in reverse, and 0 is never used as the inverted
// DocumentId (because it is the inverse of kInvalidValue), so it is always
// the max in a descending sort.
static constexpr Value kMaxDocumentIdSortValue = 0;
+ enum FlagOffsetsInFlagsField {
+ // Whether or not the hit has a term_frequency other than
+ // kDefaultTermFrequency.
+ kHasTermFrequency = 0,
+ kNumFlagsInFlagsField = 1,
+ };
+
+ enum FlagOffsetsInValueField {
+ // Whether or not the hit has a flags value other than kNoEnabledFlags (i.e.
+ // it has flags enabled in the flags field)
+ kHasFlags = 0,
+ // This hit, whether exact or not, came from a prefixed section and will
+ // need to be backfilled into branching posting lists if/when those are
+ // created.
+ kInPrefixSection = 1,
+ // This hit represents a prefix of a longer term. If exact matches are
+ // required, then this hit should be ignored.
+ kPrefixHit = 2,
+ kNumFlagsInValueField = 3,
+ };
+ static_assert(kDocumentIdBits + kSectionIdBits + kNumFlagsInValueField <=
+ sizeof(Hit::Value) * 8,
+ "HitOverflow");
+ static_assert(kDocumentIdBits == 22, "");
+ static_assert(kSectionIdBits == 6, "");
+ static_assert(kNumFlagsInValueField == 3, "");
+
+ // The datatype used to encode additional bit-flags in the Hit.
+ // This is guaranteed to be an unsigned integer, but its size may change if
+ // more flags are introduced in the future and require more bits to encode.
+ using Flags = uint8_t;
+ static constexpr Flags kNoEnabledFlags = 0;
+
// The Term Frequency of a Hit.
+ // This is guaranteed to be an unsigned integer, but its size may change if we
+ // need to expand the max term-frequency.
using TermFrequency = uint8_t;
using TermFrequencyArray = std::array<Hit::TermFrequency, kTotalNumSections>;
// Max TermFrequency is 255.
@@ -131,40 +167,67 @@ class Hit {
static constexpr TermFrequency kDefaultTermFrequency = 1;
static constexpr TermFrequency kNoTermFrequency = 0;
- explicit Hit(Value value = kInvalidValue,
- TermFrequency term_frequency = kDefaultTermFrequency)
- : value_(value), term_frequency_(term_frequency) {}
- Hit(SectionId section_id, DocumentId document_id,
- TermFrequency term_frequency, bool is_in_prefix_section = false,
- bool is_prefix_hit = false);
+ explicit Hit(Value value)
+ : Hit(value, kNoEnabledFlags, kDefaultTermFrequency) {}
+ explicit Hit(Value value, Flags flags, TermFrequency term_frequency);
+ explicit Hit(SectionId section_id, DocumentId document_id,
+ TermFrequency term_frequency, bool is_in_prefix_section,
+ bool is_prefix_hit);
bool is_valid() const { return value() != kInvalidValue; }
- Value value() const { return value_; }
+
+ Value value() const {
+ Value value;
+ memcpy(&value, value_.data(), sizeof(value));
+ return value;
+ }
+
DocumentId document_id() const;
SectionId section_id() const;
+ bool is_prefix_hit() const;
+ bool is_in_prefix_section() const;
+ // Whether or not the hit has any flags set to true.
+ bool has_flags() const;
+
+ Flags flags() const { return flags_; }
// Whether or not the hit contains a valid term frequency.
bool has_term_frequency() const;
+
TermFrequency term_frequency() const { return term_frequency_; }
- bool is_prefix_hit() const;
- bool is_in_prefix_section() const;
+
+ // Returns true if the flags values across the Hit's value_, term_frequency_
+ // and flags_ fields are consistent.
+ bool CheckFlagsAreConsistent() const;
// Creates a new hit based on old_hit but with new_document_id set.
static Hit TranslateHit(Hit old_hit, DocumentId new_document_id);
- bool operator<(const Hit& h2) const { return value() < h2.value(); }
- bool operator==(const Hit& h2) const { return value() == h2.value(); }
+ bool operator<(const Hit& h2) const {
+ if (value() != h2.value()) {
+ return value() < h2.value();
+ }
+ return flags() < h2.flags();
+ }
+ bool operator==(const Hit& h2) const {
+ return value() == h2.value() && flags() == h2.flags();
+ }
struct EqualsDocumentIdAndSectionId {
bool operator()(const Hit& hit1, const Hit& hit2) const;
};
private:
- // Value and TermFrequency must be in this order.
- // Value bits layout: 1 unused + 22 document_id + 6 section id + 3 flags.
- Value value_;
+ // Value, Flags and TermFrequency must be in this order.
+ // Value bits layout: 1 unused + 22 document_id + 6 section_id + 1
+ // is_prefix_hit + 1 is_in_prefix_section + 1 has_flags.
+ std::array<char, sizeof(Value)> value_;
+ // Flags bits layout: 1 reserved + 6 unused + 1 has_term_frequency.
+ // The left-most bit is reserved for chaining additional fields in case of
+ // future hit expansions.
+ Flags flags_;
TermFrequency term_frequency_;
-} __attribute__((packed));
-static_assert(sizeof(Hit) == 5, "");
+};
+static_assert(sizeof(Hit) == 6, "");
// TODO(b/138991332) decide how to remove/replace all is_packed_pod assertions.
static_assert(icing_is_packed_pod<Hit>::value, "go/icing-ubsan");
diff --git a/icing/index/hit/hit_test.cc b/icing/index/hit/hit_test.cc
index 0086d91..1233e00 100644
--- a/icing/index/hit/hit_test.cc
+++ b/icing/index/hit/hit_test.cc
@@ -14,6 +14,9 @@
#include "icing/index/hit/hit.h"
+#include <algorithm>
+#include <vector>
+
#include "gmock/gmock.h"
#include "gtest/gtest.h"
#include "icing/schema/section.h"
@@ -94,77 +97,127 @@ TEST(BasicHitTest, Comparison) {
}
TEST(HitTest, HasTermFrequencyFlag) {
- Hit h1(kSomeSectionid, kSomeDocumentId, Hit::kDefaultTermFrequency);
+ Hit h1(kSomeSectionid, kSomeDocumentId, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
EXPECT_THAT(h1.has_term_frequency(), IsFalse());
EXPECT_THAT(h1.term_frequency(), Eq(Hit::kDefaultTermFrequency));
- Hit h2(kSomeSectionid, kSomeDocumentId, kSomeTermFrequency);
+ Hit h2(kSomeSectionid, kSomeDocumentId, kSomeTermFrequency,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
EXPECT_THAT(h2.has_term_frequency(), IsTrue());
EXPECT_THAT(h2.term_frequency(), Eq(kSomeTermFrequency));
}
TEST(HitTest, IsPrefixHitFlag) {
- Hit h1(kSomeSectionid, kSomeDocumentId, Hit::kDefaultTermFrequency);
+ Hit h1(kSomeSectionid, kSomeDocumentId, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
EXPECT_THAT(h1.is_prefix_hit(), IsFalse());
Hit h2(kSomeSectionid, kSomeDocumentId, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
+ /*is_in_prefix_section=*/true, /*is_prefix_hit=*/false);
EXPECT_THAT(h2.is_prefix_hit(), IsFalse());
Hit h3(kSomeSectionid, kSomeDocumentId, Hit::kDefaultTermFrequency,
/*is_in_prefix_section=*/false, /*is_prefix_hit=*/true);
EXPECT_THAT(h3.is_prefix_hit(), IsTrue());
+
+ Hit h4(kSomeSectionid, kSomeDocumentId, kSomeTermFrequency,
+ /*is_in_prefix_section=*/true, /*is_prefix_hit=*/false);
+ EXPECT_THAT(h4.is_prefix_hit(), IsFalse());
}
TEST(HitTest, IsInPrefixSectionFlag) {
- Hit h1(kSomeSectionid, kSomeDocumentId, Hit::kDefaultTermFrequency);
+ Hit h1(kSomeSectionid, kSomeDocumentId, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
EXPECT_THAT(h1.is_in_prefix_section(), IsFalse());
Hit h2(kSomeSectionid, kSomeDocumentId, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
EXPECT_THAT(h2.is_in_prefix_section(), IsFalse());
Hit h3(kSomeSectionid, kSomeDocumentId, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/true);
+ /*is_in_prefix_section=*/true, /*is_prefix_hit=*/false);
EXPECT_THAT(h3.is_in_prefix_section(), IsTrue());
}
+TEST(HitTest, HasFlags) {
+ Hit h1(kSomeSectionid, kSomeDocumentId, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
+ EXPECT_THAT(h1.has_flags(), IsFalse());
+
+ Hit h2(kSomeSectionid, kSomeDocumentId, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/true);
+ EXPECT_THAT(h2.has_flags(), IsFalse());
+
+ Hit h3(kSomeSectionid, kSomeDocumentId, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/true, /*is_prefix_hit=*/false);
+ EXPECT_THAT(h3.has_flags(), IsFalse());
+
+ Hit h4(kSomeSectionid, kSomeDocumentId, kSomeTermFrequency,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
+ EXPECT_THAT(h4.has_flags(), IsTrue());
+
+ Hit h5(kSomeSectionid, kSomeDocumentId, kSomeTermFrequency,
+ /*is_prefix_hit=*/true, /*is_in_prefix_section=*/true);
+ EXPECT_THAT(h5.has_flags(), IsTrue());
+
+ Hit h6(kSomeSectionid, kSomeDocumentId, kSomeTermFrequency,
+ /*is_prefix_hit=*/false, /*is_in_prefix_section=*/true);
+ EXPECT_THAT(h6.has_flags(), IsTrue());
+
+ Hit h7(kSomeSectionid, kSomeDocumentId, kSomeTermFrequency,
+ /*is_prefix_hit=*/true, /*is_in_prefix_section=*/false);
+ EXPECT_THAT(h7.has_flags(), IsTrue());
+}
+
TEST(HitTest, Accessors) {
- Hit h1(kSomeSectionid, kSomeDocumentId, Hit::kDefaultTermFrequency);
+ Hit h1(kSomeSectionid, kSomeDocumentId, kSomeTermFrequency,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/true);
EXPECT_THAT(h1.document_id(), Eq(kSomeDocumentId));
EXPECT_THAT(h1.section_id(), Eq(kSomeSectionid));
+ EXPECT_THAT(h1.term_frequency(), Eq(kSomeTermFrequency));
+ EXPECT_THAT(h1.is_in_prefix_section(), IsFalse());
+ EXPECT_THAT(h1.is_prefix_hit(), IsTrue());
}
TEST(HitTest, Valid) {
- Hit def;
+ Hit def(Hit::kInvalidValue);
EXPECT_THAT(def.is_valid(), IsFalse());
-
- Hit explicit_invalid(Hit::kInvalidValue);
+ Hit explicit_invalid(Hit::kInvalidValue, Hit::kNoEnabledFlags,
+ Hit::kDefaultTermFrequency);
EXPECT_THAT(explicit_invalid.is_valid(), IsFalse());
static constexpr Hit::Value kSomeValue = 65372;
- Hit explicit_valid(kSomeValue);
+ Hit explicit_valid(kSomeValue, Hit::kNoEnabledFlags,
+ Hit::kDefaultTermFrequency);
EXPECT_THAT(explicit_valid.is_valid(), IsTrue());
- Hit maximum_document_id_hit(kSomeSectionid, kMaxDocumentId,
- kSomeTermFrequency);
+ Hit maximum_document_id_hit(
+ kSomeSectionid, kMaxDocumentId, kSomeTermFrequency,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
EXPECT_THAT(maximum_document_id_hit.is_valid(), IsTrue());
- Hit maximum_section_id_hit(kMaxSectionId, kSomeDocumentId,
- kSomeTermFrequency);
+ Hit maximum_section_id_hit(kMaxSectionId, kSomeDocumentId, kSomeTermFrequency,
+ /*is_in_prefix_section=*/false,
+ /*is_prefix_hit=*/false);
EXPECT_THAT(maximum_section_id_hit.is_valid(), IsTrue());
- Hit minimum_document_id_hit(kSomeSectionid, 0, kSomeTermFrequency);
+ Hit minimum_document_id_hit(kSomeSectionid, 0, kSomeTermFrequency,
+ /*is_in_prefix_section=*/false,
+ /*is_prefix_hit=*/false);
EXPECT_THAT(minimum_document_id_hit.is_valid(), IsTrue());
- Hit minimum_section_id_hit(0, kSomeDocumentId, kSomeTermFrequency);
+ Hit minimum_section_id_hit(0, kSomeDocumentId, kSomeTermFrequency,
+ /*is_in_prefix_section=*/false,
+ /*is_prefix_hit=*/false);
EXPECT_THAT(minimum_section_id_hit.is_valid(), IsTrue());
- // We use Hit with value Hit::kMaxDocumentIdSortValue for std::lower_bound in
- // the lite index. Verify that the value of the smallest valid Hit (which
+ // We use Hit with value Hit::kMaxDocumentIdSortValue for std::lower_bound
+ // in the lite index. Verify that the value of the smallest valid Hit (which
// contains kMinSectionId, kMaxDocumentId and 3 flags = false) is >=
// Hit::kMaxDocumentIdSortValue.
- Hit smallest_hit(kMinSectionId, kMaxDocumentId, Hit::kDefaultTermFrequency);
+ Hit smallest_hit(kMinSectionId, kMaxDocumentId, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
ASSERT_THAT(smallest_hit.is_valid(), IsTrue());
ASSERT_THAT(smallest_hit.has_term_frequency(), IsFalse());
ASSERT_THAT(smallest_hit.is_prefix_hit(), IsFalse());
@@ -173,39 +226,78 @@ TEST(HitTest, Valid) {
}
TEST(HitTest, Comparison) {
- Hit hit(1, 243, Hit::kDefaultTermFrequency);
+ Hit hit(/*section_id=*/1, /*document_id=*/243, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
// DocumentIds are sorted in ascending order. So a hit with a lower
// document_id should be considered greater than one with a higher
// document_id.
- Hit higher_document_id_hit(1, 2409, Hit::kDefaultTermFrequency);
- Hit higher_section_id_hit(15, 243, Hit::kDefaultTermFrequency);
+ Hit higher_document_id_hit(
+ /*section_id=*/1, /*document_id=*/2409, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
+ Hit higher_section_id_hit(/*section_id=*/15, /*document_id=*/243,
+ Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false,
+ /*is_prefix_hit=*/false);
// Whether or not a term frequency was set is considered, but the term
// frequency itself is not.
- Hit term_frequency_hit(1, 243, 12);
- Hit prefix_hit(1, 243, Hit::kDefaultTermFrequency,
+ Hit term_frequency_hit(/*section_id=*/1, 243, /*term_frequency=*/12,
+ /*is_in_prefix_section=*/false,
+ /*is_prefix_hit=*/false);
+ Hit prefix_hit(/*section_id=*/1, 243, Hit::kDefaultTermFrequency,
/*is_in_prefix_section=*/false,
/*is_prefix_hit=*/true);
- Hit hit_in_prefix_section(1, 243, Hit::kDefaultTermFrequency,
+ Hit hit_in_prefix_section(/*section_id=*/1, 243, Hit::kDefaultTermFrequency,
/*is_in_prefix_section=*/true,
/*is_prefix_hit=*/false);
+ Hit hit_with_all_flags_enabled(/*section_id=*/1, 243, 56,
+ /*is_in_prefix_section=*/true,
+ /*is_prefix_hit=*/true);
std::vector<Hit> hits{hit,
higher_document_id_hit,
higher_section_id_hit,
term_frequency_hit,
prefix_hit,
- hit_in_prefix_section};
+ hit_in_prefix_section,
+ hit_with_all_flags_enabled};
std::sort(hits.begin(), hits.end());
- EXPECT_THAT(
- hits, ElementsAre(higher_document_id_hit, hit, hit_in_prefix_section,
- prefix_hit, term_frequency_hit, higher_section_id_hit));
+ EXPECT_THAT(hits,
+ ElementsAre(higher_document_id_hit, hit, term_frequency_hit,
+ hit_in_prefix_section, prefix_hit,
+ hit_with_all_flags_enabled, higher_section_id_hit));
- Hit higher_term_frequency_hit(1, 243, 108);
+ Hit higher_term_frequency_hit(/*section_id=*/1, 243, /*term_frequency=*/108,
+ /*is_in_prefix_section=*/false,
+ /*is_prefix_hit=*/false);
// The term frequency value is not considered when comparing hits.
EXPECT_THAT(term_frequency_hit, Not(Lt(higher_term_frequency_hit)));
EXPECT_THAT(higher_term_frequency_hit, Not(Lt(term_frequency_hit)));
}
+TEST(HitTest, CheckFlagsAreConsistent) {
+ Hit::Value value_without_flags = 1 << 30;
+ Hit::Value value_with_flags = value_without_flags + 1;
+ Hit::Flags flags_with_term_freq = 1;
+
+ Hit consistent_hit_no_flags(value_without_flags, Hit::kNoEnabledFlags,
+ Hit::kDefaultTermFrequency);
+ Hit consistent_hit_with_term_frequency(value_with_flags, flags_with_term_freq,
+ kSomeTermFrequency);
+ EXPECT_THAT(consistent_hit_no_flags.CheckFlagsAreConsistent(), IsTrue());
+ EXPECT_THAT(consistent_hit_with_term_frequency.CheckFlagsAreConsistent(),
+ IsTrue());
+
+ Hit inconsistent_hit_1(value_with_flags, Hit::kNoEnabledFlags,
+ Hit::kDefaultTermFrequency);
+ Hit inconsistent_hit_2(value_with_flags, Hit::kNoEnabledFlags,
+ kSomeTermFrequency);
+ Hit inconsistent_hit_3(value_with_flags, flags_with_term_freq,
+ Hit::kDefaultTermFrequency);
+ EXPECT_THAT(inconsistent_hit_1.CheckFlagsAreConsistent(), IsFalse());
+ EXPECT_THAT(inconsistent_hit_2.CheckFlagsAreConsistent(), IsFalse());
+ EXPECT_THAT(inconsistent_hit_3.CheckFlagsAreConsistent(), IsFalse());
+}
+
} // namespace
} // namespace lib
diff --git a/icing/index/index.cc b/icing/index/index.cc
index 31dcc7e..f917aa6 100644
--- a/icing/index/index.cc
+++ b/icing/index/index.cc
@@ -65,10 +65,9 @@ libtextclassifier3::StatusOr<LiteIndex::Options> CreateLiteIndexOptions(
"Requested hit buffer size %d is too large.",
options.index_merge_size));
}
- return LiteIndex::Options(options.base_dir + "/idx/lite.",
- options.index_merge_size,
- options.lite_index_sort_at_indexing,
- options.lite_index_sort_size);
+ return LiteIndex::Options(
+ options.base_dir + "/idx/lite.", options.index_merge_size,
+ options.lite_index_sort_at_indexing, options.lite_index_sort_size);
}
std::string MakeMainIndexFilepath(const std::string& base_dir) {
@@ -345,7 +344,8 @@ libtextclassifier3::Status Index::Editor::BufferTerm(const char* term) {
libtextclassifier3::Status Index::Editor::IndexAllBufferedTerms() {
for (auto itr = seen_tokens_.begin(); itr != seen_tokens_.end(); itr++) {
Hit hit(section_id_, document_id_, /*term_frequency=*/itr->second,
- term_match_type_ == TermMatchType::PREFIX);
+ /*is_in_prefix_section=*/term_match_type_ == TermMatchType::PREFIX,
+ /*is_prefix_hit=*/false);
ICING_ASSIGN_OR_RETURN(
uint32_t term_id, term_id_codec_->EncodeTvi(itr->first, TviType::LITE));
ICING_RETURN_IF_ERROR(lite_index_->AddHit(term_id, hit));
diff --git a/icing/index/index_test.cc b/icing/index/index_test.cc
index ab550e1..50b65ad 100644
--- a/icing/index/index_test.cc
+++ b/icing/index/index_test.cc
@@ -2736,7 +2736,8 @@ TEST_F(IndexTest, IndexStorageInfoProto) {
EXPECT_THAT(storage_info.main_index_lexicon_size(), Ge(0));
EXPECT_THAT(storage_info.main_index_storage_size(), Ge(0));
EXPECT_THAT(storage_info.main_index_block_size(), Ge(0));
- // There should be 1 block for the header and 1 block for two posting lists.
+ // There should be 1 block for the header and 1 block for three posting lists
+ // ("fo", "foo", "foul").
EXPECT_THAT(storage_info.num_blocks(), Eq(2));
EXPECT_THAT(storage_info.min_free_fraction(), Ge(0));
}
diff --git a/icing/index/iterator/doc-hit-info-iterator-section-restrict.cc b/icing/index/iterator/doc-hit-info-iterator-section-restrict.cc
index 35dc0b9..682b752 100644
--- a/icing/index/iterator/doc-hit-info-iterator-section-restrict.cc
+++ b/icing/index/iterator/doc-hit-info-iterator-section-restrict.cc
@@ -113,12 +113,12 @@ DocHitInfoIteratorSectionRestrict::ApplyRestrictions(
const DocumentStore* document_store, const SchemaStore* schema_store,
const SearchSpecProto& search_spec, int64_t current_time_ms) {
std::unordered_map<std::string, std::set<std::string>> type_property_filters;
- // TODO(b/294274922): Add support for polymorphism in type property filters.
- for (const TypePropertyMask& type_property_mask :
- search_spec.type_property_filters()) {
- type_property_filters[type_property_mask.schema_type()] =
- std::set<std::string>(type_property_mask.paths().begin(),
- type_property_mask.paths().end());
+ for (const SchemaStore::ExpandedTypePropertyMask& type_property_mask :
+ schema_store->ExpandTypePropertyMasks(
+ search_spec.type_property_filters())) {
+ type_property_filters[type_property_mask.schema_type] =
+ std::set<std::string>(type_property_mask.paths.begin(),
+ type_property_mask.paths.end());
}
auto data = std::make_unique<SectionRestrictData>(
document_store, schema_store, current_time_ms, type_property_filters);
diff --git a/icing/index/lite/lite-index-header.h b/icing/index/lite/lite-index-header.h
index 6208cb6..741d173 100644
--- a/icing/index/lite/lite-index-header.h
+++ b/icing/index/lite/lite-index-header.h
@@ -53,7 +53,7 @@ class LiteIndex_Header {
class LiteIndex_HeaderImpl : public LiteIndex_Header {
public:
struct HeaderData {
- static const uint32_t kMagic = 0x01c61418;
+ static const uint32_t kMagic = 0xC2EAD682;
uint32_t lite_index_crc;
uint32_t magic;
diff --git a/icing/index/lite/lite-index.cc b/icing/index/lite/lite-index.cc
index 4c5cb37..3aed7e4 100644
--- a/icing/index/lite/lite-index.cc
+++ b/icing/index/lite/lite-index.cc
@@ -74,7 +74,7 @@ size_t header_size() { return sizeof(LiteIndex_HeaderImpl::HeaderData); }
} // namespace
const TermIdHitPair::Value TermIdHitPair::kInvalidValue =
- TermIdHitPair(0, Hit()).value();
+ TermIdHitPair(0, Hit(Hit::kInvalidValue)).value();
libtextclassifier3::StatusOr<std::unique_ptr<LiteIndex>> LiteIndex::Create(
const LiteIndex::Options& options, const IcingFilesystem* filesystem) {
@@ -517,7 +517,8 @@ int LiteIndex::FetchHits(
// Do binary search over the sorted section and repeat the above steps.
TermIdHitPair target_term_id_hit_pair(
- term_id, Hit(Hit::kMaxDocumentIdSortValue, Hit::kDefaultTermFrequency));
+ term_id, Hit(Hit::kMaxDocumentIdSortValue, Hit::kNoEnabledFlags,
+ Hit::kDefaultTermFrequency));
for (const TermIdHitPair* ptr = std::lower_bound(
array, array + header_->searchable_end(), target_term_id_hit_pair);
ptr < array + header_->searchable_end(); ++ptr) {
diff --git a/icing/index/lite/lite-index_test.cc b/icing/index/lite/lite-index_test.cc
index 7ff006e..f8ea94a 100644
--- a/icing/index/lite/lite-index_test.cc
+++ b/icing/index/lite/lite-index_test.cc
@@ -27,7 +27,6 @@
#include "icing/index/hit/hit.h"
#include "icing/index/iterator/doc-hit-info-iterator.h"
#include "icing/index/lite/doc-hit-info-iterator-term-lite.h"
-#include "icing/index/lite/lite-index-header.h"
#include "icing/index/lite/term-id-hit-pair.h"
#include "icing/index/term-id-codec.h"
#include "icing/legacy/index/icing-dynamic-trie.h"
@@ -73,15 +72,25 @@ class LiteIndexTest : public testing::Test {
constexpr NamespaceId kNamespace0 = 0;
+TEST_F(LiteIndexTest, TermIdHitPairInvalidValue) {
+ TermIdHitPair invalidTermHitPair(TermIdHitPair::kInvalidValue);
+
+ EXPECT_THAT(invalidTermHitPair.term_id(), Eq(0));
+ EXPECT_THAT(invalidTermHitPair.hit().value(), Eq(Hit::kInvalidValue));
+ EXPECT_THAT(invalidTermHitPair.hit().term_frequency(),
+ Eq(Hit::kDefaultTermFrequency));
+}
+
TEST_F(LiteIndexTest,
LiteIndexFetchHits_sortAtQuerying_unsortedHitsBelowSortThreshold) {
// Set up LiteIndex and TermIdCodec
std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index";
- // At 64 bytes the unsorted tail can contain a max of 8 TermHitPairs.
- LiteIndex::Options options(lite_index_file_name,
- /*hit_buffer_want_merge_bytes=*/1024 * 1024,
- /*hit_buffer_sort_at_indexing=*/false,
- /*hit_buffer_sort_threshold_bytes=*/64);
+ // Unsorted tail can contain a max of 8 TermIdHitPairs.
+ LiteIndex::Options options(
+ lite_index_file_name,
+ /*hit_buffer_want_merge_bytes=*/1024 * 1024,
+ /*hit_buffer_sort_at_indexing=*/false,
+ /*hit_buffer_sort_threshold_bytes=*/sizeof(TermIdHitPair) * 8);
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LiteIndex> lite_index,
LiteIndex::Create(options, &icing_filesystem_));
ICING_ASSERT_OK_AND_ASSIGN(
@@ -97,9 +106,9 @@ TEST_F(LiteIndexTest,
ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id,
term_id_codec_->EncodeTvi(foo_tvi, TviType::LITE));
Hit foo_hit0(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
Hit foo_hit1(/*section_id=*/1, /*document_id=*/1, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, foo_hit0));
ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, foo_hit1));
@@ -109,9 +118,9 @@ TEST_F(LiteIndexTest,
ICING_ASSERT_OK_AND_ASSIGN(uint32_t bar_term_id,
term_id_codec_->EncodeTvi(bar_tvi, TviType::LITE));
Hit bar_hit0(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
Hit bar_hit1(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index->AddHit(bar_term_id, bar_hit0));
ICING_ASSERT_OK(lite_index->AddHit(bar_term_id, bar_hit1));
@@ -154,15 +163,16 @@ TEST_F(LiteIndexTest,
LiteIndexFetchHits_sortAtIndexing_unsortedHitsBelowSortThreshold) {
// Set up LiteIndex and TermIdCodec
std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index";
- // At 64 bytes the unsorted tail can contain a max of 8 TermHitPairs.
+ // The unsorted tail can contain a max of 8 TermIdHitPairs.
// However note that in these tests we're unable to sort hits after
// indexing, as sorting performed by the string-section-indexing-handler
// after indexing all hits in an entire document, rather than after each
// AddHits() operation.
- LiteIndex::Options options(lite_index_file_name,
- /*hit_buffer_want_merge_bytes=*/1024 * 1024,
- /*hit_buffer_sort_at_indexing=*/true,
- /*hit_buffer_sort_threshold_bytes=*/64);
+ LiteIndex::Options options(
+ lite_index_file_name,
+ /*hit_buffer_want_merge_bytes=*/1024 * 1024,
+ /*hit_buffer_sort_at_indexing=*/true,
+ /*hit_buffer_sort_threshold_bytes=*/sizeof(TermIdHitPair) * 8);
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LiteIndex> lite_index,
LiteIndex::Create(options, &icing_filesystem_));
ICING_ASSERT_OK_AND_ASSIGN(
@@ -178,9 +188,9 @@ TEST_F(LiteIndexTest,
ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id,
term_id_codec_->EncodeTvi(foo_tvi, TviType::LITE));
Hit foo_hit0(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
Hit foo_hit1(/*section_id=*/1, /*document_id=*/1, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, foo_hit0));
ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, foo_hit1));
@@ -190,9 +200,9 @@ TEST_F(LiteIndexTest,
ICING_ASSERT_OK_AND_ASSIGN(uint32_t bar_term_id,
term_id_codec_->EncodeTvi(bar_tvi, TviType::LITE));
Hit bar_hit0(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
Hit bar_hit1(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index->AddHit(bar_term_id, bar_hit0));
ICING_ASSERT_OK(lite_index->AddHit(bar_term_id, bar_hit1));
@@ -237,15 +247,16 @@ TEST_F(
LiteIndexFetchHits_sortAtQuerying_unsortedHitsExceedingSortAtIndexThreshold) {
// Set up LiteIndex and TermIdCodec
std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index";
- // At 64 bytes the unsorted tail can contain a max of 8 TermHitPairs.
+ // The unsorted tail can contain a max of 8 TermIdHitPairs.
// However note that in these tests we're unable to sort hits after
// indexing, as sorting performed by the string-section-indexing-handler
// after indexing all hits in an entire document, rather than after each
// AddHits() operation.
- LiteIndex::Options options(lite_index_file_name,
- /*hit_buffer_want_merge_bytes=*/1024 * 1024,
- /*hit_buffer_sort_at_indexing=*/false,
- /*hit_buffer_sort_threshold_bytes=*/64);
+ LiteIndex::Options options(
+ lite_index_file_name,
+ /*hit_buffer_want_merge_bytes=*/1024 * 1024,
+ /*hit_buffer_sort_at_indexing=*/false,
+ /*hit_buffer_sort_threshold_bytes=*/sizeof(TermIdHitPair) * 8);
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LiteIndex> lite_index,
LiteIndex::Create(options, &icing_filesystem_));
ICING_ASSERT_OK_AND_ASSIGN(
@@ -257,36 +268,36 @@ TEST_F(
// Create 4 hits for docs 0-2, and 2 hits for doc 3 -- 14 in total
// Doc 0
Hit doc0_hit0(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
Hit doc0_hit1(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
Hit doc0_hit2(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
Hit doc0_hit3(/*section_id=*/2, /*document_id=*/0, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
// Doc 1
Hit doc1_hit0(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
Hit doc1_hit1(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
Hit doc1_hit2(/*section_id=*/1, /*document_id=*/1, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
Hit doc1_hit3(/*section_id=*/2, /*document_id=*/1, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
// Doc 2
Hit doc2_hit0(/*section_id=*/0, /*document_id=*/2, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
Hit doc2_hit1(/*section_id=*/0, /*document_id=*/2, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
Hit doc2_hit2(/*section_id=*/1, /*document_id=*/2, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
Hit doc2_hit3(/*section_id=*/2, /*document_id=*/2, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
// Doc 3
Hit doc3_hit0(/*section_id=*/0, /*document_id=*/3, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
Hit doc3_hit1(/*section_id=*/0, /*document_id=*/3, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
// Create terms
// Foo
@@ -398,11 +409,12 @@ TEST_F(
LiteIndexFetchHits_sortAtIndexing_unsortedHitsExceedingSortAtIndexThreshold) {
// Set up LiteIndex and TermIdCodec
std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index";
- // At 64 bytes the unsorted tail can contain a max of 8 TermHitPairs.
- LiteIndex::Options options(lite_index_file_name,
- /*hit_buffer_want_merge_bytes=*/1024 * 1024,
- /*hit_buffer_sort_at_indexing=*/true,
- /*hit_buffer_sort_threshold_bytes=*/64);
+ // The unsorted tail can contain a max of 8 TermIdHitPairs.
+ LiteIndex::Options options(
+ lite_index_file_name,
+ /*hit_buffer_want_merge_bytes=*/1024 * 1024,
+ /*hit_buffer_sort_at_indexing=*/true,
+ /*hit_buffer_sort_threshold_bytes=*/sizeof(TermIdHitPair) * 8);
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LiteIndex> lite_index,
LiteIndex::Create(options, &icing_filesystem_));
ICING_ASSERT_OK_AND_ASSIGN(
@@ -414,49 +426,49 @@ TEST_F(
// Create 4 hits for docs 0-2, and 2 hits for doc 3 -- 14 in total
// Doc 0
Hit doc0_hit0(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
Hit doc0_hit1(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
Hit doc0_hit2(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
Hit doc0_hit3(/*section_id=*/2, /*document_id=*/0, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
// Doc 1
Hit doc1_hit0(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
Hit doc1_hit1(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
Hit doc1_hit2(/*section_id=*/1, /*document_id=*/1, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
Hit doc1_hit3(/*section_id=*/2, /*document_id=*/1, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
// Doc 2
Hit doc2_hit0(/*section_id=*/0, /*document_id=*/2, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
Hit doc2_hit1(/*section_id=*/0, /*document_id=*/2, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
Hit doc2_hit2(/*section_id=*/1, /*document_id=*/2, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
Hit doc2_hit3(/*section_id=*/2, /*document_id=*/2, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
// Doc 3
Hit doc3_hit0(/*section_id=*/0, /*document_id=*/3, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
Hit doc3_hit1(/*section_id=*/0, /*document_id=*/3, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
Hit doc3_hit2(/*section_id=*/1, /*document_id=*/3, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
Hit doc3_hit3(/*section_id=*/2, /*document_id=*/3, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
// Doc 4
Hit doc4_hit0(/*section_id=*/0, /*document_id=*/4, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
Hit doc4_hit1(/*section_id=*/0, /*document_id=*/4, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
Hit doc4_hit2(/*section_id=*/1, /*document_id=*/4, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
Hit doc4_hit3(/*section_id=*/2, /*document_id=*/4, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
// Create terms
// Foo
@@ -590,11 +602,12 @@ TEST_F(
TEST_F(LiteIndexTest, LiteIndexIterator) {
// Set up LiteIndex and TermIdCodec
std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index";
- // At 64 bytes the unsorted tail can contain a max of 8 TermHitPairs.
- LiteIndex::Options options(lite_index_file_name,
- /*hit_buffer_want_merge_bytes=*/1024 * 1024,
- /*hit_buffer_sort_at_indexing=*/true,
- /*hit_buffer_sort_threshold_bytes=*/64);
+ // The unsorted tail can contain a max of 8 TermIdHitPairs.
+ LiteIndex::Options options(
+ lite_index_file_name,
+ /*hit_buffer_want_merge_bytes=*/1024 * 1024,
+ /*hit_buffer_sort_at_indexing=*/true,
+ /*hit_buffer_sort_threshold_bytes=*/sizeof(TermIdHitPair) * 8);
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LiteIndex> lite_index,
LiteIndex::Create(options, &icing_filesystem_));
ICING_ASSERT_OK_AND_ASSIGN(
@@ -610,9 +623,9 @@ TEST_F(LiteIndexTest, LiteIndexIterator) {
ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id,
term_id_codec_->EncodeTvi(tvi, TviType::LITE));
Hit doc0_hit0(/*section_id=*/0, /*document_id=*/0, /*term_frequency=*/3,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
Hit doc0_hit1(/*section_id=*/1, /*document_id=*/0, /*term_frequency=*/5,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
SectionIdMask doc0_section_id_mask = 0b11;
std::unordered_map<SectionId, Hit::TermFrequency>
expected_section_ids_tf_map0 = {{0, 3}, {1, 5}};
@@ -620,9 +633,9 @@ TEST_F(LiteIndexTest, LiteIndexIterator) {
ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc0_hit1));
Hit doc1_hit1(/*section_id=*/1, /*document_id=*/1, /*term_frequency=*/7,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
Hit doc1_hit2(/*section_id=*/2, /*document_id=*/1, /*term_frequency=*/11,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
SectionIdMask doc1_section_id_mask = 0b110;
std::unordered_map<SectionId, Hit::TermFrequency>
expected_section_ids_tf_map1 = {{1, 7}, {2, 11}};
@@ -658,11 +671,12 @@ TEST_F(LiteIndexTest, LiteIndexIterator) {
TEST_F(LiteIndexTest, LiteIndexIterator_sortAtIndexingDisabled) {
// Set up LiteIndex and TermIdCodec
std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index";
- // At 64 bytes the unsorted tail can contain a max of 8 TermHitPairs.
- LiteIndex::Options options(lite_index_file_name,
- /*hit_buffer_want_merge_bytes=*/1024 * 1024,
- /*hit_buffer_sort_at_indexing=*/false,
- /*hit_buffer_sort_threshold_bytes=*/64);
+ // The unsorted tail can contain a max of 8 TermIdHitPairs.
+ LiteIndex::Options options(
+ lite_index_file_name,
+ /*hit_buffer_want_merge_bytes=*/1024 * 1024,
+ /*hit_buffer_sort_at_indexing=*/false,
+ /*hit_buffer_sort_threshold_bytes=*/sizeof(TermIdHitPair) * 8);
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LiteIndex> lite_index,
LiteIndex::Create(options, &icing_filesystem_));
ICING_ASSERT_OK_AND_ASSIGN(
@@ -678,9 +692,9 @@ TEST_F(LiteIndexTest, LiteIndexIterator_sortAtIndexingDisabled) {
ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id,
term_id_codec_->EncodeTvi(tvi, TviType::LITE));
Hit doc0_hit0(/*section_id=*/0, /*document_id=*/0, /*term_frequency=*/3,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
Hit doc0_hit1(/*section_id=*/1, /*document_id=*/0, /*term_frequency=*/5,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
SectionIdMask doc0_section_id_mask = 0b11;
std::unordered_map<SectionId, Hit::TermFrequency>
expected_section_ids_tf_map0 = {{0, 3}, {1, 5}};
@@ -688,9 +702,9 @@ TEST_F(LiteIndexTest, LiteIndexIterator_sortAtIndexingDisabled) {
ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc0_hit1));
Hit doc1_hit1(/*section_id=*/1, /*document_id=*/1, /*term_frequency=*/7,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
Hit doc1_hit2(/*section_id=*/2, /*document_id=*/1, /*term_frequency=*/11,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
SectionIdMask doc1_section_id_mask = 0b110;
std::unordered_map<SectionId, Hit::TermFrequency>
expected_section_ids_tf_map1 = {{1, 7}, {2, 11}};
@@ -726,11 +740,12 @@ TEST_F(LiteIndexTest, LiteIndexIterator_sortAtIndexingDisabled) {
TEST_F(LiteIndexTest, LiteIndexHitBufferSize) {
// Set up LiteIndex and TermIdCodec
std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index";
- // At 64 bytes the unsorted tail can contain a max of 8 TermHitPairs.
- LiteIndex::Options options(lite_index_file_name,
- /*hit_buffer_want_merge_bytes=*/1024 * 1024,
- /*hit_buffer_sort_at_indexing=*/true,
- /*hit_buffer_sort_threshold_bytes=*/64);
+ // The unsorted tail can contain a max of 8 TermIdHitPairs.
+ LiteIndex::Options options(
+ lite_index_file_name,
+ /*hit_buffer_want_merge_bytes=*/1024 * 1024,
+ /*hit_buffer_sort_at_indexing=*/true,
+ /*hit_buffer_sort_threshold_bytes=*/sizeof(TermIdHitPair) * 8);
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LiteIndex> lite_index,
LiteIndex::Create(options, &icing_filesystem_));
ICING_ASSERT_OK_AND_ASSIGN(
@@ -746,9 +761,9 @@ TEST_F(LiteIndexTest, LiteIndexHitBufferSize) {
ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id,
term_id_codec_->EncodeTvi(tvi, TviType::LITE));
Hit hit0(/*section_id=*/0, /*document_id=*/0, /*term_frequency=*/3,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
Hit hit1(/*section_id=*/1, /*document_id=*/0, /*term_frequency=*/5,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, hit0));
ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, hit1));
diff --git a/icing/index/lite/lite-index_thread-safety_test.cc b/icing/index/lite/lite-index_thread-safety_test.cc
index 53aa6cd..a73ca28 100644
--- a/icing/index/lite/lite-index_thread-safety_test.cc
+++ b/icing/index/lite/lite-index_thread-safety_test.cc
@@ -13,15 +13,25 @@
// limitations under the License.
#include <array>
+#include <cstdint>
+#include <memory>
#include <string>
+#include <string_view>
#include <thread>
#include <vector>
#include "gmock/gmock.h"
#include "gtest/gtest.h"
+#include "icing/file/filesystem.h"
+#include "icing/index/hit/doc-hit-info.h"
+#include "icing/index/hit/hit.h"
#include "icing/index/lite/lite-index.h"
#include "icing/index/term-id-codec.h"
+#include "icing/legacy/index/icing-dynamic-trie.h"
+#include "icing/legacy/index/icing-filesystem.h"
#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
+#include "icing/store/namespace-id.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/tmp-directory.h"
@@ -109,9 +119,11 @@ TEST_F(LiteIndexThreadSafetyTest, SimultaneousFetchHits_singleTerm) {
ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id,
term_id_codec_->EncodeTvi(foo_tvi, TviType::LITE));
Hit doc_hit0(/*section_id=*/kSectionId0, /*document_id=*/kDocumentId0,
- Hit::kDefaultTermFrequency, /*is_in_prefix_section=*/false);
+ Hit::kDefaultTermFrequency, /*is_in_prefix_section=*/false,
+ /*is_prefix_hit=*/false);
Hit doc_hit1(/*section_id=*/kSectionId0, /*document_id=*/kDocumentId1,
- Hit::kDefaultTermFrequency, /*is_in_prefix_section=*/false);
+ Hit::kDefaultTermFrequency, /*is_in_prefix_section=*/false,
+ /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc_hit0));
ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc_hit1));
@@ -155,9 +167,11 @@ TEST_F(LiteIndexThreadSafetyTest, SimultaneousFetchHits_multipleTerms) {
ICING_ASSERT_OK_AND_ASSIGN(uint32_t term_id,
term_id_codec_->EncodeTvi(tvi, TviType::LITE));
Hit doc_hit0(/*section_id=*/kSectionId0, /*document_id=*/kDocumentId0,
- Hit::kDefaultTermFrequency, /*is_in_prefix_section=*/false);
+ Hit::kDefaultTermFrequency, /*is_in_prefix_section=*/false,
+ /*is_prefix_hit=*/false);
Hit doc_hit1(/*section_id=*/kSectionId0, /*document_id=*/kDocumentId1,
- Hit::kDefaultTermFrequency, /*is_in_prefix_section=*/false);
+ Hit::kDefaultTermFrequency, /*is_in_prefix_section=*/false,
+ /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(term_id, doc_hit0));
ICING_ASSERT_OK(lite_index_->AddHit(term_id, doc_hit1));
}
@@ -208,7 +222,8 @@ TEST_F(LiteIndexThreadSafetyTest, SimultaneousAddHitAndFetchHits_singleTerm) {
ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id,
term_id_codec_->EncodeTvi(foo_tvi, TviType::LITE));
Hit doc_hit0(/*section_id=*/kSectionId0, /*document_id=*/kDocumentId0,
- Hit::kDefaultTermFrequency, /*is_in_prefix_section=*/false);
+ Hit::kDefaultTermFrequency, /*is_in_prefix_section=*/false,
+ /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc_hit0));
// Create kNumThreads threads. Every even-numbered thread calls FetchHits and
@@ -228,7 +243,8 @@ TEST_F(LiteIndexThreadSafetyTest, SimultaneousAddHitAndFetchHits_singleTerm) {
} else {
// Odd-numbered thread calls AddHit.
Hit doc_hit(/*section_id=*/thread_id / 2, /*document_id=*/kDocumentId0,
- Hit::kDefaultTermFrequency, /*is_in_prefix_section=*/false);
+ Hit::kDefaultTermFrequency, /*is_in_prefix_section=*/false,
+ /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc_hit));
}
};
@@ -273,7 +289,8 @@ TEST_F(LiteIndexThreadSafetyTest,
ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id,
term_id_codec_->EncodeTvi(foo_tvi, TviType::LITE));
Hit doc_hit0(/*section_id=*/kSectionId0, /*document_id=*/kDocumentId0,
- Hit::kDefaultTermFrequency, /*is_in_prefix_section=*/false);
+ Hit::kDefaultTermFrequency, /*is_in_prefix_section=*/false,
+ /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc_hit0));
// Create kNumThreads threads. Every even-numbered thread calls FetchHits and
@@ -302,7 +319,8 @@ TEST_F(LiteIndexThreadSafetyTest,
// Odd-numbered thread calls AddHit.
// AddHit to section 0 of a new doc.
Hit doc_hit(/*section_id=*/kSectionId0, /*document_id=*/thread_id / 2,
- Hit::kDefaultTermFrequency, /*is_in_prefix_section=*/false);
+ Hit::kDefaultTermFrequency, /*is_in_prefix_section=*/false,
+ /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(term_id, doc_hit));
}
};
@@ -335,9 +353,11 @@ TEST_F(LiteIndexThreadSafetyTest, ManyAddHitAndOneFetchHits_multipleTerms) {
ICING_ASSERT_OK_AND_ASSIGN(uint32_t term_id,
term_id_codec_->EncodeTvi(tvi, TviType::LITE));
Hit doc_hit0(/*section_id=*/kSectionId0, /*document_id=*/kDocumentId0,
- Hit::kDefaultTermFrequency, /*is_in_prefix_section=*/false);
+ Hit::kDefaultTermFrequency, /*is_in_prefix_section=*/false,
+ /*is_prefix_hit=*/false);
Hit doc_hit1(/*section_id=*/kSectionId1, /*document_id=*/kDocumentId0,
- Hit::kDefaultTermFrequency, /*is_in_prefix_section=*/false);
+ Hit::kDefaultTermFrequency, /*is_in_prefix_section=*/false,
+ /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(term_id, doc_hit0));
ICING_ASSERT_OK(lite_index_->AddHit(term_id, doc_hit1));
}
@@ -370,7 +390,7 @@ TEST_F(LiteIndexThreadSafetyTest, ManyAddHitAndOneFetchHits_multipleTerms) {
// AddHit to section (thread_id % 5 + 1) of doc 0.
Hit doc_hit(/*section_id=*/thread_id % 5 + 1,
/*document_id=*/kDocumentId0, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(term_id, doc_hit));
}
};
diff --git a/icing/index/lite/term-id-hit-pair.h b/icing/index/lite/term-id-hit-pair.h
index 82bd010..760aa76 100644
--- a/icing/index/lite/term-id-hit-pair.h
+++ b/icing/index/lite/term-id-hit-pair.h
@@ -15,25 +15,22 @@
#ifndef ICING_INDEX_TERM_ID_HIT_PAIR_H_
#define ICING_INDEX_TERM_ID_HIT_PAIR_H_
+#include <array>
#include <cstdint>
-#include <limits>
-#include <memory>
-#include <string>
-#include <vector>
#include "icing/index/hit/hit.h"
-#include "icing/util/bit-util.h"
namespace icing {
namespace lib {
class TermIdHitPair {
public:
- // Layout bits: 24 termid + 32 hit value + 8 hit term frequency.
- using Value = uint64_t;
+ // Layout bits: 24 termid + 32 hit value + 8 hit flags + 8 hit term frequency.
+ using Value = std::array<uint8_t, 9>;
static constexpr int kTermIdBits = 24;
static constexpr int kHitValueBits = sizeof(Hit::Value) * 8;
+ static constexpr int kHitFlagsBits = sizeof(Hit::Flags) * 8;
static constexpr int kHitTermFrequencyBits = sizeof(Hit::TermFrequency) * 8;
static const Value kInvalidValue;
@@ -41,33 +38,48 @@ class TermIdHitPair {
explicit TermIdHitPair(Value v = kInvalidValue) : value_(v) {}
TermIdHitPair(uint32_t term_id, const Hit& hit) {
- static_assert(kTermIdBits + kHitValueBits + kHitTermFrequencyBits <=
- sizeof(Value) * 8,
- "TermIdHitPairTooBig");
-
- value_ = 0;
- // Term id goes into the most significant bits because it takes
- // precedent in sorts.
- bit_util::BitfieldSet(term_id, kHitValueBits + kHitTermFrequencyBits,
- kTermIdBits, &value_);
- bit_util::BitfieldSet(hit.value(), kHitTermFrequencyBits, kHitValueBits,
- &value_);
- bit_util::BitfieldSet(hit.term_frequency(), 0, kHitTermFrequencyBits,
- &value_);
+ static_assert(
+ kTermIdBits + kHitValueBits + kHitFlagsBits + kHitTermFrequencyBits <=
+ sizeof(Value) * 8,
+ "TermIdHitPairTooBig");
+
+ // Set termId. Term id takes 3 bytes and goes into value_[0:2] (most
+ // significant bits) because it takes precedent in sorts.
+ value_[0] = static_cast<uint8_t>((term_id >> 16) & 0xff);
+ value_[1] = static_cast<uint8_t>((term_id >> 8) & 0xff);
+ value_[2] = static_cast<uint8_t>((term_id >> 0) & 0xff);
+
+ // Set hit value. Hit value takes 4 bytes and goes into value_[3:6]
+ value_[3] = static_cast<uint8_t>((hit.value() >> 24) & 0xff);
+ value_[4] = static_cast<uint8_t>((hit.value() >> 16) & 0xff);
+ value_[5] = static_cast<uint8_t>((hit.value() >> 8) & 0xff);
+ value_[6] = static_cast<uint8_t>((hit.value() >> 0) & 0xff);
+
+ // Set flags in value_[7].
+ value_[7] = hit.flags();
+
+ // Set term-frequency in value_[8]
+ value_[8] = hit.term_frequency();
}
uint32_t term_id() const {
- return bit_util::BitfieldGet(value_, kHitValueBits + kHitTermFrequencyBits,
- kTermIdBits);
+ return (static_cast<uint32_t>(value_[0]) << 16) |
+ (static_cast<uint32_t>(value_[1]) << 8) |
+ (static_cast<uint32_t>(value_[2]) << 0);
}
Hit hit() const {
- return Hit(
- bit_util::BitfieldGet(value_, kHitTermFrequencyBits, kHitValueBits),
- bit_util::BitfieldGet(value_, 0, kHitTermFrequencyBits));
+ Hit::Value hit_value = (static_cast<uint32_t>(value_[3]) << 24) |
+ (static_cast<uint32_t>(value_[4]) << 16) |
+ (static_cast<uint32_t>(value_[5]) << 8) |
+ (static_cast<uint32_t>(value_[6]) << 0);
+ Hit::Flags hit_flags = value_[7];
+ Hit::TermFrequency term_frequency = value_[8];
+
+ return Hit(hit_value, hit_flags, term_frequency);
}
- Value value() const { return value_; }
+ const Value& value() const { return value_; }
bool operator==(const TermIdHitPair& rhs) const {
return value_ == rhs.value_;
diff --git a/icing/index/lite/term-id-hit-pair_test.cc b/icing/index/lite/term-id-hit-pair_test.cc
new file mode 100644
index 0000000..28855b4
--- /dev/null
+++ b/icing/index/lite/term-id-hit-pair_test.cc
@@ -0,0 +1,95 @@
+// Copyright (C) 2024 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/lite/term-id-hit-pair.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/index/hit/hit.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
+
+namespace icing {
+namespace lib {
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::Eq;
+
+static constexpr DocumentId kSomeDocumentId = 24;
+static constexpr SectionId kSomeSectionid = 5;
+static constexpr Hit::TermFrequency kSomeTermFrequency = 57;
+static constexpr uint32_t kSomeTermId = 129;
+static constexpr uint32_t kSomeSmallerTermId = 1;
+static constexpr uint32_t kSomeLargerTermId = 0b101010101111111100000001;
+
+TEST(TermIdHitPairTest, Accessors) {
+ Hit hit1(kSomeSectionid, kSomeDocumentId, kSomeTermFrequency,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
+ Hit hit2(kSomeSectionid, kSomeDocumentId, kSomeTermFrequency,
+ /*is_in_prefix_section=*/true, /*is_prefix_hit=*/true);
+ Hit hit3(kSomeSectionid, kSomeDocumentId, kSomeTermFrequency,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
+ Hit invalid_hit(Hit::kInvalidValue);
+
+ TermIdHitPair term_id_hit_pair_1(kSomeTermId, hit1);
+ EXPECT_THAT(term_id_hit_pair_1.term_id(), Eq(kSomeTermId));
+ EXPECT_THAT(term_id_hit_pair_1.hit(), Eq(hit1));
+
+ TermIdHitPair term_id_hit_pair_2(kSomeLargerTermId, hit2);
+ EXPECT_THAT(term_id_hit_pair_2.term_id(), Eq(kSomeLargerTermId));
+ EXPECT_THAT(term_id_hit_pair_2.hit(), Eq(hit2));
+
+ TermIdHitPair term_id_hit_pair_3(kSomeTermId, invalid_hit);
+ EXPECT_THAT(term_id_hit_pair_3.term_id(), Eq(kSomeTermId));
+ EXPECT_THAT(term_id_hit_pair_3.hit(), Eq(invalid_hit));
+}
+
+TEST(TermIdHitPairTest, Comparison) {
+ Hit hit(kSomeSectionid, kSomeDocumentId, kSomeTermFrequency,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
+ Hit smaller_hit(/*section_id=*/1, /*document_id=*/100, /*term_frequency=*/1,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
+
+ TermIdHitPair term_id_hit_pair(kSomeTermId, hit);
+ TermIdHitPair term_id_hit_pair_equal(kSomeTermId, hit);
+ TermIdHitPair term_id_hit_pair_smaller_hit(kSomeTermId, smaller_hit);
+ TermIdHitPair term_id_hit_pair_smaller_term_id(kSomeSmallerTermId, hit);
+ TermIdHitPair term_id_hit_pair_larger_term_id(kSomeLargerTermId, hit);
+ TermIdHitPair term_id_hit_pair_smaller_term_id_and_hit(kSomeSmallerTermId,
+ smaller_hit);
+
+ std::vector<TermIdHitPair> term_id_hit_pairs{
+ term_id_hit_pair,
+ term_id_hit_pair_equal,
+ term_id_hit_pair_smaller_hit,
+ term_id_hit_pair_smaller_term_id,
+ term_id_hit_pair_larger_term_id,
+ term_id_hit_pair_smaller_term_id_and_hit};
+ std::sort(term_id_hit_pairs.begin(), term_id_hit_pairs.end());
+ EXPECT_THAT(term_id_hit_pairs,
+ ElementsAre(term_id_hit_pair_smaller_term_id_and_hit,
+ term_id_hit_pair_smaller_term_id,
+ term_id_hit_pair_smaller_hit, term_id_hit_pair_equal,
+ term_id_hit_pair, term_id_hit_pair_larger_term_id));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/main/main-index-merger.cc b/icing/index/main/main-index-merger.cc
index c26a6d7..cc130c2 100644
--- a/icing/index/main/main-index-merger.cc
+++ b/icing/index/main/main-index-merger.cc
@@ -14,14 +14,20 @@
#include "icing/index/main/main-index-merger.h"
+#include <algorithm>
#include <cstdint>
#include <cstring>
-#include <memory>
#include <unordered_map>
+#include <utility>
+#include <vector>
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/absl_ports/canonical_errors.h"
-#include "icing/file/posting_list/index-block.h"
+#include "icing/file/posting_list/posting-list-common.h"
+#include "icing/index/hit/hit.h"
+#include "icing/index/lite/lite-index.h"
#include "icing/index/lite/term-id-hit-pair.h"
+#include "icing/index/main/main-index.h"
#include "icing/index/term-id-codec.h"
#include "icing/legacy/core/icing-string-util.h"
#include "icing/util/logging.h"
diff --git a/icing/index/main/main-index-merger_test.cc b/icing/index/main/main-index-merger_test.cc
index 37e14fc..333e338 100644
--- a/icing/index/main/main-index-merger_test.cc
+++ b/icing/index/main/main-index-merger_test.cc
@@ -13,19 +13,23 @@
// limitations under the License.
#include "icing/index/main/main-index-merger.h"
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
#include "gmock/gmock.h"
#include "gtest/gtest.h"
-#include "icing/absl_ports/canonical_errors.h"
#include "icing/file/filesystem.h"
-#include "icing/index/iterator/doc-hit-info-iterator.h"
-#include "icing/index/main/doc-hit-info-iterator-term-main.h"
-#include "icing/index/main/main-index-merger.h"
+#include "icing/index/hit/hit.h"
+#include "icing/index/lite/lite-index.h"
+#include "icing/index/lite/term-id-hit-pair.h"
#include "icing/index/main/main-index.h"
#include "icing/index/term-id-codec.h"
-#include "icing/index/term-property-id.h"
#include "icing/legacy/index/icing-dynamic-trie.h"
#include "icing/legacy/index/icing-filesystem.h"
-#include "icing/schema/section.h"
#include "icing/store/namespace-id.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/tmp-directory.h"
@@ -89,10 +93,10 @@ TEST_F(MainIndexMergerTest, TranslateTermNotAdded) {
term_id_codec_->EncodeTvi(fool_tvi, TviType::LITE));
Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, /*term_frequency=*/57,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit));
Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc1_hit));
// 2. Build up a fake LexiconMergeOutputs
@@ -128,10 +132,10 @@ TEST_F(MainIndexMergerTest, PrefixExpansion) {
term_id_codec_->EncodeTvi(fool_tvi, TviType::LITE));
Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, /*term_frequency=*/57,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit));
Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/true);
+ /*is_in_prefix_section=*/true, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc1_hit));
// 2. Build up a fake LexiconMergeOutputs
@@ -191,11 +195,11 @@ TEST_F(MainIndexMergerTest, DedupePrefixAndExactWithDifferentTermFrequencies) {
term_id_codec_->EncodeTvi(foo_tvi, TviType::LITE));
Hit foot_doc0_hit(/*section_id=*/0, /*document_id=*/0, /*term_frequency=*/57,
- /*is_in_prefix_section=*/true);
+ /*is_in_prefix_section=*/true, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, foot_doc0_hit));
Hit foo_doc0_hit(/*section_id=*/0, /*document_id=*/0,
- Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/true);
+ Hit::kDefaultTermFrequency, /*is_in_prefix_section=*/true,
+ /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, foo_doc0_hit));
// 2. Build up a fake LexiconMergeOutputs
@@ -255,10 +259,10 @@ TEST_F(MainIndexMergerTest, DedupeWithExactSameTermFrequencies) {
term_id_codec_->EncodeTvi(foo_tvi, TviType::LITE));
Hit foot_doc0_hit(/*section_id=*/0, /*document_id=*/0, /*term_frequency=*/57,
- /*is_in_prefix_section=*/true);
+ /*is_in_prefix_section=*/true, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, foot_doc0_hit));
Hit foo_doc0_hit(/*section_id=*/0, /*document_id=*/0, /*term_frequency=*/57,
- /*is_in_prefix_section=*/true);
+ /*is_in_prefix_section=*/true, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, foo_doc0_hit));
// The prefix hit should take the sum as term_frequency - 114.
Hit prefix_foo_doc0_hit(/*section_id=*/0, /*document_id=*/0,
@@ -320,11 +324,11 @@ TEST_F(MainIndexMergerTest, DedupePrefixExpansion) {
Hit foot_doc0_hit(/*section_id=*/0, /*document_id=*/0,
/*term_frequency=*/Hit::kMaxTermFrequency,
- /*is_in_prefix_section=*/true);
+ /*is_in_prefix_section=*/true, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, foot_doc0_hit));
Hit fool_doc0_hit(/*section_id=*/0, /*document_id=*/0,
Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/true);
+ /*is_in_prefix_section=*/true, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, fool_doc0_hit));
// 2. Build up a fake LexiconMergeOutputs
diff --git a/icing/index/main/main-index.cc b/icing/index/main/main-index.cc
index aae60c6..85ee4dc 100644
--- a/icing/index/main/main-index.cc
+++ b/icing/index/main/main-index.cc
@@ -651,7 +651,7 @@ libtextclassifier3::Status MainIndex::AddPrefixBackfillHits(
ICING_ASSIGN_OR_RETURN(tmp, backfill_accessor->GetNextHitsBatch());
}
- Hit last_added_hit;
+ Hit last_added_hit(Hit::kInvalidValue);
// The hits in backfill_hits are in the reverse order of how they were added.
// Iterate in reverse to add them to this new posting list in the correct
// order.
diff --git a/icing/index/main/main-index_test.cc b/icing/index/main/main-index_test.cc
index fa96e6c..db9dbe2 100644
--- a/icing/index/main/main-index_test.cc
+++ b/icing/index/main/main-index_test.cc
@@ -14,23 +14,33 @@
#include "icing/index/main/main-index.h"
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
#include "gmock/gmock.h"
#include "gtest/gtest.h"
-#include "icing/absl_ports/canonical_errors.h"
#include "icing/file/filesystem.h"
+#include "icing/index/hit/doc-hit-info.h"
+#include "icing/index/hit/hit.h"
#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/index/lite/lite-index.h"
#include "icing/index/lite/term-id-hit-pair.h"
#include "icing/index/main/doc-hit-info-iterator-term-main.h"
#include "icing/index/main/main-index-merger.h"
#include "icing/index/term-id-codec.h"
-#include "icing/index/term-property-id.h"
#include "icing/legacy/index/icing-dynamic-trie.h"
#include "icing/legacy/index/icing-filesystem.h"
#include "icing/legacy/index/icing-mock-filesystem.h"
#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
#include "icing/store/namespace-id.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/tmp-directory.h"
+#include "icing/util/status-macros.h"
namespace icing {
namespace lib {
@@ -152,7 +162,7 @@ TEST_F(MainIndexTest, MainIndexGetAccessorForPrefixReturnsValidAccessor) {
term_id_codec_->EncodeTvi(tvi, TviType::LITE));
Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/true);
+ /*is_in_prefix_section=*/true, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit));
// 2. Create the main index. It should have no entries in its lexicon.
@@ -178,7 +188,7 @@ TEST_F(MainIndexTest, MainIndexGetAccessorForPrefixReturnsNotFound) {
term_id_codec_->EncodeTvi(tvi, TviType::LITE));
Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit));
// 2. Create the main index. It should have no entries in its lexicon.
@@ -217,7 +227,7 @@ TEST_F(MainIndexTest, MainIndexGetAccessorForExactReturnsValidAccessor) {
term_id_codec_->EncodeTvi(tvi, TviType::LITE));
Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit));
// 2. Create the main index. It should have no entries in its lexicon.
@@ -254,18 +264,18 @@ TEST_F(MainIndexTest, MergeIndexToEmpty) {
term_id_codec_->EncodeTvi(tvi, TviType::LITE));
Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit));
ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc0_hit));
ICING_ASSERT_OK(lite_index_->AddHit(far_term_id, doc0_hit));
Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/true);
+ /*is_in_prefix_section=*/true, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc1_hit));
ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc1_hit));
Hit doc2_hit(/*section_id=*/0, /*document_id=*/2, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc2_hit));
ICING_ASSERT_OK(lite_index_->AddHit(far_term_id, doc2_hit));
@@ -332,18 +342,18 @@ TEST_F(MainIndexTest, MergeIndexToPreexisting) {
term_id_codec_->EncodeTvi(tvi, TviType::LITE));
Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit));
ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc0_hit));
ICING_ASSERT_OK(lite_index_->AddHit(far_term_id, doc0_hit));
Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/true);
+ /*is_in_prefix_section=*/true, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc1_hit));
ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc1_hit));
Hit doc2_hit(/*section_id=*/0, /*document_id=*/2, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc2_hit));
ICING_ASSERT_OK(lite_index_->AddHit(far_term_id, doc2_hit));
@@ -387,14 +397,14 @@ TEST_F(MainIndexTest, MergeIndexToPreexisting) {
term_id_codec_->EncodeTvi(tvi, TviType::LITE));
Hit doc3_hit(/*section_id=*/0, /*document_id=*/3, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc3_hit));
ICING_ASSERT_OK(lite_index_->AddHit(four_term_id, doc3_hit));
ICING_ASSERT_OK(lite_index_->AddHit(foul_term_id, doc3_hit));
ICING_ASSERT_OK(lite_index_->AddHit(fall_term_id, doc3_hit));
Hit doc4_hit(/*section_id=*/0, /*document_id=*/4, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/true);
+ /*is_in_prefix_section=*/true, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(four_term_id, doc4_hit));
ICING_ASSERT_OK(lite_index_->AddHit(foul_term_id, doc4_hit));
@@ -449,15 +459,15 @@ TEST_F(MainIndexTest, ExactRetrievedInPrefixSearch) {
term_id_codec_->EncodeTvi(tvi, TviType::LITE));
Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/true);
+ /*is_in_prefix_section=*/true, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit));
Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc1_hit));
Hit doc2_hit(/*section_id=*/0, /*document_id=*/2, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc2_hit));
// 2. Create the main index. It should have no entries in its lexicon.
@@ -500,15 +510,15 @@ TEST_F(MainIndexTest, PrefixNotRetrievedInExactSearch) {
term_id_codec_->EncodeTvi(tvi, TviType::LITE));
Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/true);
+ /*is_in_prefix_section=*/true, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit));
Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc1_hit));
Hit doc2_hit(/*section_id=*/0, /*document_id=*/2, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/true);
+ /*is_in_prefix_section=*/true, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc2_hit));
// 2. Create the main index. It should have no entries in its lexicon.
@@ -554,17 +564,17 @@ TEST_F(MainIndexTest,
document_id % Hit::kMaxTermFrequency + 1);
Hit doc_hit0(
/*section_id=*/0, /*document_id=*/document_id, term_frequency,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc_hit0));
Hit doc_hit1(
/*section_id=*/1, /*document_id=*/document_id, term_frequency,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc_hit1));
Hit doc_hit2(
/*section_id=*/2, /*document_id=*/document_id, term_frequency,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc_hit2));
}
@@ -619,7 +629,7 @@ TEST_F(MainIndexTest, MergeIndexBackfilling) {
term_id_codec_->EncodeTvi(tvi, TviType::LITE));
Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/true);
+ /*is_in_prefix_section=*/true, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc0_hit));
// 2. Create the main index. It should have no entries in its lexicon.
@@ -648,7 +658,7 @@ TEST_F(MainIndexTest, MergeIndexBackfilling) {
term_id_codec_->EncodeTvi(tvi, TviType::LITE));
Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc1_hit));
// 5. Merge the index. The main index should now contain "fool", "foot"
@@ -682,7 +692,7 @@ TEST_F(MainIndexTest, OneHitInTheFirstPageForTwoPagesMainIndex) {
uint32_t num_docs = 2038;
for (DocumentId document_id = 0; document_id < num_docs; ++document_id) {
Hit doc_hit(section_id, document_id, Hit::kDefaultTermFrequency,
- /*is_in_prefix_section=*/false);
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc_hit));
}
diff --git a/icing/index/main/posting-list-hit-accessor_test.cc b/icing/index/main/posting-list-hit-accessor_test.cc
index 1127814..c2460ff 100644
--- a/icing/index/main/posting-list-hit-accessor_test.cc
+++ b/icing/index/main/posting-list-hit-accessor_test.cc
@@ -15,14 +15,19 @@
#include "icing/index/main/posting-list-hit-accessor.h"
#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+#include "icing/text_classifier/lib3/utils/base/status.h"
#include "gmock/gmock.h"
#include "gtest/gtest.h"
#include "icing/file/filesystem.h"
#include "icing/file/posting_list/flash-index-storage.h"
-#include "icing/file/posting_list/index-block.h"
+#include "icing/file/posting_list/posting-list-accessor.h"
+#include "icing/file/posting_list/posting-list-common.h"
#include "icing/file/posting_list/posting-list-identifier.h"
-#include "icing/file/posting_list/posting-list-used.h"
#include "icing/index/hit/hit.h"
#include "icing/index/main/posting-list-hit-serializer.h"
#include "icing/testing/common-matchers.h"
@@ -102,7 +107,8 @@ TEST_F(PostingListHitAccessorTest, PreexistingPLKeepOnSameBlock) {
PostingListHitAccessor::Create(flash_index_storage_.get(),
serializer_.get()));
// Add a single hit. This will fit in a min-sized posting list.
- Hit hit1(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency);
+ Hit hit1(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(pl_accessor->PrependHit(hit1));
PostingListAccessor::FinalizeResult result1 =
std::move(*pl_accessor).Finalize();
@@ -139,12 +145,12 @@ TEST_F(PostingListHitAccessorTest, PreexistingPLReallocateToLargerPL) {
std::unique_ptr<PostingListHitAccessor> pl_accessor,
PostingListHitAccessor::Create(flash_index_storage_.get(),
serializer_.get()));
- // The smallest posting list size is 15 bytes. The first four hits will be
- // compressed to one byte each and will be able to fit in the 5 byte padded
- // region. The last hit will fit in one of the special hits. The posting list
- // will be ALMOST_FULL and can fit at most 2 more hits.
+ // Use a small posting list of 30 bytes. The first 17 hits will be compressed
+ // to one byte each and will be able to fit in the 18 byte padded region. The
+ // last hit will fit in one of the special hits. The posting list will be
+ // ALMOST_FULL and can fit at most 2 more hits.
std::vector<Hit> hits1 =
- CreateHits(/*num_hits=*/5, /*desired_byte_length=*/1);
+ CreateHits(/*num_hits=*/18, /*desired_byte_length=*/1);
for (const Hit& hit : hits1) {
ICING_ASSERT_OK(pl_accessor->PrependHit(hit));
}
@@ -160,10 +166,9 @@ TEST_F(PostingListHitAccessorTest, PreexistingPLReallocateToLargerPL) {
pl_accessor,
PostingListHitAccessor::CreateFromExisting(
flash_index_storage_.get(), serializer_.get(), result1.id));
- // The current posting list can fit at most 2 more hits. Adding 12 more hits
- // should result in these hits being moved to a larger posting list.
+ // The current posting list can fit at most 2 more hits.
std::vector<Hit> hits2 = CreateHits(
- /*start_docid=*/hits1.back().document_id() + 1, /*num_hits=*/12,
+ /*last_hit=*/hits1.back(), /*num_hits=*/2,
/*desired_byte_length=*/1);
for (const Hit& hit : hits2) {
@@ -172,18 +177,36 @@ TEST_F(PostingListHitAccessorTest, PreexistingPLReallocateToLargerPL) {
PostingListAccessor::FinalizeResult result2 =
std::move(*pl_accessor).Finalize();
ICING_EXPECT_OK(result2.status);
+ // The 2 hits should still fit on the first block
+ EXPECT_THAT(result1.id.block_index(), Eq(1));
+ EXPECT_THAT(result1.id.posting_list_index(), Eq(0));
+
+ // Add one more hit
+ ICING_ASSERT_OK_AND_ASSIGN(
+ pl_accessor,
+ PostingListHitAccessor::CreateFromExisting(
+ flash_index_storage_.get(), serializer_.get(), result2.id));
+ // The current posting list should be FULL. Adding more hits should result in
+ // these hits being moved to a larger posting list.
+ Hit single_hit =
+ CreateHit(/*last_hit=*/hits2.back(), /*desired_byte_length=*/1);
+ ICING_ASSERT_OK(pl_accessor->PrependHit(single_hit));
+ PostingListAccessor::FinalizeResult result3 =
+ std::move(*pl_accessor).Finalize();
+ ICING_EXPECT_OK(result3.status);
// Should have been allocated to the second (new) block because the posting
// list should have grown beyond the size that the first block maintains.
- EXPECT_THAT(result2.id.block_index(), Eq(2));
- EXPECT_THAT(result2.id.posting_list_index(), Eq(0));
+ EXPECT_THAT(result3.id.block_index(), Eq(2));
+ EXPECT_THAT(result3.id.posting_list_index(), Eq(0));
- // The posting list at result2.id should hold all of the hits that have been
+ // The posting list at result3.id should hold all of the hits that have been
// added.
for (const Hit& hit : hits2) {
hits1.push_back(hit);
}
+ hits1.push_back(single_hit);
ICING_ASSERT_OK_AND_ASSIGN(PostingListHolder pl_holder,
- flash_index_storage_->GetPostingList(result2.id));
+ flash_index_storage_->GetPostingList(result3.id));
EXPECT_THAT(serializer_->GetHits(&pl_holder.posting_list),
IsOkAndHolds(ElementsAreArray(hits1.rbegin(), hits1.rend())));
}
@@ -307,7 +330,7 @@ TEST_F(PostingListHitAccessorTest, InvalidHitReturnsInvalidArgument) {
std::unique_ptr<PostingListHitAccessor> pl_accessor,
PostingListHitAccessor::Create(flash_index_storage_.get(),
serializer_.get()));
- Hit invalid_hit;
+ Hit invalid_hit(Hit::kInvalidValue);
EXPECT_THAT(pl_accessor->PrependHit(invalid_hit),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
}
@@ -317,14 +340,17 @@ TEST_F(PostingListHitAccessorTest, HitsNotDecreasingReturnsInvalidArgument) {
std::unique_ptr<PostingListHitAccessor> pl_accessor,
PostingListHitAccessor::Create(flash_index_storage_.get(),
serializer_.get()));
- Hit hit1(/*section_id=*/3, /*document_id=*/1, Hit::kDefaultTermFrequency);
+ Hit hit1(/*section_id=*/3, /*document_id=*/1, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(pl_accessor->PrependHit(hit1));
- Hit hit2(/*section_id=*/6, /*document_id=*/1, Hit::kDefaultTermFrequency);
+ Hit hit2(/*section_id=*/6, /*document_id=*/1, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
EXPECT_THAT(pl_accessor->PrependHit(hit2),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
- Hit hit3(/*section_id=*/2, /*document_id=*/0, Hit::kDefaultTermFrequency);
+ Hit hit3(/*section_id=*/2, /*document_id=*/0, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
EXPECT_THAT(pl_accessor->PrependHit(hit3),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
}
@@ -345,7 +371,8 @@ TEST_F(PostingListHitAccessorTest, PreexistingPostingListNoHitsAdded) {
std::unique_ptr<PostingListHitAccessor> pl_accessor,
PostingListHitAccessor::Create(flash_index_storage_.get(),
serializer_.get()));
- Hit hit1(/*section_id=*/3, /*document_id=*/1, Hit::kDefaultTermFrequency);
+ Hit hit1(/*section_id=*/3, /*document_id=*/1, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(pl_accessor->PrependHit(hit1));
PostingListAccessor::FinalizeResult result1 =
std::move(*pl_accessor).Finalize();
diff --git a/icing/index/main/posting-list-hit-serializer.cc b/icing/index/main/posting-list-hit-serializer.cc
index e14a0c0..88c0754 100644
--- a/icing/index/main/posting-list-hit-serializer.cc
+++ b/icing/index/main/posting-list-hit-serializer.cc
@@ -19,8 +19,11 @@
#include <limits>
#include <vector>
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/absl_ports/canonical_errors.h"
#include "icing/file/posting_list/posting-list-used.h"
+#include "icing/index/hit/hit.h"
#include "icing/legacy/core/icing-string-util.h"
#include "icing/legacy/index/icing-bit-util.h"
#include "icing/util/logging.h"
@@ -35,6 +38,10 @@ uint32_t GetTermFrequencyByteSize(const Hit& hit) {
return hit.has_term_frequency() ? sizeof(Hit::TermFrequency) : 0;
}
+uint32_t GetFlagsByteSize(const Hit& hit) {
+ return hit.has_flags() ? sizeof(Hit::Flags) : 0;
+}
+
} // namespace
uint32_t PostingListHitSerializer::GetBytesUsed(
@@ -55,14 +62,45 @@ uint32_t PostingListHitSerializer::GetMinPostingListSizeToFit(
return posting_list_used->size_in_bytes();
}
- // In NOT_FULL status BytesUsed contains no special hits. The minimum sized
- // posting list that would be guaranteed to fit these hits would be
- // ALMOST_FULL, with kInvalidHit in special_hit(0), the uncompressed Hit in
- // special_hit(1) and the n compressed hits in the compressed region.
- // BytesUsed contains one uncompressed Hit and n compressed hits. Therefore,
- // fitting these hits into a posting list would require BytesUsed plus one
- // extra hit.
- return GetBytesUsed(posting_list_used) + sizeof(Hit);
+ // Edge case: if number of hits <= 2, and posting_list_used is NOT_FULL, we
+ // should return kMinPostingListSize as we know that two hits is able to fit
+ // in the 2 special hits position of a min-sized posting list.
+ // - Checking this scenario requires deserializing posting_list_used, which is
+ // an expensive operation. However this is only possible when
+ // GetBytesUsed(posting_list_used) <=
+ // sizeof(uncompressed hit) + max(delta encoded value size) +
+ // (sizeof(Hit) + (sizeof(Hit) - sizeof(Hit::Value)), so we don't need to do
+ // this when the size exceeds this number.
+ if (GetBytesUsed(posting_list_used) <=
+ 2 * sizeof(Hit) + 5 - sizeof(Hit::Value)) {
+ // Check if we're able to get more than 2 hits from posting_list_used
+ std::vector<Hit> hits;
+ libtextclassifier3::Status status =
+ GetHitsInternal(posting_list_used, /*limit=*/3, /*pop=*/false, &hits);
+ if (status.ok() && hits.size() <= 2) {
+ return GetMinPostingListSize();
+ }
+ }
+
+ // - In NOT_FULL status, BytesUsed contains no special hits. For a posting
+ // list in the NOT_FULL state with n hits, we would have n-1 compressed hits
+ // and 1 uncompressed hit.
+ // - The minimum sized posting list that would be guaranteed to fit these hits
+ // would be FULL, but calculating the size required for the FULL posting
+ // list would require deserializing the last two added hits, so instead we
+ // will calculate the size of an ALMOST_FULL posting list to fit.
+ // - An ALMOST_FULL posting list would have kInvalidHit in special_hit(0), the
+ // full uncompressed Hit in special_hit(1), and the n-1 compressed hits in
+ // the compressed region.
+ // - Currently BytesUsed contains one uncompressed Hit and n-1 compressed
+ // hits. However, it's possible that the uncompressed Hit is not a full hit,
+ // but rather only the Hit::Value (this is the case if
+ // !hit.has_term_frequency()).
+ // - Therefore, fitting these hits into a posting list would require
+ // BytesUsed + one extra full hit + byte difference between a full hit and
+ // Hit::Value. i.e:
+ // ByteUsed + sizeof(Hit) + (sizeof(Hit) - sizeof(Hit::Value)).
+ return GetBytesUsed(posting_list_used) + 2 * sizeof(Hit) - sizeof(Hit::Value);
}
void PostingListHitSerializer::Clear(PostingListUsed* posting_list_used) const {
@@ -160,27 +198,37 @@ libtextclassifier3::Status PostingListHitSerializer::PrependHitToAlmostFull(
// in the padded area and put new hit at the special position 1.
// Calling ValueOrDie is safe here because 1 < kNumSpecialData.
Hit cur = GetSpecialHit(posting_list_used, /*index=*/1).ValueOrDie();
- if (cur.value() <= hit.value()) {
+ if (cur < hit || cur == hit) {
return absl_ports::InvalidArgumentError(
"Hit being prepended must be strictly less than the most recent Hit");
}
- uint64_t delta = cur.value() - hit.value();
uint8_t delta_buf[VarInt::kMaxEncodedLen64];
- size_t delta_len = VarInt::Encode(delta, delta_buf);
+ size_t delta_len =
+ EncodeNextHitValue(/*next_hit_value=*/hit.value(),
+ /*curr_hit_value=*/cur.value(), delta_buf);
+ uint32_t cur_flags_bytes = GetFlagsByteSize(cur);
uint32_t cur_term_frequency_bytes = GetTermFrequencyByteSize(cur);
uint32_t pad_end = GetPadEnd(posting_list_used,
/*offset=*/kSpecialHitsSize);
- if (pad_end >= kSpecialHitsSize + delta_len + cur_term_frequency_bytes) {
- // Pad area has enough space for delta and term_frequency of existing hit
- // (cur). Write delta at pad_end - delta_len - cur_term_frequency_bytes.
+ if (pad_end >= kSpecialHitsSize + delta_len + cur_flags_bytes +
+ cur_term_frequency_bytes) {
+ // Pad area has enough space for delta, flags and term_frequency of existing
+ // hit (cur). Write delta at pad_end - delta_len - cur_flags_bytes -
+ // cur_term_frequency_bytes.
uint8_t* delta_offset = posting_list_used->posting_list_buffer() + pad_end -
- delta_len - cur_term_frequency_bytes;
+ delta_len - cur_flags_bytes -
+ cur_term_frequency_bytes;
memcpy(delta_offset, delta_buf, delta_len);
- // Now copy term_frequency.
+
+ // Now copy flags.
+ Hit::Flags flags = cur.flags();
+ uint8_t* flags_offset = delta_offset + delta_len;
+ memcpy(flags_offset, &flags, cur_flags_bytes);
+ // Copy term_frequency.
Hit::TermFrequency term_frequency = cur.term_frequency();
- uint8_t* term_frequency_offset = delta_offset + delta_len;
+ uint8_t* term_frequency_offset = flags_offset + cur_flags_bytes;
memcpy(term_frequency_offset, &term_frequency, cur_term_frequency_bytes);
// Now first hit is the new hit, at special position 1. Safe to ignore the
@@ -231,23 +279,36 @@ libtextclassifier3::Status PostingListHitSerializer::PrependHitToNotFull(
return absl_ports::FailedPreconditionError(
"Posting list is in an invalid state.");
}
+
+ // Retrieve the last added (cur) hit's value and flags and compare to the hit
+ // we're adding.
Hit::Value cur_value;
- memcpy(&cur_value, posting_list_used->posting_list_buffer() + offset,
- sizeof(Hit::Value));
- if (cur_value <= hit.value()) {
+ uint8_t* cur_value_offset = posting_list_used->posting_list_buffer() + offset;
+ memcpy(&cur_value, cur_value_offset, sizeof(Hit::Value));
+ Hit::Flags cur_flags = Hit::kNoEnabledFlags;
+ if (GetFlagsByteSize(Hit(cur_value)) > 0) {
+ uint8_t* cur_flags_offset = cur_value_offset + sizeof(Hit::Value);
+ memcpy(&cur_flags, cur_flags_offset, sizeof(Hit::Flags));
+ }
+ // Term-frequency is not used for hit comparison so it's ok to pass in the
+ // default term-frequency here.
+ Hit cur(cur_value, cur_flags, Hit::kDefaultTermFrequency);
+ if (cur < hit || cur == hit) {
return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
- "Hit %d being prepended must be strictly less than the most recent "
- "Hit %d",
- hit.value(), cur_value));
+ "Hit (value=%d, flags=%d) being prepended must be strictly less than "
+ "the most recent Hit (value=%d, flags=%d)",
+ hit.value(), hit.flags(), cur_value, cur_flags));
}
- uint64_t delta = cur_value - hit.value();
uint8_t delta_buf[VarInt::kMaxEncodedLen64];
- size_t delta_len = VarInt::Encode(delta, delta_buf);
+ size_t delta_len =
+ EncodeNextHitValue(/*next_hit_value=*/hit.value(),
+ /*curr_hit_value=*/cur.value(), delta_buf);
+ uint32_t hit_flags_bytes = GetFlagsByteSize(hit);
uint32_t hit_term_frequency_bytes = GetTermFrequencyByteSize(hit);
// offset now points to one past the end of the first hit.
offset += sizeof(Hit::Value);
- if (kSpecialHitsSize + sizeof(Hit::Value) + delta_len +
+ if (kSpecialHitsSize + sizeof(Hit::Value) + delta_len + hit_flags_bytes +
hit_term_frequency_bytes <=
offset) {
// Enough space for delta in compressed area.
@@ -257,9 +318,9 @@ libtextclassifier3::Status PostingListHitSerializer::PrependHitToNotFull(
memcpy(posting_list_used->posting_list_buffer() + offset, delta_buf,
delta_len);
- // Prepend new hit with (possibly) its term_frequency. We know that there is
- // room for 'hit' because of the if statement above, so calling ValueOrDie
- // is safe.
+ // Prepend new hit with (possibly) its flags and term_frequency. We know
+ // that there is room for 'hit' because of the if statement above, so
+ // calling ValueOrDie is safe.
offset =
PrependHitUncompressed(posting_list_used, hit, offset).ValueOrDie();
// offset is guaranteed to be valid here. So it's safe to ignore the return
@@ -295,13 +356,13 @@ libtextclassifier3::Status PostingListHitSerializer::PrependHitToNotFull(
// (i.e. varint delta encoding expanded required storage). We
// move first hit to special position 1 and put new hit in
// special position 0.
- Hit cur(cur_value);
- // offset is < kSpecialHitsSize + delta_len. delta_len is at most 5 bytes.
+
+ // Offset is < kSpecialHitsSize + delta_len. delta_len is at most 5 bytes.
// Therefore, offset must be less than kSpecialHitSize + 5. Since posting
- // list size must be divisible by sizeof(Hit) (5), it is guaranteed that
+ // list size must be divisible by sizeof(Hit) (6), it is guaranteed that
// offset < size_in_bytes, so it is safe to ignore the return value here.
- ICING_RETURN_IF_ERROR(
- ConsumeTermFrequencyIfPresent(posting_list_used, &cur, &offset));
+ ICING_RETURN_IF_ERROR(ConsumeFlagsAndTermFrequencyIfPresent(
+ posting_list_used, &cur, &offset));
// Safe to ignore the return value of PadToEnd because offset must be less
// than posting_list_used->size_in_bytes(). Otherwise, this function
// already would have returned FAILED_PRECONDITION.
@@ -463,19 +524,19 @@ libtextclassifier3::Status PostingListHitSerializer::GetHitsInternal(
offset += sizeof(Hit::Value);
} else {
// Now we have delta encoded subsequent hits. Decode and push.
- uint64_t delta;
- offset += VarInt::Decode(
- posting_list_used->posting_list_buffer() + offset, &delta);
- val += delta;
+ DecodedHitInfo decoded_hit_info = DecodeNextHitValue(
+ posting_list_used->posting_list_buffer() + offset, val);
+ offset += decoded_hit_info.encoded_size;
+ val = decoded_hit_info.hit_value;
}
Hit hit(val);
libtextclassifier3::Status status =
- ConsumeTermFrequencyIfPresent(posting_list_used, &hit, &offset);
+ ConsumeFlagsAndTermFrequencyIfPresent(posting_list_used, &hit, &offset);
if (!status.ok()) {
// This posting list has been corrupted somehow. The first hit of the
- // posting list claims to have a term frequency, but there's no more room
- // in the posting list for that term frequency to exist. Return an empty
- // vector and zero to indicate no hits retrieved.
+ // posting list claims to have a term frequency or flag, but there's no
+ // more room in the posting list for that term frequency or flag to exist.
+ // Return an empty vector and zero to indicate no hits retrieved.
if (out != nullptr) {
out->clear();
}
@@ -497,10 +558,10 @@ libtextclassifier3::Status PostingListHitSerializer::GetHitsInternal(
// In the compressed area. Pop and reconstruct. offset/val is
// the last traversed hit, which we must discard. So move one
// more forward.
- uint64_t delta;
- offset += VarInt::Decode(
- posting_list_used->posting_list_buffer() + offset, &delta);
- val += delta;
+ DecodedHitInfo decoded_hit_info = DecodeNextHitValue(
+ posting_list_used->posting_list_buffer() + offset, val);
+ offset += decoded_hit_info.encoded_size;
+ val = decoded_hit_info.hit_value;
// Now val is the first hit of the new posting list.
if (kSpecialHitsSize + sizeof(Hit::Value) <= offset) {
@@ -509,17 +570,18 @@ libtextclassifier3::Status PostingListHitSerializer::GetHitsInternal(
memcpy(mutable_posting_list_used->posting_list_buffer() + offset, &val,
sizeof(Hit::Value));
} else {
- // val won't fit in compressed area. Also see if there is a
+ // val won't fit in compressed area. Also see if there is a flag or
// term_frequency.
Hit hit(val);
libtextclassifier3::Status status =
- ConsumeTermFrequencyIfPresent(posting_list_used, &hit, &offset);
+ ConsumeFlagsAndTermFrequencyIfPresent(posting_list_used, &hit,
+ &offset);
if (!status.ok()) {
// This posting list has been corrupted somehow. The first hit of
- // the posting list claims to have a term frequency, but there's no
- // more room in the posting list for that term frequency to exist.
- // Return an empty vector and zero to indicate no hits retrieved. Do
- // not pop anything.
+ // the posting list claims to have a term frequency or flag, but
+ // there's no more room in the posting list for that term frequency or
+ // flag to exist. Return an empty vector and zero to indicate no hits
+ // retrieved. Do not pop anything.
if (out != nullptr) {
out->clear();
}
@@ -569,7 +631,7 @@ libtextclassifier3::StatusOr<Hit> PostingListHitSerializer::GetSpecialHit(
return absl_ports::InvalidArgumentError(
"Special hits only exist at indices 0 and 1");
}
- Hit val;
+ Hit val(Hit::kInvalidValue);
memcpy(&val, posting_list_used->posting_list_buffer() + index * sizeof(val),
sizeof(val));
return val;
@@ -653,11 +715,11 @@ bool PostingListHitSerializer::SetStartByteOffset(
// not_full state. Safe to ignore the return value because 0 and 1 are both
// < kNumSpecialData.
SetSpecialHit(posting_list_used, /*index=*/0, Hit(offset));
- SetSpecialHit(posting_list_used, /*index=*/1, Hit());
+ SetSpecialHit(posting_list_used, /*index=*/1, Hit(Hit::kInvalidValue));
} else if (offset == sizeof(Hit)) {
// almost_full state. Safe to ignore the return value because 1 is both <
// kNumSpecialData.
- SetSpecialHit(posting_list_used, /*index=*/0, Hit());
+ SetSpecialHit(posting_list_used, /*index=*/0, Hit(Hit::kInvalidValue));
}
// Nothing to do for the FULL state - the offset isn't actually stored
// anywhere and both special hits hold valid hits.
@@ -667,46 +729,72 @@ bool PostingListHitSerializer::SetStartByteOffset(
libtextclassifier3::StatusOr<uint32_t>
PostingListHitSerializer::PrependHitUncompressed(
PostingListUsed* posting_list_used, const Hit& hit, uint32_t offset) const {
- if (hit.has_term_frequency()) {
- if (offset < kSpecialHitsSize + sizeof(Hit)) {
- return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
- "Not enough room to prepend Hit at offset %d.", offset));
- }
- offset -= sizeof(Hit);
- memcpy(posting_list_used->posting_list_buffer() + offset, &hit,
- sizeof(Hit));
- } else {
- if (offset < kSpecialHitsSize + sizeof(Hit::Value)) {
- return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
- "Not enough room to prepend Hit::Value at offset %d.", offset));
- }
- offset -= sizeof(Hit::Value);
- Hit::Value val = hit.value();
- memcpy(posting_list_used->posting_list_buffer() + offset, &val,
- sizeof(Hit::Value));
+ uint32_t hit_bytes_to_prepend = sizeof(Hit::Value) + GetFlagsByteSize(hit) +
+ GetTermFrequencyByteSize(hit);
+
+ if (offset < kSpecialHitsSize + hit_bytes_to_prepend) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Not enough room to prepend Hit at offset %d.", offset));
}
+
+ if (hit.has_term_frequency()) {
+ offset -= sizeof(Hit::TermFrequency);
+ Hit::TermFrequency term_frequency = hit.term_frequency();
+ memcpy(posting_list_used->posting_list_buffer() + offset, &term_frequency,
+ sizeof(Hit::TermFrequency));
+ }
+ if (hit.has_flags()) {
+ offset -= sizeof(Hit::Flags);
+ Hit::Flags flags = hit.flags();
+ memcpy(posting_list_used->posting_list_buffer() + offset, &flags,
+ sizeof(Hit::Flags));
+ }
+ offset -= sizeof(Hit::Value);
+ Hit::Value val = hit.value();
+ memcpy(posting_list_used->posting_list_buffer() + offset, &val,
+ sizeof(Hit::Value));
return offset;
}
libtextclassifier3::Status
-PostingListHitSerializer::ConsumeTermFrequencyIfPresent(
+PostingListHitSerializer::ConsumeFlagsAndTermFrequencyIfPresent(
const PostingListUsed* posting_list_used, Hit* hit,
uint32_t* offset) const {
- if (!hit->has_term_frequency()) {
- // No term frequency to consume. Everything is fine.
+ if (!hit->has_flags()) {
+ // No flags (and by extension, no term-frequency) to consume. Everything is
+ // fine.
return libtextclassifier3::Status::OK;
}
- if (*offset + sizeof(Hit::TermFrequency) >
- posting_list_used->size_in_bytes()) {
+
+ // Consume flags
+ Hit::Flags flags;
+ if (*offset + sizeof(Hit::Flags) > posting_list_used->size_in_bytes()) {
return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
- "offset %d must not point past the end of the posting list of size %d.",
+ "offset %d must not point past the end of the posting list of size "
+ "%d.",
*offset, posting_list_used->size_in_bytes()));
}
- Hit::TermFrequency term_frequency;
- memcpy(&term_frequency, posting_list_used->posting_list_buffer() + *offset,
- sizeof(Hit::TermFrequency));
- *hit = Hit(hit->value(), term_frequency);
- *offset += sizeof(Hit::TermFrequency);
+ memcpy(&flags, posting_list_used->posting_list_buffer() + *offset,
+ sizeof(Hit::Flags));
+ *hit = Hit(hit->value(), flags, Hit::kDefaultTermFrequency);
+ *offset += sizeof(Hit::Flags);
+
+ if (hit->has_term_frequency()) {
+ // Consume term frequency
+ if (*offset + sizeof(Hit::TermFrequency) >
+ posting_list_used->size_in_bytes()) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "offset %d must not point past the end of the posting list of size "
+ "%d.",
+ *offset, posting_list_used->size_in_bytes()));
+ }
+ Hit::TermFrequency term_frequency;
+ memcpy(&term_frequency, posting_list_used->posting_list_buffer() + *offset,
+ sizeof(Hit::TermFrequency));
+ *hit = Hit(hit->value(), flags, term_frequency);
+ *offset += sizeof(Hit::TermFrequency);
+ }
+
return libtextclassifier3::Status::OK;
}
diff --git a/icing/index/main/posting-list-hit-serializer.h b/icing/index/main/posting-list-hit-serializer.h
index 2986d9c..08c792c 100644
--- a/icing/index/main/posting-list-hit-serializer.h
+++ b/icing/index/main/posting-list-hit-serializer.h
@@ -15,6 +15,7 @@
#ifndef ICING_INDEX_MAIN_POSTING_LIST_HIT_SERIALIZER_H_
#define ICING_INDEX_MAIN_POSTING_LIST_HIT_SERIALIZER_H_
+#include <cstddef>
#include <cstdint>
#include <vector>
@@ -23,6 +24,7 @@
#include "icing/file/posting_list/posting-list-common.h"
#include "icing/file/posting_list/posting-list-used.h"
#include "icing/index/hit/hit.h"
+#include "icing/legacy/index/icing-bit-util.h"
#include "icing/util/status-macros.h"
namespace icing {
@@ -34,6 +36,52 @@ class PostingListHitSerializer : public PostingListSerializer {
public:
static constexpr uint32_t kSpecialHitsSize = kNumSpecialData * sizeof(Hit);
+ struct DecodedHitInfo {
+ // The decoded hit value.
+ Hit::Value hit_value;
+
+ // Size of the encoded hit in bytes.
+ size_t encoded_size;
+ };
+
+ // Given the current hit value, encodes the next hit value for serialization
+ // in the posting list.
+ //
+ // The encoded value is the varint-encoded delta between next_hit_value and
+ // curr_hit_value.
+ // - We add 1 to this delta so as to avoid getting a delta value of 0.
+ // - This allows us to add duplicate hits with the same value, which is a
+ // valid case if we need to store hits with different flags that belong in
+ // the same section-id/doc-id combo.
+ // - We cannot have an encoded hit delta with a value of 0 as 0 is currently
+ // used for padding the unused region in the posting list.
+ //
+ // REQUIRES: next_hit_value <= curr_hit_value AND
+ // curr_hit_value - next_hit_value <
+ // std::numeric_limits<Hit::Value>::max()
+ //
+ // RETURNS: next_hit_value's encoded length in bytes and writes the encoded
+ // value directly into encoded_buf_out.
+ static size_t EncodeNextHitValue(Hit::Value next_hit_value,
+ Hit::Value curr_hit_value,
+ uint8_t* encoded_buf_out) {
+ uint64_t delta = curr_hit_value - next_hit_value + 1;
+ return VarInt::Encode(delta, encoded_buf_out);
+ }
+
+ // Given the current hit value, decodes the next hit value from an encoded
+ // byte array buffer.
+ //
+ // RETURNS: DecodedHitInfo containing the decoded hit value and the value's
+ // encoded size in bytes.
+ static DecodedHitInfo DecodeNextHitValue(const uint8_t* encoded_buf_in,
+ Hit::Value curr_hit_value) {
+ uint64_t delta;
+ size_t delta_size = VarInt::Decode(encoded_buf_in, &delta);
+ Hit::Value hit_value = curr_hit_value + delta - 1;
+ return {hit_value, delta_size};
+ }
+
uint32_t GetDataTypeBytes() const override { return sizeof(Hit); }
uint32_t GetMinPostingListSize() const override {
@@ -299,15 +347,19 @@ class PostingListHitSerializer : public PostingListSerializer {
PostingListUsed* posting_list_used, const Hit& hit,
uint32_t offset) const;
- // If hit has a term frequency, consumes the term frequency at offset, updates
- // hit to include the term frequency and updates offset to reflect that the
- // term frequency has been consumed.
+ // If hit has the flags and/or term frequency field, consumes the flags and/or
+ // term frequency at offset, updates hit to include the flag and/or term
+ // frequency and updates offset to reflect that the flag and/or term frequency
+ // fields have been consumed.
//
// RETURNS:
// - OK, if successful
- // - INVALID_ARGUMENT if hit has a term frequency and offset +
- // sizeof(Hit::TermFrequency) >= posting_list_used->size_in_bytes()
- libtextclassifier3::Status ConsumeTermFrequencyIfPresent(
+ // - INVALID_ARGUMENT if hit has a flags and/or term frequency field and
+ // offset + sizeof(Hit's flag) + sizeof(Hit's tf) >=
+ // posting_list_used->size_in_bytes()
+ // i.e. the posting list is not large enough to consume the hit's flags
+ // and term frequency fields
+ libtextclassifier3::Status ConsumeFlagsAndTermFrequencyIfPresent(
const PostingListUsed* posting_list_used, Hit* hit,
uint32_t* offset) const;
};
diff --git a/icing/index/main/posting-list-hit-serializer_test.cc b/icing/index/main/posting-list-hit-serializer_test.cc
index 7f0b945..ea135ef 100644
--- a/icing/index/main/posting-list-hit-serializer_test.cc
+++ b/icing/index/main/posting-list-hit-serializer_test.cc
@@ -14,22 +14,32 @@
#include "icing/index/main/posting-list-hit-serializer.h"
+#include <algorithm>
+#include <cstddef>
#include <cstdint>
#include <deque>
-#include <memory>
+#include <iterator>
+#include <limits>
#include <vector>
#include "icing/text_classifier/lib3/utils/base/status.h"
#include "gmock/gmock.h"
#include "gtest/gtest.h"
#include "icing/file/posting_list/posting-list-used.h"
+#include "icing/index/hit/hit.h"
+#include "icing/legacy/index/icing-bit-util.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/hit-test-utils.h"
using testing::ElementsAre;
using testing::ElementsAreArray;
using testing::Eq;
+using testing::Gt;
using testing::IsEmpty;
+using testing::IsFalse;
+using testing::IsTrue;
using testing::Le;
using testing::Lt;
@@ -58,103 +68,217 @@ TEST(PostingListHitSerializerTest, PostingListUsedPrependHitNotFull) {
PostingListUsed::CreateFromUnitializedRegion(&serializer, kHitsSize));
// Make used.
- Hit hit0(/*section_id=*/0, 0, /*term_frequency=*/56);
+ Hit hit0(/*section_id=*/0, /*document_id=*/0, /*term_frequency=*/56,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(serializer.PrependHit(&pl_used, hit0));
- // Size = sizeof(uncompressed hit0)
- int expected_size = sizeof(Hit);
- EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Le(expected_size));
+ // Size = sizeof(uncompressed hit0::Value)
+ // + sizeof(hit0::Flags)
+ // + sizeof(hit0::TermFrequency)
+ int expected_size =
+ sizeof(Hit::Value) + sizeof(Hit::Flags) + sizeof(Hit::TermFrequency);
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(expected_size));
EXPECT_THAT(serializer.GetHits(&pl_used), IsOkAndHolds(ElementsAre(hit0)));
- Hit hit1(/*section_id=*/0, 1, Hit::kDefaultTermFrequency);
+ Hit hit1(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
+ uint8_t delta_buf[VarInt::kMaxEncodedLen64];
+ size_t delta_len = PostingListHitSerializer::EncodeNextHitValue(
+ /*next_hit_value=*/hit1.value(),
+ /*curr_hit_value=*/hit0.value(), delta_buf);
ICING_ASSERT_OK(serializer.PrependHit(&pl_used, hit1));
- // Size = sizeof(uncompressed hit1)
- // + sizeof(hit0-hit1) + sizeof(hit0::term_frequency)
- expected_size += 2 + sizeof(Hit::TermFrequency);
- EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Le(expected_size));
+ // Size = sizeof(uncompressed hit1::Value)
+ // + sizeof(hit0-hit1)
+ // + sizeof(hit0::Flags)
+ // + sizeof(hit0::TermFrequency)
+ expected_size += delta_len;
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(expected_size));
EXPECT_THAT(serializer.GetHits(&pl_used),
IsOkAndHolds(ElementsAre(hit1, hit0)));
- Hit hit2(/*section_id=*/0, 2, /*term_frequency=*/56);
+ Hit hit2(/*section_id=*/0, /*document_id=*/2, /*term_frequency=*/56,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
+ delta_len = PostingListHitSerializer::EncodeNextHitValue(
+ /*next_hit_value=*/hit2.value(),
+ /*curr_hit_value=*/hit1.value(), delta_buf);
ICING_ASSERT_OK(serializer.PrependHit(&pl_used, hit2));
- // Size = sizeof(uncompressed hit2)
+ // Size = sizeof(uncompressed hit2::Value) + sizeof(hit2::Flags)
+ // + sizeof(hit2::TermFrequency)
// + sizeof(hit1-hit2)
- // + sizeof(hit0-hit1) + sizeof(hit0::term_frequency)
- expected_size += 2;
- EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Le(expected_size));
+ // + sizeof(hit0-hit1)
+ // + sizeof(hit0::flags)
+ // + sizeof(hit0::term_frequency)
+ expected_size += delta_len + sizeof(Hit::Flags) + sizeof(Hit::TermFrequency);
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(expected_size));
EXPECT_THAT(serializer.GetHits(&pl_used),
IsOkAndHolds(ElementsAre(hit2, hit1, hit0)));
- Hit hit3(/*section_id=*/0, 3, Hit::kDefaultTermFrequency);
+ Hit hit3(/*section_id=*/0, /*document_id=*/3, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
+ delta_len = PostingListHitSerializer::EncodeNextHitValue(
+ /*next_hit_value=*/hit3.value(),
+ /*curr_hit_value=*/hit2.value(), delta_buf);
ICING_ASSERT_OK(serializer.PrependHit(&pl_used, hit3));
- // Size = sizeof(uncompressed hit3)
- // + sizeof(hit2-hit3) + sizeof(hit2::term_frequency)
+ // Size = sizeof(uncompressed hit3::Value)
+ // + sizeof(hit2-hit3) + sizeof(hit2::Flags)
+ // + sizeof(hit2::TermFrequency)
// + sizeof(hit1-hit2)
- // + sizeof(hit0-hit1) + sizeof(hit0::term_frequency)
- expected_size += 2 + sizeof(Hit::TermFrequency);
- EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Le(expected_size));
+ // + sizeof(hit0-hit1)
+ // + sizeof(hit0::flags)
+ // + sizeof(hit0::term_frequency)
+ expected_size += delta_len;
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(expected_size));
+ EXPECT_THAT(serializer.GetHits(&pl_used),
+ IsOkAndHolds(ElementsAre(hit3, hit2, hit1, hit0)));
+}
+
+TEST(PostingListHitSerializerTest,
+ PostingListUsedPrependHitAlmostFull_withFlags) {
+ PostingListHitSerializer serializer;
+
+ // Size = 24
+ int pl_size = 2 * serializer.GetMinPostingListSize();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used,
+ PostingListUsed::CreateFromUnitializedRegion(&serializer, pl_size));
+
+ // Fill up the compressed region.
+ // Transitions:
+ // Adding hit0: EMPTY -> NOT_FULL
+ // Adding hit1: NOT_FULL -> NOT_FULL
+ // Adding hit2: NOT_FULL -> NOT_FULL
+ Hit hit0(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
+ Hit hit1 = CreateHit(hit0, /*desired_byte_length=*/3);
+ Hit hit2 = CreateHit(hit1, /*desired_byte_length=*/2, /*term_frequency=*/57,
+ /*is_in_prefix_section=*/true,
+ /*is_prefix_hit=*/true);
+ EXPECT_THAT(hit2.has_flags(), IsTrue());
+ EXPECT_THAT(hit2.has_term_frequency(), IsTrue());
+ ICING_EXPECT_OK(serializer.PrependHit(&pl_used, hit0));
+ ICING_EXPECT_OK(serializer.PrependHit(&pl_used, hit1));
+ ICING_EXPECT_OK(serializer.PrependHit(&pl_used, hit2));
+ // Size used will be 4 (hit2::Value) + 1 (hit2::Flags) + 1
+ // (hit2::TermFrequency) + 2 (hit1-hit2) + 3 (hit0-hit1) = 11 bytes
+ int expected_size = sizeof(Hit::Value) + sizeof(Hit::Flags) +
+ sizeof(Hit::TermFrequency) + 2 + 3;
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(expected_size));
+ EXPECT_THAT(serializer.GetHits(&pl_used),
+ IsOkAndHolds(ElementsAre(hit2, hit1, hit0)));
+
+ // Add one more hit to transition NOT_FULL -> ALMOST_FULL
+ Hit hit3 =
+ CreateHit(hit2, /*desired_byte_length=*/3, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
+ EXPECT_THAT(hit3.has_flags(), IsFalse());
+ ICING_EXPECT_OK(serializer.PrependHit(&pl_used, hit3));
+ // Storing them in the compressed region requires 4 (hit3::Value) + 3
+ // (hit2-hit3) + 1 (hit2::Flags) + 1 (hit2::TermFrequency) + 2 (hit1-hit2) + 3
+ // (hit0-hit1) = 14 bytes, but there are only 12 bytes in the compressed
+ // region. So instead, the posting list will transition to ALMOST_FULL.
+ // The in-use compressed region will actually shrink from 11 bytes to 10 bytes
+ // because the uncompressed version of hit2 will be overwritten with the
+ // compressed delta of hit2. hit3 will be written to one of the special hits.
+ // Because we're in ALMOST_FULL, the expected size is the size of the pl minus
+ // the one hit used to mark the posting list as ALMOST_FULL.
+ expected_size = pl_size - sizeof(Hit);
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(expected_size));
EXPECT_THAT(serializer.GetHits(&pl_used),
IsOkAndHolds(ElementsAre(hit3, hit2, hit1, hit0)));
+
+ // Add one more hit to transition ALMOST_FULL -> ALMOST_FULL
+ Hit hit4 = CreateHit(hit3, /*desired_byte_length=*/2);
+ ICING_EXPECT_OK(serializer.PrependHit(&pl_used, hit4));
+ // There are currently 10 bytes in use in the compressed region. hit3 will
+ // have a 2-byte delta, which fits in the compressed region. Hit3 will be
+ // moved from the special hit to the compressed region (which will have 12
+ // bytes in use after adding hit3). Hit4 will be placed in one of the special
+ // hits and the posting list will remain in ALMOST_FULL.
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(expected_size));
+ EXPECT_THAT(serializer.GetHits(&pl_used),
+ IsOkAndHolds(ElementsAre(hit4, hit3, hit2, hit1, hit0)));
+
+ // Add one more hit to transition ALMOST_FULL -> FULL
+ Hit hit5 = CreateHit(hit4, /*desired_byte_length=*/2);
+ ICING_EXPECT_OK(serializer.PrependHit(&pl_used, hit5));
+ // There are currently 12 bytes in use in the compressed region. hit4 will
+ // have a 2-byte delta which will not fit in the compressed region. So hit4
+ // will remain in one of the special hits and hit5 will occupy the other,
+ // making the posting list FULL.
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(pl_size));
+ EXPECT_THAT(serializer.GetHits(&pl_used),
+ IsOkAndHolds(ElementsAre(hit5, hit4, hit3, hit2, hit1, hit0)));
+
+ // The posting list is FULL. Adding another hit should fail.
+ Hit hit6 = CreateHit(hit5, /*desired_byte_length=*/1);
+ EXPECT_THAT(serializer.PrependHit(&pl_used, hit6),
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
}
TEST(PostingListHitSerializerTest, PostingListUsedPrependHitAlmostFull) {
PostingListHitSerializer serializer;
- int size = 2 * serializer.GetMinPostingListSize();
+ // Size = 24
+ int pl_size = 2 * serializer.GetMinPostingListSize();
ICING_ASSERT_OK_AND_ASSIGN(
PostingListUsed pl_used,
- PostingListUsed::CreateFromUnitializedRegion(&serializer, size));
+ PostingListUsed::CreateFromUnitializedRegion(&serializer, pl_size));
// Fill up the compressed region.
// Transitions:
// Adding hit0: EMPTY -> NOT_FULL
// Adding hit1: NOT_FULL -> NOT_FULL
// Adding hit2: NOT_FULL -> NOT_FULL
- Hit hit0(/*section_id=*/0, 0, Hit::kDefaultTermFrequency);
- Hit hit1 = CreateHit(hit0, /*desired_byte_length=*/2);
- Hit hit2 = CreateHit(hit1, /*desired_byte_length=*/2);
+ Hit hit0(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
+ Hit hit1 = CreateHit(hit0, /*desired_byte_length=*/3);
+ Hit hit2 = CreateHit(hit1, /*desired_byte_length=*/3);
ICING_EXPECT_OK(serializer.PrependHit(&pl_used, hit0));
ICING_EXPECT_OK(serializer.PrependHit(&pl_used, hit1));
ICING_EXPECT_OK(serializer.PrependHit(&pl_used, hit2));
- // Size used will be 2+2+4=8 bytes
- int expected_size = sizeof(Hit::Value) + 2 + 2;
- EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Le(expected_size));
+ // Size used will be 4 (hit2::Value) + 3 (hit1-hit2) + 3 (hit0-hit1)
+ // = 10 bytes
+ int expected_size = sizeof(Hit::Value) + 3 + 3;
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(expected_size));
EXPECT_THAT(serializer.GetHits(&pl_used),
IsOkAndHolds(ElementsAre(hit2, hit1, hit0)));
// Add one more hit to transition NOT_FULL -> ALMOST_FULL
Hit hit3 = CreateHit(hit2, /*desired_byte_length=*/3);
ICING_EXPECT_OK(serializer.PrependHit(&pl_used, hit3));
- // Compressed region would be 2+2+3+4=11 bytes, but the compressed region is
- // only 10 bytes. So instead, the posting list will transition to ALMOST_FULL.
- // The in-use compressed region will actually shrink from 8 bytes to 7 bytes
+ // Storing them in the compressed region requires 4 (hit3::Value) + 3
+ // (hit2-hit3) + 3 (hit1-hit2) + 3 (hit0-hit1) = 13 bytes, but there are only
+ // 12 bytes in the compressed region. So instead, the posting list will
+ // transition to ALMOST_FULL.
+ // The in-use compressed region will actually shrink from 10 bytes to 9 bytes
// because the uncompressed version of hit2 will be overwritten with the
// compressed delta of hit2. hit3 will be written to one of the special hits.
// Because we're in ALMOST_FULL, the expected size is the size of the pl minus
// the one hit used to mark the posting list as ALMOST_FULL.
- expected_size = size - sizeof(Hit);
- EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Le(expected_size));
+ expected_size = pl_size - sizeof(Hit);
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(expected_size));
EXPECT_THAT(serializer.GetHits(&pl_used),
IsOkAndHolds(ElementsAre(hit3, hit2, hit1, hit0)));
// Add one more hit to transition ALMOST_FULL -> ALMOST_FULL
Hit hit4 = CreateHit(hit3, /*desired_byte_length=*/2);
ICING_EXPECT_OK(serializer.PrependHit(&pl_used, hit4));
- // There are currently 7 bytes in use in the compressed region. hit3 will have
- // a 2-byte delta. That delta will fit in the compressed region (which will
- // now have 9 bytes in use), hit4 will be placed in one of the special hits
- // and the posting list will remain in ALMOST_FULL.
- EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Le(expected_size));
+ // There are currently 9 bytes in use in the compressed region. Hit3 will
+ // have a 2-byte delta, which fits in the compressed region. Hit3 will be
+ // moved from the special hit to the compressed region (which will have 11
+ // bytes in use after adding hit3). Hit4 will be placed in one of the special
+ // hits and the posting list will remain in ALMOST_FULL.
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(expected_size));
EXPECT_THAT(serializer.GetHits(&pl_used),
IsOkAndHolds(ElementsAre(hit4, hit3, hit2, hit1, hit0)));
// Add one more hit to transition ALMOST_FULL -> FULL
Hit hit5 = CreateHit(hit4, /*desired_byte_length=*/2);
ICING_EXPECT_OK(serializer.PrependHit(&pl_used, hit5));
- // There are currently 9 bytes in use in the compressed region. hit4 will have
- // a 2-byte delta which will not fit in the compressed region. So hit4 will
- // remain in one of the special hits and hit5 will occupy the other, making
- // the posting list FULL.
- EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Le(size));
+ // There are currently 11 bytes in use in the compressed region. Hit4 will
+ // have a 2-byte delta which will not fit in the compressed region. So hit4
+ // will remain in one of the special hits and hit5 will occupy the other,
+ // making the posting list FULL.
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(pl_size));
EXPECT_THAT(serializer.GetHits(&pl_used),
IsOkAndHolds(ElementsAre(hit5, hit4, hit3, hit2, hit1, hit0)));
@@ -164,9 +288,59 @@ TEST(PostingListHitSerializerTest, PostingListUsedPrependHitAlmostFull) {
StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
}
+TEST(PostingListHitSerializerTest, PrependHitsWithSameValue) {
+ PostingListHitSerializer serializer;
+
+ // Size = 24
+ int pl_size = 2 * serializer.GetMinPostingListSize();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used,
+ PostingListUsed::CreateFromUnitializedRegion(&serializer, pl_size));
+
+ // Fill up the compressed region.
+ Hit hit0(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
+ Hit hit1 = CreateHit(hit0, /*desired_byte_length=*/3);
+ Hit hit2 = CreateHit(hit1, /*desired_byte_length=*/2, /*term_frequency=*/57,
+ /*is_in_prefix_section=*/true,
+ /*is_prefix_hit=*/true);
+ // Create hit3 with the same value but different flags as hit2 (hit3_flags
+ // is set to have all currently-defined flags enabled)
+ Hit::Flags hit3_flags = 0;
+ for (int i = 0; i < Hit::kNumFlagsInFlagsField; ++i) {
+ hit3_flags |= (1 << i);
+ }
+ Hit hit3(hit2.value(), /*term_frequency=*/hit2.term_frequency(),
+ /*flags=*/hit3_flags);
+
+ // hit3 is larger than hit2 (its flag value is larger), and so needs to be
+ // prepended first
+ ICING_EXPECT_OK(serializer.PrependHit(&pl_used, hit0));
+ ICING_EXPECT_OK(serializer.PrependHit(&pl_used, hit1));
+ ICING_EXPECT_OK(serializer.PrependHit(&pl_used, hit3));
+ ICING_EXPECT_OK(serializer.PrependHit(&pl_used, hit2));
+ // Posting list is now ALMOST_FULL
+ // ----------------------
+ // 23-21 delta(Hit #0)
+ // 20-19 delta(Hit #1)
+ // 18 term-frequency(Hit #2)
+ // 17 flags(Hit #2)
+ // 16 delta(Hit #2) = 0
+ // 15-12 <unused padding = 0>
+ // 11-6 Hit #3
+ // 5-0 kSpecialHit
+ // ----------------------
+ int bytes_used = pl_size - sizeof(Hit);
+
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(bytes_used));
+ EXPECT_THAT(serializer.GetHits(&pl_used),
+ IsOkAndHolds(ElementsAre(hit2, hit3, hit1, hit0)));
+}
+
TEST(PostingListHitSerializerTest, PostingListUsedMinSize) {
PostingListHitSerializer serializer;
+ // Min size = 12
ICING_ASSERT_OK_AND_ASSIGN(
PostingListUsed pl_used,
PostingListUsed::CreateFromUnitializedRegion(
@@ -176,7 +350,7 @@ TEST(PostingListHitSerializerTest, PostingListUsedMinSize) {
EXPECT_THAT(serializer.GetHits(&pl_used), IsOkAndHolds(IsEmpty()));
// Add a hit, PL should shift to ALMOST_FULL state
- Hit hit0(/*section_id=*/0, 0, /*term_frequency=*/0,
+ Hit hit0(/*section_id=*/1, /*document_id=*/0, /*term_frequency=*/0,
/*is_in_prefix_section=*/false,
/*is_prefix_hit=*/true);
ICING_EXPECT_OK(serializer.PrependHit(&pl_used, hit0));
@@ -185,10 +359,10 @@ TEST(PostingListHitSerializerTest, PostingListUsedMinSize) {
EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Le(expected_size));
EXPECT_THAT(serializer.GetHits(&pl_used), IsOkAndHolds(ElementsAre(hit0)));
- // Add the smallest hit possible - no term_frequency and a delta of 1. PL
- // should shift to FULL state.
- Hit hit1(/*section_id=*/0, 0, /*term_frequency=*/0,
- /*is_in_prefix_section=*/true,
+ // Add the smallest hit possible - no term_frequency, non-prefix hit and a
+ // delta of 0b10. PL should shift to FULL state.
+ Hit hit1(/*section_id=*/0, /*document_id=*/0, /*term_frequency=*/0,
+ /*is_in_prefix_section=*/false,
/*is_prefix_hit=*/false);
ICING_EXPECT_OK(serializer.PrependHit(&pl_used, hit1));
// Size = sizeof(uncompressed hit1) + sizeof(uncompressed hit0)
@@ -198,7 +372,7 @@ TEST(PostingListHitSerializerTest, PostingListUsedMinSize) {
IsOkAndHolds(ElementsAre(hit1, hit0)));
// Try to add the smallest hit possible. Should fail
- Hit hit2(/*section_id=*/0, 0, /*term_frequency=*/0,
+ Hit hit2(/*section_id=*/0, /*document_id=*/0, /*term_frequency=*/0,
/*is_in_prefix_section=*/false,
/*is_prefix_hit=*/false);
EXPECT_THAT(serializer.PrependHit(&pl_used, hit2),
@@ -212,14 +386,17 @@ TEST(PostingListHitSerializerTest,
PostingListPrependHitArrayMinSizePostingList) {
PostingListHitSerializer serializer;
- // Min Size = 10
- int size = serializer.GetMinPostingListSize();
+ // Min Size = 12
+ int pl_size = serializer.GetMinPostingListSize();
ICING_ASSERT_OK_AND_ASSIGN(
PostingListUsed pl_used,
- PostingListUsed::CreateFromUnitializedRegion(&serializer, size));
+ PostingListUsed::CreateFromUnitializedRegion(&serializer, pl_size));
std::vector<HitElt> hits_in;
- hits_in.emplace_back(Hit(1, 0, Hit::kDefaultTermFrequency));
+ hits_in.emplace_back(Hit(/*section_id=*/1, /*document_id=*/0,
+ Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false,
+ /*is_prefix_hit=*/false));
hits_in.emplace_back(
CreateHit(hits_in.rbegin()->hit, /*desired_byte_length=*/1));
hits_in.emplace_back(
@@ -235,7 +412,7 @@ TEST(PostingListHitSerializerTest,
ICING_ASSERT_OK_AND_ASSIGN(
uint32_t num_can_prepend,
(serializer.PrependHitArray<HitElt, HitElt::get_hit>(
- &pl_used, &hits_in[0], hits_in.size(), false)));
+ &pl_used, &hits_in[0], hits_in.size(), /*keep_prepended=*/false)));
EXPECT_THAT(num_can_prepend, Eq(2));
int can_fit_hits = num_can_prepend;
@@ -243,10 +420,11 @@ TEST(PostingListHitSerializerTest,
// problem, transitioning the PL from EMPTY -> ALMOST_FULL -> FULL
const HitElt *hits_in_ptr = hits_in.data() + (hits_in.size() - 2);
ICING_ASSERT_OK_AND_ASSIGN(
- num_can_prepend, (serializer.PrependHitArray<HitElt, HitElt::get_hit>(
- &pl_used, hits_in_ptr, can_fit_hits, false)));
+ num_can_prepend,
+ (serializer.PrependHitArray<HitElt, HitElt::get_hit>(
+ &pl_used, hits_in_ptr, can_fit_hits, /*keep_prepended=*/false)));
EXPECT_THAT(num_can_prepend, Eq(can_fit_hits));
- EXPECT_THAT(size, Eq(serializer.GetBytesUsed(&pl_used)));
+ EXPECT_THAT(pl_size, Eq(serializer.GetBytesUsed(&pl_used)));
std::deque<Hit> hits_pushed;
std::transform(hits_in.rbegin(),
hits_in.rend() - hits_in.size() + can_fit_hits,
@@ -258,14 +436,17 @@ TEST(PostingListHitSerializerTest,
TEST(PostingListHitSerializerTest, PostingListPrependHitArrayPostingList) {
PostingListHitSerializer serializer;
- // Size = 30
- int size = 3 * serializer.GetMinPostingListSize();
+ // Size = 36
+ int pl_size = 3 * serializer.GetMinPostingListSize();
ICING_ASSERT_OK_AND_ASSIGN(
PostingListUsed pl_used,
- PostingListUsed::CreateFromUnitializedRegion(&serializer, size));
+ PostingListUsed::CreateFromUnitializedRegion(&serializer, pl_size));
std::vector<HitElt> hits_in;
- hits_in.emplace_back(Hit(1, 0, Hit::kDefaultTermFrequency));
+ hits_in.emplace_back(Hit(/*section_id=*/1, /*document_id=*/0,
+ Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false,
+ /*is_prefix_hit=*/false));
hits_in.emplace_back(
CreateHit(hits_in.rbegin()->hit, /*desired_byte_length=*/1));
hits_in.emplace_back(
@@ -278,14 +459,14 @@ TEST(PostingListHitSerializerTest, PostingListPrependHitArrayPostingList) {
// The last hit is uncompressed and the four before it should only take one
// byte. Total use = 8 bytes.
// ----------------------
- // 29 delta(Hit #1)
- // 28 delta(Hit #2)
- // 27 delta(Hit #3)
- // 26 delta(Hit #4)
- // 25-22 Hit #5
- // 21-10 <unused>
- // 9-5 kSpecialHit
- // 4-0 Offset=22
+ // 35 delta(Hit #0)
+ // 34 delta(Hit #1)
+ // 33 delta(Hit #2)
+ // 32 delta(Hit #3)
+ // 31-28 Hit #4
+ // 27-12 <unused>
+ // 11-6 kSpecialHit
+ // 5-0 Offset=28
// ----------------------
int byte_size = sizeof(Hit::Value) + hits_in.size() - 1;
@@ -294,7 +475,7 @@ TEST(PostingListHitSerializerTest, PostingListPrependHitArrayPostingList) {
ICING_ASSERT_OK_AND_ASSIGN(
uint32_t num_could_fit,
(serializer.PrependHitArray<HitElt, HitElt::get_hit>(
- &pl_used, &hits_in[0], hits_in.size(), false)));
+ &pl_used, &hits_in[0], hits_in.size(), /*keep_prepended=*/false)));
EXPECT_THAT(num_could_fit, Eq(hits_in.size()));
EXPECT_THAT(byte_size, Eq(serializer.GetBytesUsed(&pl_used)));
std::deque<Hit> hits_pushed;
@@ -316,31 +497,35 @@ TEST(PostingListHitSerializerTest, PostingListPrependHitArrayPostingList) {
CreateHit(hits_in.rbegin()->hit, /*desired_byte_length=*/3));
hits_in.emplace_back(
CreateHit(hits_in.rbegin()->hit, /*desired_byte_length=*/2));
+ hits_in.emplace_back(
+ CreateHit(hits_in.rbegin()->hit, /*desired_byte_length=*/3));
std::reverse(hits_in.begin(), hits_in.end());
- // Size increased by the deltas of these hits (1+2+1+2+3+2) = 11 bytes
+ // Size increased by the deltas of these hits (1+2+1+2+3+2+3) = 14 bytes
// ----------------------
- // 29 delta(Hit #1)
- // 28 delta(Hit #2)
- // 27 delta(Hit #3)
- // 26 delta(Hit #4)
- // 25 delta(Hit #5)
- // 24-23 delta(Hit #6)
- // 22 delta(Hit #7)
- // 21-20 delta(Hit #8)
- // 19-17 delta(Hit #9)
- // 16-15 delta(Hit #10)
- // 14-11 Hit #11
- // 10 <unused>
- // 9-5 kSpecialHit
- // 4-0 Offset=11
+ // 35 delta(Hit #0)
+ // 34 delta(Hit #1)
+ // 33 delta(Hit #2)
+ // 32 delta(Hit #3)
+ // 31 delta(Hit #4)
+ // 30-29 delta(Hit #5)
+ // 28 delta(Hit #6)
+ // 27-26 delta(Hit #7)
+ // 25-23 delta(Hit #8)
+ // 22-21 delta(Hit #9)
+ // 20-18 delta(Hit #10)
+ // 17-14 Hit #11
+ // 13-12 <unused>
+ // 11-6 kSpecialHit
+ // 5-0 Offset=14
// ----------------------
- byte_size += 11;
+ byte_size += 14;
- // Add these 6 hits. The PL is currently in the NOT_FULL state and should
+ // Add these 7 hits. The PL is currently in the NOT_FULL state and should
// remain in the NOT_FULL state.
ICING_ASSERT_OK_AND_ASSIGN(
- num_could_fit, (serializer.PrependHitArray<HitElt, HitElt::get_hit>(
- &pl_used, &hits_in[0], hits_in.size(), false)));
+ num_could_fit,
+ (serializer.PrependHitArray<HitElt, HitElt::get_hit>(
+ &pl_used, &hits_in[0], hits_in.size(), /*keep_prepended=*/false)));
EXPECT_THAT(num_could_fit, Eq(hits_in.size()));
EXPECT_THAT(byte_size, Eq(serializer.GetBytesUsed(&pl_used)));
// All hits from hits_in were added.
@@ -353,29 +538,32 @@ TEST(PostingListHitSerializerTest, PostingListPrependHitArrayPostingList) {
hits_in.clear();
hits_in.emplace_back(first_hit);
// ----------------------
- // 29 delta(Hit #1)
- // 28 delta(Hit #2)
- // 27 delta(Hit #3)
- // 26 delta(Hit #4)
- // 25 delta(Hit #5)
- // 24-23 delta(Hit #6)
- // 22 delta(Hit #7)
- // 21-20 delta(Hit #8)
- // 19-17 delta(Hit #9)
- // 16-15 delta(Hit #10)
- // 14-12 delta(Hit #11)
- // 11-10 <unused>
- // 9-5 Hit #12
- // 4-0 kSpecialHit
+ // 35 delta(Hit #0)
+ // 34 delta(Hit #1)
+ // 33 delta(Hit #2)
+ // 32 delta(Hit #3)
+ // 31 delta(Hit #4)
+ // 30-29 delta(Hit #5)
+ // 28 delta(Hit #6)
+ // 27-26 delta(Hit #7)
+ // 25-23 delta(Hit #8)
+ // 22-21 delta(Hit #9)
+ // 20-18 delta(Hit #10)
+ // 17-15 delta(Hit #11)
+ // 14-12 <unused>
+ // 11-6 Hit #12
+ // 5-0 kSpecialHit
// ----------------------
- byte_size = 25;
+ byte_size = 30; // 36 - 6
// Add this 1 hit. The PL is currently in the NOT_FULL state and should
// transition to the ALMOST_FULL state - even though there is still some
- // unused space.
+ // unused space. This is because the unused space (3 bytes) is less than
+ // the size of a uncompressed Hit.
ICING_ASSERT_OK_AND_ASSIGN(
- num_could_fit, (serializer.PrependHitArray<HitElt, HitElt::get_hit>(
- &pl_used, &hits_in[0], hits_in.size(), false)));
+ num_could_fit,
+ (serializer.PrependHitArray<HitElt, HitElt::get_hit>(
+ &pl_used, &hits_in[0], hits_in.size(), /*keep_prepended=*/false)));
EXPECT_THAT(num_could_fit, Eq(hits_in.size()));
EXPECT_THAT(byte_size, Eq(serializer.GetBytesUsed(&pl_used)));
// All hits from hits_in were added.
@@ -388,37 +576,40 @@ TEST(PostingListHitSerializerTest, PostingListPrependHitArrayPostingList) {
hits_in.clear();
hits_in.emplace_back(first_hit);
hits_in.emplace_back(
- CreateHit(hits_in.rbegin()->hit, /*desired_byte_length=*/2));
+ CreateHit(hits_in.rbegin()->hit, /*desired_byte_length=*/3));
std::reverse(hits_in.begin(), hits_in.end());
// ----------------------
- // 29 delta(Hit #1)
- // 28 delta(Hit #2)
- // 27 delta(Hit #3)
- // 26 delta(Hit #4)
- // 25 delta(Hit #5)
- // 24-23 delta(Hit #6)
- // 22 delta(Hit #7)
- // 21-20 delta(Hit #8)
- // 19-17 delta(Hit #9)
- // 16-15 delta(Hit #10)
- // 14-12 delta(Hit #11)
- // 11 delta(Hit #12)
- // 10 <unused>
- // 9-5 Hit #13
- // 4-0 Hit #14
+ // 35 delta(Hit #0)
+ // 34 delta(Hit #1)
+ // 33 delta(Hit #2)
+ // 32 delta(Hit #3)
+ // 31 delta(Hit #4)
+ // 30-29 delta(Hit #5)
+ // 28 delta(Hit #6)
+ // 27-26 delta(Hit #7)
+ // 25-23 delta(Hit #8)
+ // 22-21 delta(Hit #9)
+ // 20-18 delta(Hit #10)
+ // 17-15 delta(Hit #11)
+ // 14 delta(Hit #12)
+ // 13-12 <unused>
+ // 11-6 Hit #13
+ // 5-0 Hit #14
// ----------------------
- // Add these 2 hits. The PL is currently in the ALMOST_FULL state. Adding the
- // first hit should keep the PL in ALMOST_FULL because the delta between Hit
- // #12 and Hit #13 (1 byte) can fit in the unused area (2 bytes). Adding the
- // second hit should tranisition to the FULL state because the delta between
- // Hit #13 and Hit #14 (2 bytes) is larger than the remaining unused area
- // (1 byte).
+ // Add these 2 hits.
+ // - The PL is currently in the ALMOST_FULL state. Adding the first hit should
+ // keep the PL in ALMOST_FULL because the delta between
+ // Hit #13 and Hit #14 (1 byte) can fit in the unused area (3 bytes).
+ // - Adding the second hit should transition to the FULL state because the
+ // delta between Hit #14 and Hit #15 (3 bytes) is larger than the remaining
+ // unused area (2 byte).
ICING_ASSERT_OK_AND_ASSIGN(
- num_could_fit, (serializer.PrependHitArray<HitElt, HitElt::get_hit>(
- &pl_used, &hits_in[0], hits_in.size(), false)));
+ num_could_fit,
+ (serializer.PrependHitArray<HitElt, HitElt::get_hit>(
+ &pl_used, &hits_in[0], hits_in.size(), /*keep_prepended=*/false)));
EXPECT_THAT(num_could_fit, Eq(hits_in.size()));
- EXPECT_THAT(size, Eq(serializer.GetBytesUsed(&pl_used)));
+ EXPECT_THAT(pl_size, Eq(serializer.GetBytesUsed(&pl_used)));
// All hits from hits_in were added.
std::transform(hits_in.rbegin(), hits_in.rend(),
std::front_inserter(hits_pushed), HitElt::get_hit);
@@ -431,9 +622,9 @@ TEST(PostingListHitSerializerTest, PostingListPrependHitArrayTooManyHits) {
static constexpr int kNumHits = 128;
static constexpr int kDeltaSize = 1;
- static constexpr int kTermFrequencySize = 1;
static constexpr size_t kHitsSize =
- ((kNumHits * (kDeltaSize + kTermFrequencySize)) / 5) * 5;
+ ((kNumHits - 2) * kDeltaSize + (2 * sizeof(Hit))) / sizeof(Hit) *
+ sizeof(Hit);
// Create an array with one too many hits
std::vector<Hit> hits_in_too_many =
@@ -442,18 +633,21 @@ TEST(PostingListHitSerializerTest, PostingListPrependHitArrayTooManyHits) {
for (const Hit &hit : hits_in_too_many) {
hit_elts_in_too_many.emplace_back(hit);
}
+ // Reverse so that hits are inserted in descending order
+ std::reverse(hit_elts_in_too_many.begin(), hit_elts_in_too_many.end());
+
ICING_ASSERT_OK_AND_ASSIGN(
PostingListUsed pl_used,
PostingListUsed::CreateFromUnitializedRegion(
&serializer, serializer.GetMinPostingListSize()));
-
// PrependHitArray should fail because hit_elts_in_too_many is far too large
// for the minimum size pl.
ICING_ASSERT_OK_AND_ASSIGN(
uint32_t num_could_fit,
(serializer.PrependHitArray<HitElt, HitElt::get_hit>(
&pl_used, &hit_elts_in_too_many[0], hit_elts_in_too_many.size(),
- false)));
+ /*keep_prepended=*/false)));
+ ASSERT_THAT(num_could_fit, Eq(2));
ASSERT_THAT(num_could_fit, Lt(hit_elts_in_too_many.size()));
ASSERT_THAT(serializer.GetBytesUsed(&pl_used), Eq(0));
ASSERT_THAT(serializer.GetHits(&pl_used), IsOkAndHolds(IsEmpty()));
@@ -464,10 +658,11 @@ TEST(PostingListHitSerializerTest, PostingListPrependHitArrayTooManyHits) {
// PrependHitArray should fail because hit_elts_in_too_many is one hit too
// large for this pl.
ICING_ASSERT_OK_AND_ASSIGN(
- num_could_fit, (serializer.PrependHitArray<HitElt, HitElt::get_hit>(
- &pl_used, &hit_elts_in_too_many[0],
- hit_elts_in_too_many.size(), false)));
- ASSERT_THAT(num_could_fit, Lt(hit_elts_in_too_many.size()));
+ num_could_fit,
+ (serializer.PrependHitArray<HitElt, HitElt::get_hit>(
+ &pl_used, &hit_elts_in_too_many[0], hit_elts_in_too_many.size(),
+ /*keep_prepended=*/false)));
+ ASSERT_THAT(num_could_fit, Eq(hit_elts_in_too_many.size() - 1));
ASSERT_THAT(serializer.GetBytesUsed(&pl_used), Eq(0));
ASSERT_THAT(serializer.GetHits(&pl_used), IsOkAndHolds(IsEmpty()));
}
@@ -476,16 +671,37 @@ TEST(PostingListHitSerializerTest,
PostingListStatusJumpFromNotFullToFullAndBack) {
PostingListHitSerializer serializer;
+ // Size = 18
const uint32_t pl_size = 3 * sizeof(Hit);
ICING_ASSERT_OK_AND_ASSIGN(
PostingListUsed pl,
PostingListUsed::CreateFromUnitializedRegion(&serializer, pl_size));
- ICING_ASSERT_OK(serializer.PrependHit(&pl, Hit(Hit::kInvalidValue - 1, 0)));
+
+ Hit max_valued_hit(kMaxSectionId, kMinDocumentId, Hit::kMaxTermFrequency,
+ /*is_in_prefix_section=*/true, /*is_prefix_hit=*/true);
+ ICING_ASSERT_OK(serializer.PrependHit(&pl, max_valued_hit));
uint32_t bytes_used = serializer.GetBytesUsed(&pl);
+ ASSERT_THAT(bytes_used, sizeof(Hit::Value) + sizeof(Hit::Flags) +
+ sizeof(Hit::TermFrequency));
// Status not full.
ASSERT_THAT(bytes_used,
Le(pl_size - PostingListHitSerializer::kSpecialHitsSize));
- ICING_ASSERT_OK(serializer.PrependHit(&pl, Hit(Hit::kInvalidValue >> 2, 0)));
+
+ Hit min_valued_hit(kMinSectionId, kMaxDocumentId, Hit::kMaxTermFrequency,
+ /*is_in_prefix_section=*/true, /*is_prefix_hit=*/true);
+ uint8_t delta_buf[VarInt::kMaxEncodedLen64];
+ size_t delta_len = PostingListHitSerializer::EncodeNextHitValue(
+ /*next_hit_value=*/min_valued_hit.value(),
+ /*curr_hit_value=*/max_valued_hit.value(), delta_buf);
+ // The compressed region available is pl_size - 2 * sizeof(specialHits) = 6
+ // We need to also fit max_valued_hit's flags and term-frequency fields, which
+ // each take 1 byte
+ // So we'll jump directly to FULL if the varint-encoded delta of the 2 hits >
+ // 6 - sizeof(Hit::Flags) - sizeof(Hit::TermFrequency) = 4
+ ASSERT_THAT(delta_len, Gt(4));
+ ICING_ASSERT_OK(serializer.PrependHit(
+ &pl, Hit(kMinSectionId, kMaxDocumentId, Hit::kMaxTermFrequency,
+ /*is_in_prefix_section=*/true, /*is_prefix_hit=*/true)));
// Status should jump to full directly.
ASSERT_THAT(serializer.GetBytesUsed(&pl), Eq(pl_size));
ICING_ASSERT_OK(serializer.PopFrontHits(&pl, 1));
@@ -501,11 +717,12 @@ TEST(PostingListHitSerializerTest, DeltaOverflow) {
PostingListUsed pl,
PostingListUsed::CreateFromUnitializedRegion(&serializer, pl_size));
+ static const Hit::Value kMaxHitValue = std::numeric_limits<Hit::Value>::max();
static const Hit::Value kOverflow[4] = {
- Hit::kInvalidValue >> 2,
- (Hit::kInvalidValue >> 2) * 2,
- (Hit::kInvalidValue >> 2) * 3,
- Hit::kInvalidValue - 1,
+ kMaxHitValue >> 2,
+ (kMaxHitValue >> 2) * 2,
+ (kMaxHitValue >> 2) * 3,
+ kMaxHitValue - 1,
};
// Fit at least 4 ordinary values.
@@ -516,22 +733,245 @@ TEST(PostingListHitSerializerTest, DeltaOverflow) {
// Cannot fit 4 overflow values.
ICING_ASSERT_OK_AND_ASSIGN(
pl, PostingListUsed::CreateFromUnitializedRegion(&serializer, pl_size));
- ICING_EXPECT_OK(serializer.PrependHit(&pl, Hit(kOverflow[3])));
- ICING_EXPECT_OK(serializer.PrependHit(&pl, Hit(kOverflow[2])));
+ Hit::Flags has_term_frequency_flags = 0b1;
+ ICING_EXPECT_OK(serializer.PrependHit(
+ &pl, Hit(/*value=*/kOverflow[3], has_term_frequency_flags,
+ /*term_frequency=*/8)));
+ ICING_EXPECT_OK(serializer.PrependHit(
+ &pl, Hit(/*value=*/kOverflow[2], has_term_frequency_flags,
+ /*term_frequency=*/8)));
// Can fit only one more.
- ICING_EXPECT_OK(serializer.PrependHit(&pl, Hit(kOverflow[1])));
- EXPECT_THAT(serializer.PrependHit(&pl, Hit(kOverflow[0])),
+ ICING_EXPECT_OK(serializer.PrependHit(
+ &pl, Hit(/*value=*/kOverflow[1], has_term_frequency_flags,
+ /*term_frequency=*/8)));
+ EXPECT_THAT(serializer.PrependHit(
+ &pl, Hit(/*value=*/kOverflow[0], has_term_frequency_flags,
+ /*term_frequency=*/8)),
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
+}
+
+TEST(PostingListHitSerializerTest, GetMinPostingListToFitForNotFullPL) {
+ PostingListHitSerializer serializer;
+
+ // Size = 24
+ int pl_size = 2 * serializer.GetMinPostingListSize();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used,
+ PostingListUsed::CreateFromUnitializedRegion(&serializer, pl_size));
+ // Create and add some hits to make pl_used NOT_FULL
+ std::vector<Hit> hits_in =
+ CreateHits(/*num_hits=*/7, /*desired_byte_length=*/1);
+ for (const Hit &hit : hits_in) {
+ ICING_ASSERT_OK(serializer.PrependHit(&pl_used, hit));
+ }
+ // ----------------------
+ // 23 delta(Hit #0)
+ // 22 delta(Hit #1)
+ // 21 delta(Hit #2)
+ // 20 delta(Hit #3)
+ // 19 delta(Hit #4)
+ // 18 delta(Hit #5)
+ // 17-14 Hit #6
+ // 13-12 <unused>
+ // 11-6 kSpecialHit
+ // 5-0 Offset=14
+ // ----------------------
+ int bytes_used = 10;
+
+ // Check that all hits have been inserted
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(bytes_used));
+ std::deque<Hit> hits_pushed(hits_in.rbegin(), hits_in.rend());
+ EXPECT_THAT(serializer.GetHits(&pl_used),
+ IsOkAndHolds(ElementsAreArray(hits_pushed)));
+
+ // Get the min size to fit for the hits in pl_used. Moving the hits in pl_used
+ // into a posting list with this min size should make it ALMOST_FULL, which we
+ // can see should have size = 18.
+ // ----------------------
+ // 17 delta(Hit #0)
+ // 16 delta(Hit #1)
+ // 15 delta(Hit #2)
+ // 14 delta(Hit #3)
+ // 13 delta(Hit #4)
+ // 12 delta(Hit #5)
+ // 11-6 Hit #6
+ // 5-0 kSpecialHit
+ // ----------------------
+ int expected_min_size = 18;
+ uint32_t min_size_to_fit = serializer.GetMinPostingListSizeToFit(&pl_used);
+ EXPECT_THAT(min_size_to_fit, Eq(expected_min_size));
+
+ // Also check that this min size to fit posting list actually does fit all the
+ // hits and can only hit one more hit in the ALMOST_FULL state.
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListUsed min_size_to_fit_pl,
+ PostingListUsed::CreateFromUnitializedRegion(
+ &serializer, min_size_to_fit));
+ for (const Hit &hit : hits_in) {
+ ICING_ASSERT_OK(serializer.PrependHit(&min_size_to_fit_pl, hit));
+ }
+
+ // Adding another hit to the min-size-to-fit posting list should succeed
+ Hit hit = CreateHit(hits_in.back(), /*desired_byte_length=*/1);
+ ICING_ASSERT_OK(serializer.PrependHit(&min_size_to_fit_pl, hit));
+ // Adding any other hits should fail with RESOURCE_EXHAUSTED error.
+ EXPECT_THAT(serializer.PrependHit(&min_size_to_fit_pl,
+ CreateHit(hit, /*desired_byte_length=*/1)),
StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
+
+ // Check that all hits have been inserted and the min-fit posting list is now
+ // FULL.
+ EXPECT_THAT(serializer.GetBytesUsed(&min_size_to_fit_pl),
+ Eq(min_size_to_fit));
+ hits_pushed.emplace_front(hit);
+ EXPECT_THAT(serializer.GetHits(&min_size_to_fit_pl),
+ IsOkAndHolds(ElementsAreArray(hits_pushed)));
+}
+
+TEST(PostingListHitSerializerTest, GetMinPostingListToFitForTwoHits) {
+ PostingListHitSerializer serializer;
+
+ // Size = 36
+ int pl_size = 3 * serializer.GetMinPostingListSize();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used,
+ PostingListUsed::CreateFromUnitializedRegion(&serializer, pl_size));
+
+ // Create and add 2 hits
+ Hit first_hit(/*section_id=*/1, /*document_id=*/0, /*term_frequency=*/5,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
+ std::vector<Hit> hits_in =
+ CreateHits(first_hit, /*num_hits=*/2, /*desired_byte_length=*/4);
+ for (const Hit &hit : hits_in) {
+ ICING_ASSERT_OK(serializer.PrependHit(&pl_used, hit));
+ }
+ // ----------------------
+ // 35 term-frequency(Hit #0)
+ // 34 flags(Hit #0)
+ // 33-30 delta(Hit #0)
+ // 29 term-frequency(Hit #1)
+ // 28 flags(Hit #1)
+ // 27-24 Hit #1
+ // 23-12 <unused>
+ // 11-6 kSpecialHit
+ // 5-0 Offset=24
+ // ----------------------
+ int bytes_used = 12;
+
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(bytes_used));
+ std::deque<Hit> hits_pushed(hits_in.rbegin(), hits_in.rend());
+ EXPECT_THAT(serializer.GetHits(&pl_used),
+ IsOkAndHolds(ElementsAreArray(hits_pushed)));
+
+ // GetMinPostingListSizeToFit should return min posting list size.
+ EXPECT_THAT(serializer.GetMinPostingListSizeToFit(&pl_used),
+ Eq(serializer.GetMinPostingListSize()));
+}
+
+TEST(PostingListHitSerializerTest, GetMinPostingListToFitForThreeSmallHits) {
+ PostingListHitSerializer serializer;
+
+ // Size = 24
+ int pl_size = 2 * serializer.GetMinPostingListSize();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used,
+ PostingListUsed::CreateFromUnitializedRegion(&serializer, pl_size));
+ // Create and add 3 small hits that fit in the size range where we should be
+ // checking for whether the PL has only 2 hits
+ std::vector<Hit> hits_in =
+ CreateHits(/*num_hits=*/3, /*desired_byte_length=*/1);
+ for (const Hit &hit : hits_in) {
+ ICING_ASSERT_OK(serializer.PrependHit(&pl_used, hit));
+ }
+ // ----------------------
+ // 23 delta(Hit #0)
+ // 22 delta(Hit #1)
+ // 21-18 Hit #2
+ // 17-12 <unused>
+ // 11-6 kSpecialHit
+ // 5-0 Offset=18
+ // ----------------------
+ int bytes_used = 6;
+
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(bytes_used));
+ std::deque<Hit> hits_pushed(hits_in.rbegin(), hits_in.rend());
+ EXPECT_THAT(serializer.GetHits(&pl_used),
+ IsOkAndHolds(ElementsAreArray(hits_pushed)));
+
+ // Get the min size to fit for the hits in pl_used. Moving the hits in pl_used
+ // into a posting list with this min size should make it ALMOST_FULL, which we
+ // can see should have size = 14. This should not return the min posting list
+ // size.
+ // ----------------------
+ // 13 delta(Hit #0)
+ // 12 delta(Hit #1)
+ // 11-6 Hit #2
+ // 5-0 kSpecialHit
+ // ----------------------
+ int expected_min_size = 14;
+
+ EXPECT_THAT(serializer.GetMinPostingListSizeToFit(&pl_used),
+ Gt(serializer.GetMinPostingListSize()));
+ EXPECT_THAT(serializer.GetMinPostingListSizeToFit(&pl_used),
+ Eq(expected_min_size));
+}
+
+TEST(PostingListHitSerializerTest,
+ GetMinPostingListToFitForAlmostFullAndFullPLReturnsSameSize) {
+ PostingListHitSerializer serializer;
+
+ // Size = 24
+ int pl_size = 2 * serializer.GetMinPostingListSize();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used,
+ PostingListUsed::CreateFromUnitializedRegion(&serializer, pl_size));
+ // Create and add some hits to make pl_used ALMOST_FULL
+ std::vector<Hit> hits_in =
+ CreateHits(/*num_hits=*/7, /*desired_byte_length=*/2);
+ for (const Hit &hit : hits_in) {
+ ICING_ASSERT_OK(serializer.PrependHit(&pl_used, hit));
+ }
+ // ----------------------
+ // 23-22 delta(Hit #0)
+ // 21-20 delta(Hit #1)
+ // 19-18 delta(Hit #2)
+ // 17-16 delta(Hit #3)
+ // 15-14 delta(Hit #4)
+ // 13-12 delta(Hit #5)
+ // 11-6 Hit #6
+ // 5-0 kSpecialHit
+ // ----------------------
+ int bytes_used = 18;
+
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(bytes_used));
+ std::deque<Hit> hits_pushed(hits_in.rbegin(), hits_in.rend());
+ EXPECT_THAT(serializer.GetHits(&pl_used),
+ IsOkAndHolds(ElementsAreArray(hits_pushed)));
+
+ // GetMinPostingListSizeToFit should return the same size as pl_used.
+ uint32_t min_size_to_fit = serializer.GetMinPostingListSizeToFit(&pl_used);
+ EXPECT_THAT(min_size_to_fit, Eq(pl_size));
+
+ // Add another hit to make the posting list FULL
+ Hit hit = CreateHit(hits_in.back(), /*desired_byte_length=*/1);
+ ICING_ASSERT_OK(serializer.PrependHit(&pl_used, hit));
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(pl_size));
+ hits_pushed.emplace_front(hit);
+ EXPECT_THAT(serializer.GetHits(&pl_used),
+ IsOkAndHolds(ElementsAreArray(hits_pushed)));
+
+ // GetMinPostingListSizeToFit should still be the same size as pl_used.
+ min_size_to_fit = serializer.GetMinPostingListSizeToFit(&pl_used);
+ EXPECT_THAT(min_size_to_fit, Eq(pl_size));
}
TEST(PostingListHitSerializerTest, MoveFrom) {
PostingListHitSerializer serializer;
- int size = 3 * serializer.GetMinPostingListSize();
+ int pl_size = 3 * serializer.GetMinPostingListSize();
ICING_ASSERT_OK_AND_ASSIGN(
PostingListUsed pl_used1,
- PostingListUsed::CreateFromUnitializedRegion(&serializer, size));
+ PostingListUsed::CreateFromUnitializedRegion(&serializer, pl_size));
std::vector<Hit> hits1 =
CreateHits(/*num_hits=*/5, /*desired_byte_length=*/1);
for (const Hit &hit : hits1) {
@@ -540,7 +980,7 @@ TEST(PostingListHitSerializerTest, MoveFrom) {
ICING_ASSERT_OK_AND_ASSIGN(
PostingListUsed pl_used2,
- PostingListUsed::CreateFromUnitializedRegion(&serializer, size));
+ PostingListUsed::CreateFromUnitializedRegion(&serializer, pl_size));
std::vector<Hit> hits2 =
CreateHits(/*num_hits=*/5, /*desired_byte_length=*/2);
for (const Hit &hit : hits2) {
@@ -556,10 +996,10 @@ TEST(PostingListHitSerializerTest, MoveFrom) {
TEST(PostingListHitSerializerTest, MoveFromNullArgumentReturnsInvalidArgument) {
PostingListHitSerializer serializer;
- int size = 3 * serializer.GetMinPostingListSize();
+ int pl_size = 3 * serializer.GetMinPostingListSize();
ICING_ASSERT_OK_AND_ASSIGN(
PostingListUsed pl_used1,
- PostingListUsed::CreateFromUnitializedRegion(&serializer, size));
+ PostingListUsed::CreateFromUnitializedRegion(&serializer, pl_size));
std::vector<Hit> hits = CreateHits(/*num_hits=*/5, /*desired_byte_length=*/1);
for (const Hit &hit : hits) {
ICING_ASSERT_OK(serializer.PrependHit(&pl_used1, hit));
@@ -575,10 +1015,10 @@ TEST(PostingListHitSerializerTest,
MoveFromInvalidPostingListReturnsInvalidArgument) {
PostingListHitSerializer serializer;
- int size = 3 * serializer.GetMinPostingListSize();
+ int pl_size = 3 * serializer.GetMinPostingListSize();
ICING_ASSERT_OK_AND_ASSIGN(
PostingListUsed pl_used1,
- PostingListUsed::CreateFromUnitializedRegion(&serializer, size));
+ PostingListUsed::CreateFromUnitializedRegion(&serializer, pl_size));
std::vector<Hit> hits1 =
CreateHits(/*num_hits=*/5, /*desired_byte_length=*/1);
for (const Hit &hit : hits1) {
@@ -587,7 +1027,7 @@ TEST(PostingListHitSerializerTest,
ICING_ASSERT_OK_AND_ASSIGN(
PostingListUsed pl_used2,
- PostingListUsed::CreateFromUnitializedRegion(&serializer, size));
+ PostingListUsed::CreateFromUnitializedRegion(&serializer, pl_size));
std::vector<Hit> hits2 =
CreateHits(/*num_hits=*/5, /*desired_byte_length=*/2);
for (const Hit &hit : hits2) {
@@ -595,7 +1035,7 @@ TEST(PostingListHitSerializerTest,
}
// Write invalid hits to the beginning of pl_used1 to make it invalid.
- Hit invalid_hit;
+ Hit invalid_hit(Hit::kInvalidValue);
Hit *first_hit = reinterpret_cast<Hit *>(pl_used1.posting_list_buffer());
*first_hit = invalid_hit;
++first_hit;
@@ -610,10 +1050,10 @@ TEST(PostingListHitSerializerTest,
MoveToInvalidPostingListReturnsFailedPrecondition) {
PostingListHitSerializer serializer;
- int size = 3 * serializer.GetMinPostingListSize();
+ int pl_size = 3 * serializer.GetMinPostingListSize();
ICING_ASSERT_OK_AND_ASSIGN(
PostingListUsed pl_used1,
- PostingListUsed::CreateFromUnitializedRegion(&serializer, size));
+ PostingListUsed::CreateFromUnitializedRegion(&serializer, pl_size));
std::vector<Hit> hits1 =
CreateHits(/*num_hits=*/5, /*desired_byte_length=*/1);
for (const Hit &hit : hits1) {
@@ -622,7 +1062,7 @@ TEST(PostingListHitSerializerTest,
ICING_ASSERT_OK_AND_ASSIGN(
PostingListUsed pl_used2,
- PostingListUsed::CreateFromUnitializedRegion(&serializer, size));
+ PostingListUsed::CreateFromUnitializedRegion(&serializer, pl_size));
std::vector<Hit> hits2 =
CreateHits(/*num_hits=*/5, /*desired_byte_length=*/2);
for (const Hit &hit : hits2) {
@@ -630,7 +1070,7 @@ TEST(PostingListHitSerializerTest,
}
// Write invalid hits to the beginning of pl_used2 to make it invalid.
- Hit invalid_hit;
+ Hit invalid_hit(Hit::kInvalidValue);
Hit *first_hit = reinterpret_cast<Hit *>(pl_used2.posting_list_buffer());
*first_hit = invalid_hit;
++first_hit;
@@ -644,10 +1084,10 @@ TEST(PostingListHitSerializerTest,
TEST(PostingListHitSerializerTest, MoveToPostingListTooSmall) {
PostingListHitSerializer serializer;
- int size = 3 * serializer.GetMinPostingListSize();
+ int pl_size = 3 * serializer.GetMinPostingListSize();
ICING_ASSERT_OK_AND_ASSIGN(
PostingListUsed pl_used1,
- PostingListUsed::CreateFromUnitializedRegion(&serializer, size));
+ PostingListUsed::CreateFromUnitializedRegion(&serializer, pl_size));
std::vector<Hit> hits1 =
CreateHits(/*num_hits=*/5, /*desired_byte_length=*/1);
for (const Hit &hit : hits1) {
@@ -672,30 +1112,36 @@ TEST(PostingListHitSerializerTest, MoveToPostingListTooSmall) {
IsOkAndHolds(ElementsAreArray(hits2.rbegin(), hits2.rend())));
}
-TEST(PostingListHitSerializerTest, PopHitsWithScores) {
+TEST(PostingListHitSerializerTest, PopHitsWithTermFrequenciesAndFlags) {
PostingListHitSerializer serializer;
- int size = 2 * serializer.GetMinPostingListSize();
+ // Size = 24
+ int pl_size = 2 * serializer.GetMinPostingListSize();
ICING_ASSERT_OK_AND_ASSIGN(
PostingListUsed pl_used,
- PostingListUsed::CreateFromUnitializedRegion(&serializer, size));
+ PostingListUsed::CreateFromUnitializedRegion(&serializer, pl_size));
- // This posting list is 20-bytes. Create four hits that will have deltas of
- // two bytes each and all of whom will have a non-default score. This posting
- // list will be almost_full.
+ // This posting list is 24-bytes. Create four hits that will have deltas of
+ // two bytes each and all of whom will have a non-default term-frequency. This
+ // posting list will be almost_full.
//
// ----------------------
- // 19 score(Hit #0)
- // 18-17 delta(Hit #0)
- // 16 score(Hit #1)
- // 15-14 delta(Hit #1)
- // 13 score(Hit #2)
- // 12-11 delta(Hit #2)
- // 10 <unused>
- // 9-5 Hit #3
- // 4-0 kInvalidHitVal
+ // 23 term-frequency(Hit #0)
+ // 22 flags(Hit #0)
+ // 21-20 delta(Hit #0)
+ // 19 term-frequency(Hit #1)
+ // 18 flags(Hit #1)
+ // 17-16 delta(Hit #1)
+ // 15 term-frequency(Hit #2)
+ // 14 flags(Hit #2)
+ // 13-12 delta(Hit #2)
+ // 11-6 Hit #3
+ // 5-0 kInvalidHit
// ----------------------
- Hit hit0(/*section_id=*/0, /*document_id=*/0, /*score=*/5);
+ int bytes_used = 18;
+
+ Hit hit0(/*section_id=*/0, /*document_id=*/0, /*term_frequency=*/5,
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
Hit hit1 = CreateHit(hit0, /*desired_byte_length=*/2);
Hit hit2 = CreateHit(hit1, /*desired_byte_length=*/2);
Hit hit3 = CreateHit(hit2, /*desired_byte_length=*/2);
@@ -707,22 +1153,26 @@ TEST(PostingListHitSerializerTest, PopHitsWithScores) {
ICING_ASSERT_OK_AND_ASSIGN(std::vector<Hit> hits_out,
serializer.GetHits(&pl_used));
EXPECT_THAT(hits_out, ElementsAre(hit3, hit2, hit1, hit0));
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(bytes_used));
// Now, pop the last hit. The posting list should contain the first three
// hits.
//
// ----------------------
- // 19 score(Hit #0)
- // 18-17 delta(Hit #0)
- // 16 score(Hit #1)
- // 15-14 delta(Hit #1)
- // 13-10 <unused>
- // 9-5 Hit #2
- // 4-0 kInvalidHitVal
+ // 23 term-frequency(Hit #0)
+ // 22 flags(Hit #0)
+ // 21-20 delta(Hit #0)
+ // 19 term-frequency(Hit #1)
+ // 18 flags(Hit #1)
+ // 17-16 delta(Hit #1)
+ // 15-12 <unused>
+ // 11-6 Hit #2
+ // 5-0 kInvalidHit
// ----------------------
ICING_ASSERT_OK(serializer.PopFrontHits(&pl_used, 1));
ICING_ASSERT_OK_AND_ASSIGN(hits_out, serializer.GetHits(&pl_used));
EXPECT_THAT(hits_out, ElementsAre(hit2, hit1, hit0));
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(bytes_used));
}
} // namespace
diff --git a/icing/query/query-processor.cc b/icing/query/query-processor.cc
index 74e51cf..bc0917b 100644
--- a/icing/query/query-processor.cc
+++ b/icing/query/query-processor.cc
@@ -172,8 +172,10 @@ libtextclassifier3::StatusOr<QueryResults> QueryProcessor::ParseSearch(
search_spec.enabled_features().end());
for (const Feature feature : results.features_in_use) {
if (enabled_features.find(feature) == enabled_features.end()) {
- return absl_ports::InvalidArgumentError(
- absl_ports::StrCat("Attempted use of unenabled feature ", feature));
+ return absl_ports::InvalidArgumentError(absl_ports::StrCat(
+ "Attempted use of unenabled feature ", feature,
+ ". Please make sure that you have explicitly set all advanced query "
+ "features used in this query as enabled in the SearchSpec."));
}
}
diff --git a/icing/testing/hit-test-utils.cc b/icing/testing/hit-test-utils.cc
index 7ad8a64..2fd3ac8 100644
--- a/icing/testing/hit-test-utils.cc
+++ b/icing/testing/hit-test-utils.cc
@@ -14,29 +14,44 @@
#include "icing/testing/hit-test-utils.h"
+#include <cstdint>
+#include <vector>
+
+#include "icing/index/hit/hit.h"
+#include "icing/index/main/posting-list-hit-serializer.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
+
namespace icing {
namespace lib {
// Returns a hit that has a delta of desired_byte_length from last_hit.
-Hit CreateHit(Hit last_hit, int desired_byte_length) {
- Hit hit = (last_hit.section_id() == kMinSectionId)
- ? Hit(kMaxSectionId, last_hit.document_id() + 1,
- last_hit.term_frequency())
- : Hit(last_hit.section_id() - 1, last_hit.document_id(),
- last_hit.term_frequency());
+Hit CreateHit(const Hit& last_hit, int desired_byte_length) {
+ return CreateHit(last_hit, desired_byte_length, last_hit.term_frequency(),
+ /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
+}
+
+// Returns a hit that has a delta of desired_byte_length from last_hit, with
+// the desired term_frequency and flags
+Hit CreateHit(const Hit& last_hit, int desired_byte_length,
+ Hit::TermFrequency term_frequency, bool is_in_prefix_section,
+ bool is_prefix_hit) {
+ Hit hit = last_hit;
uint8_t buf[5];
- while (VarInt::Encode(last_hit.value() - hit.value(), buf) <
- desired_byte_length) {
+ do {
hit = (hit.section_id() == kMinSectionId)
- ? Hit(kMaxSectionId, hit.document_id() + 1, hit.term_frequency())
- : Hit(hit.section_id() - 1, hit.document_id(),
- hit.term_frequency());
- }
+ ? Hit(kMaxSectionId, hit.document_id() + 1, term_frequency,
+ is_in_prefix_section, is_prefix_hit)
+ : Hit(hit.section_id() - 1, hit.document_id(), term_frequency,
+ is_in_prefix_section, is_prefix_hit);
+ } while (PostingListHitSerializer::EncodeNextHitValue(
+ /*next_hit_value=*/hit.value(),
+ /*curr_hit_value=*/last_hit.value(), buf) < desired_byte_length);
return hit;
}
// Returns a vector of num_hits Hits with the first hit starting at start_docid
-// and with 1-byte deltas.
+// and with deltas of the desired byte length.
std::vector<Hit> CreateHits(DocumentId start_docid, int num_hits,
int desired_byte_length) {
std::vector<Hit> hits;
@@ -44,13 +59,30 @@ std::vector<Hit> CreateHits(DocumentId start_docid, int num_hits,
return hits;
}
hits.push_back(Hit(/*section_id=*/1, /*document_id=*/start_docid,
- Hit::kDefaultTermFrequency));
+ Hit::kDefaultTermFrequency, /*is_in_prefix_section=*/false,
+ /*is_prefix_hit=*/false));
while (hits.size() < num_hits) {
hits.push_back(CreateHit(hits.back(), desired_byte_length));
}
return hits;
}
+// Returns a vector of num_hits Hits with the first hit being the desired byte
+// length from last_hit, and with deltas of the same desired byte length.
+std::vector<Hit> CreateHits(const Hit& last_hit, int num_hits,
+ int desired_byte_length) {
+ std::vector<Hit> hits;
+ if (num_hits < 1) {
+ return hits;
+ }
+ hits.reserve(num_hits);
+ for (int i = 0; i < num_hits; ++i) {
+ hits.push_back(
+ CreateHit(hits.empty() ? last_hit : hits.back(), desired_byte_length));
+ }
+ return hits;
+}
+
std::vector<Hit> CreateHits(int num_hits, int desired_byte_length) {
return CreateHits(/*start_docid=*/0, num_hits, desired_byte_length);
}
diff --git a/icing/testing/hit-test-utils.h b/icing/testing/hit-test-utils.h
index e236ec0..2953c5c 100644
--- a/icing/testing/hit-test-utils.h
+++ b/icing/testing/hit-test-utils.h
@@ -18,21 +18,30 @@
#include <vector>
#include "icing/index/hit/hit.h"
-#include "icing/legacy/index/icing-bit-util.h"
-#include "icing/schema/section.h"
#include "icing/store/document-id.h"
namespace icing {
namespace lib {
// Returns a hit that has a delta of desired_byte_length from last_hit.
-Hit CreateHit(Hit last_hit, int desired_byte_length);
+Hit CreateHit(const Hit& last_hit, int desired_byte_length);
+
+// Returns a hit that has a delta of desired_byte_length from last_hit, with
+// the desired term_frequency and flags
+Hit CreateHit(const Hit& last_hit, int desired_byte_length,
+ Hit::TermFrequency term_frequency, bool is_in_prefix_section,
+ bool is_prefix_hit);
// Returns a vector of num_hits Hits with the first hit starting at start_docid
// and with desired_byte_length deltas.
std::vector<Hit> CreateHits(DocumentId start_docid, int num_hits,
int desired_byte_length);
+// Returns a vector of num_hits Hits with the first hit being the desired byte
+// length from last_hit, and with deltas of the same desired byte length.
+std::vector<Hit> CreateHits(const Hit& last_hit, int num_hits,
+ int desired_byte_length);
+
// Returns a vector of num_hits Hits with the first hit starting at 0 and each
// with desired_byte_length deltas.
std::vector<Hit> CreateHits(int num_hits, int desired_byte_length);
diff --git a/java/src/com/google/android/icing/IcingSearchEngineInterface.java b/java/src/com/google/android/icing/IcingSearchEngineInterface.java
index 0bc58f1..67f60ed 100644
--- a/java/src/com/google/android/icing/IcingSearchEngineInterface.java
+++ b/java/src/com/google/android/icing/IcingSearchEngineInterface.java
@@ -32,7 +32,7 @@ import com.google.android.icing.proto.SuggestionSpecProto;
import com.google.android.icing.proto.UsageReport;
import java.io.Closeable;
-/** A common user-facing interface to expose the funcationalities provided by Icing Library. */
+/** A common user-facing interface to expose the functionalities provided by Icing Library. */
public interface IcingSearchEngineInterface extends Closeable {
/**
* Initializes the current IcingSearchEngine implementation.
diff --git a/proto/icing/proto/schema.proto b/proto/icing/proto/schema.proto
index c716dba..78e1588 100644
--- a/proto/icing/proto/schema.proto
+++ b/proto/icing/proto/schema.proto
@@ -34,7 +34,7 @@ option objc_class_prefix = "ICNG";
// TODO(cassiewang) Define a sample proto file that can be used by tests and for
// documentation.
//
-// Next tag: 7
+// Next tag: 8
message SchemaTypeConfigProto {
// REQUIRED: Named type that uniquely identifies the structured, logical
// schema being defined.
@@ -43,6 +43,13 @@ message SchemaTypeConfigProto {
// in http://schema.org. Eg: DigitalDocument, Message, Person, etc.
optional string schema_type = 1;
+ // OPTIONAL: A natural language description of the SchemaTypeConfigProto.
+ //
+ // This string is not used by Icing in any way. It simply exists to allow
+ // users to store semantic information about the SchemaTypeConfigProto for
+ // future retrieval.
+ optional string description = 7;
+
// List of all properties that are supported by Documents of this type.
// An Document should never have properties that are not listed here.
//
@@ -208,7 +215,7 @@ message JoinableConfig {
// Describes the schema of a single property of Documents that belong to a
// specific SchemaTypeConfigProto. These can be considered as a rich, structured
// type for each property of Documents accepted by IcingSearchEngine.
-// Next tag: 9
+// Next tag: 10
message PropertyConfigProto {
// REQUIRED: Name that uniquely identifies a property within an Document of
// a specific SchemaTypeConfigProto.
@@ -219,6 +226,13 @@ message PropertyConfigProto {
// Eg: 'address' for http://schema.org/Place.
optional string property_name = 1;
+ // OPTIONAL: A natural language description of the property.
+ //
+ // This string is not used by Icing in any way. It simply exists to allow
+ // users to store semantic information about the PropertyConfigProto for
+ // future retrieval.
+ optional string description = 9;
+
// REQUIRED: Physical data-types of the contents of the property.
message DataType {
enum Code {
diff --git a/synced_AOSP_CL_number.txt b/synced_AOSP_CL_number.txt
index a36a007..c22a1e8 100644
--- a/synced_AOSP_CL_number.txt
+++ b/synced_AOSP_CL_number.txt
@@ -1 +1 @@
-set(synced_AOSP_CL_number=596096845)
+set(synced_AOSP_CL_number=613977851)