aboutsummaryrefslogtreecommitdiff
path: root/icing/index
diff options
context:
space:
mode:
Diffstat (limited to 'icing/index')
-rw-r--r--icing/index/index-processor.cc20
-rw-r--r--icing/index/index-processor.h2
-rw-r--r--icing/index/index-processor_benchmark.cc2
-rw-r--r--icing/index/index-processor_test.cc131
-rw-r--r--icing/index/index.cc61
-rw-r--r--icing/index/index.h27
-rw-r--r--icing/index/index_test.cc391
-rw-r--r--icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc10
-rw-r--r--icing/index/lite/doc-hit-info-iterator-term-lite.cc5
-rw-r--r--icing/index/lite/lite-index.cc47
-rw-r--r--icing/index/lite/lite-index.h12
-rw-r--r--icing/index/lite/lite-index_test.cc110
-rw-r--r--icing/index/main/flash-index-storage.h1
-rw-r--r--icing/index/main/main-index.cc86
-rw-r--r--icing/index/main/main-index.h17
-rw-r--r--icing/index/main/main-index_test.cc28
16 files changed, 575 insertions, 375 deletions
diff --git a/icing/index/index-processor.cc b/icing/index/index-processor.cc
index 1aae732..207c033 100644
--- a/icing/index/index-processor.cc
+++ b/icing/index/index-processor.cc
@@ -73,9 +73,23 @@ libtextclassifier3::Status IndexProcessor::IndexDocument(
section.metadata.term_match_type, /*namespace_id=*/0);
for (std::string_view token : section.token_sequence) {
++num_tokens;
- std::string term = normalizer_.NormalizeTerm(token);
- // Add this term to Hit buffer.
- status = editor.BufferTerm(term.c_str());
+
+ switch (section.metadata.tokenizer) {
+ case StringIndexingConfig::TokenizerType::VERBATIM:
+ // data() is safe to use here because a token created from the
+ // VERBATIM tokenizer is the entire string value. The character at
+ // data() + token.length() is guaranteed to be a null char.
+ status = editor.BufferTerm(token.data());
+ break;
+ case StringIndexingConfig::TokenizerType::NONE:
+ ICING_LOG(WARNING)
+ << "Unexpected TokenizerType::NONE found when indexing document.";
+ [[fallthrough]];
+ case StringIndexingConfig::TokenizerType::PLAIN:
+ std::string normalized_term = normalizer_.NormalizeTerm(token);
+ status = editor.BufferTerm(normalized_term.c_str());
+ }
+
if (!status.ok()) {
// We've encountered a failure. Bail out. We'll mark this doc as deleted
// and signal a failure to the client.
diff --git a/icing/index/index-processor.h b/icing/index/index-processor.h
index c4b77b5..269e41c 100644
--- a/icing/index/index-processor.h
+++ b/icing/index/index-processor.h
@@ -69,8 +69,6 @@ class IndexProcessor {
IndexProcessor(const Normalizer* normalizer, Index* index, const Clock* clock)
: normalizer_(*normalizer), index_(index), clock_(*clock) {}
- std::string NormalizeToken(const Token& token);
-
const Normalizer& normalizer_;
Index* const index_;
const Clock& clock_;
diff --git a/icing/index/index-processor_benchmark.cc b/icing/index/index-processor_benchmark.cc
index 6e072c7..1aad7d0 100644
--- a/icing/index/index-processor_benchmark.cc
+++ b/icing/index/index-processor_benchmark.cc
@@ -16,7 +16,6 @@
#include "gmock/gmock.h"
#include "icing/document-builder.h"
#include "icing/file/filesystem.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/index/index-processor.h"
#include "icing/index/index.h"
#include "icing/legacy/core/icing-string-util.h"
@@ -24,6 +23,7 @@
#include "icing/schema/schema-util.h"
#include "icing/schema/section-manager.h"
#include "icing/testing/common-matchers.h"
+#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/test-data.h"
#include "icing/testing/tmp-directory.h"
#include "icing/tokenization/language-segmenter-factory.h"
diff --git a/icing/index/index-processor_test.cc b/icing/index/index-processor_test.cc
index 449bc3e..bd310de 100644
--- a/icing/index/index-processor_test.cc
+++ b/icing/index/index-processor_test.cc
@@ -30,7 +30,6 @@
#include "icing/absl_ports/str_join.h"
#include "icing/document-builder.h"
#include "icing/file/filesystem.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/index/hit/doc-hit-info.h"
#include "icing/index/index.h"
#include "icing/index/iterator/doc-hit-info-iterator.h"
@@ -49,6 +48,7 @@
#include "icing/store/document-id.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/fake-clock.h"
+#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/random-string.h"
#include "icing/testing/test-data.h"
#include "icing/testing/tmp-directory.h"
@@ -90,6 +90,8 @@ constexpr std::string_view kRepeatedProperty = "repeated";
constexpr std::string_view kSubProperty = "submessage";
constexpr std::string_view kNestedType = "NestedType";
constexpr std::string_view kNestedProperty = "nested";
+constexpr std::string_view kExactVerbatimProperty = "verbatimExact";
+constexpr std::string_view kPrefixedVerbatimProperty = "verbatimPrefixed";
constexpr DocumentId kDocumentId0 = 0;
constexpr DocumentId kDocumentId1 = 1;
@@ -98,6 +100,8 @@ constexpr SectionId kExactSectionId = 0;
constexpr SectionId kPrefixedSectionId = 1;
constexpr SectionId kRepeatedSectionId = 2;
constexpr SectionId kNestedSectionId = 3;
+constexpr SectionId kExactVerbatimSectionId = 4;
+constexpr SectionId kPrefixedVerbatimSectionId = 5;
using Cardinality = PropertyConfigProto::Cardinality;
using DataType = PropertyConfigProto::DataType;
@@ -106,21 +110,23 @@ using ::testing::Eq;
using ::testing::IsEmpty;
using ::testing::Test;
-constexpr PropertyConfigProto_DataType_Code TYPE_STRING =
- PropertyConfigProto_DataType_Code_STRING;
-constexpr PropertyConfigProto_DataType_Code TYPE_BYTES =
- PropertyConfigProto_DataType_Code_BYTES;
+constexpr PropertyConfigProto::DataType::Code TYPE_STRING =
+ PropertyConfigProto::DataType::STRING;
+constexpr PropertyConfigProto::DataType::Code TYPE_BYTES =
+ PropertyConfigProto::DataType::BYTES;
-constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL =
- PropertyConfigProto_Cardinality_Code_OPTIONAL;
-constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REPEATED =
- PropertyConfigProto_Cardinality_Code_REPEATED;
+constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL =
+ PropertyConfigProto::Cardinality::OPTIONAL;
+constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REPEATED =
+ PropertyConfigProto::Cardinality::REPEATED;
-constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN =
- StringIndexingConfig_TokenizerType_Code_PLAIN;
+constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN =
+ StringIndexingConfig::TokenizerType::PLAIN;
+constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_VERBATIM =
+ StringIndexingConfig::TokenizerType::VERBATIM;
-constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY;
-constexpr TermMatchType_Code MATCH_PREFIX = TermMatchType_Code_PREFIX;
+constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY;
+constexpr TermMatchType::Code MATCH_PREFIX = TermMatchType::PREFIX;
class IndexProcessorTest : public Test {
protected:
@@ -180,6 +186,16 @@ class IndexProcessorTest : public Test {
.SetCardinality(CARDINALITY_REPEATED))
.AddProperty(
PropertyConfigBuilder()
+ .SetName(kExactVerbatimProperty)
+ .SetDataTypeString(MATCH_EXACT, TOKENIZER_VERBATIM)
+ .SetCardinality(CARDINALITY_REPEATED))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName(kPrefixedVerbatimProperty)
+ .SetDataTypeString(MATCH_PREFIX, TOKENIZER_VERBATIM)
+ .SetCardinality(CARDINALITY_REPEATED))
+ .AddProperty(
+ PropertyConfigBuilder()
.SetName(kSubProperty)
.SetDataTypeDocument(
kNestedType, /*index_nested_properties=*/true)
@@ -797,6 +813,95 @@ TEST_F(IndexProcessorTest, IndexingDocMergeFailureResets) {
EXPECT_THAT(index_->last_added_document_id(), Eq(doc_id));
}
+TEST_F(IndexProcessorTest, ExactVerbatimProperty) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kExactVerbatimProperty),
+ "Hello, world!")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+ EXPECT_THAT(tokenized_document.num_tokens(), 1);
+
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
+ IsOk());
+ EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("Hello, world!", kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
+ std::vector<DocHitInfo> hits = GetHits(std::move(itr));
+ std::unordered_map<SectionId, Hit::TermFrequency> expectedMap{
+ {kExactVerbatimSectionId, 1}};
+
+ EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
+ kDocumentId0, expectedMap)));
+}
+
+TEST_F(IndexProcessorTest, PrefixVerbatimProperty) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kPrefixedVerbatimProperty),
+ "Hello, world!")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+ EXPECT_THAT(tokenized_document.num_tokens(), 1);
+
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
+ IsOk());
+ EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
+
+ // We expect to match the document we indexed as "Hello, w" is a prefix
+ // of "Hello, world!"
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("Hello, w", kSectionIdMaskAll,
+ TermMatchType::PREFIX));
+ std::vector<DocHitInfo> hits = GetHits(std::move(itr));
+ std::unordered_map<SectionId, Hit::TermFrequency> expectedMap{
+ {kPrefixedVerbatimSectionId, 1}};
+
+ EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
+ kDocumentId0, expectedMap)));
+}
+
+TEST_F(IndexProcessorTest, VerbatimPropertyDoesntMatchSubToken) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kPrefixedVerbatimProperty),
+ "Hello, world!")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+ EXPECT_THAT(tokenized_document.num_tokens(), 1);
+
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
+ IsOk());
+ EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("world", kSectionIdMaskAll, TermMatchType::PREFIX));
+ std::vector<DocHitInfo> hits = GetHits(std::move(itr));
+
+ // We should not have hits for term "world" as the index processor should
+ // create a sole token "Hello, world! for the document.
+ EXPECT_THAT(hits, IsEmpty());
+}
+
} // namespace
} // namespace lib
diff --git a/icing/index/index.cc b/icing/index/index.cc
index 1bdab21..02ba699 100644
--- a/icing/index/index.cc
+++ b/icing/index/index.cc
@@ -71,24 +71,6 @@ IcingDynamicTrie::Options GetMainLexiconOptions() {
return IcingDynamicTrie::Options();
}
-// Helper function to check if a term is in the given namespaces.
-// TODO(tjbarron): Implement a method PropertyReadersAll.HasAnyProperty().
-bool IsTermInNamespaces(
- const IcingDynamicTrie::PropertyReadersAll& property_reader,
- uint32_t value_index, const std::vector<NamespaceId>& namespace_ids) {
- if (namespace_ids.empty()) {
- return true;
- }
- for (NamespaceId namespace_id : namespace_ids) {
- if (property_reader.HasProperty(GetNamespacePropertyId(namespace_id),
- value_index)) {
- return true;
- }
- }
-
- return false;
-}
-
enum class MergeAction { kTakeLiteTerm, kTakeMainTerm, kMergeTerms };
// Merge the TermMetadata from lite index and main index. If the term exists in
@@ -137,7 +119,7 @@ std::vector<TermMetadata> MergeAndRankTermMetadatas(
int total_est_hit_count =
lite_term_itr->hit_count + main_term_itr->hit_count;
PushToTermHeap(TermMetadata(std::move(lite_term_itr->content),
- total_est_hit_count),
+ total_est_hit_count),
num_to_return, merged_term_metadata_heap);
++lite_term_itr;
++main_term_itr;
@@ -228,32 +210,26 @@ Index::GetIterator(const std::string& term, SectionIdMask section_id_mask,
libtextclassifier3::StatusOr<std::vector<TermMetadata>>
Index::FindLiteTermsByPrefix(const std::string& prefix,
- const std::vector<NamespaceId>& namespace_ids) {
+ const NamespaceChecker* namespace_checker) {
// Finds all the terms that start with the given prefix in the lexicon.
IcingDynamicTrie::Iterator term_iterator(lite_index_->lexicon(),
prefix.c_str());
- // A property reader to help check if a term has some property.
- IcingDynamicTrie::PropertyReadersAll property_reader(lite_index_->lexicon());
-
std::vector<TermMetadata> term_metadata_list;
while (term_iterator.IsValid()) {
uint32_t term_value_index = term_iterator.GetValueIndex();
- // Skips the terms that don't exist in the given namespaces. We won't skip
- // any terms if namespace_ids is empty.
- if (!IsTermInNamespaces(property_reader, term_value_index, namespace_ids)) {
- term_iterator.Advance();
- continue;
- }
-
ICING_ASSIGN_OR_RETURN(
uint32_t term_id,
term_id_codec_->EncodeTvi(term_value_index, TviType::LITE),
absl_ports::InternalError("Failed to access terms in lexicon."));
-
- term_metadata_list.emplace_back(term_iterator.GetKey(),
- lite_index_->CountHits(term_id));
+ ICING_ASSIGN_OR_RETURN(int hit_count,
+ lite_index_->CountHits(term_id, namespace_checker));
+ if (hit_count > 0) {
+ // There is at least one document in the given namespace has this term.
+ term_metadata_list.push_back(
+ TermMetadata(term_iterator.GetKey(), hit_count));
+ }
term_iterator.Advance();
}
@@ -261,21 +237,20 @@ Index::FindLiteTermsByPrefix(const std::string& prefix,
}
libtextclassifier3::StatusOr<std::vector<TermMetadata>>
-Index::FindTermsByPrefix(const std::string& prefix,
- const std::vector<NamespaceId>& namespace_ids,
- int num_to_return) {
+Index::FindTermsByPrefix(const std::string& prefix, int num_to_return,
+ TermMatchType::Code term_match_type,
+ const NamespaceChecker* namespace_checker) {
std::vector<TermMetadata> term_metadata_list;
if (num_to_return <= 0) {
return term_metadata_list;
}
-
// Get results from the LiteIndex.
ICING_ASSIGN_OR_RETURN(std::vector<TermMetadata> lite_term_metadata_list,
- FindLiteTermsByPrefix(prefix, namespace_ids));
+ FindLiteTermsByPrefix(prefix, namespace_checker));
// Append results from the MainIndex.
ICING_ASSIGN_OR_RETURN(std::vector<TermMetadata> main_term_metadata_list,
- main_index_->FindTermsByPrefix(prefix, namespace_ids));
-
+ main_index_->FindTermsByPrefix(prefix, term_match_type,
+ namespace_checker));
return MergeAndRankTermMetadatas(std::move(lite_term_metadata_list),
std::move(main_term_metadata_list),
num_to_return);
@@ -284,11 +259,7 @@ Index::FindTermsByPrefix(const std::string& prefix,
IndexStorageInfoProto Index::GetStorageInfo() const {
IndexStorageInfoProto storage_info;
int64_t directory_size = filesystem_->GetDiskUsage(options_.base_dir.c_str());
- if (directory_size != Filesystem::kBadFileSize) {
- storage_info.set_index_size(directory_size);
- } else {
- storage_info.set_index_size(-1);
- }
+ storage_info.set_index_size(Filesystem::SanitizeFileSize(directory_size));
storage_info = lite_index_->GetStorageInfo(std::move(storage_info));
return main_index_->GetStorageInfo(std::move(storage_info));
}
diff --git a/icing/index/index.h b/icing/index/index.h
index 693cf04..5c53349 100644
--- a/icing/index/index.h
+++ b/icing/index/index.h
@@ -32,10 +32,12 @@
#include "icing/index/term-id-codec.h"
#include "icing/index/term-metadata.h"
#include "icing/legacy/index/icing-filesystem.h"
+#include "icing/proto/debug.pb.h"
#include "icing/proto/storage.pb.h"
#include "icing/proto/term.pb.h"
#include "icing/schema/section.h"
#include "icing/store/document-id.h"
+#include "icing/store/namespace-checker.h"
#include "icing/store/namespace-id.h"
#include "icing/util/crc32.h"
@@ -142,9 +144,14 @@ class Index {
// index.
// verbosity > 0, more detailed debug information including raw postings
// lists.
- void GetDebugInfo(int verbosity, std::string* out) const {
- lite_index_->GetDebugInfo(verbosity, out);
- main_index_->GetDebugInfo(verbosity, out);
+ IndexDebugInfoProto GetDebugInfo(int verbosity) const {
+ IndexDebugInfoProto debug_info;
+ *debug_info.mutable_index_storage_info() = GetStorageInfo();
+ *debug_info.mutable_lite_index_info() =
+ lite_index_->GetDebugInfo(verbosity);
+ *debug_info.mutable_main_index_info() =
+ main_index_->GetDebugInfo(verbosity);
+ return debug_info;
}
// Returns the byte size of the all the elements held in the index. This
@@ -181,17 +188,17 @@ class Index {
TermMatchType::Code term_match_type);
// Finds terms with the given prefix in the given namespaces. If
- // 'namespace_ids' is empty, returns results from all the namespaces. The
- // input prefix must be normalized, otherwise inaccurate results may be
- // returned. Results are not sorted specifically and are in their original
- // order. Number of results are no more than 'num_to_return'.
+ // 'namespace_ids' is empty, returns results from all the namespaces. Results
+ // are sorted in decreasing order of hit count. Number of results are no more
+ // than 'num_to_return'.
//
// Returns:
// A list of TermMetadata on success
// INTERNAL_ERROR if failed to access term data.
libtextclassifier3::StatusOr<std::vector<TermMetadata>> FindTermsByPrefix(
- const std::string& prefix, const std::vector<NamespaceId>& namespace_ids,
- int num_to_return);
+ const std::string& prefix, int num_to_return,
+ TermMatchType::Code term_match_type,
+ const NamespaceChecker* namespace_checker);
// A class that can be used to add hits to the index.
//
@@ -267,7 +274,7 @@ class Index {
filesystem_(filesystem) {}
libtextclassifier3::StatusOr<std::vector<TermMetadata>> FindLiteTermsByPrefix(
- const std::string& prefix, const std::vector<NamespaceId>& namespace_ids);
+ const std::string& prefix, const NamespaceChecker* namespace_checker);
std::unique_ptr<LiteIndex> lite_index_;
std::unique_ptr<MainIndex> main_index_;
diff --git a/icing/index/index_test.cc b/icing/index/index_test.cc
index 00d5ad6..8355c01 100644
--- a/icing/index/index_test.cc
+++ b/icing/index/index_test.cc
@@ -31,10 +31,12 @@
#include "icing/index/iterator/doc-hit-info-iterator.h"
#include "icing/legacy/index/icing-filesystem.h"
#include "icing/legacy/index/icing-mock-filesystem.h"
+#include "icing/proto/debug.pb.h"
#include "icing/proto/storage.pb.h"
#include "icing/proto/term.pb.h"
#include "icing/schema/section.h"
#include "icing/store/document-id.h"
+#include "icing/testing/always-true-namespace-checker-impl.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/random-string.h"
#include "icing/testing/tmp-directory.h"
@@ -89,22 +91,9 @@ constexpr DocumentId kDocumentId5 = 5;
constexpr DocumentId kDocumentId6 = 6;
constexpr DocumentId kDocumentId7 = 7;
constexpr DocumentId kDocumentId8 = 8;
-constexpr DocumentId kDocumentId9 = 9;
-constexpr DocumentId kDocumentId10 = 10;
-constexpr DocumentId kDocumentId11 = 11;
-constexpr DocumentId kDocumentId12 = 12;
constexpr SectionId kSectionId2 = 2;
constexpr SectionId kSectionId3 = 3;
-// The value returned by IndexBlock::ApproximateFullPostingListHitsForBlock(
-// GetBlockSize(),
-// GetPostingListIndexBits(posting_list_utils::min_posting_list_size()));
-constexpr int kMinSizePlApproxHits = 3;
-// The value returned by IndexBlock::ApproximateFullPostingListHitsForBlock(
-// GetBlockSize(),
-// GetPostingListIndexBits(2 * posting_list_utils::min_posting_list_size()));
-constexpr int kSecondSmallestPlApproxHits = 7;
-
std::vector<DocHitInfo> GetHits(std::unique_ptr<DocHitInfoIterator> iterator) {
std::vector<DocHitInfo> infos;
while (iterator->Advance().ok()) {
@@ -920,148 +909,82 @@ TEST_F(IndexTest, InvalidHitBufferSize) {
TEST_F(IndexTest, FindTermByPrefixShouldReturnEmpty) {
Index::Editor edit = index_->Edit(kDocumentId0, kSectionId2,
TermMatchType::PREFIX, /*namespace_id=*/0);
+ AlwaysTrueNamespaceCheckerImpl impl;
EXPECT_THAT(edit.BufferTerm("fool"), IsOk());
EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"foo", /*namespace_ids=*/{0},
- /*num_to_return=*/0),
+ EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"foo", /*num_to_return=*/0,
+ TermMatchType::PREFIX, &impl),
IsOkAndHolds(IsEmpty()));
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"foo", /*namespace_ids=*/{0},
- /*num_to_return=*/-1),
+ EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"foo",
+ /*num_to_return=*/-1,
+ TermMatchType::PREFIX, &impl),
IsOkAndHolds(IsEmpty()));
ICING_ASSERT_OK(index_->Merge());
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"foo", /*namespace_ids=*/{0},
- /*num_to_return=*/0),
+ EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"foo",
+ /*num_to_return=*/0,
+ TermMatchType::PREFIX, &impl),
IsOkAndHolds(IsEmpty()));
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"foo", /*namespace_ids=*/{0},
- /*num_to_return=*/-1),
+ EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"foo",
+ /*num_to_return=*/-1,
+ TermMatchType::PREFIX, &impl),
IsOkAndHolds(IsEmpty()));
}
TEST_F(IndexTest, FindTermByPrefixShouldReturnCorrectResult) {
Index::Editor edit = index_->Edit(
kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
+ AlwaysTrueNamespaceCheckerImpl impl;
EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
EXPECT_THAT(edit.BufferTerm("bar"), IsOk());
EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
// "b" should only match "bar" but not "foo".
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"b", /*namespace_ids=*/{0},
- /*num_to_return=*/10),
+ EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"b",
+ /*num_to_return=*/10,
+ TermMatchType::PREFIX, &impl),
IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("bar", 1))));
ICING_ASSERT_OK(index_->Merge());
// "b" should only match "bar" but not "foo".
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"b", /*namespace_ids=*/{0},
- /*num_to_return=*/10),
- IsOkAndHolds(UnorderedElementsAre(
- EqualsTermMetadata("bar", kMinSizePlApproxHits))));
+ EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"b",
+ /*num_to_return=*/10,
+ TermMatchType::PREFIX, &impl),
+ IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("bar", 1))));
}
TEST_F(IndexTest, FindTermByPrefixShouldRespectNumToReturn) {
Index::Editor edit = index_->Edit(
kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
+ AlwaysTrueNamespaceCheckerImpl impl;
EXPECT_THAT(edit.BufferTerm("fo"), IsOk());
EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
EXPECT_THAT(edit.BufferTerm("fool"), IsOk());
EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
// We have 3 results but only 2 should be returned.
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0},
- /*num_to_return=*/2),
+ EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f",
+ /*num_to_return=*/2,
+ TermMatchType::PREFIX, &impl),
IsOkAndHolds(SizeIs(2)));
ICING_ASSERT_OK(index_->Merge());
// We have 3 results but only 2 should be returned.
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0},
- /*num_to_return=*/2),
+ EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f",
+ /*num_to_return=*/2,
+ TermMatchType::PREFIX, &impl),
IsOkAndHolds(SizeIs(2)));
}
-TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsInOneNamespace) {
- Index::Editor edit1 =
- index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY,
- /*namespace_id=*/0);
- EXPECT_THAT(edit1.BufferTerm("fo"), IsOk());
- EXPECT_THAT(edit1.BufferTerm("foo"), IsOk());
- EXPECT_THAT(edit1.IndexAllBufferedTerms(), IsOk());
-
- Index::Editor edit2 =
- index_->Edit(kDocumentId1, kSectionId2, TermMatchType::EXACT_ONLY,
- /*namespace_id=*/1);
- EXPECT_THAT(edit2.BufferTerm("fool"), IsOk());
- EXPECT_THAT(edit2.IndexAllBufferedTerms(), IsOk());
-
- // namespace with id 0 has 2 results.
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0},
- /*num_to_return=*/10),
- IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("fo", 1),
- EqualsTermMetadata("foo", 1))));
- // namespace with id 1 has 1 result.
- EXPECT_THAT(
- index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{1},
- /*num_to_return=*/10),
- IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("fool", 1))));
-
- ICING_ASSERT_OK(index_->Merge());
-
- // namespace with id 0 has 2 results.
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0},
- /*num_to_return=*/10),
- IsOkAndHolds(UnorderedElementsAre(
- EqualsTermMetadata("fo", kMinSizePlApproxHits),
- EqualsTermMetadata("foo", kMinSizePlApproxHits))));
- // namespace with id 1 has 1 result.
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{1},
- /*num_to_return=*/10),
- IsOkAndHolds(UnorderedElementsAre(
- EqualsTermMetadata("fool", kMinSizePlApproxHits))));
-}
-
-TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsInMultipleNamespaces) {
- Index::Editor edit1 =
- index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY,
- /*namespace_id=*/0);
- EXPECT_THAT(edit1.BufferTerm("fo"), IsOk());
- EXPECT_THAT(edit1.IndexAllBufferedTerms(), IsOk());
-
- Index::Editor edit2 =
- index_->Edit(kDocumentId1, kSectionId2, TermMatchType::EXACT_ONLY,
- /*namespace_id=*/1);
- EXPECT_THAT(edit2.BufferTerm("foo"), IsOk());
- EXPECT_THAT(edit2.IndexAllBufferedTerms(), IsOk());
-
- Index::Editor edit3 =
- index_->Edit(kDocumentId2, kSectionId2, TermMatchType::EXACT_ONLY,
- /*namespace_id=*/2);
- EXPECT_THAT(edit3.BufferTerm("fool"), IsOk());
- EXPECT_THAT(edit3.IndexAllBufferedTerms(), IsOk());
-
- // Should return "foo" and "fool" which are in namespaces with ids 1 and 2.
- EXPECT_THAT(
- index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{1, 2},
- /*num_to_return=*/10),
- IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("foo", 1),
- EqualsTermMetadata("fool", 1))));
-
- ICING_ASSERT_OK(index_->Merge());
-
- EXPECT_THAT(
- index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{1, 2},
- /*num_to_return=*/10),
- IsOkAndHolds(UnorderedElementsAre(
- EqualsTermMetadata("foo", kMinSizePlApproxHits),
- EqualsTermMetadata("fool", kMinSizePlApproxHits))));
-}
-
TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsInAllNamespaces) {
Index::Editor edit1 =
index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY,
/*namespace_id=*/0);
+ AlwaysTrueNamespaceCheckerImpl impl;
EXPECT_THAT(edit1.BufferTerm("fo"), IsOk());
EXPECT_THAT(edit1.IndexAllBufferedTerms(), IsOk());
@@ -1078,8 +1001,9 @@ TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsInAllNamespaces) {
EXPECT_THAT(edit3.IndexAllBufferedTerms(), IsOk());
// Should return "fo", "foo" and "fool" across all namespaces.
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{},
- /*num_to_return=*/10),
+ EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f",
+ /*num_to_return=*/10,
+ TermMatchType::PREFIX, &impl),
IsOkAndHolds(UnorderedElementsAre(
EqualsTermMetadata("fo", 1), EqualsTermMetadata("foo", 1),
EqualsTermMetadata("fool", 1))));
@@ -1087,18 +1011,19 @@ TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsInAllNamespaces) {
ICING_ASSERT_OK(index_->Merge());
// Should return "fo", "foo" and "fool" across all namespaces.
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{},
- /*num_to_return=*/10),
+ EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f",
+ /*num_to_return=*/10,
+ TermMatchType::PREFIX, &impl),
IsOkAndHolds(UnorderedElementsAre(
- EqualsTermMetadata("fo", kMinSizePlApproxHits),
- EqualsTermMetadata("foo", kMinSizePlApproxHits),
- EqualsTermMetadata("fool", kMinSizePlApproxHits))));
+ EqualsTermMetadata("fo", 1), EqualsTermMetadata("foo", 1),
+ EqualsTermMetadata("fool", 1))));
}
TEST_F(IndexTest, FindTermByPrefixShouldReturnCorrectHitCount) {
Index::Editor edit1 =
index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY,
/*namespace_id=*/0);
+ AlwaysTrueNamespaceCheckerImpl impl;
EXPECT_THAT(edit1.BufferTerm("foo"), IsOk());
EXPECT_THAT(edit1.BufferTerm("fool"), IsOk());
EXPECT_THAT(edit1.IndexAllBufferedTerms(), IsOk());
@@ -1110,20 +1035,19 @@ TEST_F(IndexTest, FindTermByPrefixShouldReturnCorrectHitCount) {
EXPECT_THAT(edit2.IndexAllBufferedTerms(), IsOk());
// 'foo' has 1 hit, 'fool' has 2 hits.
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0},
- /*num_to_return=*/10),
+ EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f",
+ /*num_to_return=*/10,
+ TermMatchType::PREFIX, &impl),
IsOkAndHolds(ElementsAre(EqualsTermMetadata("fool", 2),
EqualsTermMetadata("foo", 1))));
ICING_ASSERT_OK(index_->Merge());
- // foo's one hit should fit on a min-sized pl, fool's two hits should also fit
- // on a min-sized pl.
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0},
- /*num_to_return=*/10),
- IsOkAndHolds(UnorderedElementsAre(
- EqualsTermMetadata("foo", kMinSizePlApproxHits),
- EqualsTermMetadata("fool", kMinSizePlApproxHits))));
+ EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f",
+ /*num_to_return=*/10,
+ TermMatchType::PREFIX, &impl),
+ IsOkAndHolds(ElementsAre(EqualsTermMetadata("fool", 2),
+ EqualsTermMetadata("foo", 1))));
}
TEST_F(IndexTest, FindTermByPrefixShouldReturnInOrder) {
@@ -1132,6 +1056,7 @@ TEST_F(IndexTest, FindTermByPrefixShouldReturnInOrder) {
Index::Editor edit1 =
index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY,
/*namespace_id=*/0);
+ AlwaysTrueNamespaceCheckerImpl impl;
EXPECT_THAT(edit1.BufferTerm("term-one"), IsOk());
EXPECT_THAT(edit1.BufferTerm("term-two"), IsOk());
EXPECT_THAT(edit1.BufferTerm("term-three"), IsOk());
@@ -1181,8 +1106,9 @@ TEST_F(IndexTest, FindTermByPrefixShouldReturnInOrder) {
EXPECT_THAT(edit6.IndexAllBufferedTerms(), IsOk());
// verify the order in lite index is correct.
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"t", /*namespace_ids=*/{0},
- /*num_to_return=*/10),
+ EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"t",
+ /*num_to_return=*/10,
+ TermMatchType::PREFIX, &impl),
IsOkAndHolds(ElementsAre(EqualsTermMetadata("term-six", 6),
EqualsTermMetadata("term-five", 5),
EqualsTermMetadata("term-four", 4),
@@ -1192,93 +1118,97 @@ TEST_F(IndexTest, FindTermByPrefixShouldReturnInOrder) {
ICING_ASSERT_OK(index_->Merge());
- // Since most of term has same approx hit count, we don't verify order in the
- // main index.
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"t", /*namespace_ids=*/{0},
- /*num_to_return=*/10),
- IsOkAndHolds(UnorderedElementsAre(
- EqualsTermMetadata("term-six", kSecondSmallestPlApproxHits),
- EqualsTermMetadata("term-five", kSecondSmallestPlApproxHits),
- EqualsTermMetadata("term-four", kMinSizePlApproxHits),
- EqualsTermMetadata("term-three", kMinSizePlApproxHits),
- EqualsTermMetadata("term-two", kMinSizePlApproxHits),
- EqualsTermMetadata("term-one", kMinSizePlApproxHits))));
-
- // keep push terms to the lite index. For term 1-4, since they has same hit
- // count kMinSizePlApproxHits, we will push 4 term-one, 3 term-two, 2
- // term-three and one term-four to make them in reverse order. And for term
- // 5 & 6, we will push 2 term-five and one term-six.
+ EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"t",
+ /*num_to_return=*/10,
+ TermMatchType::PREFIX, &impl),
+ IsOkAndHolds(ElementsAre(EqualsTermMetadata("term-six", 6),
+ EqualsTermMetadata("term-five", 5),
+ EqualsTermMetadata("term-four", 4),
+ EqualsTermMetadata("term-three", 3),
+ EqualsTermMetadata("term-two", 2),
+ EqualsTermMetadata("term-one", 1))));
+
+ // keep push terms to the lite index. We will add 2 document to term-five,
+ // term-three and term-one. The output order should be 5-6-3-4-1-2.
Index::Editor edit7 =
index_->Edit(kDocumentId7, kSectionId2, TermMatchType::EXACT_ONLY,
/*namespace_id=*/0);
EXPECT_THAT(edit7.BufferTerm("term-one"), IsOk());
- EXPECT_THAT(edit7.BufferTerm("term-two"), IsOk());
EXPECT_THAT(edit7.BufferTerm("term-three"), IsOk());
- EXPECT_THAT(edit7.BufferTerm("term-four"), IsOk());
+ EXPECT_THAT(edit7.BufferTerm("term-five"), IsOk());
EXPECT_THAT(edit7.IndexAllBufferedTerms(), IsOk());
Index::Editor edit8 =
index_->Edit(kDocumentId8, kSectionId2, TermMatchType::EXACT_ONLY,
/*namespace_id=*/0);
EXPECT_THAT(edit8.BufferTerm("term-one"), IsOk());
- EXPECT_THAT(edit8.BufferTerm("term-two"), IsOk());
EXPECT_THAT(edit8.BufferTerm("term-three"), IsOk());
+ EXPECT_THAT(edit8.BufferTerm("term-five"), IsOk());
EXPECT_THAT(edit8.IndexAllBufferedTerms(), IsOk());
- Index::Editor edit9 =
- index_->Edit(kDocumentId9, kSectionId2, TermMatchType::EXACT_ONLY,
- /*namespace_id=*/0);
- EXPECT_THAT(edit9.BufferTerm("term-one"), IsOk());
- EXPECT_THAT(edit9.BufferTerm("term-two"), IsOk());
- EXPECT_THAT(edit9.IndexAllBufferedTerms(), IsOk());
+ // verify the combination of lite index and main index is in correct order.
+ EXPECT_THAT(
+ index_->FindTermsByPrefix(/*prefix=*/"t", /*num_to_return=*/10,
+ TermMatchType::PREFIX, &impl),
+ IsOkAndHolds(ElementsAre(
+ EqualsTermMetadata("term-five", 7), EqualsTermMetadata("term-six", 6),
+ EqualsTermMetadata("term-three", 5),
+ EqualsTermMetadata("term-four", 4), EqualsTermMetadata("term-one", 3),
+ EqualsTermMetadata("term-two", 2))));
- Index::Editor edit10 =
- index_->Edit(kDocumentId10, kSectionId2, TermMatchType::EXACT_ONLY,
- /*namespace_id=*/0);
- EXPECT_THAT(edit10.BufferTerm("term-one"), IsOk());
- EXPECT_THAT(edit10.IndexAllBufferedTerms(), IsOk());
+ // Get the first three terms.
+ EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"t",
+ /*num_to_return=*/3,
+ TermMatchType::PREFIX, &impl),
+ IsOkAndHolds(ElementsAre(EqualsTermMetadata("term-five", 7),
+ EqualsTermMetadata("term-six", 6),
+ EqualsTermMetadata("term-three", 5))));
+}
- Index::Editor edit11 =
- index_->Edit(kDocumentId11, kSectionId2, TermMatchType::EXACT_ONLY,
+TEST_F(IndexTest, FindTermByPrefix_InTermMatchTypePrefix_ShouldReturnInOrder) {
+ Index::Editor edit1 =
+ index_->Edit(kDocumentId0, kSectionId2, TermMatchType::PREFIX,
/*namespace_id=*/0);
- EXPECT_THAT(edit11.BufferTerm("term-five"), IsOk());
- EXPECT_THAT(edit11.BufferTerm("term-six"), IsOk());
- EXPECT_THAT(edit11.IndexAllBufferedTerms(), IsOk());
+ AlwaysTrueNamespaceCheckerImpl impl;
+ EXPECT_THAT(edit1.BufferTerm("fo"), IsOk());
+ EXPECT_THAT(edit1.IndexAllBufferedTerms(), IsOk());
- Index::Editor edit12 =
- index_->Edit(kDocumentId12, kSectionId2, TermMatchType::EXACT_ONLY,
+ Index::Editor edit2 =
+ index_->Edit(kDocumentId2, kSectionId2, TermMatchType::PREFIX,
/*namespace_id=*/0);
- EXPECT_THAT(edit12.BufferTerm("term-five"), IsOk());
- EXPECT_THAT(edit12.IndexAllBufferedTerms(), IsOk());
+ EXPECT_THAT(edit2.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit2.IndexAllBufferedTerms(), IsOk());
- // verify the combination of lite index and main index is in correct order.
- EXPECT_THAT(
- index_->FindTermsByPrefix(/*prefix=*/"t", /*namespace_ids=*/{0},
- /*num_to_return=*/10),
- IsOkAndHolds(ElementsAre(
- EqualsTermMetadata("term-five",
- kSecondSmallestPlApproxHits + 2), // 9
- EqualsTermMetadata("term-six", kSecondSmallestPlApproxHits + 1), // 8
- EqualsTermMetadata("term-one", kMinSizePlApproxHits + 4), // 7
- EqualsTermMetadata("term-two", kMinSizePlApproxHits + 3), // 6
- EqualsTermMetadata("term-three", kMinSizePlApproxHits + 2), // 5
- EqualsTermMetadata("term-four", kMinSizePlApproxHits + 1)))); // 4
+ Index::Editor edit3 =
+ index_->Edit(kDocumentId3, kSectionId2, TermMatchType::PREFIX,
+ /*namespace_id=*/0);
+ EXPECT_THAT(edit3.BufferTerm("fool"), IsOk());
+ EXPECT_THAT(edit3.IndexAllBufferedTerms(), IsOk());
- // Get the first three terms.
- EXPECT_THAT(
- index_->FindTermsByPrefix(/*prefix=*/"t", /*namespace_ids=*/{0},
- /*num_to_return=*/3),
- IsOkAndHolds(ElementsAre(
- EqualsTermMetadata("term-five",
- kSecondSmallestPlApproxHits + 2), // 9
- EqualsTermMetadata("term-six", kSecondSmallestPlApproxHits + 1), // 8
- EqualsTermMetadata("term-one", kMinSizePlApproxHits + 4)))); // 7
+ ICING_ASSERT_OK(index_->Merge());
+ // verify the order in pls is correct
+ // "fo" { {doc0, exact_hit}, {doc1, prefix_hit}, {doc2, prefix_hit} }
+ // "foo" { {doc1, exact_hit}, {doc2, prefix_hit} }
+ // "fool" { {doc2, exact_hit} }
+ EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f",
+ /*num_to_return=*/10,
+ TermMatchType::PREFIX, &impl),
+ IsOkAndHolds(ElementsAre(EqualsTermMetadata("fo", 3),
+ EqualsTermMetadata("foo", 2),
+ EqualsTermMetadata("fool", 1))));
+ // Find by exact only, all terms should be equally.
+ EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*num_to_return=*/10,
+ TermMatchType::EXACT_ONLY, &impl),
+ IsOkAndHolds(UnorderedElementsAre(
+ EqualsTermMetadata("fo", 1), EqualsTermMetadata("foo", 1),
+ EqualsTermMetadata("fool", 1))));
}
-TEST_F(IndexTest, FindTermByPrefixShouldReturnApproximateHitCountForMain) {
+TEST_F(IndexTest, FindTermByPrefixShouldReturnHitCountForMain) {
Index::Editor edit =
index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY,
/*namespace_id=*/0);
+ AlwaysTrueNamespaceCheckerImpl impl;
EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
EXPECT_THAT(edit.BufferTerm("fool"), IsOk());
EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
@@ -1313,25 +1243,26 @@ TEST_F(IndexTest, FindTermByPrefixShouldReturnApproximateHitCountForMain) {
EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
// 'foo' has 1 hit, 'fool' has 8 hits.
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0},
- /*num_to_return=*/10),
+ EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f",
+ /*num_to_return=*/10,
+ TermMatchType::PREFIX, &impl),
IsOkAndHolds(ElementsAre(EqualsTermMetadata("fool", 8),
EqualsTermMetadata("foo", 1))));
ICING_ASSERT_OK(index_->Merge());
- // foo's hits should fit on a single pl. fool's hits will need two pls.
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0},
- /*num_to_return=*/10),
- IsOkAndHolds(UnorderedElementsAre(
- EqualsTermMetadata("foo", kMinSizePlApproxHits),
- EqualsTermMetadata("fool", kSecondSmallestPlApproxHits))));
+ EXPECT_THAT(
+ index_->FindTermsByPrefix(/*prefix=*/"f", /*num_to_return=*/10,
+ TermMatchType::PREFIX, &impl),
+ IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("foo", 1),
+ EqualsTermMetadata("fool", 8))));
}
TEST_F(IndexTest, FindTermByPrefixShouldReturnCombinedHitCount) {
Index::Editor edit =
index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY,
/*namespace_id=*/0);
+ AlwaysTrueNamespaceCheckerImpl impl;
EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
EXPECT_THAT(edit.BufferTerm("fool"), IsOk());
EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
@@ -1343,19 +1274,18 @@ TEST_F(IndexTest, FindTermByPrefixShouldReturnCombinedHitCount) {
EXPECT_THAT(edit.BufferTerm("fool"), IsOk());
EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
- // 'foo' has 1 hit in the main index, 'fool' has 1 hit in the main index and
- // 1 hit in the lite index.
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0},
- /*num_to_return=*/10),
- IsOkAndHolds(ElementsAre(
- EqualsTermMetadata("fool", kMinSizePlApproxHits + 1),
- EqualsTermMetadata("foo", kMinSizePlApproxHits))));
+ EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*num_to_return=*/10,
+ TermMatchType::PREFIX, &impl),
+ IsOkAndHolds(ElementsAre(EqualsTermMetadata("fool", 2),
+ EqualsTermMetadata("foo", 1))));
}
TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsFromBothIndices) {
Index::Editor edit =
index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY,
/*namespace_id=*/0);
+ AlwaysTrueNamespaceCheckerImpl impl;
+
EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
@@ -1368,10 +1298,10 @@ TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsFromBothIndices) {
// 'foo' has 1 hit in the main index, 'fool' has 1 hit in the lite index.
EXPECT_THAT(
- index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0},
- /*num_to_return=*/10),
- IsOkAndHolds(ElementsAre(EqualsTermMetadata("foo", kMinSizePlApproxHits),
- EqualsTermMetadata("fool", 1))));
+ index_->FindTermsByPrefix(/*prefix=*/"f", /*num_to_return=*/10,
+ TermMatchType::PREFIX, &impl),
+ IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("foo", 1),
+ EqualsTermMetadata("fool", 1))));
}
TEST_F(IndexTest, GetElementsSize) {
@@ -1465,12 +1395,14 @@ TEST_F(IndexTest, GetDebugInfo) {
EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
edit = index_->Edit(kDocumentId1, kSectionId3, TermMatchType::PREFIX,
/*namespace_id=*/0);
+ index_->set_last_added_document_id(kDocumentId1);
ASSERT_THAT(edit.BufferTerm("foot"), IsOk());
EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
ICING_ASSERT_OK(index_->Merge());
edit = index_->Edit(kDocumentId2, kSectionId2, TermMatchType::EXACT_ONLY,
/*namespace_id=*/0);
+ index_->set_last_added_document_id(kDocumentId2);
ASSERT_THAT(edit.BufferTerm("footer"), IsOk());
EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
edit = index_->Edit(kDocumentId2, kSectionId3, TermMatchType::PREFIX,
@@ -1478,40 +1410,45 @@ TEST_F(IndexTest, GetDebugInfo) {
ASSERT_THAT(edit.BufferTerm("foo"), IsOk());
EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
- std::string out0;
- index_->GetDebugInfo(/*verbosity=*/0, &out0);
- EXPECT_THAT(out0, Not(IsEmpty()));
+ IndexDebugInfoProto out0 = index_->GetDebugInfo(/*verbosity=*/0);
+ EXPECT_FALSE(out0.main_index_info().has_flash_index_storage_info());
+ EXPECT_THAT(out0.main_index_info().last_added_document_id(),
+ Eq(kDocumentId1));
+ EXPECT_THAT(out0.lite_index_info().curr_size(), Eq(2));
+ EXPECT_THAT(out0.lite_index_info().last_added_document_id(),
+ Eq(kDocumentId2));
- std::string out1;
- index_->GetDebugInfo(/*verbosity=*/1, &out1);
- EXPECT_THAT(out1, SizeIs(Gt(out0.size())));
+ IndexDebugInfoProto out1 = index_->GetDebugInfo(/*verbosity=*/1);
+ EXPECT_THAT(out1.main_index_info().flash_index_storage_info(),
+ Not(IsEmpty()));
// Add one more doc to the lite index. Debug strings should change.
edit = index_->Edit(kDocumentId3, kSectionId2, TermMatchType::EXACT_ONLY,
/*namespace_id=*/0);
+ index_->set_last_added_document_id(kDocumentId3);
ASSERT_THAT(edit.BufferTerm("far"), IsOk());
EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
- std::string out2;
- index_->GetDebugInfo(/*verbosity=*/0, &out2);
- EXPECT_THAT(out2, Ne(out0));
-
- std::string out3;
- index_->GetDebugInfo(/*verbosity=*/1, &out3);
- EXPECT_THAT(out3, Ne(out1));
+ IndexDebugInfoProto out2 = index_->GetDebugInfo(/*verbosity=*/0);
+ EXPECT_THAT(out2.lite_index_info().curr_size(), Eq(3));
+ EXPECT_THAT(out2.lite_index_info().last_added_document_id(),
+ Eq(kDocumentId3));
// Merge into the man index. Debuug strings should change again.
ICING_ASSERT_OK(index_->Merge());
- std::string out4;
- index_->GetDebugInfo(/*verbosity=*/0, &out4);
- EXPECT_THAT(out4, Ne(out0));
- EXPECT_THAT(out4, Ne(out2));
-
- std::string out5;
- index_->GetDebugInfo(/*verbosity=*/1, &out5);
- EXPECT_THAT(out5, Ne(out1));
- EXPECT_THAT(out5, Ne(out3));
+ IndexDebugInfoProto out3 = index_->GetDebugInfo(/*verbosity=*/0);
+ EXPECT_TRUE(out3.has_index_storage_info());
+ EXPECT_THAT(out3.main_index_info().lexicon_info(), Not(IsEmpty()));
+ EXPECT_THAT(out3.main_index_info().last_added_document_id(),
+ Eq(kDocumentId3));
+ EXPECT_THAT(out3.lite_index_info().curr_size(), Eq(0));
+ EXPECT_THAT(out3.lite_index_info().hit_buffer_size(), Gt(0));
+ EXPECT_THAT(out3.lite_index_info().last_added_document_id(),
+ Eq(kInvalidDocumentId));
+ EXPECT_THAT(out3.lite_index_info().searchable_end(), Eq(0));
+ EXPECT_THAT(out3.lite_index_info().index_crc(), Gt(0));
+ EXPECT_THAT(out3.lite_index_info().lexicon_info(), Not(IsEmpty()));
}
TEST_F(IndexTest, BackfillingMultipleTermsSucceeds) {
diff --git a/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc b/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc
index 43a846b..7c6d924 100644
--- a/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc
+++ b/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc
@@ -48,13 +48,13 @@ using ::testing::ElementsAreArray;
using ::testing::Eq;
using ::testing::IsEmpty;
-constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL =
- PropertyConfigProto_Cardinality_Code_OPTIONAL;
+constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL =
+ PropertyConfigProto::Cardinality::OPTIONAL;
-constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN =
- StringIndexingConfig_TokenizerType_Code_PLAIN;
+constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN =
+ StringIndexingConfig::TokenizerType::PLAIN;
-constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY;
+constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY;
class DocHitInfoIteratorSectionRestrictTest : public ::testing::Test {
protected:
diff --git a/icing/index/lite/doc-hit-info-iterator-term-lite.cc b/icing/index/lite/doc-hit-info-iterator-term-lite.cc
index 08df4fc..f215d63 100644
--- a/icing/index/lite/doc-hit-info-iterator-term-lite.cc
+++ b/icing/index/lite/doc-hit-info-iterator-term-lite.cc
@@ -77,7 +77,8 @@ libtextclassifier3::Status DocHitInfoIteratorTermLiteExact::RetrieveMoreHits() {
ICING_ASSIGN_OR_RETURN(uint32_t term_id,
term_id_codec_->EncodeTvi(tvi, TviType::LITE));
lite_index_->AppendHits(term_id, section_restrict_mask_,
- /*only_from_prefix_sections=*/false, &cached_hits_);
+ /*only_from_prefix_sections=*/false,
+ /*namespace_checker=*/nullptr, &cached_hits_);
cached_hits_idx_ = 0;
return libtextclassifier3::Status::OK;
}
@@ -100,7 +101,7 @@ DocHitInfoIteratorTermLitePrefix::RetrieveMoreHits() {
term_id_codec_->EncodeTvi(it.GetValueIndex(), TviType::LITE));
lite_index_->AppendHits(term_id, section_restrict_mask_,
/*only_from_prefix_sections=*/!exact_match,
- &cached_hits_);
+ /*namespace_checker=*/nullptr, &cached_hits_);
++terms_matched;
}
if (terms_matched > 1) {
diff --git a/icing/index/lite/lite-index.cc b/icing/index/lite/lite-index.cc
index 9e4ac28..a5c6baf 100644
--- a/icing/index/lite/lite-index.cc
+++ b/icing/index/lite/lite-index.cc
@@ -336,9 +336,12 @@ libtextclassifier3::StatusOr<uint32_t> LiteIndex::GetTermId(
int LiteIndex::AppendHits(uint32_t term_id, SectionIdMask section_id_mask,
bool only_from_prefix_sections,
+ const NamespaceChecker* namespace_checker,
std::vector<DocHitInfo>* hits_out) {
int count = 0;
DocumentId last_document_id = kInvalidDocumentId;
+ // Record whether the last document belongs to the given namespaces.
+ bool last_document_in_namespace = false;
for (uint32_t idx = Seek(term_id); idx < header_->cur_size(); idx++) {
TermIdHitPair term_id_hit_pair(
hit_buffer_.array_cast<TermIdHitPair>()[idx]);
@@ -355,22 +358,31 @@ int LiteIndex::AppendHits(uint32_t term_id, SectionIdMask section_id_mask,
}
DocumentId document_id = hit.document_id();
if (document_id != last_document_id) {
+ last_document_id = document_id;
+ last_document_in_namespace =
+ namespace_checker == nullptr ||
+ namespace_checker->BelongsToTargetNamespaces(document_id);
+ if (!last_document_in_namespace) {
+ // The document is removed or expired or not belongs to target
+ // namespaces.
+ continue;
+ }
++count;
if (hits_out != nullptr) {
hits_out->push_back(DocHitInfo(document_id));
}
- last_document_id = document_id;
}
- if (hits_out != nullptr) {
+ if (hits_out != nullptr && last_document_in_namespace) {
hits_out->back().UpdateSection(hit.section_id(), hit.term_frequency());
}
}
return count;
}
-int LiteIndex::CountHits(uint32_t term_id) {
+libtextclassifier3::StatusOr<int> LiteIndex::CountHits(
+ uint32_t term_id, const NamespaceChecker* namespace_checker) {
return AppendHits(term_id, kSectionIdMaskAll,
- /*only_from_prefix_sections=*/false,
+ /*only_from_prefix_sections=*/false, namespace_checker,
/*hits_out=*/nullptr);
}
@@ -379,15 +391,16 @@ bool LiteIndex::is_full() const {
lexicon_.min_free_fraction() < (1.0 - kTrieFullFraction));
}
-void LiteIndex::GetDebugInfo(int verbosity, std::string* out) const {
- absl_ports::StrAppend(
- out, IcingStringUtil::StringPrintf("Lite Index\nHit buffer %u/%u\n",
- header_->cur_size(),
- options_.hit_buffer_size));
-
- // Lexicon.
- out->append("Lexicon stats:\n");
- lexicon_.GetDebugInfo(verbosity, out);
+IndexDebugInfoProto::LiteIndexDebugInfoProto LiteIndex::GetDebugInfo(
+ int verbosity) {
+ IndexDebugInfoProto::LiteIndexDebugInfoProto res;
+ res.set_curr_size(header_->cur_size());
+ res.set_hit_buffer_size(options_.hit_buffer_size);
+ res.set_last_added_document_id(header_->last_added_docid());
+ res.set_searchable_end(header_->searchable_end());
+ res.set_index_crc(ComputeChecksum().Get());
+ lexicon_.GetDebugInfo(verbosity, res.mutable_lexicon_info());
+ return res;
}
libtextclassifier3::StatusOr<int64_t> LiteIndex::GetElementsSize() const {
@@ -408,12 +421,8 @@ IndexStorageInfoProto LiteIndex::GetStorageInfo(
IndexStorageInfoProto storage_info) const {
int64_t header_and_hit_buffer_file_size =
filesystem_->GetFileSize(hit_buffer_fd_.get());
- if (header_and_hit_buffer_file_size != Filesystem::kBadFileSize) {
- storage_info.set_lite_index_hit_buffer_size(
- header_and_hit_buffer_file_size);
- } else {
- storage_info.set_lite_index_hit_buffer_size(-1);
- }
+ storage_info.set_lite_index_hit_buffer_size(
+ IcingFilesystem::SanitizeFileSize(header_and_hit_buffer_file_size));
int64_t lexicon_disk_usage = lexicon_.GetElementsSize();
if (lexicon_disk_usage != Filesystem::kBadFileSize) {
storage_info.set_lite_index_lexicon_size(lexicon_disk_usage);
diff --git a/icing/index/lite/lite-index.h b/icing/index/lite/lite-index.h
index b134aba..378fc94 100644
--- a/icing/index/lite/lite-index.h
+++ b/icing/index/lite/lite-index.h
@@ -37,10 +37,12 @@
#include "icing/legacy/index/icing-lite-index-header.h"
#include "icing/legacy/index/icing-lite-index-options.h"
#include "icing/legacy/index/icing-mmapper.h"
+#include "icing/proto/debug.pb.h"
#include "icing/proto/storage.pb.h"
#include "icing/proto/term.pb.h"
#include "icing/schema/section.h"
#include "icing/store/document-id.h"
+#include "icing/store/namespace-checker.h"
#include "icing/store/namespace-id.h"
#include "icing/util/bit-util.h"
#include "icing/util/crc32.h"
@@ -140,13 +142,19 @@ class LiteIndex {
// skipping hits in non-prefix sections if only_from_prefix_sections is true,
// to hits_out. If hits_out is nullptr, no hits will be added.
//
+ // Only those hits which belongs to the given namespaces will be counted and
+ // appended. A nullptr namespace checker will disable this check.
+ //
// Returns the number of hits that would be added to hits_out.
int AppendHits(uint32_t term_id, SectionIdMask section_id_mask,
bool only_from_prefix_sections,
+ const NamespaceChecker* namespace_checker,
std::vector<DocHitInfo>* hits_out);
// Returns the hit count of the term.
- int CountHits(uint32_t term_id);
+ // Only those hits which belongs to the given namespaces will be counted.
+ libtextclassifier3::StatusOr<int> CountHits(
+ uint32_t term_id, const NamespaceChecker* namespace_checker);
// Check if buffer has reached its capacity.
bool is_full() const;
@@ -234,7 +242,7 @@ class LiteIndex {
// Returns debug information for the index in out.
// verbosity <= 0, simplest debug information - size of lexicon, hit buffer
// verbosity > 0, more detailed debug information from the lexicon.
- void GetDebugInfo(int verbosity, std::string* out) const;
+ IndexDebugInfoProto::LiteIndexDebugInfoProto GetDebugInfo(int verbosity);
// Returns the byte size of all the elements held in the index. This excludes
// the size of any internal metadata of the index, e.g. the index's header.
diff --git a/icing/index/lite/lite-index_test.cc b/icing/index/lite/lite-index_test.cc
new file mode 100644
index 0000000..825f830
--- /dev/null
+++ b/icing/index/lite/lite-index_test.cc
@@ -0,0 +1,110 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/lite/lite-index.h"
+
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/index/term-id-codec.h"
+#include "icing/legacy/index/icing-mock-filesystem.h"
+#include "icing/schema/section.h"
+#include "icing/store/namespace-checker.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::Eq;
+using ::testing::IsEmpty;
+using ::testing::SizeIs;
+
+class AlwaysFalseNamespaceCheckerImpl : public NamespaceChecker {
+ public:
+ bool BelongsToTargetNamespaces(DocumentId document_id) const override {
+ return false;
+ }
+};
+
+class LiteIndexTest : public testing::Test {
+ protected:
+ void SetUp() override {
+ index_dir_ = GetTestTempDir() + "/test_dir";
+ ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(index_dir_.c_str()));
+
+ std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index";
+ LiteIndex::Options options(lite_index_file_name,
+ /*hit_buffer_want_merge_bytes=*/1024 * 1024);
+ ICING_ASSERT_OK_AND_ASSIGN(lite_index_,
+ LiteIndex::Create(options, &icing_filesystem_));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ term_id_codec_,
+ TermIdCodec::Create(
+ IcingDynamicTrie::max_value_index(IcingDynamicTrie::Options()),
+ IcingDynamicTrie::max_value_index(options.lexicon_options)));
+ }
+
+ void TearDown() override {
+ ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(index_dir_.c_str()));
+ }
+
+ std::string index_dir_;
+ Filesystem filesystem_;
+ IcingFilesystem icing_filesystem_;
+ std::unique_ptr<LiteIndex> lite_index_;
+ std::unique_ptr<TermIdCodec> term_id_codec_;
+};
+
+constexpr NamespaceId kNamespace0 = 0;
+
+TEST_F(LiteIndexTest, LiteIndexAppendHits) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t tvi,
+ lite_index_->InsertTerm("foo", TermMatchType::PREFIX, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id,
+ term_id_codec_->EncodeTvi(tvi, TviType::LITE));
+ Hit doc_hit0(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ Hit doc_hit1(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc_hit0));
+ ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc_hit1));
+
+ std::vector<DocHitInfo> hits1;
+ lite_index_->AppendHits(foo_term_id, kSectionIdMaskAll,
+ /*only_from_prefix_sections=*/false,
+ /*namespace_checker=*/nullptr, &hits1);
+ EXPECT_THAT(hits1, SizeIs(1));
+ EXPECT_THAT(hits1.back().document_id(), Eq(0));
+ // Check that the hits are coming from section 0 and section 1.
+ EXPECT_THAT(hits1.back().hit_section_ids_mask(), Eq(0b11));
+
+ std::vector<DocHitInfo> hits2;
+ AlwaysFalseNamespaceCheckerImpl always_false_namespace_checker;
+ lite_index_->AppendHits(foo_term_id, kSectionIdMaskAll,
+ /*only_from_prefix_sections=*/false,
+ &always_false_namespace_checker, &hits2);
+ // Check that no hits are returned because they get skipped by the namespace
+ // checker.
+ EXPECT_THAT(hits2, IsEmpty());
+}
+
+} // namespace
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/main/flash-index-storage.h b/icing/index/main/flash-index-storage.h
index 8d5b50b..6c6fbb8 100644
--- a/icing/index/main/flash-index-storage.h
+++ b/icing/index/main/flash-index-storage.h
@@ -159,6 +159,7 @@ class FlashIndexStorage {
libtextclassifier3::Status Reset();
+ // TODO(b/222349894) Convert the string output to a protocol buffer instead.
void GetDebugInfo(int verbosity, std::string* out) const;
private:
diff --git a/icing/index/main/main-index.cc b/icing/index/main/main-index.cc
index b185138..2d6007b 100644
--- a/icing/index/main/main-index.cc
+++ b/icing/index/main/main-index.cc
@@ -133,18 +133,10 @@ libtextclassifier3::StatusOr<int64_t> MainIndex::GetElementsSize() const {
IndexStorageInfoProto MainIndex::GetStorageInfo(
IndexStorageInfoProto storage_info) const {
- int64_t lexicon_elt_size = main_lexicon_->GetElementsSize();
- if (lexicon_elt_size != IcingFilesystem::kBadFileSize) {
- storage_info.set_main_index_lexicon_size(lexicon_elt_size);
- } else {
- storage_info.set_main_index_lexicon_size(-1);
- }
- int64_t index_elt_size = flash_index_storage_->GetElementsSize();
- if (lexicon_elt_size != IcingFilesystem::kBadFileSize) {
- storage_info.set_main_index_storage_size(index_elt_size);
- } else {
- storage_info.set_main_index_storage_size(-1);
- }
+ storage_info.set_main_index_lexicon_size(
+ IcingFilesystem::SanitizeFileSize(main_lexicon_->GetElementsSize()));
+ storage_info.set_main_index_storage_size(
+ Filesystem::SanitizeFileSize(flash_index_storage_->GetElementsSize()));
storage_info.set_main_index_block_size(flash_index_storage_->block_size());
storage_info.set_num_blocks(flash_index_storage_->num_blocks());
storage_info.set_min_free_fraction(flash_index_storage_->min_free_fraction());
@@ -186,7 +178,7 @@ MainIndex::GetAccessorForPrefixTerm(const std::string& prefix) {
if (!exact && !hits_in_prefix_section.HasProperty(main_itr.GetValueIndex())) {
// Found it, but it doesn't have prefix hits. Exit early. No need to
// retrieve the posting list because there's nothing there for us.
- return libtextclassifier3::Status::OK;
+ return absl_ports::NotFoundError("The term doesn't have any prefix hits.");
}
PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
memcpy(&posting_list_id, main_itr.GetValue(), sizeof(posting_list_id));
@@ -217,35 +209,45 @@ bool IsTermInNamespaces(
libtextclassifier3::StatusOr<std::vector<TermMetadata>>
MainIndex::FindTermsByPrefix(const std::string& prefix,
- const std::vector<NamespaceId>& namespace_ids) {
+ TermMatchType::Code term_match_type,
+ const NamespaceChecker* namespace_checker) {
// Finds all the terms that start with the given prefix in the lexicon.
IcingDynamicTrie::Iterator term_iterator(*main_lexicon_, prefix.c_str());
- // A property reader to help check if a term has some property.
- IcingDynamicTrie::PropertyReadersAll property_reader(*main_lexicon_);
-
std::vector<TermMetadata> term_metadata_list;
while (term_iterator.IsValid()) {
- uint32_t term_value_index = term_iterator.GetValueIndex();
+ int count = 0;
+ DocumentId last_document_id = kInvalidDocumentId;
- // Skips the terms that don't exist in the given namespaces. We won't skip
- // any terms if namespace_ids is empty.
- if (!IsTermInNamespaces(property_reader, term_value_index, namespace_ids)) {
- term_iterator.Advance();
- continue;
- }
PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
memcpy(&posting_list_id, term_iterator.GetValue(), sizeof(posting_list_id));
- // Getting the actual hit count would require reading the entire posting
- // list chain. We take an approximation to avoid all of those IO ops.
- // Because we are not reading the posting lists, it is impossible to
- // differentiate between single max-size posting lists and chains of
- // max-size posting lists. We assume that the impact on scoring is not
- // significant.
- int approx_hit_count = IndexBlock::ApproximateFullPostingListHitsForBlock(
- flash_index_storage_->block_size(),
- posting_list_id.posting_list_index_bits());
- term_metadata_list.emplace_back(term_iterator.GetKey(), approx_hit_count);
+ ICING_ASSIGN_OR_RETURN(PostingListAccessor pl_accessor,
+ PostingListAccessor::CreateFromExisting(
+ flash_index_storage_.get(), posting_list_id));
+ ICING_ASSIGN_OR_RETURN(std::vector<Hit> hits,
+ pl_accessor.GetNextHitsBatch());
+ for (const Hit& hit : hits) {
+ DocumentId document_id = hit.document_id();
+ if (document_id != last_document_id) {
+ last_document_id = document_id;
+ if (term_match_type == TermMatchType::EXACT_ONLY &&
+ hit.is_prefix_hit()) {
+ continue;
+ }
+ if (!namespace_checker->BelongsToTargetNamespaces(document_id)) {
+ // The document is removed or expired or not belongs to target
+ // namespaces.
+ continue;
+ }
+ // TODO(b/152934343) Add search type in SuggestionSpec to ask user to
+ // input search type, prefix or exact. And make different score strategy
+ // base on that.
+ ++count;
+ }
+ }
+ if (count > 0) {
+ term_metadata_list.push_back(TermMetadata(term_iterator.GetKey(), count));
+ }
term_iterator.Advance();
}
@@ -605,16 +607,22 @@ libtextclassifier3::Status MainIndex::AddPrefixBackfillHits(
return libtextclassifier3::Status::OK;
}
-void MainIndex::GetDebugInfo(int verbosity, std::string* out) const {
+IndexDebugInfoProto::MainIndexDebugInfoProto MainIndex::GetDebugInfo(
+ int verbosity) const {
+ IndexDebugInfoProto::MainIndexDebugInfoProto res;
+
// Lexicon.
- out->append("Main Lexicon stats:\n");
- main_lexicon_->GetDebugInfo(verbosity, out);
+ main_lexicon_->GetDebugInfo(verbosity, res.mutable_lexicon_info());
+
+ res.set_last_added_document_id(last_added_document_id());
if (verbosity <= 0) {
- return;
+ return res;
}
- flash_index_storage_->GetDebugInfo(verbosity, out);
+ flash_index_storage_->GetDebugInfo(verbosity,
+ res.mutable_flash_index_storage_info());
+ return res;
}
} // namespace lib
diff --git a/icing/index/main/main-index.h b/icing/index/main/main-index.h
index 919a5c5..abb0418 100644
--- a/icing/index/main/main-index.h
+++ b/icing/index/main/main-index.h
@@ -27,7 +27,9 @@
#include "icing/index/term-metadata.h"
#include "icing/legacy/index/icing-dynamic-trie.h"
#include "icing/legacy/index/icing-filesystem.h"
+#include "icing/proto/debug.pb.h"
#include "icing/proto/storage.pb.h"
+#include "icing/store/namespace-checker.h"
#include "icing/store/namespace-id.h"
#include "icing/util/status-macros.h"
@@ -71,17 +73,17 @@ class MainIndex {
// Finds terms with the given prefix in the given namespaces. If
// 'namespace_ids' is empty, returns results from all the namespaces. The
// input prefix must be normalized, otherwise inaccurate results may be
- // returned. Results are not sorted specifically and are in lexigraphical
- // order. Number of results are no more than 'num_to_return'.
- //
- // The hit count returned with each TermMetadata is an approximation based of
- // posting list size.
+ // returned. If term_match_type is EXACT, only exact hit will be counted and
+ // it is PREFIX, both prefix and exact hits will be counted. Results are not
+ // sorted specifically and are in lexigraphical order. Number of results are
+ // no more than 'num_to_return'.
//
// Returns:
// A list of TermMetadata on success
// INTERNAL_ERROR if failed to access term data.
libtextclassifier3::StatusOr<std::vector<TermMetadata>> FindTermsByPrefix(
- const std::string& prefix, const std::vector<NamespaceId>& namespace_ids);
+ const std::string& prefix, TermMatchType::Code term_match_type,
+ const NamespaceChecker* namespace_checker);
struct LexiconMergeOutputs {
// Maps from main_lexicon tvi for new branching point to the main_lexicon
@@ -184,7 +186,8 @@ class MainIndex {
// verbosity <= 0, simplest debug information - just the lexicon
// verbosity > 0, more detailed debug information including raw postings
// lists.
- void GetDebugInfo(int verbosity, std::string* out) const;
+ IndexDebugInfoProto::MainIndexDebugInfoProto GetDebugInfo(
+ int verbosity) const;
private:
libtextclassifier3::Status Init(const std::string& index_directory,
diff --git a/icing/index/main/main-index_test.cc b/icing/index/main/main-index_test.cc
index 74139be..fa83d68 100644
--- a/icing/index/main/main-index_test.cc
+++ b/icing/index/main/main-index_test.cc
@@ -162,6 +162,34 @@ TEST_F(MainIndexTest, MainIndexGetAccessorForPrefixReturnsValidAccessor) {
EXPECT_THAT(main_index->GetAccessorForPrefixTerm("foo"), IsOk());
}
+TEST_F(MainIndexTest, MainIndexGetAccessorForPrefixReturnsNotFound) {
+ // 1. Index one doc in the Lite Index:
+ // - Doc0 {"foot" is_in_prefix_section=false}
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t tvi,
+ lite_index_->InsertTerm("foot", TermMatchType::EXACT_ONLY, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t foot_term_id,
+ term_id_codec_->EncodeTvi(tvi, TviType::LITE));
+
+ Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit));
+
+ // 2. Create the main index. It should have no entries in its lexicon.
+ std::string main_index_file_name = index_dir_ + "/test_file.idx.index";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<MainIndex> main_index,
+ MainIndex::Create(main_index_file_name, &filesystem_,
+ &icing_filesystem_));
+
+ // 3. Merge the index. The main index should return not found when we search
+ // prefix contain "foo".
+ ICING_ASSERT_OK(Merge(*lite_index_, *term_id_codec_, main_index.get()));
+ // GetAccessorForPrefixTerm should return a valid accessor for "foo".
+ EXPECT_THAT(main_index->GetAccessorForPrefixTerm("foo"),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
TEST_F(MainIndexTest, MainIndexGetAccessorForExactTermNotFound) {
// Create the main index. It should have no entries in its lexicon.
std::string main_index_file_name = index_dir_ + "/test_file.idx.index";