diff options
Diffstat (limited to 'icing/index')
-rw-r--r-- | icing/index/index-processor.cc | 20 | ||||
-rw-r--r-- | icing/index/index-processor.h | 2 | ||||
-rw-r--r-- | icing/index/index-processor_benchmark.cc | 2 | ||||
-rw-r--r-- | icing/index/index-processor_test.cc | 131 | ||||
-rw-r--r-- | icing/index/index.cc | 61 | ||||
-rw-r--r-- | icing/index/index.h | 27 | ||||
-rw-r--r-- | icing/index/index_test.cc | 391 | ||||
-rw-r--r-- | icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc | 10 | ||||
-rw-r--r-- | icing/index/lite/doc-hit-info-iterator-term-lite.cc | 5 | ||||
-rw-r--r-- | icing/index/lite/lite-index.cc | 47 | ||||
-rw-r--r-- | icing/index/lite/lite-index.h | 12 | ||||
-rw-r--r-- | icing/index/lite/lite-index_test.cc | 110 | ||||
-rw-r--r-- | icing/index/main/flash-index-storage.h | 1 | ||||
-rw-r--r-- | icing/index/main/main-index.cc | 86 | ||||
-rw-r--r-- | icing/index/main/main-index.h | 17 | ||||
-rw-r--r-- | icing/index/main/main-index_test.cc | 28 |
16 files changed, 575 insertions, 375 deletions
diff --git a/icing/index/index-processor.cc b/icing/index/index-processor.cc index 1aae732..207c033 100644 --- a/icing/index/index-processor.cc +++ b/icing/index/index-processor.cc @@ -73,9 +73,23 @@ libtextclassifier3::Status IndexProcessor::IndexDocument( section.metadata.term_match_type, /*namespace_id=*/0); for (std::string_view token : section.token_sequence) { ++num_tokens; - std::string term = normalizer_.NormalizeTerm(token); - // Add this term to Hit buffer. - status = editor.BufferTerm(term.c_str()); + + switch (section.metadata.tokenizer) { + case StringIndexingConfig::TokenizerType::VERBATIM: + // data() is safe to use here because a token created from the + // VERBATIM tokenizer is the entire string value. The character at + // data() + token.length() is guaranteed to be a null char. + status = editor.BufferTerm(token.data()); + break; + case StringIndexingConfig::TokenizerType::NONE: + ICING_LOG(WARNING) + << "Unexpected TokenizerType::NONE found when indexing document."; + [[fallthrough]]; + case StringIndexingConfig::TokenizerType::PLAIN: + std::string normalized_term = normalizer_.NormalizeTerm(token); + status = editor.BufferTerm(normalized_term.c_str()); + } + if (!status.ok()) { // We've encountered a failure. Bail out. We'll mark this doc as deleted // and signal a failure to the client. diff --git a/icing/index/index-processor.h b/icing/index/index-processor.h index c4b77b5..269e41c 100644 --- a/icing/index/index-processor.h +++ b/icing/index/index-processor.h @@ -69,8 +69,6 @@ class IndexProcessor { IndexProcessor(const Normalizer* normalizer, Index* index, const Clock* clock) : normalizer_(*normalizer), index_(index), clock_(*clock) {} - std::string NormalizeToken(const Token& token); - const Normalizer& normalizer_; Index* const index_; const Clock& clock_; diff --git a/icing/index/index-processor_benchmark.cc b/icing/index/index-processor_benchmark.cc index 6e072c7..1aad7d0 100644 --- a/icing/index/index-processor_benchmark.cc +++ b/icing/index/index-processor_benchmark.cc @@ -16,7 +16,6 @@ #include "gmock/gmock.h" #include "icing/document-builder.h" #include "icing/file/filesystem.h" -#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/index/index-processor.h" #include "icing/index/index.h" #include "icing/legacy/core/icing-string-util.h" @@ -24,6 +23,7 @@ #include "icing/schema/schema-util.h" #include "icing/schema/section-manager.h" #include "icing/testing/common-matchers.h" +#include "icing/testing/icu-data-file-helper.h" #include "icing/testing/test-data.h" #include "icing/testing/tmp-directory.h" #include "icing/tokenization/language-segmenter-factory.h" diff --git a/icing/index/index-processor_test.cc b/icing/index/index-processor_test.cc index 449bc3e..bd310de 100644 --- a/icing/index/index-processor_test.cc +++ b/icing/index/index-processor_test.cc @@ -30,7 +30,6 @@ #include "icing/absl_ports/str_join.h" #include "icing/document-builder.h" #include "icing/file/filesystem.h" -#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/index/hit/doc-hit-info.h" #include "icing/index/index.h" #include "icing/index/iterator/doc-hit-info-iterator.h" @@ -49,6 +48,7 @@ #include "icing/store/document-id.h" #include "icing/testing/common-matchers.h" #include "icing/testing/fake-clock.h" +#include "icing/testing/icu-data-file-helper.h" #include "icing/testing/random-string.h" #include "icing/testing/test-data.h" #include "icing/testing/tmp-directory.h" @@ -90,6 +90,8 @@ constexpr std::string_view kRepeatedProperty = "repeated"; constexpr std::string_view kSubProperty = "submessage"; constexpr std::string_view kNestedType = "NestedType"; constexpr std::string_view kNestedProperty = "nested"; +constexpr std::string_view kExactVerbatimProperty = "verbatimExact"; +constexpr std::string_view kPrefixedVerbatimProperty = "verbatimPrefixed"; constexpr DocumentId kDocumentId0 = 0; constexpr DocumentId kDocumentId1 = 1; @@ -98,6 +100,8 @@ constexpr SectionId kExactSectionId = 0; constexpr SectionId kPrefixedSectionId = 1; constexpr SectionId kRepeatedSectionId = 2; constexpr SectionId kNestedSectionId = 3; +constexpr SectionId kExactVerbatimSectionId = 4; +constexpr SectionId kPrefixedVerbatimSectionId = 5; using Cardinality = PropertyConfigProto::Cardinality; using DataType = PropertyConfigProto::DataType; @@ -106,21 +110,23 @@ using ::testing::Eq; using ::testing::IsEmpty; using ::testing::Test; -constexpr PropertyConfigProto_DataType_Code TYPE_STRING = - PropertyConfigProto_DataType_Code_STRING; -constexpr PropertyConfigProto_DataType_Code TYPE_BYTES = - PropertyConfigProto_DataType_Code_BYTES; +constexpr PropertyConfigProto::DataType::Code TYPE_STRING = + PropertyConfigProto::DataType::STRING; +constexpr PropertyConfigProto::DataType::Code TYPE_BYTES = + PropertyConfigProto::DataType::BYTES; -constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL = - PropertyConfigProto_Cardinality_Code_OPTIONAL; -constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REPEATED = - PropertyConfigProto_Cardinality_Code_REPEATED; +constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL = + PropertyConfigProto::Cardinality::OPTIONAL; +constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REPEATED = + PropertyConfigProto::Cardinality::REPEATED; -constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN = - StringIndexingConfig_TokenizerType_Code_PLAIN; +constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN = + StringIndexingConfig::TokenizerType::PLAIN; +constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_VERBATIM = + StringIndexingConfig::TokenizerType::VERBATIM; -constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY; -constexpr TermMatchType_Code MATCH_PREFIX = TermMatchType_Code_PREFIX; +constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY; +constexpr TermMatchType::Code MATCH_PREFIX = TermMatchType::PREFIX; class IndexProcessorTest : public Test { protected: @@ -180,6 +186,16 @@ class IndexProcessorTest : public Test { .SetCardinality(CARDINALITY_REPEATED)) .AddProperty( PropertyConfigBuilder() + .SetName(kExactVerbatimProperty) + .SetDataTypeString(MATCH_EXACT, TOKENIZER_VERBATIM) + .SetCardinality(CARDINALITY_REPEATED)) + .AddProperty( + PropertyConfigBuilder() + .SetName(kPrefixedVerbatimProperty) + .SetDataTypeString(MATCH_PREFIX, TOKENIZER_VERBATIM) + .SetCardinality(CARDINALITY_REPEATED)) + .AddProperty( + PropertyConfigBuilder() .SetName(kSubProperty) .SetDataTypeDocument( kNestedType, /*index_nested_properties=*/true) @@ -797,6 +813,95 @@ TEST_F(IndexProcessorTest, IndexingDocMergeFailureResets) { EXPECT_THAT(index_->last_added_document_id(), Eq(doc_id)); } +TEST_F(IndexProcessorTest, ExactVerbatimProperty) { + DocumentProto document = + DocumentBuilder() + .SetKey("icing", "fake_type/1") + .SetSchema(std::string(kFakeType)) + .AddStringProperty(std::string(kExactVerbatimProperty), + "Hello, world!") + .Build(); + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_document, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + document)); + EXPECT_THAT(tokenized_document.num_tokens(), 1); + + EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), + IsOk()); + EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<DocHitInfoIterator> itr, + index_->GetIterator("Hello, world!", kSectionIdMaskAll, + TermMatchType::EXACT_ONLY)); + std::vector<DocHitInfo> hits = GetHits(std::move(itr)); + std::unordered_map<SectionId, Hit::TermFrequency> expectedMap{ + {kExactVerbatimSectionId, 1}}; + + EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency( + kDocumentId0, expectedMap))); +} + +TEST_F(IndexProcessorTest, PrefixVerbatimProperty) { + DocumentProto document = + DocumentBuilder() + .SetKey("icing", "fake_type/1") + .SetSchema(std::string(kFakeType)) + .AddStringProperty(std::string(kPrefixedVerbatimProperty), + "Hello, world!") + .Build(); + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_document, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + document)); + EXPECT_THAT(tokenized_document.num_tokens(), 1); + + EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), + IsOk()); + EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); + + // We expect to match the document we indexed as "Hello, w" is a prefix + // of "Hello, world!" + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DocHitInfoIterator> itr, + index_->GetIterator("Hello, w", kSectionIdMaskAll, + TermMatchType::PREFIX)); + std::vector<DocHitInfo> hits = GetHits(std::move(itr)); + std::unordered_map<SectionId, Hit::TermFrequency> expectedMap{ + {kPrefixedVerbatimSectionId, 1}}; + + EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency( + kDocumentId0, expectedMap))); +} + +TEST_F(IndexProcessorTest, VerbatimPropertyDoesntMatchSubToken) { + DocumentProto document = + DocumentBuilder() + .SetKey("icing", "fake_type/1") + .SetSchema(std::string(kFakeType)) + .AddStringProperty(std::string(kPrefixedVerbatimProperty), + "Hello, world!") + .Build(); + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_document, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + document)); + EXPECT_THAT(tokenized_document.num_tokens(), 1); + + EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), + IsOk()); + EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<DocHitInfoIterator> itr, + index_->GetIterator("world", kSectionIdMaskAll, TermMatchType::PREFIX)); + std::vector<DocHitInfo> hits = GetHits(std::move(itr)); + + // We should not have hits for term "world" as the index processor should + // create a sole token "Hello, world! for the document. + EXPECT_THAT(hits, IsEmpty()); +} + } // namespace } // namespace lib diff --git a/icing/index/index.cc b/icing/index/index.cc index 1bdab21..02ba699 100644 --- a/icing/index/index.cc +++ b/icing/index/index.cc @@ -71,24 +71,6 @@ IcingDynamicTrie::Options GetMainLexiconOptions() { return IcingDynamicTrie::Options(); } -// Helper function to check if a term is in the given namespaces. -// TODO(tjbarron): Implement a method PropertyReadersAll.HasAnyProperty(). -bool IsTermInNamespaces( - const IcingDynamicTrie::PropertyReadersAll& property_reader, - uint32_t value_index, const std::vector<NamespaceId>& namespace_ids) { - if (namespace_ids.empty()) { - return true; - } - for (NamespaceId namespace_id : namespace_ids) { - if (property_reader.HasProperty(GetNamespacePropertyId(namespace_id), - value_index)) { - return true; - } - } - - return false; -} - enum class MergeAction { kTakeLiteTerm, kTakeMainTerm, kMergeTerms }; // Merge the TermMetadata from lite index and main index. If the term exists in @@ -137,7 +119,7 @@ std::vector<TermMetadata> MergeAndRankTermMetadatas( int total_est_hit_count = lite_term_itr->hit_count + main_term_itr->hit_count; PushToTermHeap(TermMetadata(std::move(lite_term_itr->content), - total_est_hit_count), + total_est_hit_count), num_to_return, merged_term_metadata_heap); ++lite_term_itr; ++main_term_itr; @@ -228,32 +210,26 @@ Index::GetIterator(const std::string& term, SectionIdMask section_id_mask, libtextclassifier3::StatusOr<std::vector<TermMetadata>> Index::FindLiteTermsByPrefix(const std::string& prefix, - const std::vector<NamespaceId>& namespace_ids) { + const NamespaceChecker* namespace_checker) { // Finds all the terms that start with the given prefix in the lexicon. IcingDynamicTrie::Iterator term_iterator(lite_index_->lexicon(), prefix.c_str()); - // A property reader to help check if a term has some property. - IcingDynamicTrie::PropertyReadersAll property_reader(lite_index_->lexicon()); - std::vector<TermMetadata> term_metadata_list; while (term_iterator.IsValid()) { uint32_t term_value_index = term_iterator.GetValueIndex(); - // Skips the terms that don't exist in the given namespaces. We won't skip - // any terms if namespace_ids is empty. - if (!IsTermInNamespaces(property_reader, term_value_index, namespace_ids)) { - term_iterator.Advance(); - continue; - } - ICING_ASSIGN_OR_RETURN( uint32_t term_id, term_id_codec_->EncodeTvi(term_value_index, TviType::LITE), absl_ports::InternalError("Failed to access terms in lexicon.")); - - term_metadata_list.emplace_back(term_iterator.GetKey(), - lite_index_->CountHits(term_id)); + ICING_ASSIGN_OR_RETURN(int hit_count, + lite_index_->CountHits(term_id, namespace_checker)); + if (hit_count > 0) { + // There is at least one document in the given namespace has this term. + term_metadata_list.push_back( + TermMetadata(term_iterator.GetKey(), hit_count)); + } term_iterator.Advance(); } @@ -261,21 +237,20 @@ Index::FindLiteTermsByPrefix(const std::string& prefix, } libtextclassifier3::StatusOr<std::vector<TermMetadata>> -Index::FindTermsByPrefix(const std::string& prefix, - const std::vector<NamespaceId>& namespace_ids, - int num_to_return) { +Index::FindTermsByPrefix(const std::string& prefix, int num_to_return, + TermMatchType::Code term_match_type, + const NamespaceChecker* namespace_checker) { std::vector<TermMetadata> term_metadata_list; if (num_to_return <= 0) { return term_metadata_list; } - // Get results from the LiteIndex. ICING_ASSIGN_OR_RETURN(std::vector<TermMetadata> lite_term_metadata_list, - FindLiteTermsByPrefix(prefix, namespace_ids)); + FindLiteTermsByPrefix(prefix, namespace_checker)); // Append results from the MainIndex. ICING_ASSIGN_OR_RETURN(std::vector<TermMetadata> main_term_metadata_list, - main_index_->FindTermsByPrefix(prefix, namespace_ids)); - + main_index_->FindTermsByPrefix(prefix, term_match_type, + namespace_checker)); return MergeAndRankTermMetadatas(std::move(lite_term_metadata_list), std::move(main_term_metadata_list), num_to_return); @@ -284,11 +259,7 @@ Index::FindTermsByPrefix(const std::string& prefix, IndexStorageInfoProto Index::GetStorageInfo() const { IndexStorageInfoProto storage_info; int64_t directory_size = filesystem_->GetDiskUsage(options_.base_dir.c_str()); - if (directory_size != Filesystem::kBadFileSize) { - storage_info.set_index_size(directory_size); - } else { - storage_info.set_index_size(-1); - } + storage_info.set_index_size(Filesystem::SanitizeFileSize(directory_size)); storage_info = lite_index_->GetStorageInfo(std::move(storage_info)); return main_index_->GetStorageInfo(std::move(storage_info)); } diff --git a/icing/index/index.h b/icing/index/index.h index 693cf04..5c53349 100644 --- a/icing/index/index.h +++ b/icing/index/index.h @@ -32,10 +32,12 @@ #include "icing/index/term-id-codec.h" #include "icing/index/term-metadata.h" #include "icing/legacy/index/icing-filesystem.h" +#include "icing/proto/debug.pb.h" #include "icing/proto/storage.pb.h" #include "icing/proto/term.pb.h" #include "icing/schema/section.h" #include "icing/store/document-id.h" +#include "icing/store/namespace-checker.h" #include "icing/store/namespace-id.h" #include "icing/util/crc32.h" @@ -142,9 +144,14 @@ class Index { // index. // verbosity > 0, more detailed debug information including raw postings // lists. - void GetDebugInfo(int verbosity, std::string* out) const { - lite_index_->GetDebugInfo(verbosity, out); - main_index_->GetDebugInfo(verbosity, out); + IndexDebugInfoProto GetDebugInfo(int verbosity) const { + IndexDebugInfoProto debug_info; + *debug_info.mutable_index_storage_info() = GetStorageInfo(); + *debug_info.mutable_lite_index_info() = + lite_index_->GetDebugInfo(verbosity); + *debug_info.mutable_main_index_info() = + main_index_->GetDebugInfo(verbosity); + return debug_info; } // Returns the byte size of the all the elements held in the index. This @@ -181,17 +188,17 @@ class Index { TermMatchType::Code term_match_type); // Finds terms with the given prefix in the given namespaces. If - // 'namespace_ids' is empty, returns results from all the namespaces. The - // input prefix must be normalized, otherwise inaccurate results may be - // returned. Results are not sorted specifically and are in their original - // order. Number of results are no more than 'num_to_return'. + // 'namespace_ids' is empty, returns results from all the namespaces. Results + // are sorted in decreasing order of hit count. Number of results are no more + // than 'num_to_return'. // // Returns: // A list of TermMetadata on success // INTERNAL_ERROR if failed to access term data. libtextclassifier3::StatusOr<std::vector<TermMetadata>> FindTermsByPrefix( - const std::string& prefix, const std::vector<NamespaceId>& namespace_ids, - int num_to_return); + const std::string& prefix, int num_to_return, + TermMatchType::Code term_match_type, + const NamespaceChecker* namespace_checker); // A class that can be used to add hits to the index. // @@ -267,7 +274,7 @@ class Index { filesystem_(filesystem) {} libtextclassifier3::StatusOr<std::vector<TermMetadata>> FindLiteTermsByPrefix( - const std::string& prefix, const std::vector<NamespaceId>& namespace_ids); + const std::string& prefix, const NamespaceChecker* namespace_checker); std::unique_ptr<LiteIndex> lite_index_; std::unique_ptr<MainIndex> main_index_; diff --git a/icing/index/index_test.cc b/icing/index/index_test.cc index 00d5ad6..8355c01 100644 --- a/icing/index/index_test.cc +++ b/icing/index/index_test.cc @@ -31,10 +31,12 @@ #include "icing/index/iterator/doc-hit-info-iterator.h" #include "icing/legacy/index/icing-filesystem.h" #include "icing/legacy/index/icing-mock-filesystem.h" +#include "icing/proto/debug.pb.h" #include "icing/proto/storage.pb.h" #include "icing/proto/term.pb.h" #include "icing/schema/section.h" #include "icing/store/document-id.h" +#include "icing/testing/always-true-namespace-checker-impl.h" #include "icing/testing/common-matchers.h" #include "icing/testing/random-string.h" #include "icing/testing/tmp-directory.h" @@ -89,22 +91,9 @@ constexpr DocumentId kDocumentId5 = 5; constexpr DocumentId kDocumentId6 = 6; constexpr DocumentId kDocumentId7 = 7; constexpr DocumentId kDocumentId8 = 8; -constexpr DocumentId kDocumentId9 = 9; -constexpr DocumentId kDocumentId10 = 10; -constexpr DocumentId kDocumentId11 = 11; -constexpr DocumentId kDocumentId12 = 12; constexpr SectionId kSectionId2 = 2; constexpr SectionId kSectionId3 = 3; -// The value returned by IndexBlock::ApproximateFullPostingListHitsForBlock( -// GetBlockSize(), -// GetPostingListIndexBits(posting_list_utils::min_posting_list_size())); -constexpr int kMinSizePlApproxHits = 3; -// The value returned by IndexBlock::ApproximateFullPostingListHitsForBlock( -// GetBlockSize(), -// GetPostingListIndexBits(2 * posting_list_utils::min_posting_list_size())); -constexpr int kSecondSmallestPlApproxHits = 7; - std::vector<DocHitInfo> GetHits(std::unique_ptr<DocHitInfoIterator> iterator) { std::vector<DocHitInfo> infos; while (iterator->Advance().ok()) { @@ -920,148 +909,82 @@ TEST_F(IndexTest, InvalidHitBufferSize) { TEST_F(IndexTest, FindTermByPrefixShouldReturnEmpty) { Index::Editor edit = index_->Edit(kDocumentId0, kSectionId2, TermMatchType::PREFIX, /*namespace_id=*/0); + AlwaysTrueNamespaceCheckerImpl impl; EXPECT_THAT(edit.BufferTerm("fool"), IsOk()); EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); - EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"foo", /*namespace_ids=*/{0}, - /*num_to_return=*/0), + EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"foo", /*num_to_return=*/0, + TermMatchType::PREFIX, &impl), IsOkAndHolds(IsEmpty())); - EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"foo", /*namespace_ids=*/{0}, - /*num_to_return=*/-1), + EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"foo", + /*num_to_return=*/-1, + TermMatchType::PREFIX, &impl), IsOkAndHolds(IsEmpty())); ICING_ASSERT_OK(index_->Merge()); - EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"foo", /*namespace_ids=*/{0}, - /*num_to_return=*/0), + EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"foo", + /*num_to_return=*/0, + TermMatchType::PREFIX, &impl), IsOkAndHolds(IsEmpty())); - EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"foo", /*namespace_ids=*/{0}, - /*num_to_return=*/-1), + EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"foo", + /*num_to_return=*/-1, + TermMatchType::PREFIX, &impl), IsOkAndHolds(IsEmpty())); } TEST_F(IndexTest, FindTermByPrefixShouldReturnCorrectResult) { Index::Editor edit = index_->Edit( kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); + AlwaysTrueNamespaceCheckerImpl impl; EXPECT_THAT(edit.BufferTerm("foo"), IsOk()); EXPECT_THAT(edit.BufferTerm("bar"), IsOk()); EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); // "b" should only match "bar" but not "foo". - EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"b", /*namespace_ids=*/{0}, - /*num_to_return=*/10), + EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"b", + /*num_to_return=*/10, + TermMatchType::PREFIX, &impl), IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("bar", 1)))); ICING_ASSERT_OK(index_->Merge()); // "b" should only match "bar" but not "foo". - EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"b", /*namespace_ids=*/{0}, - /*num_to_return=*/10), - IsOkAndHolds(UnorderedElementsAre( - EqualsTermMetadata("bar", kMinSizePlApproxHits)))); + EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"b", + /*num_to_return=*/10, + TermMatchType::PREFIX, &impl), + IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("bar", 1)))); } TEST_F(IndexTest, FindTermByPrefixShouldRespectNumToReturn) { Index::Editor edit = index_->Edit( kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); + AlwaysTrueNamespaceCheckerImpl impl; EXPECT_THAT(edit.BufferTerm("fo"), IsOk()); EXPECT_THAT(edit.BufferTerm("foo"), IsOk()); EXPECT_THAT(edit.BufferTerm("fool"), IsOk()); EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); // We have 3 results but only 2 should be returned. - EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0}, - /*num_to_return=*/2), + EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", + /*num_to_return=*/2, + TermMatchType::PREFIX, &impl), IsOkAndHolds(SizeIs(2))); ICING_ASSERT_OK(index_->Merge()); // We have 3 results but only 2 should be returned. - EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0}, - /*num_to_return=*/2), + EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", + /*num_to_return=*/2, + TermMatchType::PREFIX, &impl), IsOkAndHolds(SizeIs(2))); } -TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsInOneNamespace) { - Index::Editor edit1 = - index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, - /*namespace_id=*/0); - EXPECT_THAT(edit1.BufferTerm("fo"), IsOk()); - EXPECT_THAT(edit1.BufferTerm("foo"), IsOk()); - EXPECT_THAT(edit1.IndexAllBufferedTerms(), IsOk()); - - Index::Editor edit2 = - index_->Edit(kDocumentId1, kSectionId2, TermMatchType::EXACT_ONLY, - /*namespace_id=*/1); - EXPECT_THAT(edit2.BufferTerm("fool"), IsOk()); - EXPECT_THAT(edit2.IndexAllBufferedTerms(), IsOk()); - - // namespace with id 0 has 2 results. - EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0}, - /*num_to_return=*/10), - IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("fo", 1), - EqualsTermMetadata("foo", 1)))); - // namespace with id 1 has 1 result. - EXPECT_THAT( - index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{1}, - /*num_to_return=*/10), - IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("fool", 1)))); - - ICING_ASSERT_OK(index_->Merge()); - - // namespace with id 0 has 2 results. - EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0}, - /*num_to_return=*/10), - IsOkAndHolds(UnorderedElementsAre( - EqualsTermMetadata("fo", kMinSizePlApproxHits), - EqualsTermMetadata("foo", kMinSizePlApproxHits)))); - // namespace with id 1 has 1 result. - EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{1}, - /*num_to_return=*/10), - IsOkAndHolds(UnorderedElementsAre( - EqualsTermMetadata("fool", kMinSizePlApproxHits)))); -} - -TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsInMultipleNamespaces) { - Index::Editor edit1 = - index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, - /*namespace_id=*/0); - EXPECT_THAT(edit1.BufferTerm("fo"), IsOk()); - EXPECT_THAT(edit1.IndexAllBufferedTerms(), IsOk()); - - Index::Editor edit2 = - index_->Edit(kDocumentId1, kSectionId2, TermMatchType::EXACT_ONLY, - /*namespace_id=*/1); - EXPECT_THAT(edit2.BufferTerm("foo"), IsOk()); - EXPECT_THAT(edit2.IndexAllBufferedTerms(), IsOk()); - - Index::Editor edit3 = - index_->Edit(kDocumentId2, kSectionId2, TermMatchType::EXACT_ONLY, - /*namespace_id=*/2); - EXPECT_THAT(edit3.BufferTerm("fool"), IsOk()); - EXPECT_THAT(edit3.IndexAllBufferedTerms(), IsOk()); - - // Should return "foo" and "fool" which are in namespaces with ids 1 and 2. - EXPECT_THAT( - index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{1, 2}, - /*num_to_return=*/10), - IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("foo", 1), - EqualsTermMetadata("fool", 1)))); - - ICING_ASSERT_OK(index_->Merge()); - - EXPECT_THAT( - index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{1, 2}, - /*num_to_return=*/10), - IsOkAndHolds(UnorderedElementsAre( - EqualsTermMetadata("foo", kMinSizePlApproxHits), - EqualsTermMetadata("fool", kMinSizePlApproxHits)))); -} - TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsInAllNamespaces) { Index::Editor edit1 = index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); + AlwaysTrueNamespaceCheckerImpl impl; EXPECT_THAT(edit1.BufferTerm("fo"), IsOk()); EXPECT_THAT(edit1.IndexAllBufferedTerms(), IsOk()); @@ -1078,8 +1001,9 @@ TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsInAllNamespaces) { EXPECT_THAT(edit3.IndexAllBufferedTerms(), IsOk()); // Should return "fo", "foo" and "fool" across all namespaces. - EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{}, - /*num_to_return=*/10), + EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", + /*num_to_return=*/10, + TermMatchType::PREFIX, &impl), IsOkAndHolds(UnorderedElementsAre( EqualsTermMetadata("fo", 1), EqualsTermMetadata("foo", 1), EqualsTermMetadata("fool", 1)))); @@ -1087,18 +1011,19 @@ TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsInAllNamespaces) { ICING_ASSERT_OK(index_->Merge()); // Should return "fo", "foo" and "fool" across all namespaces. - EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{}, - /*num_to_return=*/10), + EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", + /*num_to_return=*/10, + TermMatchType::PREFIX, &impl), IsOkAndHolds(UnorderedElementsAre( - EqualsTermMetadata("fo", kMinSizePlApproxHits), - EqualsTermMetadata("foo", kMinSizePlApproxHits), - EqualsTermMetadata("fool", kMinSizePlApproxHits)))); + EqualsTermMetadata("fo", 1), EqualsTermMetadata("foo", 1), + EqualsTermMetadata("fool", 1)))); } TEST_F(IndexTest, FindTermByPrefixShouldReturnCorrectHitCount) { Index::Editor edit1 = index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); + AlwaysTrueNamespaceCheckerImpl impl; EXPECT_THAT(edit1.BufferTerm("foo"), IsOk()); EXPECT_THAT(edit1.BufferTerm("fool"), IsOk()); EXPECT_THAT(edit1.IndexAllBufferedTerms(), IsOk()); @@ -1110,20 +1035,19 @@ TEST_F(IndexTest, FindTermByPrefixShouldReturnCorrectHitCount) { EXPECT_THAT(edit2.IndexAllBufferedTerms(), IsOk()); // 'foo' has 1 hit, 'fool' has 2 hits. - EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0}, - /*num_to_return=*/10), + EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", + /*num_to_return=*/10, + TermMatchType::PREFIX, &impl), IsOkAndHolds(ElementsAre(EqualsTermMetadata("fool", 2), EqualsTermMetadata("foo", 1)))); ICING_ASSERT_OK(index_->Merge()); - // foo's one hit should fit on a min-sized pl, fool's two hits should also fit - // on a min-sized pl. - EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0}, - /*num_to_return=*/10), - IsOkAndHolds(UnorderedElementsAre( - EqualsTermMetadata("foo", kMinSizePlApproxHits), - EqualsTermMetadata("fool", kMinSizePlApproxHits)))); + EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", + /*num_to_return=*/10, + TermMatchType::PREFIX, &impl), + IsOkAndHolds(ElementsAre(EqualsTermMetadata("fool", 2), + EqualsTermMetadata("foo", 1)))); } TEST_F(IndexTest, FindTermByPrefixShouldReturnInOrder) { @@ -1132,6 +1056,7 @@ TEST_F(IndexTest, FindTermByPrefixShouldReturnInOrder) { Index::Editor edit1 = index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); + AlwaysTrueNamespaceCheckerImpl impl; EXPECT_THAT(edit1.BufferTerm("term-one"), IsOk()); EXPECT_THAT(edit1.BufferTerm("term-two"), IsOk()); EXPECT_THAT(edit1.BufferTerm("term-three"), IsOk()); @@ -1181,8 +1106,9 @@ TEST_F(IndexTest, FindTermByPrefixShouldReturnInOrder) { EXPECT_THAT(edit6.IndexAllBufferedTerms(), IsOk()); // verify the order in lite index is correct. - EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"t", /*namespace_ids=*/{0}, - /*num_to_return=*/10), + EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"t", + /*num_to_return=*/10, + TermMatchType::PREFIX, &impl), IsOkAndHolds(ElementsAre(EqualsTermMetadata("term-six", 6), EqualsTermMetadata("term-five", 5), EqualsTermMetadata("term-four", 4), @@ -1192,93 +1118,97 @@ TEST_F(IndexTest, FindTermByPrefixShouldReturnInOrder) { ICING_ASSERT_OK(index_->Merge()); - // Since most of term has same approx hit count, we don't verify order in the - // main index. - EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"t", /*namespace_ids=*/{0}, - /*num_to_return=*/10), - IsOkAndHolds(UnorderedElementsAre( - EqualsTermMetadata("term-six", kSecondSmallestPlApproxHits), - EqualsTermMetadata("term-five", kSecondSmallestPlApproxHits), - EqualsTermMetadata("term-four", kMinSizePlApproxHits), - EqualsTermMetadata("term-three", kMinSizePlApproxHits), - EqualsTermMetadata("term-two", kMinSizePlApproxHits), - EqualsTermMetadata("term-one", kMinSizePlApproxHits)))); - - // keep push terms to the lite index. For term 1-4, since they has same hit - // count kMinSizePlApproxHits, we will push 4 term-one, 3 term-two, 2 - // term-three and one term-four to make them in reverse order. And for term - // 5 & 6, we will push 2 term-five and one term-six. + EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"t", + /*num_to_return=*/10, + TermMatchType::PREFIX, &impl), + IsOkAndHolds(ElementsAre(EqualsTermMetadata("term-six", 6), + EqualsTermMetadata("term-five", 5), + EqualsTermMetadata("term-four", 4), + EqualsTermMetadata("term-three", 3), + EqualsTermMetadata("term-two", 2), + EqualsTermMetadata("term-one", 1)))); + + // keep push terms to the lite index. We will add 2 document to term-five, + // term-three and term-one. The output order should be 5-6-3-4-1-2. Index::Editor edit7 = index_->Edit(kDocumentId7, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); EXPECT_THAT(edit7.BufferTerm("term-one"), IsOk()); - EXPECT_THAT(edit7.BufferTerm("term-two"), IsOk()); EXPECT_THAT(edit7.BufferTerm("term-three"), IsOk()); - EXPECT_THAT(edit7.BufferTerm("term-four"), IsOk()); + EXPECT_THAT(edit7.BufferTerm("term-five"), IsOk()); EXPECT_THAT(edit7.IndexAllBufferedTerms(), IsOk()); Index::Editor edit8 = index_->Edit(kDocumentId8, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); EXPECT_THAT(edit8.BufferTerm("term-one"), IsOk()); - EXPECT_THAT(edit8.BufferTerm("term-two"), IsOk()); EXPECT_THAT(edit8.BufferTerm("term-three"), IsOk()); + EXPECT_THAT(edit8.BufferTerm("term-five"), IsOk()); EXPECT_THAT(edit8.IndexAllBufferedTerms(), IsOk()); - Index::Editor edit9 = - index_->Edit(kDocumentId9, kSectionId2, TermMatchType::EXACT_ONLY, - /*namespace_id=*/0); - EXPECT_THAT(edit9.BufferTerm("term-one"), IsOk()); - EXPECT_THAT(edit9.BufferTerm("term-two"), IsOk()); - EXPECT_THAT(edit9.IndexAllBufferedTerms(), IsOk()); + // verify the combination of lite index and main index is in correct order. + EXPECT_THAT( + index_->FindTermsByPrefix(/*prefix=*/"t", /*num_to_return=*/10, + TermMatchType::PREFIX, &impl), + IsOkAndHolds(ElementsAre( + EqualsTermMetadata("term-five", 7), EqualsTermMetadata("term-six", 6), + EqualsTermMetadata("term-three", 5), + EqualsTermMetadata("term-four", 4), EqualsTermMetadata("term-one", 3), + EqualsTermMetadata("term-two", 2)))); - Index::Editor edit10 = - index_->Edit(kDocumentId10, kSectionId2, TermMatchType::EXACT_ONLY, - /*namespace_id=*/0); - EXPECT_THAT(edit10.BufferTerm("term-one"), IsOk()); - EXPECT_THAT(edit10.IndexAllBufferedTerms(), IsOk()); + // Get the first three terms. + EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"t", + /*num_to_return=*/3, + TermMatchType::PREFIX, &impl), + IsOkAndHolds(ElementsAre(EqualsTermMetadata("term-five", 7), + EqualsTermMetadata("term-six", 6), + EqualsTermMetadata("term-three", 5)))); +} - Index::Editor edit11 = - index_->Edit(kDocumentId11, kSectionId2, TermMatchType::EXACT_ONLY, +TEST_F(IndexTest, FindTermByPrefix_InTermMatchTypePrefix_ShouldReturnInOrder) { + Index::Editor edit1 = + index_->Edit(kDocumentId0, kSectionId2, TermMatchType::PREFIX, /*namespace_id=*/0); - EXPECT_THAT(edit11.BufferTerm("term-five"), IsOk()); - EXPECT_THAT(edit11.BufferTerm("term-six"), IsOk()); - EXPECT_THAT(edit11.IndexAllBufferedTerms(), IsOk()); + AlwaysTrueNamespaceCheckerImpl impl; + EXPECT_THAT(edit1.BufferTerm("fo"), IsOk()); + EXPECT_THAT(edit1.IndexAllBufferedTerms(), IsOk()); - Index::Editor edit12 = - index_->Edit(kDocumentId12, kSectionId2, TermMatchType::EXACT_ONLY, + Index::Editor edit2 = + index_->Edit(kDocumentId2, kSectionId2, TermMatchType::PREFIX, /*namespace_id=*/0); - EXPECT_THAT(edit12.BufferTerm("term-five"), IsOk()); - EXPECT_THAT(edit12.IndexAllBufferedTerms(), IsOk()); + EXPECT_THAT(edit2.BufferTerm("foo"), IsOk()); + EXPECT_THAT(edit2.IndexAllBufferedTerms(), IsOk()); - // verify the combination of lite index and main index is in correct order. - EXPECT_THAT( - index_->FindTermsByPrefix(/*prefix=*/"t", /*namespace_ids=*/{0}, - /*num_to_return=*/10), - IsOkAndHolds(ElementsAre( - EqualsTermMetadata("term-five", - kSecondSmallestPlApproxHits + 2), // 9 - EqualsTermMetadata("term-six", kSecondSmallestPlApproxHits + 1), // 8 - EqualsTermMetadata("term-one", kMinSizePlApproxHits + 4), // 7 - EqualsTermMetadata("term-two", kMinSizePlApproxHits + 3), // 6 - EqualsTermMetadata("term-three", kMinSizePlApproxHits + 2), // 5 - EqualsTermMetadata("term-four", kMinSizePlApproxHits + 1)))); // 4 + Index::Editor edit3 = + index_->Edit(kDocumentId3, kSectionId2, TermMatchType::PREFIX, + /*namespace_id=*/0); + EXPECT_THAT(edit3.BufferTerm("fool"), IsOk()); + EXPECT_THAT(edit3.IndexAllBufferedTerms(), IsOk()); - // Get the first three terms. - EXPECT_THAT( - index_->FindTermsByPrefix(/*prefix=*/"t", /*namespace_ids=*/{0}, - /*num_to_return=*/3), - IsOkAndHolds(ElementsAre( - EqualsTermMetadata("term-five", - kSecondSmallestPlApproxHits + 2), // 9 - EqualsTermMetadata("term-six", kSecondSmallestPlApproxHits + 1), // 8 - EqualsTermMetadata("term-one", kMinSizePlApproxHits + 4)))); // 7 + ICING_ASSERT_OK(index_->Merge()); + // verify the order in pls is correct + // "fo" { {doc0, exact_hit}, {doc1, prefix_hit}, {doc2, prefix_hit} } + // "foo" { {doc1, exact_hit}, {doc2, prefix_hit} } + // "fool" { {doc2, exact_hit} } + EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", + /*num_to_return=*/10, + TermMatchType::PREFIX, &impl), + IsOkAndHolds(ElementsAre(EqualsTermMetadata("fo", 3), + EqualsTermMetadata("foo", 2), + EqualsTermMetadata("fool", 1)))); + // Find by exact only, all terms should be equally. + EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*num_to_return=*/10, + TermMatchType::EXACT_ONLY, &impl), + IsOkAndHolds(UnorderedElementsAre( + EqualsTermMetadata("fo", 1), EqualsTermMetadata("foo", 1), + EqualsTermMetadata("fool", 1)))); } -TEST_F(IndexTest, FindTermByPrefixShouldReturnApproximateHitCountForMain) { +TEST_F(IndexTest, FindTermByPrefixShouldReturnHitCountForMain) { Index::Editor edit = index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); + AlwaysTrueNamespaceCheckerImpl impl; EXPECT_THAT(edit.BufferTerm("foo"), IsOk()); EXPECT_THAT(edit.BufferTerm("fool"), IsOk()); EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); @@ -1313,25 +1243,26 @@ TEST_F(IndexTest, FindTermByPrefixShouldReturnApproximateHitCountForMain) { EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); // 'foo' has 1 hit, 'fool' has 8 hits. - EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0}, - /*num_to_return=*/10), + EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", + /*num_to_return=*/10, + TermMatchType::PREFIX, &impl), IsOkAndHolds(ElementsAre(EqualsTermMetadata("fool", 8), EqualsTermMetadata("foo", 1)))); ICING_ASSERT_OK(index_->Merge()); - // foo's hits should fit on a single pl. fool's hits will need two pls. - EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0}, - /*num_to_return=*/10), - IsOkAndHolds(UnorderedElementsAre( - EqualsTermMetadata("foo", kMinSizePlApproxHits), - EqualsTermMetadata("fool", kSecondSmallestPlApproxHits)))); + EXPECT_THAT( + index_->FindTermsByPrefix(/*prefix=*/"f", /*num_to_return=*/10, + TermMatchType::PREFIX, &impl), + IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("foo", 1), + EqualsTermMetadata("fool", 8)))); } TEST_F(IndexTest, FindTermByPrefixShouldReturnCombinedHitCount) { Index::Editor edit = index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); + AlwaysTrueNamespaceCheckerImpl impl; EXPECT_THAT(edit.BufferTerm("foo"), IsOk()); EXPECT_THAT(edit.BufferTerm("fool"), IsOk()); EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); @@ -1343,19 +1274,18 @@ TEST_F(IndexTest, FindTermByPrefixShouldReturnCombinedHitCount) { EXPECT_THAT(edit.BufferTerm("fool"), IsOk()); EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); - // 'foo' has 1 hit in the main index, 'fool' has 1 hit in the main index and - // 1 hit in the lite index. - EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0}, - /*num_to_return=*/10), - IsOkAndHolds(ElementsAre( - EqualsTermMetadata("fool", kMinSizePlApproxHits + 1), - EqualsTermMetadata("foo", kMinSizePlApproxHits)))); + EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*num_to_return=*/10, + TermMatchType::PREFIX, &impl), + IsOkAndHolds(ElementsAre(EqualsTermMetadata("fool", 2), + EqualsTermMetadata("foo", 1)))); } TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsFromBothIndices) { Index::Editor edit = index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); + AlwaysTrueNamespaceCheckerImpl impl; + EXPECT_THAT(edit.BufferTerm("foo"), IsOk()); EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); @@ -1368,10 +1298,10 @@ TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsFromBothIndices) { // 'foo' has 1 hit in the main index, 'fool' has 1 hit in the lite index. EXPECT_THAT( - index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0}, - /*num_to_return=*/10), - IsOkAndHolds(ElementsAre(EqualsTermMetadata("foo", kMinSizePlApproxHits), - EqualsTermMetadata("fool", 1)))); + index_->FindTermsByPrefix(/*prefix=*/"f", /*num_to_return=*/10, + TermMatchType::PREFIX, &impl), + IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("foo", 1), + EqualsTermMetadata("fool", 1)))); } TEST_F(IndexTest, GetElementsSize) { @@ -1465,12 +1395,14 @@ TEST_F(IndexTest, GetDebugInfo) { EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); edit = index_->Edit(kDocumentId1, kSectionId3, TermMatchType::PREFIX, /*namespace_id=*/0); + index_->set_last_added_document_id(kDocumentId1); ASSERT_THAT(edit.BufferTerm("foot"), IsOk()); EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); ICING_ASSERT_OK(index_->Merge()); edit = index_->Edit(kDocumentId2, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); + index_->set_last_added_document_id(kDocumentId2); ASSERT_THAT(edit.BufferTerm("footer"), IsOk()); EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); edit = index_->Edit(kDocumentId2, kSectionId3, TermMatchType::PREFIX, @@ -1478,40 +1410,45 @@ TEST_F(IndexTest, GetDebugInfo) { ASSERT_THAT(edit.BufferTerm("foo"), IsOk()); EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); - std::string out0; - index_->GetDebugInfo(/*verbosity=*/0, &out0); - EXPECT_THAT(out0, Not(IsEmpty())); + IndexDebugInfoProto out0 = index_->GetDebugInfo(/*verbosity=*/0); + EXPECT_FALSE(out0.main_index_info().has_flash_index_storage_info()); + EXPECT_THAT(out0.main_index_info().last_added_document_id(), + Eq(kDocumentId1)); + EXPECT_THAT(out0.lite_index_info().curr_size(), Eq(2)); + EXPECT_THAT(out0.lite_index_info().last_added_document_id(), + Eq(kDocumentId2)); - std::string out1; - index_->GetDebugInfo(/*verbosity=*/1, &out1); - EXPECT_THAT(out1, SizeIs(Gt(out0.size()))); + IndexDebugInfoProto out1 = index_->GetDebugInfo(/*verbosity=*/1); + EXPECT_THAT(out1.main_index_info().flash_index_storage_info(), + Not(IsEmpty())); // Add one more doc to the lite index. Debug strings should change. edit = index_->Edit(kDocumentId3, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); + index_->set_last_added_document_id(kDocumentId3); ASSERT_THAT(edit.BufferTerm("far"), IsOk()); EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); - std::string out2; - index_->GetDebugInfo(/*verbosity=*/0, &out2); - EXPECT_THAT(out2, Ne(out0)); - - std::string out3; - index_->GetDebugInfo(/*verbosity=*/1, &out3); - EXPECT_THAT(out3, Ne(out1)); + IndexDebugInfoProto out2 = index_->GetDebugInfo(/*verbosity=*/0); + EXPECT_THAT(out2.lite_index_info().curr_size(), Eq(3)); + EXPECT_THAT(out2.lite_index_info().last_added_document_id(), + Eq(kDocumentId3)); // Merge into the man index. Debuug strings should change again. ICING_ASSERT_OK(index_->Merge()); - std::string out4; - index_->GetDebugInfo(/*verbosity=*/0, &out4); - EXPECT_THAT(out4, Ne(out0)); - EXPECT_THAT(out4, Ne(out2)); - - std::string out5; - index_->GetDebugInfo(/*verbosity=*/1, &out5); - EXPECT_THAT(out5, Ne(out1)); - EXPECT_THAT(out5, Ne(out3)); + IndexDebugInfoProto out3 = index_->GetDebugInfo(/*verbosity=*/0); + EXPECT_TRUE(out3.has_index_storage_info()); + EXPECT_THAT(out3.main_index_info().lexicon_info(), Not(IsEmpty())); + EXPECT_THAT(out3.main_index_info().last_added_document_id(), + Eq(kDocumentId3)); + EXPECT_THAT(out3.lite_index_info().curr_size(), Eq(0)); + EXPECT_THAT(out3.lite_index_info().hit_buffer_size(), Gt(0)); + EXPECT_THAT(out3.lite_index_info().last_added_document_id(), + Eq(kInvalidDocumentId)); + EXPECT_THAT(out3.lite_index_info().searchable_end(), Eq(0)); + EXPECT_THAT(out3.lite_index_info().index_crc(), Gt(0)); + EXPECT_THAT(out3.lite_index_info().lexicon_info(), Not(IsEmpty())); } TEST_F(IndexTest, BackfillingMultipleTermsSucceeds) { diff --git a/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc b/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc index 43a846b..7c6d924 100644 --- a/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc +++ b/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc @@ -48,13 +48,13 @@ using ::testing::ElementsAreArray; using ::testing::Eq; using ::testing::IsEmpty; -constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL = - PropertyConfigProto_Cardinality_Code_OPTIONAL; +constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL = + PropertyConfigProto::Cardinality::OPTIONAL; -constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN = - StringIndexingConfig_TokenizerType_Code_PLAIN; +constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN = + StringIndexingConfig::TokenizerType::PLAIN; -constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY; +constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY; class DocHitInfoIteratorSectionRestrictTest : public ::testing::Test { protected: diff --git a/icing/index/lite/doc-hit-info-iterator-term-lite.cc b/icing/index/lite/doc-hit-info-iterator-term-lite.cc index 08df4fc..f215d63 100644 --- a/icing/index/lite/doc-hit-info-iterator-term-lite.cc +++ b/icing/index/lite/doc-hit-info-iterator-term-lite.cc @@ -77,7 +77,8 @@ libtextclassifier3::Status DocHitInfoIteratorTermLiteExact::RetrieveMoreHits() { ICING_ASSIGN_OR_RETURN(uint32_t term_id, term_id_codec_->EncodeTvi(tvi, TviType::LITE)); lite_index_->AppendHits(term_id, section_restrict_mask_, - /*only_from_prefix_sections=*/false, &cached_hits_); + /*only_from_prefix_sections=*/false, + /*namespace_checker=*/nullptr, &cached_hits_); cached_hits_idx_ = 0; return libtextclassifier3::Status::OK; } @@ -100,7 +101,7 @@ DocHitInfoIteratorTermLitePrefix::RetrieveMoreHits() { term_id_codec_->EncodeTvi(it.GetValueIndex(), TviType::LITE)); lite_index_->AppendHits(term_id, section_restrict_mask_, /*only_from_prefix_sections=*/!exact_match, - &cached_hits_); + /*namespace_checker=*/nullptr, &cached_hits_); ++terms_matched; } if (terms_matched > 1) { diff --git a/icing/index/lite/lite-index.cc b/icing/index/lite/lite-index.cc index 9e4ac28..a5c6baf 100644 --- a/icing/index/lite/lite-index.cc +++ b/icing/index/lite/lite-index.cc @@ -336,9 +336,12 @@ libtextclassifier3::StatusOr<uint32_t> LiteIndex::GetTermId( int LiteIndex::AppendHits(uint32_t term_id, SectionIdMask section_id_mask, bool only_from_prefix_sections, + const NamespaceChecker* namespace_checker, std::vector<DocHitInfo>* hits_out) { int count = 0; DocumentId last_document_id = kInvalidDocumentId; + // Record whether the last document belongs to the given namespaces. + bool last_document_in_namespace = false; for (uint32_t idx = Seek(term_id); idx < header_->cur_size(); idx++) { TermIdHitPair term_id_hit_pair( hit_buffer_.array_cast<TermIdHitPair>()[idx]); @@ -355,22 +358,31 @@ int LiteIndex::AppendHits(uint32_t term_id, SectionIdMask section_id_mask, } DocumentId document_id = hit.document_id(); if (document_id != last_document_id) { + last_document_id = document_id; + last_document_in_namespace = + namespace_checker == nullptr || + namespace_checker->BelongsToTargetNamespaces(document_id); + if (!last_document_in_namespace) { + // The document is removed or expired or not belongs to target + // namespaces. + continue; + } ++count; if (hits_out != nullptr) { hits_out->push_back(DocHitInfo(document_id)); } - last_document_id = document_id; } - if (hits_out != nullptr) { + if (hits_out != nullptr && last_document_in_namespace) { hits_out->back().UpdateSection(hit.section_id(), hit.term_frequency()); } } return count; } -int LiteIndex::CountHits(uint32_t term_id) { +libtextclassifier3::StatusOr<int> LiteIndex::CountHits( + uint32_t term_id, const NamespaceChecker* namespace_checker) { return AppendHits(term_id, kSectionIdMaskAll, - /*only_from_prefix_sections=*/false, + /*only_from_prefix_sections=*/false, namespace_checker, /*hits_out=*/nullptr); } @@ -379,15 +391,16 @@ bool LiteIndex::is_full() const { lexicon_.min_free_fraction() < (1.0 - kTrieFullFraction)); } -void LiteIndex::GetDebugInfo(int verbosity, std::string* out) const { - absl_ports::StrAppend( - out, IcingStringUtil::StringPrintf("Lite Index\nHit buffer %u/%u\n", - header_->cur_size(), - options_.hit_buffer_size)); - - // Lexicon. - out->append("Lexicon stats:\n"); - lexicon_.GetDebugInfo(verbosity, out); +IndexDebugInfoProto::LiteIndexDebugInfoProto LiteIndex::GetDebugInfo( + int verbosity) { + IndexDebugInfoProto::LiteIndexDebugInfoProto res; + res.set_curr_size(header_->cur_size()); + res.set_hit_buffer_size(options_.hit_buffer_size); + res.set_last_added_document_id(header_->last_added_docid()); + res.set_searchable_end(header_->searchable_end()); + res.set_index_crc(ComputeChecksum().Get()); + lexicon_.GetDebugInfo(verbosity, res.mutable_lexicon_info()); + return res; } libtextclassifier3::StatusOr<int64_t> LiteIndex::GetElementsSize() const { @@ -408,12 +421,8 @@ IndexStorageInfoProto LiteIndex::GetStorageInfo( IndexStorageInfoProto storage_info) const { int64_t header_and_hit_buffer_file_size = filesystem_->GetFileSize(hit_buffer_fd_.get()); - if (header_and_hit_buffer_file_size != Filesystem::kBadFileSize) { - storage_info.set_lite_index_hit_buffer_size( - header_and_hit_buffer_file_size); - } else { - storage_info.set_lite_index_hit_buffer_size(-1); - } + storage_info.set_lite_index_hit_buffer_size( + IcingFilesystem::SanitizeFileSize(header_and_hit_buffer_file_size)); int64_t lexicon_disk_usage = lexicon_.GetElementsSize(); if (lexicon_disk_usage != Filesystem::kBadFileSize) { storage_info.set_lite_index_lexicon_size(lexicon_disk_usage); diff --git a/icing/index/lite/lite-index.h b/icing/index/lite/lite-index.h index b134aba..378fc94 100644 --- a/icing/index/lite/lite-index.h +++ b/icing/index/lite/lite-index.h @@ -37,10 +37,12 @@ #include "icing/legacy/index/icing-lite-index-header.h" #include "icing/legacy/index/icing-lite-index-options.h" #include "icing/legacy/index/icing-mmapper.h" +#include "icing/proto/debug.pb.h" #include "icing/proto/storage.pb.h" #include "icing/proto/term.pb.h" #include "icing/schema/section.h" #include "icing/store/document-id.h" +#include "icing/store/namespace-checker.h" #include "icing/store/namespace-id.h" #include "icing/util/bit-util.h" #include "icing/util/crc32.h" @@ -140,13 +142,19 @@ class LiteIndex { // skipping hits in non-prefix sections if only_from_prefix_sections is true, // to hits_out. If hits_out is nullptr, no hits will be added. // + // Only those hits which belongs to the given namespaces will be counted and + // appended. A nullptr namespace checker will disable this check. + // // Returns the number of hits that would be added to hits_out. int AppendHits(uint32_t term_id, SectionIdMask section_id_mask, bool only_from_prefix_sections, + const NamespaceChecker* namespace_checker, std::vector<DocHitInfo>* hits_out); // Returns the hit count of the term. - int CountHits(uint32_t term_id); + // Only those hits which belongs to the given namespaces will be counted. + libtextclassifier3::StatusOr<int> CountHits( + uint32_t term_id, const NamespaceChecker* namespace_checker); // Check if buffer has reached its capacity. bool is_full() const; @@ -234,7 +242,7 @@ class LiteIndex { // Returns debug information for the index in out. // verbosity <= 0, simplest debug information - size of lexicon, hit buffer // verbosity > 0, more detailed debug information from the lexicon. - void GetDebugInfo(int verbosity, std::string* out) const; + IndexDebugInfoProto::LiteIndexDebugInfoProto GetDebugInfo(int verbosity); // Returns the byte size of all the elements held in the index. This excludes // the size of any internal metadata of the index, e.g. the index's header. diff --git a/icing/index/lite/lite-index_test.cc b/icing/index/lite/lite-index_test.cc new file mode 100644 index 0000000..825f830 --- /dev/null +++ b/icing/index/lite/lite-index_test.cc @@ -0,0 +1,110 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/index/lite/lite-index.h" + +#include <vector> + +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "icing/index/term-id-codec.h" +#include "icing/legacy/index/icing-mock-filesystem.h" +#include "icing/schema/section.h" +#include "icing/store/namespace-checker.h" +#include "icing/testing/common-matchers.h" +#include "icing/testing/tmp-directory.h" + +namespace icing { +namespace lib { + +namespace { + +using ::testing::Eq; +using ::testing::IsEmpty; +using ::testing::SizeIs; + +class AlwaysFalseNamespaceCheckerImpl : public NamespaceChecker { + public: + bool BelongsToTargetNamespaces(DocumentId document_id) const override { + return false; + } +}; + +class LiteIndexTest : public testing::Test { + protected: + void SetUp() override { + index_dir_ = GetTestTempDir() + "/test_dir"; + ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(index_dir_.c_str())); + + std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index"; + LiteIndex::Options options(lite_index_file_name, + /*hit_buffer_want_merge_bytes=*/1024 * 1024); + ICING_ASSERT_OK_AND_ASSIGN(lite_index_, + LiteIndex::Create(options, &icing_filesystem_)); + + ICING_ASSERT_OK_AND_ASSIGN( + term_id_codec_, + TermIdCodec::Create( + IcingDynamicTrie::max_value_index(IcingDynamicTrie::Options()), + IcingDynamicTrie::max_value_index(options.lexicon_options))); + } + + void TearDown() override { + ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(index_dir_.c_str())); + } + + std::string index_dir_; + Filesystem filesystem_; + IcingFilesystem icing_filesystem_; + std::unique_ptr<LiteIndex> lite_index_; + std::unique_ptr<TermIdCodec> term_id_codec_; +}; + +constexpr NamespaceId kNamespace0 = 0; + +TEST_F(LiteIndexTest, LiteIndexAppendHits) { + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t tvi, + lite_index_->InsertTerm("foo", TermMatchType::PREFIX, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id, + term_id_codec_->EncodeTvi(tvi, TviType::LITE)); + Hit doc_hit0(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc_hit1(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc_hit0)); + ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc_hit1)); + + std::vector<DocHitInfo> hits1; + lite_index_->AppendHits(foo_term_id, kSectionIdMaskAll, + /*only_from_prefix_sections=*/false, + /*namespace_checker=*/nullptr, &hits1); + EXPECT_THAT(hits1, SizeIs(1)); + EXPECT_THAT(hits1.back().document_id(), Eq(0)); + // Check that the hits are coming from section 0 and section 1. + EXPECT_THAT(hits1.back().hit_section_ids_mask(), Eq(0b11)); + + std::vector<DocHitInfo> hits2; + AlwaysFalseNamespaceCheckerImpl always_false_namespace_checker; + lite_index_->AppendHits(foo_term_id, kSectionIdMaskAll, + /*only_from_prefix_sections=*/false, + &always_false_namespace_checker, &hits2); + // Check that no hits are returned because they get skipped by the namespace + // checker. + EXPECT_THAT(hits2, IsEmpty()); +} + +} // namespace +} // namespace lib +} // namespace icing diff --git a/icing/index/main/flash-index-storage.h b/icing/index/main/flash-index-storage.h index 8d5b50b..6c6fbb8 100644 --- a/icing/index/main/flash-index-storage.h +++ b/icing/index/main/flash-index-storage.h @@ -159,6 +159,7 @@ class FlashIndexStorage { libtextclassifier3::Status Reset(); + // TODO(b/222349894) Convert the string output to a protocol buffer instead. void GetDebugInfo(int verbosity, std::string* out) const; private: diff --git a/icing/index/main/main-index.cc b/icing/index/main/main-index.cc index b185138..2d6007b 100644 --- a/icing/index/main/main-index.cc +++ b/icing/index/main/main-index.cc @@ -133,18 +133,10 @@ libtextclassifier3::StatusOr<int64_t> MainIndex::GetElementsSize() const { IndexStorageInfoProto MainIndex::GetStorageInfo( IndexStorageInfoProto storage_info) const { - int64_t lexicon_elt_size = main_lexicon_->GetElementsSize(); - if (lexicon_elt_size != IcingFilesystem::kBadFileSize) { - storage_info.set_main_index_lexicon_size(lexicon_elt_size); - } else { - storage_info.set_main_index_lexicon_size(-1); - } - int64_t index_elt_size = flash_index_storage_->GetElementsSize(); - if (lexicon_elt_size != IcingFilesystem::kBadFileSize) { - storage_info.set_main_index_storage_size(index_elt_size); - } else { - storage_info.set_main_index_storage_size(-1); - } + storage_info.set_main_index_lexicon_size( + IcingFilesystem::SanitizeFileSize(main_lexicon_->GetElementsSize())); + storage_info.set_main_index_storage_size( + Filesystem::SanitizeFileSize(flash_index_storage_->GetElementsSize())); storage_info.set_main_index_block_size(flash_index_storage_->block_size()); storage_info.set_num_blocks(flash_index_storage_->num_blocks()); storage_info.set_min_free_fraction(flash_index_storage_->min_free_fraction()); @@ -186,7 +178,7 @@ MainIndex::GetAccessorForPrefixTerm(const std::string& prefix) { if (!exact && !hits_in_prefix_section.HasProperty(main_itr.GetValueIndex())) { // Found it, but it doesn't have prefix hits. Exit early. No need to // retrieve the posting list because there's nothing there for us. - return libtextclassifier3::Status::OK; + return absl_ports::NotFoundError("The term doesn't have any prefix hits."); } PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid; memcpy(&posting_list_id, main_itr.GetValue(), sizeof(posting_list_id)); @@ -217,35 +209,45 @@ bool IsTermInNamespaces( libtextclassifier3::StatusOr<std::vector<TermMetadata>> MainIndex::FindTermsByPrefix(const std::string& prefix, - const std::vector<NamespaceId>& namespace_ids) { + TermMatchType::Code term_match_type, + const NamespaceChecker* namespace_checker) { // Finds all the terms that start with the given prefix in the lexicon. IcingDynamicTrie::Iterator term_iterator(*main_lexicon_, prefix.c_str()); - // A property reader to help check if a term has some property. - IcingDynamicTrie::PropertyReadersAll property_reader(*main_lexicon_); - std::vector<TermMetadata> term_metadata_list; while (term_iterator.IsValid()) { - uint32_t term_value_index = term_iterator.GetValueIndex(); + int count = 0; + DocumentId last_document_id = kInvalidDocumentId; - // Skips the terms that don't exist in the given namespaces. We won't skip - // any terms if namespace_ids is empty. - if (!IsTermInNamespaces(property_reader, term_value_index, namespace_ids)) { - term_iterator.Advance(); - continue; - } PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid; memcpy(&posting_list_id, term_iterator.GetValue(), sizeof(posting_list_id)); - // Getting the actual hit count would require reading the entire posting - // list chain. We take an approximation to avoid all of those IO ops. - // Because we are not reading the posting lists, it is impossible to - // differentiate between single max-size posting lists and chains of - // max-size posting lists. We assume that the impact on scoring is not - // significant. - int approx_hit_count = IndexBlock::ApproximateFullPostingListHitsForBlock( - flash_index_storage_->block_size(), - posting_list_id.posting_list_index_bits()); - term_metadata_list.emplace_back(term_iterator.GetKey(), approx_hit_count); + ICING_ASSIGN_OR_RETURN(PostingListAccessor pl_accessor, + PostingListAccessor::CreateFromExisting( + flash_index_storage_.get(), posting_list_id)); + ICING_ASSIGN_OR_RETURN(std::vector<Hit> hits, + pl_accessor.GetNextHitsBatch()); + for (const Hit& hit : hits) { + DocumentId document_id = hit.document_id(); + if (document_id != last_document_id) { + last_document_id = document_id; + if (term_match_type == TermMatchType::EXACT_ONLY && + hit.is_prefix_hit()) { + continue; + } + if (!namespace_checker->BelongsToTargetNamespaces(document_id)) { + // The document is removed or expired or not belongs to target + // namespaces. + continue; + } + // TODO(b/152934343) Add search type in SuggestionSpec to ask user to + // input search type, prefix or exact. And make different score strategy + // base on that. + ++count; + } + } + if (count > 0) { + term_metadata_list.push_back(TermMetadata(term_iterator.GetKey(), count)); + } term_iterator.Advance(); } @@ -605,16 +607,22 @@ libtextclassifier3::Status MainIndex::AddPrefixBackfillHits( return libtextclassifier3::Status::OK; } -void MainIndex::GetDebugInfo(int verbosity, std::string* out) const { +IndexDebugInfoProto::MainIndexDebugInfoProto MainIndex::GetDebugInfo( + int verbosity) const { + IndexDebugInfoProto::MainIndexDebugInfoProto res; + // Lexicon. - out->append("Main Lexicon stats:\n"); - main_lexicon_->GetDebugInfo(verbosity, out); + main_lexicon_->GetDebugInfo(verbosity, res.mutable_lexicon_info()); + + res.set_last_added_document_id(last_added_document_id()); if (verbosity <= 0) { - return; + return res; } - flash_index_storage_->GetDebugInfo(verbosity, out); + flash_index_storage_->GetDebugInfo(verbosity, + res.mutable_flash_index_storage_info()); + return res; } } // namespace lib diff --git a/icing/index/main/main-index.h b/icing/index/main/main-index.h index 919a5c5..abb0418 100644 --- a/icing/index/main/main-index.h +++ b/icing/index/main/main-index.h @@ -27,7 +27,9 @@ #include "icing/index/term-metadata.h" #include "icing/legacy/index/icing-dynamic-trie.h" #include "icing/legacy/index/icing-filesystem.h" +#include "icing/proto/debug.pb.h" #include "icing/proto/storage.pb.h" +#include "icing/store/namespace-checker.h" #include "icing/store/namespace-id.h" #include "icing/util/status-macros.h" @@ -71,17 +73,17 @@ class MainIndex { // Finds terms with the given prefix in the given namespaces. If // 'namespace_ids' is empty, returns results from all the namespaces. The // input prefix must be normalized, otherwise inaccurate results may be - // returned. Results are not sorted specifically and are in lexigraphical - // order. Number of results are no more than 'num_to_return'. - // - // The hit count returned with each TermMetadata is an approximation based of - // posting list size. + // returned. If term_match_type is EXACT, only exact hit will be counted and + // it is PREFIX, both prefix and exact hits will be counted. Results are not + // sorted specifically and are in lexigraphical order. Number of results are + // no more than 'num_to_return'. // // Returns: // A list of TermMetadata on success // INTERNAL_ERROR if failed to access term data. libtextclassifier3::StatusOr<std::vector<TermMetadata>> FindTermsByPrefix( - const std::string& prefix, const std::vector<NamespaceId>& namespace_ids); + const std::string& prefix, TermMatchType::Code term_match_type, + const NamespaceChecker* namespace_checker); struct LexiconMergeOutputs { // Maps from main_lexicon tvi for new branching point to the main_lexicon @@ -184,7 +186,8 @@ class MainIndex { // verbosity <= 0, simplest debug information - just the lexicon // verbosity > 0, more detailed debug information including raw postings // lists. - void GetDebugInfo(int verbosity, std::string* out) const; + IndexDebugInfoProto::MainIndexDebugInfoProto GetDebugInfo( + int verbosity) const; private: libtextclassifier3::Status Init(const std::string& index_directory, diff --git a/icing/index/main/main-index_test.cc b/icing/index/main/main-index_test.cc index 74139be..fa83d68 100644 --- a/icing/index/main/main-index_test.cc +++ b/icing/index/main/main-index_test.cc @@ -162,6 +162,34 @@ TEST_F(MainIndexTest, MainIndexGetAccessorForPrefixReturnsValidAccessor) { EXPECT_THAT(main_index->GetAccessorForPrefixTerm("foo"), IsOk()); } +TEST_F(MainIndexTest, MainIndexGetAccessorForPrefixReturnsNotFound) { + // 1. Index one doc in the Lite Index: + // - Doc0 {"foot" is_in_prefix_section=false} + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t tvi, + lite_index_->InsertTerm("foot", TermMatchType::EXACT_ONLY, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN(uint32_t foot_term_id, + term_id_codec_->EncodeTvi(tvi, TviType::LITE)); + + Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit)); + + // 2. Create the main index. It should have no entries in its lexicon. + std::string main_index_file_name = index_dir_ + "/test_file.idx.index"; + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<MainIndex> main_index, + MainIndex::Create(main_index_file_name, &filesystem_, + &icing_filesystem_)); + + // 3. Merge the index. The main index should return not found when we search + // prefix contain "foo". + ICING_ASSERT_OK(Merge(*lite_index_, *term_id_codec_, main_index.get())); + // GetAccessorForPrefixTerm should return a valid accessor for "foo". + EXPECT_THAT(main_index->GetAccessorForPrefixTerm("foo"), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); +} + TEST_F(MainIndexTest, MainIndexGetAccessorForExactTermNotFound) { // Create the main index. It should have no entries in its lexicon. std::string main_index_file_name = index_dir_ + "/test_file.idx.index"; |