diff options
author | Cassie Wang <cassiewang@google.com> | 2021-02-26 08:04:01 -0800 |
---|---|---|
committer | Cassie Wang <cassiewang@google.com> | 2021-03-02 15:29:44 -0800 |
commit | 85fd8c8521e338d2bab69f5482e3cc2cf312fd4e (patch) | |
tree | 929e118124b203997be393e4a1c5f5ee6da2de40 /icing/index | |
parent | a34db390d80f862bfaaa49dea3605c5fec3bca3d (diff) | |
download | icing-85fd8c8521e338d2bab69f5482e3cc2cf312fd4e.tar.gz |
Sync from upstream.
Descriptions:
==========
Add last optimized time to GetOptimizeInfo.
==========
Update the implementation of snippeting to return property paths with value indices and remove the values_index field.
==========
Create builders for SchemaProto, SchemaTypeConfigProto and PropertyConfigProto.
==========
Rename some protos with the rules:
- Remove "Native" prefix
- Add "Proto" suffix for consistency with other protos
==========
Upgrade your minimum iOS version to 11.4.
==========
Fix PersistToDisk definitions to ensure that they properly call datasync. This change is meant to address the first part of ptd doc - that certain functions that claim to persist data don't actually explicitly flush.
==========
Change function call from has_field() to field()
==========
Add IcingStorageInfo.
==========
Add IndexStorageStats.
==========
Add SchemaStoreStorageStats.
==========
Add DocumentStoreStorageStats.
==========
Implement OptimizeStats.
==========
Remove the max number of results per query limit (1000) and replace it with a more flexible way to limit memory use by the result-state-manager.
==========
Add a test case to ensure we don't add UsageStore's checksum in DocumentStore's ComputeChecksum.
==========
Account for UsageStore in GetDiskUsage.
==========
Ensure that SchemaStore properly handles function calls when the schema isn't set.
==========
Remove jlpl_strict_deps feature from package declarations.
==========
Qualifies std::string in 3p directories
==========
Section restricts should influence the relevance score.
==========
Apply fixes upstream that were necessary to sync changes downstream. Also added a METADATA check to prevent any accidental adds of foo.proto.h includes.
==========
Remove the 'com.google.protobuf' to 'com.google.android.icing.protobuf' translation in the export_to_aosp script.
==========
Include usage store size in GetOptimizeInfo. This helps clients get a better idea of what savings they could get back if they called Optimize.
Change-Id: Ia2339c7987267a73c49dadf1ced4a0a8ef001d4c
Diffstat (limited to 'icing/index')
19 files changed, 262 insertions, 66 deletions
diff --git a/icing/index/index-processor.cc b/icing/index/index-processor.cc index d2f9d41..09dda41 100644 --- a/icing/index/index-processor.cc +++ b/icing/index/index-processor.cc @@ -55,7 +55,7 @@ IndexProcessor::Create(const Normalizer* normalizer, Index* index, libtextclassifier3::Status IndexProcessor::IndexDocument( const TokenizedDocument& tokenized_document, DocumentId document_id, - NativePutDocumentStats* put_document_stats) { + PutDocumentStatsProto* put_document_stats) { std::unique_ptr<Timer> index_timer = clock_.GetNewTimer(); if (index_->last_added_document_id() != kInvalidDocumentId && diff --git a/icing/index/index-processor.h b/icing/index/index-processor.h index 9fc7c46..6b07c98 100644 --- a/icing/index/index-processor.h +++ b/icing/index/index-processor.h @@ -81,7 +81,7 @@ class IndexProcessor { // INTERNAL_ERROR if any other errors occur libtextclassifier3::Status IndexDocument( const TokenizedDocument& tokenized_document, DocumentId document_id, - NativePutDocumentStats* put_document_stats = nullptr); + PutDocumentStatsProto* put_document_stats = nullptr); private: IndexProcessor(const Normalizer* normalizer, Index* index, diff --git a/icing/index/index.cc b/icing/index/index.cc index bd41b51..db59ad2 100644 --- a/icing/index/index.cc +++ b/icing/index/index.cc @@ -164,7 +164,7 @@ libtextclassifier3::StatusOr<std::unique_ptr<Index>> Index::Create( icing_filesystem)); return std::unique_ptr<Index>(new Index(options, std::move(term_id_codec), std::move(lite_index), - std::move(main_index))); + std::move(main_index), filesystem)); } libtextclassifier3::Status Index::TruncateTo(DocumentId document_id) { @@ -277,6 +277,18 @@ Index::FindTermsByPrefix(const std::string& prefix, std::move(main_term_metadata_list), num_to_return); } +IndexStorageInfoProto Index::GetStorageInfo() const { + IndexStorageInfoProto storage_info; + int64_t directory_size = filesystem_->GetDiskUsage(options_.base_dir.c_str()); + if (directory_size != Filesystem::kBadFileSize) { + storage_info.set_index_size(directory_size); + } else { + storage_info.set_index_size(-1); + } + storage_info = lite_index_->GetStorageInfo(std::move(storage_info)); + return main_index_->GetStorageInfo(std::move(storage_info)); +} + libtextclassifier3::Status Index::Editor::BufferTerm(const char* term) { // Step 1: See if this term is already in the lexicon uint32_t tvi; diff --git a/icing/index/index.h b/icing/index/index.h index a4ea719..b7021ca 100644 --- a/icing/index/index.h +++ b/icing/index/index.h @@ -32,6 +32,7 @@ #include "icing/index/term-id-codec.h" #include "icing/index/term-metadata.h" #include "icing/legacy/index/icing-filesystem.h" +#include "icing/proto/storage.pb.h" #include "icing/proto/term.pb.h" #include "icing/schema/section.h" #include "icing/store/document-id.h" @@ -151,6 +152,12 @@ class Index { return lite_index_size + main_index_size; } + // Calculates the StorageInfo for the Index. + // + // If an IO error occurs while trying to calculate the value for a field, then + // that field will be set to -1. + IndexStorageInfoProto GetStorageInfo() const; + // Create an iterator to iterate through all doc hit infos in the index that // match the term. section_id_mask can be set to ignore hits from sections not // listed in the mask. Eg. section_id_mask = 1U << 3; would only return hits @@ -242,11 +249,12 @@ class Index { private: Index(const Options& options, std::unique_ptr<TermIdCodec> term_id_codec, std::unique_ptr<LiteIndex> lite_index, - std::unique_ptr<MainIndex> main_index) + std::unique_ptr<MainIndex> main_index, const Filesystem* filesystem) : lite_index_(std::move(lite_index)), main_index_(std::move(main_index)), options_(options), - term_id_codec_(std::move(term_id_codec)) {} + term_id_codec_(std::move(term_id_codec)), + filesystem_(filesystem) {} libtextclassifier3::StatusOr<std::vector<TermMetadata>> FindLiteTermsByPrefix( const std::string& prefix, const std::vector<NamespaceId>& namespace_ids, @@ -256,6 +264,7 @@ class Index { std::unique_ptr<MainIndex> main_index_; const Options options_; std::unique_ptr<TermIdCodec> term_id_codec_; + const Filesystem* filesystem_; }; } // namespace lib diff --git a/icing/index/index_test.cc b/icing/index/index_test.cc index 3479ab1..de4edf8 100644 --- a/icing/index/index_test.cc +++ b/icing/index/index_test.cc @@ -31,6 +31,7 @@ #include "icing/index/iterator/doc-hit-info-iterator.h" #include "icing/legacy/index/icing-filesystem.h" #include "icing/legacy/index/icing-mock-filesystem.h" +#include "icing/proto/storage.pb.h" #include "icing/proto/term.pb.h" #include "icing/schema/section.h" #include "icing/store/document-id.h" @@ -46,6 +47,7 @@ namespace { using ::testing::ElementsAre; using ::testing::Eq; +using ::testing::Ge; using ::testing::Gt; using ::testing::IsEmpty; using ::testing::IsTrue; @@ -1636,6 +1638,33 @@ TEST_F(IndexTest, TruncateToThrowsOutBothIndices) { EXPECT_THAT(index_->last_added_document_id(), Eq(kInvalidDocumentId)); } +TEST_F(IndexTest, IndexStorageInfoProto) { + // Add two documents to the lite index and merge them into main. + { + Index::Editor edit = index_->Edit( + kDocumentId0, kSectionId2, TermMatchType::PREFIX, /*namespace_id=*/0); + ASSERT_THAT(edit.BufferTerm("foo"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); + edit = index_->Edit(kDocumentId1, kSectionId2, TermMatchType::PREFIX, + /*namespace_id=*/0); + ASSERT_THAT(edit.BufferTerm("foul"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); + + ICING_ASSERT_OK(index_->Merge()); + } + + IndexStorageInfoProto storage_info = index_->GetStorageInfo(); + EXPECT_THAT(storage_info.index_size(), Ge(0)); + EXPECT_THAT(storage_info.lite_index_lexicon_size(), Ge(0)); + EXPECT_THAT(storage_info.lite_index_hit_buffer_size(), Ge(0)); + EXPECT_THAT(storage_info.main_index_lexicon_size(), Ge(0)); + EXPECT_THAT(storage_info.main_index_storage_size(), Ge(0)); + EXPECT_THAT(storage_info.main_index_block_size(), Ge(0)); + // There should be 1 block for the header and 1 block for two posting lists. + EXPECT_THAT(storage_info.num_blocks(), Eq(2)); + EXPECT_THAT(storage_info.min_free_fraction(), Ge(0)); +} + } // namespace } // namespace lib diff --git a/icing/index/iterator/doc-hit-info-iterator-and.h b/icing/index/iterator/doc-hit-info-iterator-and.h index faca785..8ceff44 100644 --- a/icing/index/iterator/doc-hit-info-iterator-and.h +++ b/icing/index/iterator/doc-hit-info-iterator-and.h @@ -47,13 +47,16 @@ class DocHitInfoIteratorAnd : public DocHitInfoIterator { std::string ToString() const override; void PopulateMatchedTermsStats( - std::vector<TermMatchInfo> *matched_terms_stats) const override { + std::vector<TermMatchInfo> *matched_terms_stats, + SectionIdMask filtering_section_mask = kSectionIdMaskAll) const override { if (doc_hit_info_.document_id() == kInvalidDocumentId) { // Current hit isn't valid, return. return; } - short_->PopulateMatchedTermsStats(matched_terms_stats); - long_->PopulateMatchedTermsStats(matched_terms_stats); + short_->PopulateMatchedTermsStats(matched_terms_stats, + filtering_section_mask); + long_->PopulateMatchedTermsStats(matched_terms_stats, + filtering_section_mask); } private: @@ -78,13 +81,15 @@ class DocHitInfoIteratorAndNary : public DocHitInfoIterator { std::string ToString() const override; void PopulateMatchedTermsStats( - std::vector<TermMatchInfo> *matched_terms_stats) const override { + std::vector<TermMatchInfo> *matched_terms_stats, + SectionIdMask filtering_section_mask = kSectionIdMaskAll) const override { if (doc_hit_info_.document_id() == kInvalidDocumentId) { // Current hit isn't valid, return. return; } for (size_t i = 0; i < iterators_.size(); ++i) { - iterators_.at(i)->PopulateMatchedTermsStats(matched_terms_stats); + iterators_.at(i)->PopulateMatchedTermsStats(matched_terms_stats, + filtering_section_mask); } } diff --git a/icing/index/iterator/doc-hit-info-iterator-filter.h b/icing/index/iterator/doc-hit-info-iterator-filter.h index fb60e38..9cee74c 100644 --- a/icing/index/iterator/doc-hit-info-iterator-filter.h +++ b/icing/index/iterator/doc-hit-info-iterator-filter.h @@ -68,8 +68,10 @@ class DocHitInfoIteratorFilter : public DocHitInfoIterator { std::string ToString() const override; void PopulateMatchedTermsStats( - std::vector<TermMatchInfo>* matched_terms_stats) const override { - delegate_->PopulateMatchedTermsStats(matched_terms_stats); + std::vector<TermMatchInfo>* matched_terms_stats, + SectionIdMask filtering_section_mask = kSectionIdMaskAll) const override { + delegate_->PopulateMatchedTermsStats(matched_terms_stats, + filtering_section_mask); } private: diff --git a/icing/index/iterator/doc-hit-info-iterator-or.h b/icing/index/iterator/doc-hit-info-iterator-or.h index 2f49430..2dae68d 100644 --- a/icing/index/iterator/doc-hit-info-iterator-or.h +++ b/icing/index/iterator/doc-hit-info-iterator-or.h @@ -43,15 +43,18 @@ class DocHitInfoIteratorOr : public DocHitInfoIterator { std::string ToString() const override; void PopulateMatchedTermsStats( - std::vector<TermMatchInfo> *matched_terms_stats) const override { + std::vector<TermMatchInfo> *matched_terms_stats, + SectionIdMask filtering_section_mask = kSectionIdMaskAll) const override { if (doc_hit_info_.document_id() == kInvalidDocumentId) { // Current hit isn't valid, return. return; } - current_->PopulateMatchedTermsStats(matched_terms_stats); + current_->PopulateMatchedTermsStats(matched_terms_stats, + filtering_section_mask); // If equal, then current_ == left_. Combine with results from right_. if (left_document_id_ == right_document_id_) { - right_->PopulateMatchedTermsStats(matched_terms_stats); + right_->PopulateMatchedTermsStats(matched_terms_stats, + filtering_section_mask); } } @@ -83,13 +86,15 @@ class DocHitInfoIteratorOrNary : public DocHitInfoIterator { std::string ToString() const override; void PopulateMatchedTermsStats( - std::vector<TermMatchInfo> *matched_terms_stats) const override { + std::vector<TermMatchInfo> *matched_terms_stats, + SectionIdMask filtering_section_mask = kSectionIdMaskAll) const override { if (doc_hit_info_.document_id() == kInvalidDocumentId) { // Current hit isn't valid, return. return; } for (size_t i = 0; i < current_iterators_.size(); i++) { - current_iterators_.at(i)->PopulateMatchedTermsStats(matched_terms_stats); + current_iterators_.at(i)->PopulateMatchedTermsStats( + matched_terms_stats, filtering_section_mask); } } diff --git a/icing/index/iterator/doc-hit-info-iterator-section-restrict.cc b/icing/index/iterator/doc-hit-info-iterator-section-restrict.cc index 8acb91a..e6ee8e3 100644 --- a/icing/index/iterator/doc-hit-info-iterator-section-restrict.cc +++ b/icing/index/iterator/doc-hit-info-iterator-section-restrict.cc @@ -84,8 +84,7 @@ libtextclassifier3::Status DocHitInfoIteratorSectionRestrict::Advance() { if (section_metadata->path == target_section_) { // The hit was in the target section name, return OK/found doc_hit_info_ = delegate_->doc_hit_info(); - hit_intersect_section_ids_mask_ = - delegate_->hit_intersect_section_ids_mask(); + hit_intersect_section_ids_mask_ = 1u << section_id; return libtextclassifier3::Status::OK; } } diff --git a/icing/index/iterator/doc-hit-info-iterator-section-restrict.h b/icing/index/iterator/doc-hit-info-iterator-section-restrict.h index ba74384..52b243a 100644 --- a/icing/index/iterator/doc-hit-info-iterator-section-restrict.h +++ b/icing/index/iterator/doc-hit-info-iterator-section-restrict.h @@ -52,13 +52,21 @@ class DocHitInfoIteratorSectionRestrict : public DocHitInfoIterator { std::string ToString() const override; - // NOTE: currently, section restricts does decide which documents to - // return, but doesn't impact the relevance score of a document. - // TODO(b/173156803): decide whether we want to filter the matched_terms_stats - // for the restricted sections. + // Note that the DocHitInfoIteratorSectionRestrict is the only iterator that + // should set filtering_section_mask, hence the received + // filtering_section_mask is ignored and the filtering_section_mask passed to + // the delegate will be set to hit_intersect_section_ids_mask_. This will + // allow to filter the matching sections in the delegate. void PopulateMatchedTermsStats( - std::vector<TermMatchInfo>* matched_terms_stats) const override { - delegate_->PopulateMatchedTermsStats(matched_terms_stats); + std::vector<TermMatchInfo>* matched_terms_stats, + SectionIdMask filtering_section_mask = kSectionIdMaskAll) const override { + if (doc_hit_info_.document_id() == kInvalidDocumentId) { + // Current hit isn't valid, return. + return; + } + delegate_->PopulateMatchedTermsStats( + matched_terms_stats, + /*filtering_section_mask=*/hit_intersect_section_ids_mask_); } private: diff --git a/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc b/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc index 91e0cbe..21b3f8f 100644 --- a/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc +++ b/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc @@ -43,6 +43,7 @@ namespace lib { namespace { using ::testing::ElementsAre; +using ::testing::ElementsAreArray; using ::testing::Eq; using ::testing::IsEmpty; @@ -101,6 +102,57 @@ class DocHitInfoIteratorSectionRestrictTest : public ::testing::Test { FakeClock fake_clock_; }; +TEST_F(DocHitInfoIteratorSectionRestrictTest, + PopulateMatchedTermsStats_IncludesHitWithMatchingSection) { + // Populate the DocumentStore's FilterCache with this document's data + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id, + document_store_->Put(document_)); + + // Arbitrary section ids for the documents in the DocHitInfoIterators. + // Created to test correct section_id_mask behavior. + SectionIdMask original_section_id_mask = 0b00000101; // hits in sections 0, 2 + + DocHitInfo doc_hit_info1 = DocHitInfo(document_id); + doc_hit_info1.UpdateSection(/*section_id=*/0, /*hit_term_frequency=*/1); + doc_hit_info1.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/2); + + // Create a hit that was found in the indexed section + std::vector<DocHitInfo> doc_hit_infos = {doc_hit_info1}; + + auto original_iterator = + std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos, "hi"); + original_iterator->set_hit_intersect_section_ids_mask( + original_section_id_mask); + + // Filtering for the indexed section name (which has a section id of 0) should + // get a result. + DocHitInfoIteratorSectionRestrict section_restrict_iterator( + std::move(original_iterator), document_store_.get(), schema_store_.get(), + /*target_section=*/indexed_property_); + + std::vector<TermMatchInfo> matched_terms_stats; + section_restrict_iterator.PopulateMatchedTermsStats(&matched_terms_stats); + EXPECT_THAT(matched_terms_stats, IsEmpty()); + + ICING_EXPECT_OK(section_restrict_iterator.Advance()); + EXPECT_THAT(section_restrict_iterator.doc_hit_info().document_id(), + Eq(document_id)); + SectionIdMask expected_section_id_mask = 0b00000001; // hits in sections 0 + EXPECT_EQ(section_restrict_iterator.hit_intersect_section_ids_mask(), + expected_section_id_mask); + + section_restrict_iterator.PopulateMatchedTermsStats(&matched_terms_stats); + EXPECT_EQ(matched_terms_stats.at(0).term, "hi"); + std::array<Hit::TermFrequency, kMaxSectionId> expected_term_frequencies{ + 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + EXPECT_THAT(matched_terms_stats.at(0).term_frequencies, + ElementsAreArray(expected_term_frequencies)); + EXPECT_EQ(matched_terms_stats.at(0).section_ids_mask, + expected_section_id_mask); + + EXPECT_FALSE(section_restrict_iterator.Advance().ok()); +} + TEST_F(DocHitInfoIteratorSectionRestrictTest, EmptyOriginalIterator) { std::unique_ptr<DocHitInfoIterator> original_iterator_empty = std::make_unique<DocHitInfoIteratorDummy>(); @@ -110,6 +162,9 @@ TEST_F(DocHitInfoIteratorSectionRestrictTest, EmptyOriginalIterator) { schema_store_.get(), /*target_section=*/""); EXPECT_THAT(GetDocumentIds(&filtered_iterator), IsEmpty()); + std::vector<TermMatchInfo> matched_terms_stats; + filtered_iterator.PopulateMatchedTermsStats(&matched_terms_stats); + EXPECT_THAT(matched_terms_stats, IsEmpty()); } TEST_F(DocHitInfoIteratorSectionRestrictTest, IncludesHitWithMatchingSection) { @@ -148,6 +203,9 @@ TEST_F(DocHitInfoIteratorSectionRestrictTest, NoMatchingDocumentFilterData) { /*target_section=*/""); EXPECT_THAT(GetDocumentIds(§ion_restrict_iterator), IsEmpty()); + std::vector<TermMatchInfo> matched_terms_stats; + section_restrict_iterator.PopulateMatchedTermsStats(&matched_terms_stats); + EXPECT_THAT(matched_terms_stats, IsEmpty()); } TEST_F(DocHitInfoIteratorSectionRestrictTest, @@ -171,6 +229,9 @@ TEST_F(DocHitInfoIteratorSectionRestrictTest, "some_section_name"); EXPECT_THAT(GetDocumentIds(§ion_restrict_iterator), IsEmpty()); + std::vector<TermMatchInfo> matched_terms_stats; + section_restrict_iterator.PopulateMatchedTermsStats(&matched_terms_stats); + EXPECT_THAT(matched_terms_stats, IsEmpty()); } TEST_F(DocHitInfoIteratorSectionRestrictTest, @@ -192,6 +253,9 @@ TEST_F(DocHitInfoIteratorSectionRestrictTest, indexed_property_); EXPECT_THAT(GetDocumentIds(§ion_restrict_iterator), IsEmpty()); + std::vector<TermMatchInfo> matched_terms_stats; + section_restrict_iterator.PopulateMatchedTermsStats(&matched_terms_stats); + EXPECT_THAT(matched_terms_stats, IsEmpty()); } TEST_F(DocHitInfoIteratorSectionRestrictTest, @@ -216,6 +280,9 @@ TEST_F(DocHitInfoIteratorSectionRestrictTest, indexed_property_); EXPECT_THAT(GetDocumentIds(§ion_restrict_iterator), IsEmpty()); + std::vector<TermMatchInfo> matched_terms_stats; + section_restrict_iterator.PopulateMatchedTermsStats(&matched_terms_stats); + EXPECT_THAT(matched_terms_stats, IsEmpty()); } TEST_F(DocHitInfoIteratorSectionRestrictTest, GetNumBlocksInspected) { diff --git a/icing/index/iterator/doc-hit-info-iterator-test-util.h b/icing/index/iterator/doc-hit-info-iterator-test-util.h index 913696a..45acc8f 100644 --- a/icing/index/iterator/doc-hit-info-iterator-test-util.h +++ b/icing/index/iterator/doc-hit-info-iterator-test-util.h @@ -56,23 +56,25 @@ class DocHitInfoIteratorDummy : public DocHitInfoIterator { // Imitates behavior of DocHitInfoIteratorTermMain/DocHitInfoIteratorTermLite void PopulateMatchedTermsStats( - std::vector<TermMatchInfo>* matched_terms_stats) const override { + std::vector<TermMatchInfo>* matched_terms_stats, + SectionIdMask filtering_section_mask = kSectionIdMaskAll) const override { if (doc_hit_info_.document_id() == kInvalidDocumentId) { // Current hit isn't valid, return. return; } - SectionIdMask section_mask = doc_hit_info_.hit_section_ids_mask(); + SectionIdMask section_mask = + doc_hit_info_.hit_section_ids_mask() & filtering_section_mask; + SectionIdMask section_mask_copy = section_mask; std::array<Hit::TermFrequency, kMaxSectionId> section_term_frequencies = { Hit::kNoTermFrequency}; - - while (section_mask) { - SectionId section_id = __builtin_ctz(section_mask); + while (section_mask_copy) { + SectionId section_id = __builtin_ctz(section_mask_copy); section_term_frequencies.at(section_id) = doc_hit_info_.hit_term_frequency(section_id); - section_mask &= ~(1u << section_id); + section_mask_copy &= ~(1u << section_id); } - TermMatchInfo term_stats(term_, doc_hit_info_.hit_section_ids_mask(), - section_term_frequencies); + TermMatchInfo term_stats(term_, section_mask, + std::move(section_term_frequencies)); for (auto& cur_term_stats : *matched_terms_stats) { if (cur_term_stats.term == term_stats.term) { diff --git a/icing/index/iterator/doc-hit-info-iterator.h b/icing/index/iterator/doc-hit-info-iterator.h index c4d9901..afb298b 100644 --- a/icing/index/iterator/doc-hit-info-iterator.h +++ b/icing/index/iterator/doc-hit-info-iterator.h @@ -15,6 +15,7 @@ #ifndef ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_H_ #define ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_H_ +#include <array> #include <cstdint> #include <string> #include <string_view> @@ -93,11 +94,14 @@ class DocHitInfoIterator { // For the last hit docid, retrieves all the matched query terms and other // stats, see TermMatchInfo. + // filtering_section_mask filters the matching sections and should be set only + // by DocHitInfoIteratorSectionRestrict. // If Advance() wasn't called after construction, Advance() returned false or // the concrete HitIterator didn't override this method, the vectors aren't // populated. virtual void PopulateMatchedTermsStats( - std::vector<TermMatchInfo>* matched_terms_stats) const {} + std::vector<TermMatchInfo>* matched_terms_stats, + SectionIdMask filtering_section_mask = kSectionIdMaskAll) const {} protected: DocHitInfo doc_hit_info_; diff --git a/icing/index/lite/doc-hit-info-iterator-term-lite.h b/icing/index/lite/doc-hit-info-iterator-term-lite.h index ac5e97f..8dbe043 100644 --- a/icing/index/lite/doc-hit-info-iterator-term-lite.h +++ b/icing/index/lite/doc-hit-info-iterator-term-lite.h @@ -50,21 +50,24 @@ class DocHitInfoIteratorTermLite : public DocHitInfoIterator { int32_t GetNumLeafAdvanceCalls() const override { return num_advance_calls_; } void PopulateMatchedTermsStats( - std::vector<TermMatchInfo>* matched_terms_stats) const override { + std::vector<TermMatchInfo>* matched_terms_stats, + SectionIdMask filtering_section_mask = kSectionIdMaskAll) const override { if (doc_hit_info_.document_id() == kInvalidDocumentId) { // Current hit isn't valid, return. return; } - SectionIdMask section_mask = doc_hit_info_.hit_section_ids_mask(); + SectionIdMask section_mask = + doc_hit_info_.hit_section_ids_mask() & filtering_section_mask; + SectionIdMask section_mask_copy = section_mask; std::array<Hit::TermFrequency, kMaxSectionId> section_term_frequencies = { Hit::kNoTermFrequency}; - while (section_mask) { - SectionId section_id = __builtin_ctz(section_mask); + while (section_mask_copy) { + SectionId section_id = __builtin_ctz(section_mask_copy); section_term_frequencies.at(section_id) = doc_hit_info_.hit_term_frequency(section_id); - section_mask &= ~(1u << section_id); + section_mask_copy &= ~(1u << section_id); } - TermMatchInfo term_stats(term_, doc_hit_info_.hit_section_ids_mask(), + TermMatchInfo term_stats(term_, section_mask, std::move(section_term_frequencies)); for (const TermMatchInfo& cur_term_stats : *matched_terms_stats) { diff --git a/icing/index/lite/lite-index.cc b/icing/index/lite/lite-index.cc index e0379b8..69138e1 100644 --- a/icing/index/lite/lite-index.cc +++ b/icing/index/lite/lite-index.cc @@ -394,26 +394,36 @@ void LiteIndex::GetDebugInfo(int verbosity, std::string* out) const { } libtextclassifier3::StatusOr<int64_t> LiteIndex::GetElementsSize() const { - int64_t header_and_hit_buffer_file_size = - filesystem_->GetFileSize(hit_buffer_fd_.get()); - - if (header_and_hit_buffer_file_size == Filesystem::kBadFileSize) { - return absl_ports::InternalError( - "Failed to get element size of the LiteIndex's header and hit buffer"); + IndexStorageInfoProto storage_info = GetStorageInfo(IndexStorageInfoProto()); + if (storage_info.lite_index_hit_buffer_size() == -1 || + storage_info.lite_index_lexicon_size() == -1) { + return absl_ports::AbortedError( + "Failed to get size of LiteIndex's members."); } - - int64_t lexicon_disk_usage = lexicon_.GetElementsSize(); - if (lexicon_disk_usage == IcingFilesystem::kBadFileSize) { - return absl_ports::InternalError( - "Failed to get element size of LiteIndex's lexicon"); - } - // On initialization, we grow the file to a padded size first. So this size // won't count towards the size taken up by elements size_t header_padded_size = IcingMMapper::page_aligned_size(header_size()); + return storage_info.lite_index_hit_buffer_size() - header_padded_size + + storage_info.lite_index_lexicon_size(); +} - return header_and_hit_buffer_file_size - header_padded_size + - lexicon_disk_usage; +IndexStorageInfoProto LiteIndex::GetStorageInfo( + IndexStorageInfoProto storage_info) const { + int64_t header_and_hit_buffer_file_size = + filesystem_->GetFileSize(hit_buffer_fd_.get()); + if (header_and_hit_buffer_file_size != Filesystem::kBadFileSize) { + storage_info.set_lite_index_hit_buffer_size( + header_and_hit_buffer_file_size); + } else { + storage_info.set_lite_index_hit_buffer_size(-1); + } + int64_t lexicon_disk_usage = lexicon_.GetElementsSize(); + if (lexicon_disk_usage != Filesystem::kBadFileSize) { + storage_info.set_lite_index_lexicon_size(lexicon_disk_usage); + } else { + storage_info.set_lite_index_lexicon_size(-1); + } + return storage_info; } uint32_t LiteIndex::Seek(uint32_t term_id) { diff --git a/icing/index/lite/lite-index.h b/icing/index/lite/lite-index.h index 7b51aa4..90c6fbc 100644 --- a/icing/index/lite/lite-index.h +++ b/icing/index/lite/lite-index.h @@ -37,6 +37,7 @@ #include "icing/legacy/index/icing-lite-index-header.h" #include "icing/legacy/index/icing-lite-index-options.h" #include "icing/legacy/index/icing-mmapper.h" +#include "icing/proto/storage.pb.h" #include "icing/proto/term.pb.h" #include "icing/schema/section.h" #include "icing/store/document-id.h" @@ -240,6 +241,14 @@ class LiteIndex { // INTERNAL_ERROR on IO error libtextclassifier3::StatusOr<int64_t> GetElementsSize() const; + // Takes the provided storage_info, populates the fields related to the lite + // index and returns that storage_info. + // + // If an IO error occurs while trying to calculate the value for a field, then + // that field will be set to -1. + IndexStorageInfoProto GetStorageInfo( + IndexStorageInfoProto storage_info) const; + private: static IcingDynamicTrie::RuntimeOptions MakeTrieRuntimeOptions(); diff --git a/icing/index/main/doc-hit-info-iterator-term-main.h b/icing/index/main/doc-hit-info-iterator-term-main.h index d626d7a..f3cf701 100644 --- a/icing/index/main/doc-hit-info-iterator-term-main.h +++ b/icing/index/main/doc-hit-info-iterator-term-main.h @@ -50,21 +50,24 @@ class DocHitInfoIteratorTermMain : public DocHitInfoIterator { int32_t GetNumLeafAdvanceCalls() const override { return num_advance_calls_; } void PopulateMatchedTermsStats( - std::vector<TermMatchInfo>* matched_terms_stats) const override { + std::vector<TermMatchInfo>* matched_terms_stats, + SectionIdMask filtering_section_mask = kSectionIdMaskAll) const override { if (doc_hit_info_.document_id() == kInvalidDocumentId) { // Current hit isn't valid, return. return; } - SectionIdMask section_mask = doc_hit_info_.hit_section_ids_mask(); + SectionIdMask section_mask = + doc_hit_info_.hit_section_ids_mask() & filtering_section_mask; + SectionIdMask section_mask_copy = section_mask; std::array<Hit::TermFrequency, kMaxSectionId> section_term_frequencies = { Hit::kNoTermFrequency}; - while (section_mask) { - SectionId section_id = __builtin_ctz(section_mask); + while (section_mask_copy) { + SectionId section_id = __builtin_ctz(section_mask_copy); section_term_frequencies.at(section_id) = doc_hit_info_.hit_term_frequency(section_id); - section_mask &= ~(1u << section_id); + section_mask_copy &= ~(1u << section_id); } - TermMatchInfo term_stats(term_, doc_hit_info_.hit_section_ids_mask(), + TermMatchInfo term_stats(term_, section_mask, std::move(section_term_frequencies)); for (const TermMatchInfo& cur_term_stats : *matched_terms_stats) { diff --git a/icing/index/main/main-index.cc b/icing/index/main/main-index.cc index 636f631..8ae6b27 100644 --- a/icing/index/main/main-index.cc +++ b/icing/index/main/main-index.cc @@ -121,14 +121,34 @@ libtextclassifier3::Status MainIndex::Init( } libtextclassifier3::StatusOr<int64_t> MainIndex::GetElementsSize() const { + IndexStorageInfoProto storage_info = GetStorageInfo(IndexStorageInfoProto()); + if (storage_info.main_index_storage_size() == -1 || + storage_info.main_index_lexicon_size() == -1) { + return absl_ports::AbortedError( + "Failed to get size of MainIndex's members."); + } + return storage_info.main_index_storage_size() + + storage_info.main_index_lexicon_size(); +} + +IndexStorageInfoProto MainIndex::GetStorageInfo( + IndexStorageInfoProto storage_info) const { int64_t lexicon_elt_size = main_lexicon_->GetElementsSize(); + if (lexicon_elt_size != IcingFilesystem::kBadFileSize) { + storage_info.set_main_index_lexicon_size(lexicon_elt_size); + } else { + storage_info.set_main_index_lexicon_size(-1); + } int64_t index_elt_size = flash_index_storage_->GetElementsSize(); - if (lexicon_elt_size == IcingFilesystem::kBadFileSize || - index_elt_size == IcingFilesystem::kBadFileSize) { - return absl_ports::InternalError( - "Failed to get element size of LiteIndex's lexicon"); + if (lexicon_elt_size != IcingFilesystem::kBadFileSize) { + storage_info.set_main_index_storage_size(index_elt_size); + } else { + storage_info.set_main_index_storage_size(-1); } - return lexicon_elt_size + index_elt_size; + storage_info.set_main_index_block_size(flash_index_storage_->block_size()); + storage_info.set_num_blocks(flash_index_storage_->num_blocks()); + storage_info.set_min_free_fraction(flash_index_storage_->min_free_fraction()); + return storage_info; } libtextclassifier3::StatusOr<std::unique_ptr<PostingListAccessor>> diff --git a/icing/index/main/main-index.h b/icing/index/main/main-index.h index 7403b8c..43635ca 100644 --- a/icing/index/main/main-index.h +++ b/icing/index/main/main-index.h @@ -27,6 +27,7 @@ #include "icing/index/term-metadata.h" #include "icing/legacy/index/icing-dynamic-trie.h" #include "icing/legacy/index/icing-filesystem.h" +#include "icing/proto/storage.pb.h" #include "icing/store/namespace-id.h" #include "icing/util/status-macros.h" @@ -172,6 +173,14 @@ class MainIndex { // - INTERNAL on IO error libtextclassifier3::StatusOr<int64_t> GetElementsSize() const; + // Takes the provided storage_info, populates the fields related to the main + // index and returns that storage_info. + // + // If an IO error occurs while trying to calculate the value for a field, then + // that field will be set to -1. + IndexStorageInfoProto GetStorageInfo( + IndexStorageInfoProto storage_info) const; + // Returns debug information for the main index in out. // verbosity <= 0, simplest debug information - just the lexicon // verbosity > 0, more detailed debug information including raw postings |