diff options
author | Tim Barron <tjbarron@google.com> | 2022-04-12 14:30:14 -0700 |
---|---|---|
committer | Tim Barron <tjbarron@google.com> | 2022-04-12 14:36:38 -0700 |
commit | d5c9ae94052a0f2f1b9ddec9dbbe502bc4f11d54 (patch) | |
tree | 90b929dc92d5874b5c15caca064401196ab4fc65 /icing/index/lite | |
parent | beff93fe1f5165aeeb871d9711963aa1846299ae (diff) | |
download | icing-d5c9ae94052a0f2f1b9ddec9dbbe502bc4f11d54.tar.gz |
Sync from upstream.
======================================================================
Refactor DocumentStore::Initialize to improve readability of document store recovery.
======================================================================
Remove non-NDK API usages of ICU4C in libicing.
======================================================================
Move IcuDataFileHelper to the testing directory since it is a test-only util.
======================================================================
Support dump function for DocumentStore
======================================================================
Switch to use PRead rather than MMap in the proto log.
======================================================================
Support dump function for main/lite index and lexicon
======================================================================
Fix LiteIndex::AppendHits
======================================================================
Enable and fix DocumentStoreTest.LoadScoreCacheAndInitializeSuccessfully
======================================================================
Fix MainIndex::GetStorageInfo.
======================================================================
Fix icing-search-engine_fuzz_test by making IcuLanguageSegmenterIterator::Advance non-recursive.
======================================================================
Allow to return additional information for deleted documents in DeleteByQuery
======================================================================
Using enum class in Token::Type for better type safety.
======================================================================
Bug: 158089703
Bug: 185845269
Bug: 209071710
Bug: 211785521
Bug: 218413237
Bug: 223549255
Change-Id: Id2786047ab279734bdd2aee883e82607b6a0e403
Diffstat (limited to 'icing/index/lite')
-rw-r--r-- | icing/index/lite/lite-index.cc | 38 | ||||
-rw-r--r-- | icing/index/lite/lite-index.h | 3 | ||||
-rw-r--r-- | icing/index/lite/lite-index_test.cc | 110 |
3 files changed, 131 insertions, 20 deletions
diff --git a/icing/index/lite/lite-index.cc b/icing/index/lite/lite-index.cc index e7a8cb3..a5c6baf 100644 --- a/icing/index/lite/lite-index.cc +++ b/icing/index/lite/lite-index.cc @@ -340,6 +340,8 @@ int LiteIndex::AppendHits(uint32_t term_id, SectionIdMask section_id_mask, std::vector<DocHitInfo>* hits_out) { int count = 0; DocumentId last_document_id = kInvalidDocumentId; + // Record whether the last document belongs to the given namespaces. + bool last_document_in_namespace = false; for (uint32_t idx = Seek(term_id); idx < header_->cur_size(); idx++) { TermIdHitPair term_id_hit_pair( hit_buffer_.array_cast<TermIdHitPair>()[idx]); @@ -357,9 +359,10 @@ int LiteIndex::AppendHits(uint32_t term_id, SectionIdMask section_id_mask, DocumentId document_id = hit.document_id(); if (document_id != last_document_id) { last_document_id = document_id; - // Check does current document belongs to the given namespaces. - if (namespace_checker != nullptr && - !namespace_checker->BelongsToTargetNamespaces(document_id)) { + last_document_in_namespace = + namespace_checker == nullptr || + namespace_checker->BelongsToTargetNamespaces(document_id); + if (!last_document_in_namespace) { // The document is removed or expired or not belongs to target // namespaces. continue; @@ -369,7 +372,7 @@ int LiteIndex::AppendHits(uint32_t term_id, SectionIdMask section_id_mask, hits_out->push_back(DocHitInfo(document_id)); } } - if (hits_out != nullptr) { + if (hits_out != nullptr && last_document_in_namespace) { hits_out->back().UpdateSection(hit.section_id(), hit.term_frequency()); } } @@ -388,15 +391,16 @@ bool LiteIndex::is_full() const { lexicon_.min_free_fraction() < (1.0 - kTrieFullFraction)); } -void LiteIndex::GetDebugInfo(int verbosity, std::string* out) const { - absl_ports::StrAppend( - out, IcingStringUtil::StringPrintf("Lite Index\nHit buffer %u/%u\n", - header_->cur_size(), - options_.hit_buffer_size)); - - // Lexicon. - out->append("Lexicon stats:\n"); - lexicon_.GetDebugInfo(verbosity, out); +IndexDebugInfoProto::LiteIndexDebugInfoProto LiteIndex::GetDebugInfo( + int verbosity) { + IndexDebugInfoProto::LiteIndexDebugInfoProto res; + res.set_curr_size(header_->cur_size()); + res.set_hit_buffer_size(options_.hit_buffer_size); + res.set_last_added_document_id(header_->last_added_docid()); + res.set_searchable_end(header_->searchable_end()); + res.set_index_crc(ComputeChecksum().Get()); + lexicon_.GetDebugInfo(verbosity, res.mutable_lexicon_info()); + return res; } libtextclassifier3::StatusOr<int64_t> LiteIndex::GetElementsSize() const { @@ -417,12 +421,8 @@ IndexStorageInfoProto LiteIndex::GetStorageInfo( IndexStorageInfoProto storage_info) const { int64_t header_and_hit_buffer_file_size = filesystem_->GetFileSize(hit_buffer_fd_.get()); - if (header_and_hit_buffer_file_size != Filesystem::kBadFileSize) { - storage_info.set_lite_index_hit_buffer_size( - header_and_hit_buffer_file_size); - } else { - storage_info.set_lite_index_hit_buffer_size(-1); - } + storage_info.set_lite_index_hit_buffer_size( + IcingFilesystem::SanitizeFileSize(header_and_hit_buffer_file_size)); int64_t lexicon_disk_usage = lexicon_.GetElementsSize(); if (lexicon_disk_usage != Filesystem::kBadFileSize) { storage_info.set_lite_index_lexicon_size(lexicon_disk_usage); diff --git a/icing/index/lite/lite-index.h b/icing/index/lite/lite-index.h index 890980c..378fc94 100644 --- a/icing/index/lite/lite-index.h +++ b/icing/index/lite/lite-index.h @@ -37,6 +37,7 @@ #include "icing/legacy/index/icing-lite-index-header.h" #include "icing/legacy/index/icing-lite-index-options.h" #include "icing/legacy/index/icing-mmapper.h" +#include "icing/proto/debug.pb.h" #include "icing/proto/storage.pb.h" #include "icing/proto/term.pb.h" #include "icing/schema/section.h" @@ -241,7 +242,7 @@ class LiteIndex { // Returns debug information for the index in out. // verbosity <= 0, simplest debug information - size of lexicon, hit buffer // verbosity > 0, more detailed debug information from the lexicon. - void GetDebugInfo(int verbosity, std::string* out) const; + IndexDebugInfoProto::LiteIndexDebugInfoProto GetDebugInfo(int verbosity); // Returns the byte size of all the elements held in the index. This excludes // the size of any internal metadata of the index, e.g. the index's header. diff --git a/icing/index/lite/lite-index_test.cc b/icing/index/lite/lite-index_test.cc new file mode 100644 index 0000000..825f830 --- /dev/null +++ b/icing/index/lite/lite-index_test.cc @@ -0,0 +1,110 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/index/lite/lite-index.h" + +#include <vector> + +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "icing/index/term-id-codec.h" +#include "icing/legacy/index/icing-mock-filesystem.h" +#include "icing/schema/section.h" +#include "icing/store/namespace-checker.h" +#include "icing/testing/common-matchers.h" +#include "icing/testing/tmp-directory.h" + +namespace icing { +namespace lib { + +namespace { + +using ::testing::Eq; +using ::testing::IsEmpty; +using ::testing::SizeIs; + +class AlwaysFalseNamespaceCheckerImpl : public NamespaceChecker { + public: + bool BelongsToTargetNamespaces(DocumentId document_id) const override { + return false; + } +}; + +class LiteIndexTest : public testing::Test { + protected: + void SetUp() override { + index_dir_ = GetTestTempDir() + "/test_dir"; + ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(index_dir_.c_str())); + + std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index"; + LiteIndex::Options options(lite_index_file_name, + /*hit_buffer_want_merge_bytes=*/1024 * 1024); + ICING_ASSERT_OK_AND_ASSIGN(lite_index_, + LiteIndex::Create(options, &icing_filesystem_)); + + ICING_ASSERT_OK_AND_ASSIGN( + term_id_codec_, + TermIdCodec::Create( + IcingDynamicTrie::max_value_index(IcingDynamicTrie::Options()), + IcingDynamicTrie::max_value_index(options.lexicon_options))); + } + + void TearDown() override { + ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(index_dir_.c_str())); + } + + std::string index_dir_; + Filesystem filesystem_; + IcingFilesystem icing_filesystem_; + std::unique_ptr<LiteIndex> lite_index_; + std::unique_ptr<TermIdCodec> term_id_codec_; +}; + +constexpr NamespaceId kNamespace0 = 0; + +TEST_F(LiteIndexTest, LiteIndexAppendHits) { + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t tvi, + lite_index_->InsertTerm("foo", TermMatchType::PREFIX, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id, + term_id_codec_->EncodeTvi(tvi, TviType::LITE)); + Hit doc_hit0(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc_hit1(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc_hit0)); + ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc_hit1)); + + std::vector<DocHitInfo> hits1; + lite_index_->AppendHits(foo_term_id, kSectionIdMaskAll, + /*only_from_prefix_sections=*/false, + /*namespace_checker=*/nullptr, &hits1); + EXPECT_THAT(hits1, SizeIs(1)); + EXPECT_THAT(hits1.back().document_id(), Eq(0)); + // Check that the hits are coming from section 0 and section 1. + EXPECT_THAT(hits1.back().hit_section_ids_mask(), Eq(0b11)); + + std::vector<DocHitInfo> hits2; + AlwaysFalseNamespaceCheckerImpl always_false_namespace_checker; + lite_index_->AppendHits(foo_term_id, kSectionIdMaskAll, + /*only_from_prefix_sections=*/false, + &always_false_namespace_checker, &hits2); + // Check that no hits are returned because they get skipped by the namespace + // checker. + EXPECT_THAT(hits2, IsEmpty()); +} + +} // namespace +} // namespace lib +} // namespace icing |