aboutsummaryrefslogtreecommitdiff
path: root/icing/index/lite
diff options
context:
space:
mode:
authorTim Barron <tjbarron@google.com>2022-04-12 14:30:14 -0700
committerTim Barron <tjbarron@google.com>2022-04-12 14:36:38 -0700
commitd5c9ae94052a0f2f1b9ddec9dbbe502bc4f11d54 (patch)
tree90b929dc92d5874b5c15caca064401196ab4fc65 /icing/index/lite
parentbeff93fe1f5165aeeb871d9711963aa1846299ae (diff)
downloadicing-d5c9ae94052a0f2f1b9ddec9dbbe502bc4f11d54.tar.gz
Sync from upstream.
====================================================================== Refactor DocumentStore::Initialize to improve readability of document store recovery. ====================================================================== Remove non-NDK API usages of ICU4C in libicing. ====================================================================== Move IcuDataFileHelper to the testing directory since it is a test-only util. ====================================================================== Support dump function for DocumentStore ====================================================================== Switch to use PRead rather than MMap in the proto log. ====================================================================== Support dump function for main/lite index and lexicon ====================================================================== Fix LiteIndex::AppendHits ====================================================================== Enable and fix DocumentStoreTest.LoadScoreCacheAndInitializeSuccessfully ====================================================================== Fix MainIndex::GetStorageInfo. ====================================================================== Fix icing-search-engine_fuzz_test by making IcuLanguageSegmenterIterator::Advance non-recursive. ====================================================================== Allow to return additional information for deleted documents in DeleteByQuery ====================================================================== Using enum class in Token::Type for better type safety. ====================================================================== Bug: 158089703 Bug: 185845269 Bug: 209071710 Bug: 211785521 Bug: 218413237 Bug: 223549255 Change-Id: Id2786047ab279734bdd2aee883e82607b6a0e403
Diffstat (limited to 'icing/index/lite')
-rw-r--r--icing/index/lite/lite-index.cc38
-rw-r--r--icing/index/lite/lite-index.h3
-rw-r--r--icing/index/lite/lite-index_test.cc110
3 files changed, 131 insertions, 20 deletions
diff --git a/icing/index/lite/lite-index.cc b/icing/index/lite/lite-index.cc
index e7a8cb3..a5c6baf 100644
--- a/icing/index/lite/lite-index.cc
+++ b/icing/index/lite/lite-index.cc
@@ -340,6 +340,8 @@ int LiteIndex::AppendHits(uint32_t term_id, SectionIdMask section_id_mask,
std::vector<DocHitInfo>* hits_out) {
int count = 0;
DocumentId last_document_id = kInvalidDocumentId;
+ // Record whether the last document belongs to the given namespaces.
+ bool last_document_in_namespace = false;
for (uint32_t idx = Seek(term_id); idx < header_->cur_size(); idx++) {
TermIdHitPair term_id_hit_pair(
hit_buffer_.array_cast<TermIdHitPair>()[idx]);
@@ -357,9 +359,10 @@ int LiteIndex::AppendHits(uint32_t term_id, SectionIdMask section_id_mask,
DocumentId document_id = hit.document_id();
if (document_id != last_document_id) {
last_document_id = document_id;
- // Check does current document belongs to the given namespaces.
- if (namespace_checker != nullptr &&
- !namespace_checker->BelongsToTargetNamespaces(document_id)) {
+ last_document_in_namespace =
+ namespace_checker == nullptr ||
+ namespace_checker->BelongsToTargetNamespaces(document_id);
+ if (!last_document_in_namespace) {
// The document is removed or expired or not belongs to target
// namespaces.
continue;
@@ -369,7 +372,7 @@ int LiteIndex::AppendHits(uint32_t term_id, SectionIdMask section_id_mask,
hits_out->push_back(DocHitInfo(document_id));
}
}
- if (hits_out != nullptr) {
+ if (hits_out != nullptr && last_document_in_namespace) {
hits_out->back().UpdateSection(hit.section_id(), hit.term_frequency());
}
}
@@ -388,15 +391,16 @@ bool LiteIndex::is_full() const {
lexicon_.min_free_fraction() < (1.0 - kTrieFullFraction));
}
-void LiteIndex::GetDebugInfo(int verbosity, std::string* out) const {
- absl_ports::StrAppend(
- out, IcingStringUtil::StringPrintf("Lite Index\nHit buffer %u/%u\n",
- header_->cur_size(),
- options_.hit_buffer_size));
-
- // Lexicon.
- out->append("Lexicon stats:\n");
- lexicon_.GetDebugInfo(verbosity, out);
+IndexDebugInfoProto::LiteIndexDebugInfoProto LiteIndex::GetDebugInfo(
+ int verbosity) {
+ IndexDebugInfoProto::LiteIndexDebugInfoProto res;
+ res.set_curr_size(header_->cur_size());
+ res.set_hit_buffer_size(options_.hit_buffer_size);
+ res.set_last_added_document_id(header_->last_added_docid());
+ res.set_searchable_end(header_->searchable_end());
+ res.set_index_crc(ComputeChecksum().Get());
+ lexicon_.GetDebugInfo(verbosity, res.mutable_lexicon_info());
+ return res;
}
libtextclassifier3::StatusOr<int64_t> LiteIndex::GetElementsSize() const {
@@ -417,12 +421,8 @@ IndexStorageInfoProto LiteIndex::GetStorageInfo(
IndexStorageInfoProto storage_info) const {
int64_t header_and_hit_buffer_file_size =
filesystem_->GetFileSize(hit_buffer_fd_.get());
- if (header_and_hit_buffer_file_size != Filesystem::kBadFileSize) {
- storage_info.set_lite_index_hit_buffer_size(
- header_and_hit_buffer_file_size);
- } else {
- storage_info.set_lite_index_hit_buffer_size(-1);
- }
+ storage_info.set_lite_index_hit_buffer_size(
+ IcingFilesystem::SanitizeFileSize(header_and_hit_buffer_file_size));
int64_t lexicon_disk_usage = lexicon_.GetElementsSize();
if (lexicon_disk_usage != Filesystem::kBadFileSize) {
storage_info.set_lite_index_lexicon_size(lexicon_disk_usage);
diff --git a/icing/index/lite/lite-index.h b/icing/index/lite/lite-index.h
index 890980c..378fc94 100644
--- a/icing/index/lite/lite-index.h
+++ b/icing/index/lite/lite-index.h
@@ -37,6 +37,7 @@
#include "icing/legacy/index/icing-lite-index-header.h"
#include "icing/legacy/index/icing-lite-index-options.h"
#include "icing/legacy/index/icing-mmapper.h"
+#include "icing/proto/debug.pb.h"
#include "icing/proto/storage.pb.h"
#include "icing/proto/term.pb.h"
#include "icing/schema/section.h"
@@ -241,7 +242,7 @@ class LiteIndex {
// Returns debug information for the index in out.
// verbosity <= 0, simplest debug information - size of lexicon, hit buffer
// verbosity > 0, more detailed debug information from the lexicon.
- void GetDebugInfo(int verbosity, std::string* out) const;
+ IndexDebugInfoProto::LiteIndexDebugInfoProto GetDebugInfo(int verbosity);
// Returns the byte size of all the elements held in the index. This excludes
// the size of any internal metadata of the index, e.g. the index's header.
diff --git a/icing/index/lite/lite-index_test.cc b/icing/index/lite/lite-index_test.cc
new file mode 100644
index 0000000..825f830
--- /dev/null
+++ b/icing/index/lite/lite-index_test.cc
@@ -0,0 +1,110 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/lite/lite-index.h"
+
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/index/term-id-codec.h"
+#include "icing/legacy/index/icing-mock-filesystem.h"
+#include "icing/schema/section.h"
+#include "icing/store/namespace-checker.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::Eq;
+using ::testing::IsEmpty;
+using ::testing::SizeIs;
+
+class AlwaysFalseNamespaceCheckerImpl : public NamespaceChecker {
+ public:
+ bool BelongsToTargetNamespaces(DocumentId document_id) const override {
+ return false;
+ }
+};
+
+class LiteIndexTest : public testing::Test {
+ protected:
+ void SetUp() override {
+ index_dir_ = GetTestTempDir() + "/test_dir";
+ ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(index_dir_.c_str()));
+
+ std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index";
+ LiteIndex::Options options(lite_index_file_name,
+ /*hit_buffer_want_merge_bytes=*/1024 * 1024);
+ ICING_ASSERT_OK_AND_ASSIGN(lite_index_,
+ LiteIndex::Create(options, &icing_filesystem_));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ term_id_codec_,
+ TermIdCodec::Create(
+ IcingDynamicTrie::max_value_index(IcingDynamicTrie::Options()),
+ IcingDynamicTrie::max_value_index(options.lexicon_options)));
+ }
+
+ void TearDown() override {
+ ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(index_dir_.c_str()));
+ }
+
+ std::string index_dir_;
+ Filesystem filesystem_;
+ IcingFilesystem icing_filesystem_;
+ std::unique_ptr<LiteIndex> lite_index_;
+ std::unique_ptr<TermIdCodec> term_id_codec_;
+};
+
+constexpr NamespaceId kNamespace0 = 0;
+
+TEST_F(LiteIndexTest, LiteIndexAppendHits) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t tvi,
+ lite_index_->InsertTerm("foo", TermMatchType::PREFIX, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id,
+ term_id_codec_->EncodeTvi(tvi, TviType::LITE));
+ Hit doc_hit0(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ Hit doc_hit1(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc_hit0));
+ ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc_hit1));
+
+ std::vector<DocHitInfo> hits1;
+ lite_index_->AppendHits(foo_term_id, kSectionIdMaskAll,
+ /*only_from_prefix_sections=*/false,
+ /*namespace_checker=*/nullptr, &hits1);
+ EXPECT_THAT(hits1, SizeIs(1));
+ EXPECT_THAT(hits1.back().document_id(), Eq(0));
+ // Check that the hits are coming from section 0 and section 1.
+ EXPECT_THAT(hits1.back().hit_section_ids_mask(), Eq(0b11));
+
+ std::vector<DocHitInfo> hits2;
+ AlwaysFalseNamespaceCheckerImpl always_false_namespace_checker;
+ lite_index_->AppendHits(foo_term_id, kSectionIdMaskAll,
+ /*only_from_prefix_sections=*/false,
+ &always_false_namespace_checker, &hits2);
+ // Check that no hits are returned because they get skipped by the namespace
+ // checker.
+ EXPECT_THAT(hits2, IsEmpty());
+}
+
+} // namespace
+} // namespace lib
+} // namespace icing