diff options
author | Jiayu Hu <hujiayu@google.com> | 2022-06-15 18:01:29 -0700 |
---|---|---|
committer | Jiayu Hu <hujiayu@google.com> | 2022-06-15 18:13:09 -0700 |
commit | 7c93c404e1fb4ed5e35326245ebc820ed774c6b2 (patch) | |
tree | fe159041f4c9358b9fd09b0e83ff551ff26b67a0 /icing/legacy | |
parent | e3dfbf68537a9865815a55623a92aed26bc80eb4 (diff) | |
download | icing-7c93c404e1fb4ed5e35326245ebc820ed774c6b2.tar.gz |
Sync from upstream.
Descriptions:
======================================================================
Export Icing logging tag to JNI
======================================================================
Update export_to_aosp.sh to change icing log tag to "AppSearchIcing"
======================================================================
Improve the logic of NamespaceChecker.
======================================================================
Step 4.1: Use ScoredDocumentHitsRanker in ResultStateV2
======================================================================
Step 4.0: Create ScoredDocumentHitsRanker interface and PriorityQueueScoredDocumentHitsRanker
======================================================================
Refactor KeyMapper
======================================================================
Change Icing DEFAULT_LOGGING_LEVEL to INFO
======================================================================
Step 3.4: Create ResultRetrieverV2GroupResultLimiterTest (copied from ResultStateTest)
======================================================================
Add IcingDynamicTrie::IsBranchingTerm to check if a term is branching.
======================================================================
Fix IcingDynamicTrie::Delete bug
======================================================================
Step 3.3: Create ResultRetrieverV2ProjectionTest (copied from ResultRetrieverTest)
======================================================================
Step 3.2: Create ResultRetrieverV2SnippetTest (copied from ResultRetrieverTest)
======================================================================
Step 3.1: Create ResultRetrieverV2Test (copied from ResultRetrieverTest)
======================================================================
Enable legacy multidex
======================================================================
Fix NPE caused by improper handling of return value of GetFileSize.
======================================================================
Step 3.0: Create ResultRetrieverV2 (copied from ResultRetriever)
======================================================================
Step 2: Create PageResult (copied from PageResultState)
======================================================================
Change AppSearch hawkeye testing app multidex to legacy.
======================================================================
Fix icing-search-engine_benchmark bug
======================================================================
(Small fix for step 1) Fix unit test stack memory error
======================================================================
Step 1: Create ResultStateV2 and ResultStateV2Test (copied from ResultState, ResultStateTest)
======================================================================
Bug: 146903474
Bug: 152934343
Bug: 193919210
Bug: 231368517
Bug: 232273174
Bug: 233470404
Bug: 233657885
Change-Id: Iae5461f8650c2bd58f683128fdd243d90403bc06
Diffstat (limited to 'icing/legacy')
-rw-r--r-- | icing/legacy/index/icing-dynamic-trie.cc | 93 | ||||
-rw-r--r-- | icing/legacy/index/icing-dynamic-trie.h | 17 | ||||
-rw-r--r-- | icing/legacy/index/icing-dynamic-trie_test.cc | 219 |
3 files changed, 308 insertions, 21 deletions
diff --git a/icing/legacy/index/icing-dynamic-trie.cc b/icing/legacy/index/icing-dynamic-trie.cc index 77876c4..4428599 100644 --- a/icing/legacy/index/icing-dynamic-trie.cc +++ b/icing/legacy/index/icing-dynamic-trie.cc @@ -101,15 +101,9 @@ namespace { constexpr uint32_t kInvalidNodeIndex = (1U << 24) - 1; constexpr uint32_t kInvalidNextIndex = ~0U; -// Returns the number of valid nexts in the array. -int GetValidNextsSize(IcingDynamicTrie::Next *next_array_start, - int next_array_length) { - int valid_nexts_length = 0; - for (; valid_nexts_length < next_array_length && - next_array_start[valid_nexts_length].node_index() != kInvalidNodeIndex; - ++valid_nexts_length) { - } - return valid_nexts_length; +void ResetMutableNext(IcingDynamicTrie::Next &mutable_next) { + mutable_next.set_val(0xff); + mutable_next.set_node_index(kInvalidNodeIndex); } } // namespace @@ -769,8 +763,7 @@ IcingDynamicTrie::IcingDynamicTrieStorage::AllocNextArray(int size) { // Fill with char 0xff so we are sorted properly. for (int i = 0; i < aligned_size; i++) { - ret[i].set_val(0xff); - ret[i].set_node_index(kInvalidNodeIndex); + ResetMutableNext(ret[i]); } return ret; } @@ -1550,9 +1543,7 @@ bool IcingDynamicTrie::ResetNext(uint32_t next_index) { if (mutable_next == nullptr) { return false; } - - mutable_next->set_val(0); - mutable_next->set_node_index(kInvalidNodeIndex); + ResetMutableNext(*mutable_next); return true; } @@ -1570,7 +1561,7 @@ bool IcingDynamicTrie::SortNextArray(const Node *node) { return false; } - std::sort(next_array_start, next_array_start + next_array_buffer_size - 1); + std::sort(next_array_start, next_array_start + next_array_buffer_size); return true; } @@ -2116,22 +2107,33 @@ const IcingDynamicTrie::Next *IcingDynamicTrie::GetNextByChar( return found; } +int IcingDynamicTrie::GetValidNextsSize( + IcingDynamicTrie::Next *next_array_start, int next_array_length) const { + // Only searching for key char 0xff is not sufficient, as 0xff can be a valid + // character. We must also specify kInvalidNodeIndex as the target node index + // when searching the next array. + return LowerBound(next_array_start, next_array_start + next_array_length, + /*key_char=*/0xff, /*node_index=*/kInvalidNodeIndex) - + next_array_start; +} + const IcingDynamicTrie::Next *IcingDynamicTrie::LowerBound( - const Next *start, const Next *end, uint8_t key_char) const { + const Next *start, const Next *end, uint8_t key_char, + uint32_t node_index) const { // Above this value will use binary search instead of linear // search. 16 was chosen from running some benchmarks with // different values. static const uint32_t kBinarySearchCutoff = 16; + Next key_next(key_char, node_index); if (end - start >= kBinarySearchCutoff) { // Binary search. - Next key_next(key_char, 0); return lower_bound(start, end, key_next); } else { // Linear search. const Next *found; for (found = start; found < end; found++) { - if (found->val() >= key_char) { + if (!(*found < key_next)) { // Should have gotten match. break; } @@ -2275,6 +2277,40 @@ std::vector<int> IcingDynamicTrie::FindBranchingPrefixLengths(const char *key, return prefix_lengths; } +bool IcingDynamicTrie::IsBranchingTerm(const char *key) const { + if (!is_initialized()) { + ICING_LOG(FATAL) << "DynamicTrie not initialized"; + } + + if (storage_->empty()) { + return false; + } + + uint32_t best_node_index; + int key_offset; + FindBestNode(key, &best_node_index, &key_offset, /*prefix=*/true); + const Node *cur_node = storage_->GetNode(best_node_index); + + if (cur_node->is_leaf()) { + return false; + } + + // key is not present in the trie. + if (key[key_offset] != '\0') { + return false; + } + + // Found key as an intermediate node, but key is not a valid term stored in + // the trie. + if (GetNextByChar(cur_node, '\0') == nullptr) { + return false; + } + + // The intermediate node for key must have more than two children for key to + // be a branching term, one of which represents the leaf node for key itself. + return cur_node->log2_num_children() > 1; +} + void IcingDynamicTrie::GetDebugInfo(int verbosity, std::string *out) const { Stats stats; CollectStats(&stats); @@ -2500,7 +2536,26 @@ bool IcingDynamicTrie::Delete(const std::string_view key) { for (uint32_t next_index : nexts_to_reset) { ResetNext(next_index); } - SortNextArray(last_multichild_node); + + if (last_multichild_node != nullptr) { + SortNextArray(last_multichild_node); + uint32_t next_array_buffer_size = + 1u << last_multichild_node->log2_num_children(); + Next *next_array_start = this->storage_->GetMutableNextArray( + last_multichild_node->next_index(), next_array_buffer_size); + uint32_t num_children = + GetValidNextsSize(next_array_start, next_array_buffer_size); + // Shrink the next array if we can. + if (num_children == next_array_buffer_size / 2) { + Node *mutable_node = storage_->GetMutableNode( + storage_->GetNodeIndex(last_multichild_node)); + mutable_node->set_log2_num_children(mutable_node->log2_num_children() - + 1); + // Add the unused second half of the next array to the free list. + storage_->FreeNextArray(next_array_start + next_array_buffer_size / 2, + mutable_node->log2_num_children()); + } + } return true; } diff --git a/icing/legacy/index/icing-dynamic-trie.h b/icing/legacy/index/icing-dynamic-trie.h index 013b926..ec8b31a 100644 --- a/icing/legacy/index/icing-dynamic-trie.h +++ b/icing/legacy/index/icing-dynamic-trie.h @@ -400,6 +400,16 @@ class IcingDynamicTrie : public IIcingStorage { // itself. If utf8 is true, does not cut key mid-utf8. std::vector<int> FindBranchingPrefixLengths(const char *key, bool utf8) const; + // Check if key is a branching term. + // + // key is a branching term, if and only if there exists terms s1 and s2 in the + // trie such that key is the maximum common prefix of s1 and s2, but s1 and s2 + // are not prefixes of each other. + // + // The function assumes that key is already present in the trie. Otherwise, + // false will be returned. + bool IsBranchingTerm(const char *key) const; + void GetDebugInfo(int verbosity, std::string *out) const override; double min_free_fraction() const; @@ -612,8 +622,11 @@ class IcingDynamicTrie : public IIcingStorage { // Helpers for Find and Insert. const Next *GetNextByChar(const Node *node, uint8_t key_char) const; - const Next *LowerBound(const Next *start, const Next *end, - uint8_t key_char) const; + const Next *LowerBound(const Next *start, const Next *end, uint8_t key_char, + uint32_t node_index = 0) const; + // Returns the number of valid nexts in the array. + int GetValidNextsSize(IcingDynamicTrie::Next *next_array_start, + int next_array_length) const; void FindBestNode(const char *key, uint32_t *best_node_index, int *key_offset, bool prefix, bool utf8 = false) const; diff --git a/icing/legacy/index/icing-dynamic-trie_test.cc b/icing/legacy/index/icing-dynamic-trie_test.cc index c8d1403..b69ee64 100644 --- a/icing/legacy/index/icing-dynamic-trie_test.cc +++ b/icing/legacy/index/icing-dynamic-trie_test.cc @@ -20,6 +20,7 @@ #include <memory> #include <string> #include <unordered_map> +#include <unordered_set> #include <vector> #include "icing/text_classifier/lib3/utils/hash/farmhash.h" @@ -27,13 +28,16 @@ #include "gtest/gtest.h" #include "icing/legacy/core/icing-string-util.h" #include "icing/legacy/index/icing-filesystem.h" +#include "icing/testing/random-string.h" #include "icing/testing/tmp-directory.h" +#include "icing/util/logging.h" namespace icing { namespace lib { namespace { +using testing::ContainerEq; using testing::ElementsAre; constexpr std::string_view kKeys[] = { @@ -962,6 +966,102 @@ TEST_F(IcingDynamicTrieTest, DeletingNonExistingKeyShouldReturnTrue) { EXPECT_TRUE(trie.Find("bed", &value)); } +TEST_F(IcingDynamicTrieTest, DeletionResortsFullNextArray) { + IcingFilesystem filesystem; + IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(), + &filesystem); + ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options())); + ASSERT_TRUE(trie.Init()); + + uint32_t value = 1; + // 'f' -> [ 'a', 'j', 'o', 'u' ] + ASSERT_TRUE(trie.Insert("foul", &value)); + ASSERT_TRUE(trie.Insert("far", &value)); + ASSERT_TRUE(trie.Insert("fudge", &value)); + ASSERT_TRUE(trie.Insert("fjord", &value)); + + // Delete the third child + EXPECT_TRUE(trie.Delete("foul")); + + std::vector<std::string> remaining; + for (IcingDynamicTrie::Iterator term_iter(trie, /*prefix=*/""); + term_iter.IsValid(); term_iter.Advance()) { + remaining.push_back(term_iter.GetKey()); + } + EXPECT_THAT(remaining, ElementsAre("far", "fjord", "fudge")); +} + +TEST_F(IcingDynamicTrieTest, DeletionResortsPartiallyFilledNextArray) { + IcingFilesystem filesystem; + IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(), + &filesystem); + ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options())); + ASSERT_TRUE(trie.Init()); + + uint32_t value = 1; + // 'f' -> [ 'a', 'o', 'u', 0xFF ] + ASSERT_TRUE(trie.Insert("foul", &value)); + ASSERT_TRUE(trie.Insert("far", &value)); + ASSERT_TRUE(trie.Insert("fudge", &value)); + + // Delete the second child + EXPECT_TRUE(trie.Delete("foul")); + + std::vector<std::string> remaining; + for (IcingDynamicTrie::Iterator term_iter(trie, /*prefix=*/""); + term_iter.IsValid(); term_iter.Advance()) { + remaining.push_back(term_iter.GetKey()); + } + EXPECT_THAT(remaining, ElementsAre("far", "fudge")); +} + +TEST_F(IcingDynamicTrieTest, DeletionLoadTest) { + IcingFilesystem filesystem; + IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(), + &filesystem); + ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options())); + ASSERT_TRUE(trie.Init()); + + std::default_random_engine random; + ICING_LOG(ERROR) << "Seed: " << std::default_random_engine::default_seed; + std::vector<std::string> terms; + uint32_t value; + // Randomly generate 2048 terms. + for (int i = 0; i < 2048; ++i) { + terms.push_back(RandomString("abcdefg", 5, &random)); + ASSERT_TRUE(trie.Insert(terms.back().c_str(), &value)); + } + + // Randomly delete 1024 terms. + std::unordered_set<std::string> exp_remaining(terms.begin(), terms.end()); + std::shuffle(terms.begin(), terms.end(), random); + for (int i = 0; i < 1024; ++i) { + exp_remaining.erase(terms[i]); + ASSERT_TRUE(trie.Delete(terms[i].c_str())); + } + + // Check that the iterator still works, and the remaining terms are correct. + std::unordered_set<std::string> remaining; + for (IcingDynamicTrie::Iterator term_iter(trie, /*prefix=*/""); + term_iter.IsValid(); term_iter.Advance()) { + remaining.insert(term_iter.GetKey()); + } + EXPECT_THAT(remaining, ContainerEq(exp_remaining)); + + // Check that we can still insert terms after delete. + for (int i = 0; i < 2048; ++i) { + std::string term = RandomString("abcdefg", 5, &random); + ASSERT_TRUE(trie.Insert(term.c_str(), &value)); + exp_remaining.insert(term); + } + remaining.clear(); + for (IcingDynamicTrie::Iterator term_iter(trie, /*prefix=*/""); + term_iter.IsValid(); term_iter.Advance()) { + remaining.insert(term_iter.GetKey()); + } + EXPECT_THAT(remaining, ContainerEq(exp_remaining)); +} + } // namespace // The tests below are accessing private methods and fields of IcingDynamicTrie @@ -1133,5 +1233,124 @@ TEST_F(IcingDynamicTrieTest, BitmapsClosedWhenInitFails) { ASSERT_EQ(0, trie.property_bitmaps_.size()); } +TEST_F(IcingDynamicTrieTest, IsBranchingTerm) { + IcingFilesystem filesystem; + IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(), + &filesystem); + ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options())); + ASSERT_TRUE(trie.Init()); + + uint32_t value = 1; + + ASSERT_TRUE(trie.Insert("", &value)); + EXPECT_FALSE(trie.IsBranchingTerm("")); + + ASSERT_TRUE(trie.Insert("ab", &value)); + EXPECT_FALSE(trie.IsBranchingTerm("")); + EXPECT_FALSE(trie.IsBranchingTerm("ab")); + + ASSERT_TRUE(trie.Insert("ac", &value)); + // "" is a prefix of "ab" and "ac", but it is not a branching term. + EXPECT_FALSE(trie.IsBranchingTerm("")); + EXPECT_FALSE(trie.IsBranchingTerm("ab")); + EXPECT_FALSE(trie.IsBranchingTerm("ac")); + + ASSERT_TRUE(trie.Insert("ba", &value)); + // "" now branches to "ba" + EXPECT_TRUE(trie.IsBranchingTerm("")); + EXPECT_FALSE(trie.IsBranchingTerm("ab")); + EXPECT_FALSE(trie.IsBranchingTerm("ac")); + EXPECT_FALSE(trie.IsBranchingTerm("ba")); + + ASSERT_TRUE(trie.Insert("a", &value)); + EXPECT_TRUE(trie.IsBranchingTerm("")); + // "a" branches to "ab" and "ac" + EXPECT_TRUE(trie.IsBranchingTerm("a")); + EXPECT_FALSE(trie.IsBranchingTerm("ab")); + EXPECT_FALSE(trie.IsBranchingTerm("ac")); + EXPECT_FALSE(trie.IsBranchingTerm("ba")); + + ASSERT_TRUE(trie.Insert("abc", &value)); + ASSERT_TRUE(trie.Insert("acd", &value)); + EXPECT_TRUE(trie.IsBranchingTerm("")); + EXPECT_TRUE(trie.IsBranchingTerm("a")); + // "ab" is a prefix of "abc", but it is not a branching term. + EXPECT_FALSE(trie.IsBranchingTerm("ab")); + // "ac" is a prefix of "acd", but it is not a branching term. + EXPECT_FALSE(trie.IsBranchingTerm("ac")); + EXPECT_FALSE(trie.IsBranchingTerm("ba")); + EXPECT_FALSE(trie.IsBranchingTerm("abc")); + EXPECT_FALSE(trie.IsBranchingTerm("acd")); + + ASSERT_TRUE(trie.Insert("abcd", &value)); + EXPECT_TRUE(trie.IsBranchingTerm("")); + EXPECT_TRUE(trie.IsBranchingTerm("a")); + // "ab" is a prefix of "abc" and "abcd", but it is not a branching term. + EXPECT_FALSE(trie.IsBranchingTerm("ab")); + EXPECT_FALSE(trie.IsBranchingTerm("ac")); + EXPECT_FALSE(trie.IsBranchingTerm("ba")); + // "abc" is a prefix of "abcd", but it is not a branching term. + EXPECT_FALSE(trie.IsBranchingTerm("abc")); + EXPECT_FALSE(trie.IsBranchingTerm("acd")); + EXPECT_FALSE(trie.IsBranchingTerm("abcd")); + + ASSERT_TRUE(trie.Insert("abd", &value)); + EXPECT_TRUE(trie.IsBranchingTerm("")); + EXPECT_TRUE(trie.IsBranchingTerm("a")); + // "ab" branches to "abc" and "abd" + EXPECT_TRUE(trie.IsBranchingTerm("ab")); + EXPECT_FALSE(trie.IsBranchingTerm("ac")); + EXPECT_FALSE(trie.IsBranchingTerm("ba")); + EXPECT_FALSE(trie.IsBranchingTerm("abc")); + EXPECT_FALSE(trie.IsBranchingTerm("acd")); + EXPECT_FALSE(trie.IsBranchingTerm("abcd")); + EXPECT_FALSE(trie.IsBranchingTerm("abd")); +} + +TEST_F(IcingDynamicTrieTest, IsBranchingTermShouldWorkForNonExistingTerms) { + IcingFilesystem filesystem; + IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(), + &filesystem); + ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options())); + ASSERT_TRUE(trie.Init()); + + uint32_t value = 1; + + EXPECT_FALSE(trie.IsBranchingTerm("")); + EXPECT_FALSE(trie.IsBranchingTerm("a")); + EXPECT_FALSE(trie.IsBranchingTerm("ab")); + + ASSERT_TRUE(trie.Insert("aa", &value)); + EXPECT_FALSE(trie.IsBranchingTerm("")); + EXPECT_FALSE(trie.IsBranchingTerm("a")); + + ASSERT_TRUE(trie.Insert("", &value)); + EXPECT_FALSE(trie.IsBranchingTerm("a")); + + ASSERT_TRUE(trie.Insert("ab", &value)); + EXPECT_FALSE(trie.IsBranchingTerm("a")); + + ASSERT_TRUE(trie.Insert("ac", &value)); + EXPECT_FALSE(trie.IsBranchingTerm("a")); + + ASSERT_TRUE(trie.Insert("ad", &value)); + EXPECT_FALSE(trie.IsBranchingTerm("a")); + + ASSERT_TRUE(trie.Insert("abcd", &value)); + EXPECT_FALSE(trie.IsBranchingTerm("abc")); + + ASSERT_TRUE(trie.Insert("abce", &value)); + EXPECT_FALSE(trie.IsBranchingTerm("abc")); + + ASSERT_TRUE(trie.Insert("abcf", &value)); + EXPECT_FALSE(trie.IsBranchingTerm("abc")); + + ASSERT_TRUE(trie.Insert("abc_suffix", &value)); + EXPECT_FALSE(trie.IsBranchingTerm("abc")); + EXPECT_FALSE(trie.IsBranchingTerm("abc_s")); + EXPECT_FALSE(trie.IsBranchingTerm("abc_su")); + EXPECT_FALSE(trie.IsBranchingTerm("abc_suffi")); +} + } // namespace lib } // namespace icing |