diff options
author | Grace Zhao <gracezrx@google.com> | 2023-08-30 23:54:01 -0700 |
---|---|---|
committer | Grace Zhao <gracezrx@google.com> | 2023-08-30 23:54:01 -0700 |
commit | 6293cd0a843b1e5b09086d9010ec863556d0c1ba (patch) | |
tree | 8834036305dca128017ee06f9b2d3d157da54c62 /icing/index/lite | |
parent | 8c71e61d02944611249c892236e67c6acace8a2d (diff) | |
download | icing-6293cd0a843b1e5b09086d9010ec863556d0c1ba.tar.gz |
Update Icing from upstream.
Descriptions:
========================================================================
[fixit] Remove incorrect reserve usage for STL data structure
========================================================================
Move LiteIndex::SortHits() from the query path to the indexing path
========================================================================
Bug: 286418010
Bug: 297549761
Bug: 248295995
Change-Id: I46168178e0227451b0f7dbf394bb6cc5db94e1e8
Diffstat (limited to 'icing/index/lite')
-rw-r--r-- | icing/index/lite/lite-index-options.cc | 10 | ||||
-rw-r--r-- | icing/index/lite/lite-index-options.h | 6 | ||||
-rw-r--r-- | icing/index/lite/lite-index.cc | 269 | ||||
-rw-r--r-- | icing/index/lite/lite-index.h | 83 | ||||
-rw-r--r-- | icing/index/lite/lite-index_test.cc | 641 | ||||
-rw-r--r-- | icing/index/lite/lite-index_thread-safety_test.cc | 7 | ||||
-rw-r--r-- | icing/index/lite/term-id-hit-pair.h | 2 |
7 files changed, 859 insertions, 159 deletions
diff --git a/icing/index/lite/lite-index-options.cc b/icing/index/lite/lite-index-options.cc index 29075f8..8780d45 100644 --- a/icing/index/lite/lite-index-options.cc +++ b/icing/index/lite/lite-index-options.cc @@ -14,6 +14,8 @@ #include "icing/index/lite/lite-index-options.h" +#include <cstdint> + #include "icing/index/lite/term-id-hit-pair.h" namespace icing { @@ -64,9 +66,13 @@ IcingDynamicTrie::Options CalculateTrieOptions(uint32_t hit_buffer_size) { } // namespace LiteIndexOptions::LiteIndexOptions(const std::string& filename_base, - uint32_t hit_buffer_want_merge_bytes) + uint32_t hit_buffer_want_merge_bytes, + bool hit_buffer_sort_at_indexing, + uint32_t hit_buffer_sort_threshold_bytes) : filename_base(filename_base), - hit_buffer_want_merge_bytes(hit_buffer_want_merge_bytes) { + hit_buffer_want_merge_bytes(hit_buffer_want_merge_bytes), + hit_buffer_sort_at_indexing(hit_buffer_sort_at_indexing), + hit_buffer_sort_threshold_bytes(hit_buffer_sort_threshold_bytes) { hit_buffer_size = CalculateHitBufferSize(hit_buffer_want_merge_bytes); lexicon_options = CalculateTrieOptions(hit_buffer_size); display_mappings_options = CalculateTrieOptions(hit_buffer_size); diff --git a/icing/index/lite/lite-index-options.h b/icing/index/lite/lite-index-options.h index ae58802..9f8452c 100644 --- a/icing/index/lite/lite-index-options.h +++ b/icing/index/lite/lite-index-options.h @@ -27,7 +27,9 @@ struct LiteIndexOptions { // hit_buffer_want_merge_bytes and the logic in CalculateHitBufferSize and // CalculateTrieOptions. LiteIndexOptions(const std::string& filename_base, - uint32_t hit_buffer_want_merge_bytes); + uint32_t hit_buffer_want_merge_bytes, + bool hit_buffer_sort_at_indexing, + uint32_t hit_buffer_sort_threshold_bytes); IcingDynamicTrie::Options lexicon_options; IcingDynamicTrie::Options display_mappings_options; @@ -35,6 +37,8 @@ struct LiteIndexOptions { std::string filename_base; uint32_t hit_buffer_want_merge_bytes = 0; uint32_t hit_buffer_size = 0; + bool hit_buffer_sort_at_indexing = false; + uint32_t hit_buffer_sort_threshold_bytes = 0; }; } // namespace lib diff --git a/icing/index/lite/lite-index.cc b/icing/index/lite/lite-index.cc index bf54dec..ec7141a 100644 --- a/icing/index/lite/lite-index.cc +++ b/icing/index/lite/lite-index.cc @@ -36,6 +36,8 @@ #include "icing/index/hit/doc-hit-info.h" #include "icing/index/hit/hit.h" #include "icing/index/lite/lite-index-header.h" +#include "icing/index/lite/term-id-hit-pair.h" +#include "icing/index/term-id-codec.h" #include "icing/index/term-property-id.h" #include "icing/legacy/core/icing-string-util.h" #include "icing/legacy/core/icing-timer.h" @@ -44,10 +46,13 @@ #include "icing/legacy/index/icing-filesystem.h" #include "icing/legacy/index/icing-mmapper.h" #include "icing/proto/debug.pb.h" +#include "icing/proto/scoring.pb.h" #include "icing/proto/storage.pb.h" #include "icing/proto/term.pb.h" #include "icing/schema/section.h" #include "icing/store/document-id.h" +#include "icing/store/namespace-id.h" +#include "icing/store/suggestion-result-checker.h" #include "icing/util/crc32.h" #include "icing/util/logging.h" #include "icing/util/status-macros.h" @@ -160,7 +165,7 @@ libtextclassifier3::Status LiteIndex::Initialize() { } // Set up header. - header_mmap_.Remap(hit_buffer_fd_.get(), 0, header_size()); + header_mmap_.Remap(hit_buffer_fd_.get(), kHeaderFileOffset, header_size()); header_ = std::make_unique<LiteIndex_HeaderImpl>( reinterpret_cast<LiteIndex_HeaderImpl::HeaderData*>( header_mmap_.address())); @@ -175,7 +180,7 @@ libtextclassifier3::Status LiteIndex::Initialize() { UpdateChecksum(); } else { - header_mmap_.Remap(hit_buffer_fd_.get(), 0, header_size()); + header_mmap_.Remap(hit_buffer_fd_.get(), kHeaderFileOffset, header_size()); header_ = std::make_unique<LiteIndex_HeaderImpl>( reinterpret_cast<LiteIndex_HeaderImpl::HeaderData*>( header_mmap_.address())); @@ -352,6 +357,73 @@ libtextclassifier3::StatusOr<uint32_t> LiteIndex::GetTermId( return tvi; } +void LiteIndex::ScoreAndAppendFetchedHit( + const Hit& hit, SectionIdMask section_id_mask, + bool only_from_prefix_sections, + SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by, + const SuggestionResultChecker* suggestion_result_checker, + DocumentId& last_document_id, bool& is_last_document_desired, + int& total_score_out, std::vector<DocHitInfo>* hits_out, + std::vector<Hit::TermFrequencyArray>* term_frequency_out) const { + // Check sections. + if (((UINT64_C(1) << hit.section_id()) & section_id_mask) == 0) { + return; + } + // Check prefix section only. + if (only_from_prefix_sections && !hit.is_in_prefix_section()) { + return; + } + // Check whether this Hit is desired. + // TODO(b/230553264) Move common logic into helper function once we support + // score term by prefix_hit in lite_index. + DocumentId document_id = hit.document_id(); + bool is_new_document = document_id != last_document_id; + if (is_new_document) { + last_document_id = document_id; + is_last_document_desired = + suggestion_result_checker == nullptr || + suggestion_result_checker->BelongsToTargetResults(document_id, + hit.section_id()); + } + if (!is_last_document_desired) { + // The document is removed or expired or not desired. + return; + } + + // Score the hit by the strategy + switch (score_by) { + case SuggestionScoringSpecProto::SuggestionRankingStrategy::NONE: + total_score_out = 1; + break; + case SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT: + if (is_new_document) { + ++total_score_out; + } + break; + case SuggestionScoringSpecProto::SuggestionRankingStrategy::TERM_FREQUENCY: + if (hit.has_term_frequency()) { + total_score_out += hit.term_frequency(); + } else { + ++total_score_out; + } + break; + } + + // Append the Hit or update hit section to the output vector. + if (is_new_document && hits_out != nullptr) { + hits_out->push_back(DocHitInfo(document_id)); + if (term_frequency_out != nullptr) { + term_frequency_out->push_back(Hit::TermFrequencyArray()); + } + } + if (hits_out != nullptr) { + hits_out->back().UpdateSection(hit.section_id()); + if (term_frequency_out != nullptr) { + term_frequency_out->back()[hit.section_id()] = hit.term_frequency(); + } + } +} + int LiteIndex::FetchHits( uint32_t term_id, SectionIdMask section_id_mask, bool only_from_prefix_sections, @@ -359,19 +431,38 @@ int LiteIndex::FetchHits( const SuggestionResultChecker* suggestion_result_checker, std::vector<DocHitInfo>* hits_out, std::vector<Hit::TermFrequencyArray>* term_frequency_out) { - int score = 0; - DocumentId last_document_id = kInvalidDocumentId; - // Record whether the last document belongs to the given namespaces. - bool is_last_document_desired = false; - - if (NeedSort()) { - // Transition from shared_lock in NeedSort to unique_lock here is safe - // because it doesn't hurt to sort again if sorting was done already by - // another thread after NeedSort is evaluated. NeedSort is called before - // sorting to improve concurrency as threads can avoid acquiring the unique - // lock if no sorting is needed. + bool need_sort_at_querying = false; + { + absl_ports::shared_lock l(&mutex_); + + // We sort here when: + // 1. We don't enable sorting at indexing time (i.e. we sort at querying + // time), and there is an unsorted tail portion. OR + // 2. The unsorted tail size exceeds the hit_buffer_sort_threshold, + // regardless of whether or not hit_buffer_sort_at_indexing is enabled. + // This is more of a sanity check. We should not really be encountering + // this case. + need_sort_at_querying = NeedSortAtQuerying(); + } + if (need_sort_at_querying) { absl_ports::unique_lock l(&mutex_); - SortHits(); + IcingTimer timer; + + // Transition from shared_lock to unique_lock is safe here because it + // doesn't hurt to sort again if sorting was done already by another thread + // after need_sort_at_querying is evaluated. + // We check need_sort_at_querying to improve query concurrency as threads + // can avoid acquiring the unique lock if no sorting is needed. + SortHitsImpl(); + + if (options_.hit_buffer_sort_at_indexing) { + // This is the second case for sort. Log as this should be a very rare + // occasion. + ICING_LOG(WARNING) << "Sorting HitBuffer at querying time when " + "hit_buffer_sort_at_indexing is enabled. Sort and " + "merge HitBuffer in " + << timer.Elapsed() * 1000 << " ms."; + } } // This downgrade from an unique_lock to a shared_lock is safe because we're @@ -379,75 +470,72 @@ int LiteIndex::FetchHits( // only in Seek(). // Any operations that might execute in between the transition of downgrading // the lock here are guaranteed not to alter the searchable section (or the - // LiteIndex due to a global lock in IcingSearchEngine). + // LiteIndex) due to a global lock in IcingSearchEngine. absl_ports::shared_lock l(&mutex_); - for (uint32_t idx = Seek(term_id); idx < header_->searchable_end(); idx++) { - TermIdHitPair term_id_hit_pair = - hit_buffer_.array_cast<TermIdHitPair>()[idx]; - if (term_id_hit_pair.term_id() != term_id) break; - - const Hit& hit = term_id_hit_pair.hit(); - // Check sections. - if (((UINT64_C(1) << hit.section_id()) & section_id_mask) == 0) { - continue; - } - // Check prefix section only. - if (only_from_prefix_sections && !hit.is_in_prefix_section()) { - continue; - } - // TODO(b/230553264) Move common logic into helper function once we support - // score term by prefix_hit in lite_index. - // Check whether this Hit is desired. - DocumentId document_id = hit.document_id(); - bool is_new_document = document_id != last_document_id; - if (is_new_document) { - last_document_id = document_id; - is_last_document_desired = - suggestion_result_checker == nullptr || - suggestion_result_checker->BelongsToTargetResults(document_id, - hit.section_id()); - } - if (!is_last_document_desired) { - // The document is removed or expired or not desired. - continue; - } - // Score the hit by the strategy - switch (score_by) { - case SuggestionScoringSpecProto::SuggestionRankingStrategy::NONE: - score = 1; - break; - case SuggestionScoringSpecProto::SuggestionRankingStrategy:: - DOCUMENT_COUNT: - if (is_new_document) { - ++score; - } - break; - case SuggestionScoringSpecProto::SuggestionRankingStrategy:: - TERM_FREQUENCY: - if (hit.has_term_frequency()) { - score += hit.term_frequency(); - } else { - ++score; - } - break; - } + // Search in the HitBuffer array for Hits with the corresponding term_id. + // Hits are added in increasing order of doc ids, so hits that get appended + // later have larger docIds. This means that: + // 1. Hits in the unsorted tail will have larger docIds than hits in the + // sorted portion. + // 2. Hits at the end of the unsorted tail will have larger docIds than hits + // in the front of the tail. + // We want to retrieve hits in descending order of docIds. Therefore we should + // search by doing: + // 1. Linear search first in reverse iteration order over the unsorted tail + // portion. + // 2. Followed by binary search on the sorted portion. + const TermIdHitPair* array = hit_buffer_.array_cast<TermIdHitPair>(); - // Append the Hit or update hit section to the output vector. - if (is_new_document && hits_out != nullptr) { - hits_out->push_back(DocHitInfo(document_id)); - if (term_frequency_out != nullptr) { - term_frequency_out->push_back(Hit::TermFrequencyArray()); + DocumentId last_document_id = kInvalidDocumentId; + // Record whether the last document belongs to the given namespaces. + bool is_last_document_desired = false; + int total_score = 0; + + // Linear search over unsorted tail in reverse iteration order. + // This should only be performed when hit_buffer_sort_at_indexing is enabled. + // When disabled, the entire HitBuffer should be sorted already and only + // binary search is needed. + if (options_.hit_buffer_sort_at_indexing) { + uint32_t unsorted_length = header_->cur_size() - header_->searchable_end(); + for (uint32_t i = 1; i <= unsorted_length; ++i) { + TermIdHitPair term_id_hit_pair = array[header_->cur_size() - i]; + if (term_id_hit_pair.term_id() == term_id) { + // We've found a matched hit. + const Hit& matched_hit = term_id_hit_pair.hit(); + // Score the hit and add to total_score. Also add the hits and its term + // frequency info to hits_out and term_frequency_out if the two vectors + // are non-null. + ScoreAndAppendFetchedHit(matched_hit, section_id_mask, + only_from_prefix_sections, score_by, + suggestion_result_checker, last_document_id, + is_last_document_desired, total_score, + hits_out, term_frequency_out); } } - if (hits_out != nullptr) { - hits_out->back().UpdateSection(hit.section_id()); - if (term_frequency_out != nullptr) { - term_frequency_out->back()[hit.section_id()] = hit.term_frequency(); - } + } + + // Do binary search over the sorted section and repeat the above steps. + TermIdHitPair target_term_id_hit_pair( + term_id, Hit(Hit::kMaxDocumentIdSortValue, Hit::kDefaultTermFrequency)); + for (const TermIdHitPair* ptr = std::lower_bound( + array, array + header_->searchable_end(), target_term_id_hit_pair); + ptr < array + header_->searchable_end(); ++ptr) { + if (ptr->term_id() != term_id) { + // We've processed all matches. Stop iterating further. + break; } + + const Hit& matched_hit = ptr->hit(); + // Score the hit and add to total_score. Also add the hits and its term + // frequency info to hits_out and term_frequency_out if the two vectors are + // non-null. + ScoreAndAppendFetchedHit( + matched_hit, section_id_mask, only_from_prefix_sections, score_by, + suggestion_result_checker, last_document_id, is_last_document_desired, + total_score, hits_out, term_frequency_out); } - return score; + return total_score; } libtextclassifier3::StatusOr<int> LiteIndex::ScoreHits( @@ -455,9 +543,9 @@ libtextclassifier3::StatusOr<int> LiteIndex::ScoreHits( SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by, const SuggestionResultChecker* suggestion_result_checker) { return FetchHits(term_id, kSectionIdMaskAll, - /*only_from_prefix_sections=*/false, score_by, - suggestion_result_checker, - /*hits_out=*/nullptr); + /*only_from_prefix_sections=*/false, score_by, + suggestion_result_checker, + /*hits_out=*/nullptr); } bool LiteIndex::is_full() const { @@ -515,7 +603,7 @@ IndexStorageInfoProto LiteIndex::GetStorageInfo( return storage_info; } -void LiteIndex::SortHits() { +void LiteIndex::SortHitsImpl() { // Make searchable by sorting by hit buffer. uint32_t sort_len = header_->cur_size() - header_->searchable_end(); if (sort_len <= 0) { @@ -546,25 +634,6 @@ void LiteIndex::SortHits() { UpdateChecksum(); } -uint32_t LiteIndex::Seek(uint32_t term_id) const { - // Binary search for our term_id. Make sure we get the first - // element. Using kBeginSortValue ensures this for the hit value. - TermIdHitPair term_id_hit_pair( - term_id, Hit(Hit::kMaxDocumentIdSortValue, Hit::kDefaultTermFrequency)); - - const TermIdHitPair::Value* array = - hit_buffer_.array_cast<TermIdHitPair::Value>(); - if (header_->searchable_end() != header_->cur_size()) { - ICING_LOG(WARNING) << "Lite index: hit buffer searchable end != current " - << "size during Seek(): " - << header_->searchable_end() << " vs " - << header_->cur_size(); - } - const TermIdHitPair::Value* ptr = std::lower_bound( - array, array + header_->searchable_end(), term_id_hit_pair.value()); - return ptr - array; -} - libtextclassifier3::Status LiteIndex::Optimize( const std::vector<DocumentId>& document_id_old_to_new, const TermIdCodec* term_id_codec, DocumentId new_last_added_document_id) { @@ -575,7 +644,7 @@ libtextclassifier3::Status LiteIndex::Optimize( } // Sort the hits so that hits with the same term id will be grouped together, // which helps later to determine which terms will be unused after compaction. - SortHits(); + SortHitsImpl(); uint32_t new_size = 0; uint32_t curr_term_id = 0; uint32_t curr_tvi = 0; diff --git a/icing/index/lite/lite-index.h b/icing/index/lite/lite-index.h index 916a14b..288602a 100644 --- a/icing/index/lite/lite-index.h +++ b/icing/index/lite/lite-index.h @@ -20,6 +20,7 @@ #define ICING_INDEX_LITE_INDEX_H_ #include <cstdint> +#include <iterator> #include <limits> #include <memory> #include <string> @@ -48,7 +49,6 @@ #include "icing/store/document-id.h" #include "icing/store/namespace-id.h" #include "icing/store/suggestion-result-checker.h" -#include "icing/util/bit-util.h" #include "icing/util/crc32.h" namespace icing { @@ -63,6 +63,9 @@ class LiteIndex { // An entry in the hit buffer. using Options = LiteIndexOptions; + // Offset for the LiteIndex_Header in the hit buffer mmap. + static constexpr uint32_t kHeaderFileOffset = 0; + // Updates checksum of subcomponents. ~LiteIndex(); @@ -152,8 +155,8 @@ class LiteIndex { // Add all hits with term_id from the sections specified in section_id_mask, // skipping hits in non-prefix sections if only_from_prefix_sections is true, // to hits_out. If hits_out is nullptr, no hits will be added. The - // corresponding hit term frequencies will also be added if term_frequency_out - // is nullptr. + // corresponding hit term frequencies will also not be added if + // term_frequency_out is nullptr. // // Only those hits which belongs to the given namespaces will be counted and // fetched. A nullptr namespace checker will disable this check. @@ -181,15 +184,29 @@ class LiteIndex { uint32_t size() const ICING_LOCKS_EXCLUDED(mutex_) { absl_ports::shared_lock l(&mutex_); - return sizeLocked(); + return size_impl(); } bool WantsMerge() const ICING_LOCKS_EXCLUDED(mutex_) { absl_ports::shared_lock l(&mutex_); - return is_full() || sizeLocked() >= (options_.hit_buffer_want_merge_bytes / - sizeof(TermIdHitPair::Value)); + return is_full() || size_impl() >= (options_.hit_buffer_want_merge_bytes / + sizeof(TermIdHitPair::Value)); + } + + // Whether or not the HitBuffer's unsorted tail size exceeds the sort + // threshold. + bool HasUnsortedHitsExceedingSortThreshold() const + ICING_LOCKS_EXCLUDED(mutex_) { + absl_ports::shared_lock l(&mutex_); + return HasUnsortedHitsExceedingSortThresholdImpl(); } + // Sort hits stored in the index. + void SortHits() ICING_LOCKS_EXCLUDED(mutex_) { + absl_ports::unique_lock l(&mutex_); + SortHitsImpl(); + }; + class const_iterator { friend class LiteIndex; @@ -326,17 +343,13 @@ class LiteIndex { // Check if the hit buffer has reached its capacity. bool is_full() const ICING_SHARED_LOCKS_REQUIRED(mutex_); - uint32_t sizeLocked() const ICING_SHARED_LOCKS_REQUIRED(mutex_) { - return header_->cur_size(); - } - // Non-locking implementation for empty(). bool empty_impl() const ICING_SHARED_LOCKS_REQUIRED(mutex_) { return size_impl() == 0; } // Non-locking implementation for size(). - bool size_impl() const ICING_SHARED_LOCKS_REQUIRED(mutex_) { + uint32_t size_impl() const ICING_SHARED_LOCKS_REQUIRED(mutex_) { return header_->cur_size(); } @@ -352,18 +365,48 @@ class LiteIndex { NamespaceId namespace_id) ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); - // Whether or not the HitBuffer requires sorting. - bool NeedSort() ICING_LOCKS_EXCLUDED(mutex_) { - absl_ports::shared_lock l(&mutex_); - return header_->cur_size() - header_->searchable_end() > 0; + // We need to sort during querying time when: + // 1. Sorting at indexing time is not enabled and there is an unsorted tail + // section in the HitBuffer. + // 2. The unsorted tail size exceeds the hit_buffer_sort_threshold, regardless + // of whether or not hit_buffer_sort_at_indexing is enabled. This is to + // prevent performing sequential search on a large unsorted tail section, + // which would result in bad query performance. + // This is more of a sanity check. We should not really be encountering + // this case. + bool NeedSortAtQuerying() const ICING_SHARED_LOCKS_REQUIRED(mutex_) { + return HasUnsortedHitsExceedingSortThresholdImpl() || + (!options_.hit_buffer_sort_at_indexing && + header_->cur_size() - header_->searchable_end() > 0); } - // Sort hits stored in the index. - void SortHits() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); + // Non-locking implementation for HasUnsortedHitsExceedingSortThresholdImpl(). + bool HasUnsortedHitsExceedingSortThresholdImpl() const + ICING_SHARED_LOCKS_REQUIRED(mutex_) { + return header_->cur_size() - header_->searchable_end() >= + (options_.hit_buffer_sort_threshold_bytes / + sizeof(TermIdHitPair::Value)); + } - // Returns the position of the first element with term_id, or the searchable - // end of the hit buffer if term_id is not present. - uint32_t Seek(uint32_t term_id) const ICING_SHARED_LOCKS_REQUIRED(mutex_); + // Non-locking implementation for SortHits(). + void SortHitsImpl() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); + + // Calculates and adds the score for a fetched hit to total_score_out, while + // updating last_document_id (which keeps track of the last added docId so + // far), and is_last_document_desired (which keeps track of whether that last + // added docId belongs to the query's desired namespace.) + // + // Also appends the hit to hits_out and term_frequency_out if the vectors are + // not null. + void ScoreAndAppendFetchedHit( + const Hit& hit, SectionIdMask section_id_mask, + bool only_from_prefix_sections, + SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by, + const SuggestionResultChecker* suggestion_result_checker, + DocumentId& last_document_id, bool& is_last_document_desired, + int& total_score_out, std::vector<DocHitInfo>* hits_out, + std::vector<Hit::TermFrequencyArray>* term_frequency_out) const + ICING_SHARED_LOCKS_REQUIRED(mutex_); // File descriptor that points to where the header and hit buffer are written // to. diff --git a/icing/index/lite/lite-index_test.cc b/icing/index/lite/lite-index_test.cc index 5f141ed..9811fa2 100644 --- a/icing/index/lite/lite-index_test.cc +++ b/icing/index/lite/lite-index_test.cc @@ -14,14 +14,27 @@ #include "icing/index/lite/lite-index.h" +#include <cstdint> +#include <memory> +#include <string> +#include <unordered_map> #include <vector> #include "gmock/gmock.h" #include "gtest/gtest.h" +#include "icing/file/filesystem.h" +#include "icing/index/hit/doc-hit-info.h" +#include "icing/index/hit/hit.h" +#include "icing/index/iterator/doc-hit-info-iterator.h" #include "icing/index/lite/doc-hit-info-iterator-term-lite.h" +#include "icing/index/lite/lite-index-header.h" #include "icing/index/term-id-codec.h" +#include "icing/legacy/index/icing-dynamic-trie.h" +#include "icing/legacy/index/icing-filesystem.h" +#include "icing/proto/scoring.pb.h" +#include "icing/proto/term.pb.h" #include "icing/schema/section.h" -#include "icing/store/suggestion-result-checker.h" +#include "icing/store/namespace-id.h" #include "icing/testing/always-false-suggestion-result-checker-impl.h" #include "icing/testing/common-matchers.h" #include "icing/testing/tmp-directory.h" @@ -34,6 +47,8 @@ namespace { using ::testing::ElementsAre; using ::testing::Eq; using ::testing::IsEmpty; +using ::testing::IsFalse; +using ::testing::IsTrue; using ::testing::SizeIs; class LiteIndexTest : public testing::Test { @@ -41,62 +56,518 @@ class LiteIndexTest : public testing::Test { void SetUp() override { index_dir_ = GetTestTempDir() + "/test_dir"; ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(index_dir_.c_str())); - - std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index"; - LiteIndex::Options options(lite_index_file_name, - /*hit_buffer_want_merge_bytes=*/1024 * 1024); - ICING_ASSERT_OK_AND_ASSIGN(lite_index_, - LiteIndex::Create(options, &icing_filesystem_)); - - ICING_ASSERT_OK_AND_ASSIGN( - term_id_codec_, - TermIdCodec::Create( - IcingDynamicTrie::max_value_index(IcingDynamicTrie::Options()), - IcingDynamicTrie::max_value_index(options.lexicon_options))); } void TearDown() override { term_id_codec_.reset(); - lite_index_.reset(); ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(index_dir_.c_str())); } std::string index_dir_; Filesystem filesystem_; IcingFilesystem icing_filesystem_; - std::unique_ptr<LiteIndex> lite_index_; std::unique_ptr<TermIdCodec> term_id_codec_; }; constexpr NamespaceId kNamespace0 = 0; -TEST_F(LiteIndexTest, LiteIndexAppendHits) { +TEST_F(LiteIndexTest, + LiteIndexFetchHits_sortAtQuerying_unsortedHitsBelowSortThreshold) { + // Set up LiteIndex and TermIdCodec + std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index"; + // At 64 bytes the unsorted tail can contain a max of 8 TermHitPairs. + LiteIndex::Options options(lite_index_file_name, + /*hit_buffer_want_merge_bytes=*/1024 * 1024, + /*hit_buffer_sort_at_indexing=*/false, + /*hit_buffer_sort_threshold_bytes=*/64); + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LiteIndex> lite_index, + LiteIndex::Create(options, &icing_filesystem_)); ICING_ASSERT_OK_AND_ASSIGN( - uint32_t tvi, - lite_index_->InsertTerm("foo", TermMatchType::PREFIX, kNamespace0)); + term_id_codec_, + TermIdCodec::Create( + IcingDynamicTrie::max_value_index(IcingDynamicTrie::Options()), + IcingDynamicTrie::max_value_index(options.lexicon_options))); + + // Add some hits + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t foo_tvi, + lite_index->InsertTerm("foo", TermMatchType::PREFIX, kNamespace0)); ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id, - term_id_codec_->EncodeTvi(tvi, TviType::LITE)); - Hit doc_hit0(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency, + term_id_codec_->EncodeTvi(foo_tvi, TviType::LITE)); + Hit foo_hit0(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency, /*is_in_prefix_section=*/false); - Hit doc_hit1(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency, + Hit foo_hit1(/*section_id=*/1, /*document_id=*/1, Hit::kDefaultTermFrequency, /*is_in_prefix_section=*/false); - ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc_hit0)); - ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc_hit1)); + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, foo_hit0)); + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, foo_hit1)); + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t bar_tvi, + lite_index->InsertTerm("bar", TermMatchType::PREFIX, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN(uint32_t bar_term_id, + term_id_codec_->EncodeTvi(bar_tvi, TviType::LITE)); + Hit bar_hit0(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit bar_hit1(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + ICING_ASSERT_OK(lite_index->AddHit(bar_term_id, bar_hit0)); + ICING_ASSERT_OK(lite_index->AddHit(bar_term_id, bar_hit1)); + + // Check that unsorted hits does not exceed the sort threshold. + EXPECT_THAT(lite_index->HasUnsortedHitsExceedingSortThreshold(), IsFalse()); + + // Check that hits are unsorted. Persist the data and pread from + // LiteIndexHeader. + ASSERT_THAT(lite_index->PersistToDisk(), IsOk()); + LiteIndex_HeaderImpl::HeaderData header_data; + ASSERT_TRUE(filesystem_.PRead((lite_index_file_name + "hb").c_str(), + &header_data, sizeof(header_data), + LiteIndex::kHeaderFileOffset)); + EXPECT_THAT(header_data.cur_size - header_data.searchable_end, Eq(4)); + + // Query the LiteIndex std::vector<DocHitInfo> hits1; - lite_index_->FetchHits( + lite_index->FetchHits( foo_term_id, kSectionIdMaskAll, /*only_from_prefix_sections=*/false, SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT, /*namespace_checker=*/nullptr, &hits1); EXPECT_THAT(hits1, SizeIs(1)); - EXPECT_THAT(hits1.back().document_id(), Eq(0)); + EXPECT_THAT(hits1.back().document_id(), Eq(1)); // Check that the hits are coming from section 0 and section 1. EXPECT_THAT(hits1.back().hit_section_ids_mask(), Eq(0b11)); std::vector<DocHitInfo> hits2; AlwaysFalseSuggestionResultCheckerImpl always_false_suggestion_result_checker; - lite_index_->FetchHits( + lite_index->FetchHits( + foo_term_id, kSectionIdMaskAll, + /*only_from_prefix_sections=*/false, + SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT, + &always_false_suggestion_result_checker, &hits2); + // Check that no hits are returned because they get skipped by the namespace + // checker. + EXPECT_THAT(hits2, IsEmpty()); + + // Check that hits are sorted after querying LiteIndex. Persist the data and + // pread from LiteIndexHeader. + ASSERT_THAT(lite_index->PersistToDisk(), IsOk()); + ASSERT_TRUE(filesystem_.PRead((lite_index_file_name + "hb").c_str(), + &header_data, sizeof(header_data), + LiteIndex::kHeaderFileOffset)); + EXPECT_THAT(header_data.cur_size - header_data.searchable_end, Eq(0)); +} + +TEST_F(LiteIndexTest, + LiteIndexFetchHits_sortAtIndexing_unsortedHitsBelowSortThreshold) { + // Set up LiteIndex and TermIdCodec + std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index"; + // At 64 bytes the unsorted tail can contain a max of 8 TermHitPairs. + // However note that in these tests we're unable to sort hits after + // indexing, as sorting performed by the string-section-indexing-handler + // after indexing all hits in an entire document, rather than after each + // AddHits() operation. + LiteIndex::Options options(lite_index_file_name, + /*hit_buffer_want_merge_bytes=*/1024 * 1024, + /*hit_buffer_sort_at_indexing=*/true, + /*hit_buffer_sort_threshold_bytes=*/64); + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LiteIndex> lite_index, + LiteIndex::Create(options, &icing_filesystem_)); + ICING_ASSERT_OK_AND_ASSIGN( + term_id_codec_, + TermIdCodec::Create( + IcingDynamicTrie::max_value_index(IcingDynamicTrie::Options()), + IcingDynamicTrie::max_value_index(options.lexicon_options))); + + // Add some hits + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t foo_tvi, + lite_index->InsertTerm("foo", TermMatchType::PREFIX, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id, + term_id_codec_->EncodeTvi(foo_tvi, TviType::LITE)); + Hit foo_hit0(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit foo_hit1(/*section_id=*/1, /*document_id=*/1, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, foo_hit0)); + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, foo_hit1)); + + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t bar_tvi, + lite_index->InsertTerm("bar", TermMatchType::PREFIX, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN(uint32_t bar_term_id, + term_id_codec_->EncodeTvi(bar_tvi, TviType::LITE)); + Hit bar_hit0(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit bar_hit1(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + ICING_ASSERT_OK(lite_index->AddHit(bar_term_id, bar_hit0)); + ICING_ASSERT_OK(lite_index->AddHit(bar_term_id, bar_hit1)); + + // Check that unsorted hits does not exceed the sort threshold. + EXPECT_THAT(lite_index->HasUnsortedHitsExceedingSortThreshold(), IsFalse()); + + // Check that hits are unsorted. Persist the data and pread from + // LiteIndexHeader. + ASSERT_THAT(lite_index->PersistToDisk(), IsOk()); + LiteIndex_HeaderImpl::HeaderData header_data; + ASSERT_TRUE(filesystem_.PRead((lite_index_file_name + "hb").c_str(), + &header_data, sizeof(header_data), + LiteIndex::kHeaderFileOffset)); + EXPECT_THAT(header_data.cur_size - header_data.searchable_end, Eq(4)); + + // Query the LiteIndex + std::vector<DocHitInfo> hits1; + lite_index->FetchHits( + foo_term_id, kSectionIdMaskAll, + /*only_from_prefix_sections=*/false, + SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT, + /*namespace_checker=*/nullptr, &hits1); + EXPECT_THAT(hits1, SizeIs(1)); + EXPECT_THAT(hits1.back().document_id(), Eq(1)); + // Check that the hits are coming from section 0 and section 1. + EXPECT_THAT(hits1.back().hit_section_ids_mask(), Eq(0b11)); + + std::vector<DocHitInfo> hits2; + AlwaysFalseSuggestionResultCheckerImpl always_false_suggestion_result_checker; + lite_index->FetchHits( + foo_term_id, kSectionIdMaskAll, + /*only_from_prefix_sections=*/false, + SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT, + &always_false_suggestion_result_checker, &hits2); + // Check that no hits are returned because they get skipped by the namespace + // checker. + EXPECT_THAT(hits2, IsEmpty()); + + // Check that hits are still unsorted after querying LiteIndex because the + // HitBuffer unsorted size is still below the sort threshold, and we've + // enabled sort_at_indexing. + // Persist the data and performing a pread on LiteIndexHeader. + ASSERT_THAT(lite_index->PersistToDisk(), IsOk()); + ASSERT_TRUE(filesystem_.PRead((lite_index_file_name + "hb").c_str(), + &header_data, sizeof(header_data), + LiteIndex::kHeaderFileOffset)); + EXPECT_THAT(header_data.cur_size - header_data.searchable_end, Eq(4)); +} + +TEST_F( + LiteIndexTest, + LiteIndexFetchHits_sortAtQuerying_unsortedHitsExceedingSortAtIndexThreshold) { + // Set up LiteIndex and TermIdCodec + std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index"; + // At 64 bytes the unsorted tail can contain a max of 8 TermHitPairs. + // However note that in these tests we're unable to sort hits after + // indexing, as sorting performed by the string-section-indexing-handler + // after indexing all hits in an entire document, rather than after each + // AddHits() operation. + LiteIndex::Options options(lite_index_file_name, + /*hit_buffer_want_merge_bytes=*/1024 * 1024, + /*hit_buffer_sort_at_indexing=*/false, + /*hit_buffer_sort_threshold_bytes=*/64); + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LiteIndex> lite_index, + LiteIndex::Create(options, &icing_filesystem_)); + ICING_ASSERT_OK_AND_ASSIGN( + term_id_codec_, + TermIdCodec::Create( + IcingDynamicTrie::max_value_index(IcingDynamicTrie::Options()), + IcingDynamicTrie::max_value_index(options.lexicon_options))); + + // Create 4 hits for docs 0-2, and 2 hits for doc 3 -- 14 in total + // Doc 0 + Hit doc0_hit0(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc0_hit1(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc0_hit2(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc0_hit3(/*section_id=*/2, /*document_id=*/0, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + // Doc 1 + Hit doc1_hit0(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc1_hit1(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc1_hit2(/*section_id=*/1, /*document_id=*/1, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc1_hit3(/*section_id=*/2, /*document_id=*/1, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + // Doc 2 + Hit doc2_hit0(/*section_id=*/0, /*document_id=*/2, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc2_hit1(/*section_id=*/0, /*document_id=*/2, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc2_hit2(/*section_id=*/1, /*document_id=*/2, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc2_hit3(/*section_id=*/2, /*document_id=*/2, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + // Doc 3 + Hit doc3_hit0(/*section_id=*/0, /*document_id=*/3, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc3_hit1(/*section_id=*/0, /*document_id=*/3, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + + // Create terms + // Foo + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t foo_tvi, + lite_index->InsertTerm("foo", TermMatchType::EXACT_ONLY, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id, + term_id_codec_->EncodeTvi(foo_tvi, TviType::LITE)); + // Bar + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t bar_tvi, + lite_index->InsertTerm("bar", TermMatchType::PREFIX, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN(uint32_t bar_term_id, + term_id_codec_->EncodeTvi(bar_tvi, TviType::LITE)); + // Baz + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t baz_tvi, + lite_index->InsertTerm("baz", TermMatchType::PREFIX, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN(uint32_t baz_term_id, + term_id_codec_->EncodeTvi(baz_tvi, TviType::LITE)); + // Qux + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t qux_tvi, + lite_index->InsertTerm("qux", TermMatchType::PREFIX, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN(uint32_t qux_term_id, + term_id_codec_->EncodeTvi(qux_tvi, TviType::LITE)); + + // Add 14 hits and make sure that termIds are added in unsorted order. + // Documents should be inserted in order as new incoming hits should have + // larger document ids. + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc0_hit0)); + ICING_ASSERT_OK(lite_index->AddHit(bar_term_id, doc0_hit1)); + ICING_ASSERT_OK(lite_index->AddHit(baz_term_id, doc0_hit2)); + ICING_ASSERT_OK(lite_index->AddHit(qux_term_id, doc0_hit3)); + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc1_hit0)); + ICING_ASSERT_OK(lite_index->AddHit(bar_term_id, doc1_hit1)); + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc1_hit2)); + ICING_ASSERT_OK(lite_index->AddHit(bar_term_id, doc1_hit3)); + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc2_hit0)); + ICING_ASSERT_OK(lite_index->AddHit(baz_term_id, doc2_hit1)); + ICING_ASSERT_OK(lite_index->AddHit(qux_term_id, doc2_hit2)); + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc2_hit3)); + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc3_hit0)); + ICING_ASSERT_OK(lite_index->AddHit(baz_term_id, doc3_hit1)); + // Verify that the HitBuffer has not been sorted. + EXPECT_THAT(lite_index->HasUnsortedHitsExceedingSortThreshold(), IsTrue()); + + // We now have the following in the hit buffer: + // <term>: {(docId, sectionId)...} + // foo: {(0, 0); (1, 0); (1, 1); (2, 0); (2, 2); (3, 0)} + // bar: {(0, 0); (1, 0); (1, 2)} + // baz: {(0, 1); (2, 0); (3, 0)} + // quz: {(0, 2); (2, 1)} + + // Search over the HitBuffer. + std::vector<DocHitInfo> hits1; + lite_index->FetchHits( + foo_term_id, kSectionIdMaskAll, + /*only_from_prefix_sections=*/false, + SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT, + /*namespace_checker=*/nullptr, &hits1); + EXPECT_THAT(hits1, SizeIs(4)); + // Check that hits are retrieved in descending order of docIds. + EXPECT_THAT(hits1[0].document_id(), Eq(3)); + EXPECT_THAT(hits1[0].hit_section_ids_mask(), Eq(0b1)); + EXPECT_THAT(hits1[1].document_id(), Eq(2)); + EXPECT_THAT(hits1[1].hit_section_ids_mask(), Eq(0b101)); + EXPECT_THAT(hits1[2].document_id(), Eq(1)); + EXPECT_THAT(hits1[2].hit_section_ids_mask(), Eq(0b11)); + EXPECT_THAT(hits1[3].document_id(), Eq(0)); + EXPECT_THAT(hits1[3].hit_section_ids_mask(), Eq(0b1)); + + std::vector<DocHitInfo> hits2; + AlwaysFalseSuggestionResultCheckerImpl always_false_suggestion_result_checker; + lite_index->FetchHits( + foo_term_id, kSectionIdMaskAll, + /*only_from_prefix_sections=*/false, + SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT, + &always_false_suggestion_result_checker, &hits2); + // Check that no hits are returned because they get skipped by the namespace + // checker. + EXPECT_THAT(hits2, IsEmpty()); + + std::vector<DocHitInfo> hits3; + lite_index->FetchHits( + bar_term_id, 0b1, + /*only_from_prefix_sections=*/false, + SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT, + /*namespace_checker=*/nullptr, &hits3); + EXPECT_THAT(hits3, SizeIs(2)); + // Check fetching hits with SectionIdMask. + EXPECT_THAT(hits3[0].document_id(), Eq(1)); + EXPECT_THAT(hits3[1].hit_section_ids_mask(), Eq(0b1)); + EXPECT_THAT(hits3[1].document_id(), Eq(0)); + EXPECT_THAT(hits3[1].hit_section_ids_mask(), Eq(0b1)); + + // Check that the HitBuffer is sorted after the query call. + EXPECT_THAT(lite_index->HasUnsortedHitsExceedingSortThreshold(), IsFalse()); +} + +TEST_F( + LiteIndexTest, + LiteIndexFetchHits_sortAtIndexing_unsortedHitsExceedingSortAtIndexThreshold) { + // Set up LiteIndex and TermIdCodec + std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index"; + // At 64 bytes the unsorted tail can contain a max of 8 TermHitPairs. + LiteIndex::Options options(lite_index_file_name, + /*hit_buffer_want_merge_bytes=*/1024 * 1024, + /*hit_buffer_sort_at_indexing=*/true, + /*hit_buffer_sort_threshold_bytes=*/64); + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LiteIndex> lite_index, + LiteIndex::Create(options, &icing_filesystem_)); + ICING_ASSERT_OK_AND_ASSIGN( + term_id_codec_, + TermIdCodec::Create( + IcingDynamicTrie::max_value_index(IcingDynamicTrie::Options()), + IcingDynamicTrie::max_value_index(options.lexicon_options))); + + // Create 4 hits for docs 0-2, and 2 hits for doc 3 -- 14 in total + // Doc 0 + Hit doc0_hit0(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc0_hit1(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc0_hit2(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc0_hit3(/*section_id=*/2, /*document_id=*/0, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + // Doc 1 + Hit doc1_hit0(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc1_hit1(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc1_hit2(/*section_id=*/1, /*document_id=*/1, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc1_hit3(/*section_id=*/2, /*document_id=*/1, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + // Doc 2 + Hit doc2_hit0(/*section_id=*/0, /*document_id=*/2, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc2_hit1(/*section_id=*/0, /*document_id=*/2, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc2_hit2(/*section_id=*/1, /*document_id=*/2, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc2_hit3(/*section_id=*/2, /*document_id=*/2, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + // Doc 3 + Hit doc3_hit0(/*section_id=*/0, /*document_id=*/3, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc3_hit1(/*section_id=*/0, /*document_id=*/3, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc3_hit2(/*section_id=*/1, /*document_id=*/3, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc3_hit3(/*section_id=*/2, /*document_id=*/3, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + // Doc 4 + Hit doc4_hit0(/*section_id=*/0, /*document_id=*/4, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc4_hit1(/*section_id=*/0, /*document_id=*/4, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc4_hit2(/*section_id=*/1, /*document_id=*/4, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc4_hit3(/*section_id=*/2, /*document_id=*/4, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + + // Create terms + // Foo + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t foo_tvi, + lite_index->InsertTerm("foo", TermMatchType::EXACT_ONLY, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id, + term_id_codec_->EncodeTvi(foo_tvi, TviType::LITE)); + // Bar + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t bar_tvi, + lite_index->InsertTerm("bar", TermMatchType::PREFIX, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN(uint32_t bar_term_id, + term_id_codec_->EncodeTvi(bar_tvi, TviType::LITE)); + // Baz + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t baz_tvi, + lite_index->InsertTerm("baz", TermMatchType::PREFIX, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN(uint32_t baz_term_id, + term_id_codec_->EncodeTvi(baz_tvi, TviType::LITE)); + // Qux + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t qux_tvi, + lite_index->InsertTerm("qux", TermMatchType::PREFIX, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN(uint32_t qux_term_id, + term_id_codec_->EncodeTvi(qux_tvi, TviType::LITE)); + + // Add hits and make sure that termIds are added in unsorted order. + // Documents should be inserted in order as new incoming hits should have + // larger document ids. + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc0_hit0)); + ICING_ASSERT_OK(lite_index->AddHit(bar_term_id, doc0_hit1)); + ICING_ASSERT_OK(lite_index->AddHit(baz_term_id, doc0_hit2)); + ICING_ASSERT_OK(lite_index->AddHit(qux_term_id, doc0_hit3)); + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc1_hit0)); + ICING_ASSERT_OK(lite_index->AddHit(bar_term_id, doc1_hit1)); + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc1_hit2)); + ICING_ASSERT_OK(lite_index->AddHit(bar_term_id, doc1_hit3)); + // Adding 8 hits exceeds the sort threshold. However when sort_at_indexing is + // enabled, sorting is done in the string-section-indexing-handler rather than + // AddHit() itself, we need to invoke SortHits() manually. + EXPECT_THAT(lite_index->HasUnsortedHitsExceedingSortThreshold(), IsTrue()); + lite_index->SortHits(); + // Check that the HitBuffer is sorted. + ASSERT_THAT(lite_index->PersistToDisk(), IsOk()); + LiteIndex_HeaderImpl::HeaderData header_data; + ASSERT_TRUE(filesystem_.PRead((lite_index_file_name + "hb").c_str(), + &header_data, sizeof(header_data), + LiteIndex::kHeaderFileOffset)); + EXPECT_THAT(header_data.cur_size - header_data.searchable_end, Eq(0)); + + // Add 12 more hits so that sort threshold is exceeded again. + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc2_hit0)); + ICING_ASSERT_OK(lite_index->AddHit(baz_term_id, doc2_hit1)); + ICING_ASSERT_OK(lite_index->AddHit(qux_term_id, doc2_hit2)); + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc2_hit3)); + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc3_hit0)); + ICING_ASSERT_OK(lite_index->AddHit(baz_term_id, doc3_hit1)); + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc3_hit2)); + ICING_ASSERT_OK(lite_index->AddHit(bar_term_id, doc3_hit3)); + ICING_ASSERT_OK(lite_index->AddHit(baz_term_id, doc4_hit0)); + ICING_ASSERT_OK(lite_index->AddHit(qux_term_id, doc4_hit1)); + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc4_hit2)); + ICING_ASSERT_OK(lite_index->AddHit(bar_term_id, doc4_hit3)); + + // Adding these hits exceeds the sort threshold. However when sort_at_indexing + // is enabled, sorting is done in the string-section-indexing-handler rather + // than AddHit() itself. + EXPECT_THAT(lite_index->HasUnsortedHitsExceedingSortThreshold(), IsTrue()); + + // We now have the following in the hit buffer: + // <term>: {(docId, sectionId)...} + // foo: {(0, 0); (1, 0); (1, 1); (2, 0); (2, 2); (3, 0); (3, 1); (4, 1)} + // bar: {(0, 0); (1, 0); (1, 2); (3, 2); (4, 2)} + // baz: {(0, 1); (2, 0); (3, 0); (4, 0)} + // quz: {(0, 2); (2, 1); (4, 0)} + + // Search over the HitBuffer. + std::vector<DocHitInfo> hits1; + lite_index->FetchHits( + foo_term_id, kSectionIdMaskAll, + /*only_from_prefix_sections=*/false, + SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT, + /*namespace_checker=*/nullptr, &hits1); + EXPECT_THAT(hits1, SizeIs(5)); + // Check that hits are retrieved in descending order of docIds. + EXPECT_THAT(hits1[0].document_id(), Eq(4)); + EXPECT_THAT(hits1[0].hit_section_ids_mask(), Eq(0b10)); + EXPECT_THAT(hits1[1].document_id(), Eq(3)); + EXPECT_THAT(hits1[1].hit_section_ids_mask(), Eq(0b11)); + EXPECT_THAT(hits1[2].document_id(), Eq(2)); + EXPECT_THAT(hits1[2].hit_section_ids_mask(), Eq(0b101)); + EXPECT_THAT(hits1[3].document_id(), Eq(1)); + EXPECT_THAT(hits1[3].hit_section_ids_mask(), Eq(0b11)); + EXPECT_THAT(hits1[4].document_id(), Eq(0)); + EXPECT_THAT(hits1[4].hit_section_ids_mask(), Eq(0b1)); + + std::vector<DocHitInfo> hits2; + AlwaysFalseSuggestionResultCheckerImpl always_false_suggestion_result_checker; + lite_index->FetchHits( foo_term_id, kSectionIdMaskAll, /*only_from_prefix_sections=*/false, SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT, @@ -104,13 +575,119 @@ TEST_F(LiteIndexTest, LiteIndexAppendHits) { // Check that no hits are returned because they get skipped by the namespace // checker. EXPECT_THAT(hits2, IsEmpty()); + + std::vector<DocHitInfo> hits3; + lite_index->FetchHits( + bar_term_id, 0b1, + /*only_from_prefix_sections=*/false, + SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT, + /*namespace_checker=*/nullptr, &hits3); + EXPECT_THAT(hits3, SizeIs(2)); + // Check fetching hits with SectionIdMask. + EXPECT_THAT(hits3[0].document_id(), Eq(1)); + EXPECT_THAT(hits3[1].hit_section_ids_mask(), Eq(0b1)); + EXPECT_THAT(hits3[1].document_id(), Eq(0)); + EXPECT_THAT(hits3[1].hit_section_ids_mask(), Eq(0b1)); + + // Check that the HitBuffer is sorted after the query call. FetchHits should + // sort before performing binary search if the HitBuffer unsorted size exceeds + // the sort threshold. Regardless of the sort_at_indexing config. + EXPECT_THAT(lite_index->HasUnsortedHitsExceedingSortThreshold(), IsFalse()); + ASSERT_THAT(lite_index->PersistToDisk(), IsOk()); + ASSERT_TRUE(filesystem_.PRead((lite_index_file_name + "hb").c_str(), + &header_data, sizeof(header_data), + LiteIndex::kHeaderFileOffset)); + EXPECT_THAT(header_data.cur_size - header_data.searchable_end, Eq(0)); } TEST_F(LiteIndexTest, LiteIndexIterator) { + // Set up LiteIndex and TermIdCodec + std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index"; + // At 64 bytes the unsorted tail can contain a max of 8 TermHitPairs. + LiteIndex::Options options(lite_index_file_name, + /*hit_buffer_want_merge_bytes=*/1024 * 1024, + /*hit_buffer_sort_at_indexing=*/true, + /*hit_buffer_sort_threshold_bytes=*/64); + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LiteIndex> lite_index, + LiteIndex::Create(options, &icing_filesystem_)); + ICING_ASSERT_OK_AND_ASSIGN( + term_id_codec_, + TermIdCodec::Create( + IcingDynamicTrie::max_value_index(IcingDynamicTrie::Options()), + IcingDynamicTrie::max_value_index(options.lexicon_options))); + + const std::string term = "foo"; + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t tvi, + lite_index->InsertTerm(term, TermMatchType::PREFIX, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id, + term_id_codec_->EncodeTvi(tvi, TviType::LITE)); + Hit doc0_hit0(/*section_id=*/0, /*document_id=*/0, /*term_frequency=*/3, + /*is_in_prefix_section=*/false); + Hit doc0_hit1(/*section_id=*/1, /*document_id=*/0, /*term_frequency=*/5, + /*is_in_prefix_section=*/false); + SectionIdMask doc0_section_id_mask = 0b11; + std::unordered_map<SectionId, Hit::TermFrequency> + expected_section_ids_tf_map0 = {{0, 3}, {1, 5}}; + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc0_hit0)); + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc0_hit1)); + + Hit doc1_hit1(/*section_id=*/1, /*document_id=*/1, /*term_frequency=*/7, + /*is_in_prefix_section=*/false); + Hit doc1_hit2(/*section_id=*/2, /*document_id=*/1, /*term_frequency=*/11, + /*is_in_prefix_section=*/false); + SectionIdMask doc1_section_id_mask = 0b110; + std::unordered_map<SectionId, Hit::TermFrequency> + expected_section_ids_tf_map1 = {{1, 7}, {2, 11}}; + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc1_hit1)); + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc1_hit2)); + + std::unique_ptr<DocHitInfoIteratorTermLiteExact> iter = + std::make_unique<DocHitInfoIteratorTermLiteExact>( + term_id_codec_.get(), lite_index.get(), term, /*term_start_index=*/0, + /*unnormalized_term_length=*/0, kSectionIdMaskAll, + /*need_hit_term_frequency=*/true); + + ASSERT_THAT(iter->Advance(), IsOk()); + EXPECT_THAT(iter->doc_hit_info().document_id(), Eq(1)); + EXPECT_THAT(iter->doc_hit_info().hit_section_ids_mask(), + Eq(doc1_section_id_mask)); + + std::vector<TermMatchInfo> matched_terms_stats; + iter->PopulateMatchedTermsStats(&matched_terms_stats); + EXPECT_THAT(matched_terms_stats, ElementsAre(EqualsTermMatchInfo( + term, expected_section_ids_tf_map1))); + + ASSERT_THAT(iter->Advance(), IsOk()); + EXPECT_THAT(iter->doc_hit_info().document_id(), Eq(0)); + EXPECT_THAT(iter->doc_hit_info().hit_section_ids_mask(), + Eq(doc0_section_id_mask)); + matched_terms_stats.clear(); + iter->PopulateMatchedTermsStats(&matched_terms_stats); + EXPECT_THAT(matched_terms_stats, ElementsAre(EqualsTermMatchInfo( + term, expected_section_ids_tf_map0))); +} + +TEST_F(LiteIndexTest, LiteIndexIterator_sortAtIndexingDisabled) { + // Set up LiteIndex and TermIdCodec + std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index"; + // At 64 bytes the unsorted tail can contain a max of 8 TermHitPairs. + LiteIndex::Options options(lite_index_file_name, + /*hit_buffer_want_merge_bytes=*/1024 * 1024, + /*hit_buffer_sort_at_indexing=*/false, + /*hit_buffer_sort_threshold_bytes=*/64); + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LiteIndex> lite_index, + LiteIndex::Create(options, &icing_filesystem_)); + ICING_ASSERT_OK_AND_ASSIGN( + term_id_codec_, + TermIdCodec::Create( + IcingDynamicTrie::max_value_index(IcingDynamicTrie::Options()), + IcingDynamicTrie::max_value_index(options.lexicon_options))); + const std::string term = "foo"; ICING_ASSERT_OK_AND_ASSIGN( uint32_t tvi, - lite_index_->InsertTerm(term, TermMatchType::PREFIX, kNamespace0)); + lite_index->InsertTerm(term, TermMatchType::PREFIX, kNamespace0)); ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id, term_id_codec_->EncodeTvi(tvi, TviType::LITE)); Hit doc0_hit0(/*section_id=*/0, /*document_id=*/0, /*term_frequency=*/3, @@ -120,8 +697,8 @@ TEST_F(LiteIndexTest, LiteIndexIterator) { SectionIdMask doc0_section_id_mask = 0b11; std::unordered_map<SectionId, Hit::TermFrequency> expected_section_ids_tf_map0 = {{0, 3}, {1, 5}}; - ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc0_hit0)); - ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc0_hit1)); + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc0_hit0)); + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc0_hit1)); Hit doc1_hit1(/*section_id=*/1, /*document_id=*/1, /*term_frequency=*/7, /*is_in_prefix_section=*/false); @@ -130,12 +707,12 @@ TEST_F(LiteIndexTest, LiteIndexIterator) { SectionIdMask doc1_section_id_mask = 0b110; std::unordered_map<SectionId, Hit::TermFrequency> expected_section_ids_tf_map1 = {{1, 7}, {2, 11}}; - ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc1_hit1)); - ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc1_hit2)); + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc1_hit1)); + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc1_hit2)); std::unique_ptr<DocHitInfoIteratorTermLiteExact> iter = std::make_unique<DocHitInfoIteratorTermLiteExact>( - term_id_codec_.get(), lite_index_.get(), term, /*term_start_index=*/0, + term_id_codec_.get(), lite_index.get(), term, /*term_start_index=*/0, /*unnormalized_term_length=*/0, kSectionIdMaskAll, /*need_hit_term_frequency=*/true); diff --git a/icing/index/lite/lite-index_thread-safety_test.cc b/icing/index/lite/lite-index_thread-safety_test.cc index 7711f92..53aa6cd 100644 --- a/icing/index/lite/lite-index_thread-safety_test.cc +++ b/icing/index/lite/lite-index_thread-safety_test.cc @@ -19,12 +19,9 @@ #include "gmock/gmock.h" #include "gtest/gtest.h" -#include "icing/index/lite/doc-hit-info-iterator-term-lite.h" #include "icing/index/lite/lite-index.h" #include "icing/index/term-id-codec.h" #include "icing/schema/section.h" -#include "icing/store/suggestion-result-checker.h" -#include "icing/testing/always-false-suggestion-result-checker-impl.h" #include "icing/testing/common-matchers.h" #include "icing/testing/tmp-directory.h" @@ -52,7 +49,9 @@ class LiteIndexThreadSafetyTest : public testing::Test { std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx-thread-safety.index"; LiteIndex::Options options(lite_index_file_name, - /*hit_buffer_want_merge_bytes=*/1024 * 1024); + /*hit_buffer_want_merge_bytes=*/1024 * 1024, + /*hit_buffer_sort_at_indexing=*/true, + /*hit_buffer_sort_threshold_bytes=*/64); ICING_ASSERT_OK_AND_ASSIGN(lite_index_, LiteIndex::Create(options, &icing_filesystem_)); diff --git a/icing/index/lite/term-id-hit-pair.h b/icing/index/lite/term-id-hit-pair.h index 61ec502..82bd010 100644 --- a/icing/index/lite/term-id-hit-pair.h +++ b/icing/index/lite/term-id-hit-pair.h @@ -73,6 +73,8 @@ class TermIdHitPair { return value_ == rhs.value_; } + bool operator<(const TermIdHitPair& rhs) const { return value_ < rhs.value_; } + private: Value value_; }; |