diff options
Diffstat (limited to 'icing/index/lite/doc-hit-info-iterator-term-lite.cc')
-rw-r--r-- | icing/index/lite/doc-hit-info-iterator-term-lite.cc | 94 |
1 files changed, 80 insertions, 14 deletions
diff --git a/icing/index/lite/doc-hit-info-iterator-term-lite.cc b/icing/index/lite/doc-hit-info-iterator-term-lite.cc index f215d63..597f5b5 100644 --- a/icing/index/lite/doc-hit-info-iterator-term-lite.cc +++ b/icing/index/lite/doc-hit-info-iterator-term-lite.cc @@ -14,7 +14,9 @@ #include "icing/index/lite/doc-hit-info-iterator-term-lite.h" +#include <array> #include <cstdint> +#include <numeric> #include "icing/text_classifier/lib3/utils/base/status.h" #include "icing/absl_ports/canonical_errors.h" @@ -30,9 +32,9 @@ namespace lib { namespace { std::string SectionIdMaskToString(SectionIdMask section_id_mask) { - std::string mask(kMaxSectionId + 1, '0'); + std::string mask(kTotalNumSections, '0'); for (SectionId i = kMaxSectionId; i >= 0; --i) { - if (section_id_mask & (1U << i)) { + if (section_id_mask & (UINT64_C(1) << i)) { mask[kMaxSectionId - i] = '1'; } } @@ -76,9 +78,11 @@ libtextclassifier3::Status DocHitInfoIteratorTermLiteExact::RetrieveMoreHits() { ICING_ASSIGN_OR_RETURN(uint32_t tvi, lite_index_->GetTermId(term_)); ICING_ASSIGN_OR_RETURN(uint32_t term_id, term_id_codec_->EncodeTvi(tvi, TviType::LITE)); - lite_index_->AppendHits(term_id, section_restrict_mask_, - /*only_from_prefix_sections=*/false, - /*namespace_checker=*/nullptr, &cached_hits_); + lite_index_->AppendHits( + term_id, section_restrict_mask_, + /*only_from_prefix_sections=*/false, + /*namespace_checker=*/nullptr, &cached_hits_, + need_hit_term_frequency_ ? &cached_hit_term_frequency_ : nullptr); cached_hits_idx_ = 0; return libtextclassifier3::Status::OK; } @@ -99,9 +103,11 @@ DocHitInfoIteratorTermLitePrefix::RetrieveMoreHits() { ICING_ASSIGN_OR_RETURN( uint32_t term_id, term_id_codec_->EncodeTvi(it.GetValueIndex(), TviType::LITE)); - lite_index_->AppendHits(term_id, section_restrict_mask_, - /*only_from_prefix_sections=*/!exact_match, - /*namespace_checker=*/nullptr, &cached_hits_); + lite_index_->AppendHits( + term_id, section_restrict_mask_, + /*only_from_prefix_sections=*/!exact_match, + /*namespace_checker=*/nullptr, &cached_hits_, + need_hit_term_frequency_ ? &cached_hit_term_frequency_ : nullptr); ++terms_matched; } if (terms_matched > 1) { @@ -111,23 +117,83 @@ DocHitInfoIteratorTermLitePrefix::RetrieveMoreHits() { return libtextclassifier3::Status::OK; } -void DocHitInfoIteratorTermLitePrefix::SortAndDedupeDocumentIds() { +void DocHitInfoIteratorTermLitePrefix::SortDocumentIds() { // Re-sort cached document_ids and merge sections. - sort(cached_hits_.begin(), cached_hits_.end()); + if (!need_hit_term_frequency_) { + // If we don't need to also sort cached_hit_term_frequency_ along with + // cached_hits_, then just simply sort cached_hits_. + sort(cached_hits_.begin(), cached_hits_.end()); + } else { + // Sort cached_hit_term_frequency_ along with cached_hits_. + std::vector<int> indices(cached_hits_.size()); + std::iota(indices.begin(), indices.end(), 0); + std::sort(indices.begin(), indices.end(), [this](int i, int j) { + return cached_hits_[i] < cached_hits_[j]; + }); + // Now indices is a map from sorted index to current index. In other words, + // the sorted cached_hits_[i] should be the current cached_hits_[indices[i]] + // for every valid i. + std::vector<bool> done(indices.size()); + // Apply permutation + for (int i = 0; i < indices.size(); ++i) { + if (done[i]) { + continue; + } + done[i] = true; + int curr = i; + int next = indices[i]; + // Since every finite permutation is formed by disjoint cycles, we can + // start with the current element, at index i, and swap the element at + // this position with whatever element that *should* be here. Then, + // continue to swap the original element, at its updated positions, with + // the element that should be occupying that position until the original + // element has reached *its* correct position. This completes applying the + // single cycle in the permutation. + while (next != i) { + std::swap(cached_hits_[curr], cached_hits_[next]); + std::swap(cached_hit_term_frequency_[curr], + cached_hit_term_frequency_[next]); + done[next] = true; + curr = next; + next = indices[next]; + } + } + } +} +void DocHitInfoIteratorTermLitePrefix::SortAndDedupeDocumentIds() { + SortDocumentIds(); int idx = 0; for (int i = 1; i < cached_hits_.size(); ++i) { - const DocHitInfo& hit_info = cached_hits_.at(i); - DocHitInfo& collapsed_hit_info = cached_hits_.at(idx); + const DocHitInfo& hit_info = cached_hits_[i]; + DocHitInfo& collapsed_hit_info = cached_hits_[idx]; if (collapsed_hit_info.document_id() == hit_info.document_id()) { - collapsed_hit_info.MergeSectionsFrom(hit_info); + SectionIdMask curr_mask = hit_info.hit_section_ids_mask(); + collapsed_hit_info.MergeSectionsFrom(curr_mask); + if (need_hit_term_frequency_) { + Hit::TermFrequencyArray& collapsed_term_frequency = + cached_hit_term_frequency_[idx]; + while (curr_mask) { + SectionId section_id = __builtin_ctzll(curr_mask); + collapsed_term_frequency[section_id] = + cached_hit_term_frequency_[i][section_id]; + curr_mask &= ~(UINT64_C(1) << section_id); + } + } } else { // New document_id. - cached_hits_.at(++idx) = hit_info; + ++idx; + cached_hits_[idx] = hit_info; + if (need_hit_term_frequency_) { + cached_hit_term_frequency_[idx] = cached_hit_term_frequency_[i]; + } } } // idx points to last doc hit info. cached_hits_.resize(idx + 1); + if (need_hit_term_frequency_) { + cached_hit_term_frequency_.resize(idx + 1); + } } std::string DocHitInfoIteratorTermLitePrefix::ToString() const { |