aboutsummaryrefslogtreecommitdiff
path: root/icing/index/lite/doc-hit-info-iterator-term-lite.cc
diff options
context:
space:
mode:
Diffstat (limited to 'icing/index/lite/doc-hit-info-iterator-term-lite.cc')
-rw-r--r--icing/index/lite/doc-hit-info-iterator-term-lite.cc94
1 files changed, 80 insertions, 14 deletions
diff --git a/icing/index/lite/doc-hit-info-iterator-term-lite.cc b/icing/index/lite/doc-hit-info-iterator-term-lite.cc
index f215d63..597f5b5 100644
--- a/icing/index/lite/doc-hit-info-iterator-term-lite.cc
+++ b/icing/index/lite/doc-hit-info-iterator-term-lite.cc
@@ -14,7 +14,9 @@
#include "icing/index/lite/doc-hit-info-iterator-term-lite.h"
+#include <array>
#include <cstdint>
+#include <numeric>
#include "icing/text_classifier/lib3/utils/base/status.h"
#include "icing/absl_ports/canonical_errors.h"
@@ -30,9 +32,9 @@ namespace lib {
namespace {
std::string SectionIdMaskToString(SectionIdMask section_id_mask) {
- std::string mask(kMaxSectionId + 1, '0');
+ std::string mask(kTotalNumSections, '0');
for (SectionId i = kMaxSectionId; i >= 0; --i) {
- if (section_id_mask & (1U << i)) {
+ if (section_id_mask & (UINT64_C(1) << i)) {
mask[kMaxSectionId - i] = '1';
}
}
@@ -76,9 +78,11 @@ libtextclassifier3::Status DocHitInfoIteratorTermLiteExact::RetrieveMoreHits() {
ICING_ASSIGN_OR_RETURN(uint32_t tvi, lite_index_->GetTermId(term_));
ICING_ASSIGN_OR_RETURN(uint32_t term_id,
term_id_codec_->EncodeTvi(tvi, TviType::LITE));
- lite_index_->AppendHits(term_id, section_restrict_mask_,
- /*only_from_prefix_sections=*/false,
- /*namespace_checker=*/nullptr, &cached_hits_);
+ lite_index_->AppendHits(
+ term_id, section_restrict_mask_,
+ /*only_from_prefix_sections=*/false,
+ /*namespace_checker=*/nullptr, &cached_hits_,
+ need_hit_term_frequency_ ? &cached_hit_term_frequency_ : nullptr);
cached_hits_idx_ = 0;
return libtextclassifier3::Status::OK;
}
@@ -99,9 +103,11 @@ DocHitInfoIteratorTermLitePrefix::RetrieveMoreHits() {
ICING_ASSIGN_OR_RETURN(
uint32_t term_id,
term_id_codec_->EncodeTvi(it.GetValueIndex(), TviType::LITE));
- lite_index_->AppendHits(term_id, section_restrict_mask_,
- /*only_from_prefix_sections=*/!exact_match,
- /*namespace_checker=*/nullptr, &cached_hits_);
+ lite_index_->AppendHits(
+ term_id, section_restrict_mask_,
+ /*only_from_prefix_sections=*/!exact_match,
+ /*namespace_checker=*/nullptr, &cached_hits_,
+ need_hit_term_frequency_ ? &cached_hit_term_frequency_ : nullptr);
++terms_matched;
}
if (terms_matched > 1) {
@@ -111,23 +117,83 @@ DocHitInfoIteratorTermLitePrefix::RetrieveMoreHits() {
return libtextclassifier3::Status::OK;
}
-void DocHitInfoIteratorTermLitePrefix::SortAndDedupeDocumentIds() {
+void DocHitInfoIteratorTermLitePrefix::SortDocumentIds() {
// Re-sort cached document_ids and merge sections.
- sort(cached_hits_.begin(), cached_hits_.end());
+ if (!need_hit_term_frequency_) {
+ // If we don't need to also sort cached_hit_term_frequency_ along with
+ // cached_hits_, then just simply sort cached_hits_.
+ sort(cached_hits_.begin(), cached_hits_.end());
+ } else {
+ // Sort cached_hit_term_frequency_ along with cached_hits_.
+ std::vector<int> indices(cached_hits_.size());
+ std::iota(indices.begin(), indices.end(), 0);
+ std::sort(indices.begin(), indices.end(), [this](int i, int j) {
+ return cached_hits_[i] < cached_hits_[j];
+ });
+ // Now indices is a map from sorted index to current index. In other words,
+ // the sorted cached_hits_[i] should be the current cached_hits_[indices[i]]
+ // for every valid i.
+ std::vector<bool> done(indices.size());
+ // Apply permutation
+ for (int i = 0; i < indices.size(); ++i) {
+ if (done[i]) {
+ continue;
+ }
+ done[i] = true;
+ int curr = i;
+ int next = indices[i];
+ // Since every finite permutation is formed by disjoint cycles, we can
+ // start with the current element, at index i, and swap the element at
+ // this position with whatever element that *should* be here. Then,
+ // continue to swap the original element, at its updated positions, with
+ // the element that should be occupying that position until the original
+ // element has reached *its* correct position. This completes applying the
+ // single cycle in the permutation.
+ while (next != i) {
+ std::swap(cached_hits_[curr], cached_hits_[next]);
+ std::swap(cached_hit_term_frequency_[curr],
+ cached_hit_term_frequency_[next]);
+ done[next] = true;
+ curr = next;
+ next = indices[next];
+ }
+ }
+ }
+}
+void DocHitInfoIteratorTermLitePrefix::SortAndDedupeDocumentIds() {
+ SortDocumentIds();
int idx = 0;
for (int i = 1; i < cached_hits_.size(); ++i) {
- const DocHitInfo& hit_info = cached_hits_.at(i);
- DocHitInfo& collapsed_hit_info = cached_hits_.at(idx);
+ const DocHitInfo& hit_info = cached_hits_[i];
+ DocHitInfo& collapsed_hit_info = cached_hits_[idx];
if (collapsed_hit_info.document_id() == hit_info.document_id()) {
- collapsed_hit_info.MergeSectionsFrom(hit_info);
+ SectionIdMask curr_mask = hit_info.hit_section_ids_mask();
+ collapsed_hit_info.MergeSectionsFrom(curr_mask);
+ if (need_hit_term_frequency_) {
+ Hit::TermFrequencyArray& collapsed_term_frequency =
+ cached_hit_term_frequency_[idx];
+ while (curr_mask) {
+ SectionId section_id = __builtin_ctzll(curr_mask);
+ collapsed_term_frequency[section_id] =
+ cached_hit_term_frequency_[i][section_id];
+ curr_mask &= ~(UINT64_C(1) << section_id);
+ }
+ }
} else {
// New document_id.
- cached_hits_.at(++idx) = hit_info;
+ ++idx;
+ cached_hits_[idx] = hit_info;
+ if (need_hit_term_frequency_) {
+ cached_hit_term_frequency_[idx] = cached_hit_term_frequency_[i];
+ }
}
}
// idx points to last doc hit info.
cached_hits_.resize(idx + 1);
+ if (need_hit_term_frequency_) {
+ cached_hit_term_frequency_.resize(idx + 1);
+ }
}
std::string DocHitInfoIteratorTermLitePrefix::ToString() const {