// Copyright (C) 2019 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #ifndef ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_LITE_H_ #define ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_LITE_H_ #include #include #include "icing/text_classifier/lib3/utils/base/status.h" #include "icing/index/hit/doc-hit-info.h" #include "icing/index/iterator/doc-hit-info-iterator.h" #include "icing/index/lite/lite-index.h" #include "icing/index/term-id-codec.h" #include "icing/schema/section.h" namespace icing { namespace lib { class DocHitInfoIteratorTermLite : public DocHitInfoLeafIterator { public: explicit DocHitInfoIteratorTermLite(const TermIdCodec* term_id_codec, LiteIndex* lite_index, const std::string& term, int term_start_index, int unnormalized_term_length, SectionIdMask section_restrict_mask, bool need_hit_term_frequency) : term_(term), term_start_index_(term_start_index), unnormalized_term_length_(unnormalized_term_length), lite_index_(lite_index), cached_hits_idx_(-1), term_id_codec_(term_id_codec), num_advance_calls_(0), section_restrict_mask_(section_restrict_mask), need_hit_term_frequency_(need_hit_term_frequency) {} libtextclassifier3::Status Advance() override; libtextclassifier3::StatusOr TrimRightMostNode() && override; CallStats GetCallStats() const override { return CallStats( /*num_leaf_advance_calls_lite_index_in=*/num_advance_calls_, /*num_leaf_advance_calls_main_index_in=*/0, /*num_leaf_advance_calls_integer_index_in=*/0, /*num_leaf_advance_calls_no_index_in=*/0, /*num_blocks_inspected_in=*/0); } void PopulateMatchedTermsStats( std::vector* matched_terms_stats, SectionIdMask filtering_section_mask = kSectionIdMaskAll) const override { if (cached_hits_idx_ == -1 || cached_hits_idx_ >= cached_hits_.size()) { // Current hit isn't valid, return. return; } SectionIdMask section_mask = doc_hit_info_.hit_section_ids_mask() & filtering_section_mask; SectionIdMask section_mask_copy = section_mask; std::array section_term_frequencies = {Hit::kNoTermFrequency}; while (section_mask_copy) { SectionId section_id = __builtin_ctzll(section_mask_copy); if (need_hit_term_frequency_) { section_term_frequencies.at(section_id) = cached_hit_term_frequency_.at(cached_hits_idx_)[section_id]; } section_mask_copy &= ~(UINT64_C(1) << section_id); } TermMatchInfo term_stats(term_, section_mask, std::move(section_term_frequencies)); for (const TermMatchInfo& cur_term_stats : *matched_terms_stats) { if (cur_term_stats.term == term_stats.term) { // Same docId and same term, we don't need to add the term and the term // frequency should always be the same return; } } matched_terms_stats->push_back(std::move(term_stats)); } protected: // Add DocHitInfos corresponding to term_ to cached_hits_. // // Returns: // - OK, on success // - NOT_FOUND if no term matching term_ was found in the lexicon. // - INVALID_ARGUMENT if unable to properly encode the termid virtual libtextclassifier3::Status RetrieveMoreHits() = 0; const std::string term_; // The start index of the given term in the search query int term_start_index_; // The length of the given unnormalized term in the search query int unnormalized_term_length_; LiteIndex* const lite_index_; // Stores hits retrieved from the index. This may only be a subset of the hits // that are present in the index. Current value pointed to by the Iterator is // tracked by cached_hits_idx_. std::vector cached_hits_; std::vector cached_hit_term_frequency_; int cached_hits_idx_; const TermIdCodec* term_id_codec_; int num_advance_calls_; // Mask indicating which sections hits should be considered for. // Ex. 0000 0000 0000 0010 means that only hits from section 1 are desired. const SectionIdMask section_restrict_mask_; const bool need_hit_term_frequency_; }; class DocHitInfoIteratorTermLiteExact : public DocHitInfoIteratorTermLite { public: explicit DocHitInfoIteratorTermLiteExact(const TermIdCodec* term_id_codec, LiteIndex* lite_index, const std::string& term, int term_start_index, int unnormalized_term_length, SectionIdMask section_id_mask, bool need_hit_term_frequency) : DocHitInfoIteratorTermLite(term_id_codec, lite_index, term, term_start_index, unnormalized_term_length, section_id_mask, need_hit_term_frequency) {} std::string ToString() const override; protected: libtextclassifier3::Status RetrieveMoreHits() override; }; class DocHitInfoIteratorTermLitePrefix : public DocHitInfoIteratorTermLite { public: explicit DocHitInfoIteratorTermLitePrefix(const TermIdCodec* term_id_codec, LiteIndex* lite_index, const std::string& term, int term_start_index, int unnormalized_term_length, SectionIdMask section_id_mask, bool need_hit_term_frequency) : DocHitInfoIteratorTermLite(term_id_codec, lite_index, term, term_start_index, unnormalized_term_length, section_id_mask, need_hit_term_frequency) {} std::string ToString() const override; protected: libtextclassifier3::Status RetrieveMoreHits() override; private: // After retrieving DocHitInfos from the index, a DocHitInfo for docid 1 and // "foo" and a DocHitInfo for docid 1 and "fool". These DocHitInfos should be // merged. void SortDocumentIds(); void SortAndDedupeDocumentIds(); }; } // namespace lib } // namespace icing #endif // ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_LITE_H_