diff options
author | Terry Wang <tytytyww@google.com> | 2020-09-24 13:39:23 -0700 |
---|---|---|
committer | Terry Wang <tytytyww@google.com> | 2020-09-24 13:39:23 -0700 |
commit | e15b6b66f871a71b73278c34d5c54f648f880c29 (patch) | |
tree | 61e187172a8802fae8e39f04ce69d3ae5c939f2e /icing/index/lite | |
parent | 9f1b9cf4dc93fa7bfee0a3637c93dc5b557aab30 (diff) | |
download | icing-e15b6b66f871a71b73278c34d5c54f648f880c29.tar.gz |
Pull upstream changes.
Change-Id: I44831fdadcdb67f2e19570a35cb4c76faf8397f9
Diffstat (limited to 'icing/index/lite')
-rw-r--r-- | icing/index/lite/doc-hit-info-iterator-term-lite.cc | 126 | ||||
-rw-r--r-- | icing/index/lite/doc-hit-info-iterator-term-lite.h | 109 |
2 files changed, 235 insertions, 0 deletions
diff --git a/icing/index/lite/doc-hit-info-iterator-term-lite.cc b/icing/index/lite/doc-hit-info-iterator-term-lite.cc new file mode 100644 index 0000000..a975f86 --- /dev/null +++ b/icing/index/lite/doc-hit-info-iterator-term-lite.cc @@ -0,0 +1,126 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/index/lite/doc-hit-info-iterator-term-lite.h" + +#include <cstdint> + +#include "icing/text_classifier/lib3/utils/base/status.h" +#include "icing/absl_ports/canonical_errors.h" +#include "icing/absl_ports/str_cat.h" +#include "icing/index/hit/doc-hit-info.h" +#include "icing/schema/section.h" +#include "icing/util/status-macros.h" + +namespace icing { +namespace lib { + +namespace { + +std::string SectionIdMaskToString(SectionIdMask section_id_mask) { + std::string mask(kMaxSectionId + 1, '0'); + for (SectionId i = kMaxSectionId; i >= 0; --i) { + if (section_id_mask & (1U << i)) { + mask[kMaxSectionId - i] = '1'; + } + } + return mask; +} + +} // namespace + +libtextclassifier3::Status DocHitInfoIteratorTermLite::Advance() { + if (cached_hits_idx_ == -1) { + ICING_RETURN_IF_ERROR(RetrieveMoreHits()); + } else { + ++cached_hits_idx_; + } + if (cached_hits_idx_ == -1 || cached_hits_idx_ >= cached_hits_.size()) { + // Nothing more for the iterator to return. Set these members to invalid + // values. + doc_hit_info_ = DocHitInfo(); + hit_intersect_section_ids_mask_ = kSectionIdMaskNone; + return absl_ports::ResourceExhaustedError( + "No more DocHitInfos in iterator"); + } + doc_hit_info_ = cached_hits_.at(cached_hits_idx_); + hit_intersect_section_ids_mask_ = doc_hit_info_.hit_section_ids_mask(); + return libtextclassifier3::Status::OK; +} + +libtextclassifier3::Status DocHitInfoIteratorTermLiteExact::RetrieveMoreHits() { + // Exact match only. All hits in lite lexicon are exact. + ICING_ASSIGN_OR_RETURN(uint32_t tvi, lite_index_->FindTerm(term_)); + ICING_ASSIGN_OR_RETURN(uint32_t term_id, + term_id_codec_->EncodeTvi(tvi, TviType::LITE)); + lite_index_->AppendHits(term_id, section_restrict_mask_, + /*only_from_prefix_sections=*/false, &cached_hits_); + cached_hits_idx_ = 0; + return libtextclassifier3::Status::OK; +} + +std::string DocHitInfoIteratorTermLiteExact::ToString() const { + return absl_ports::StrCat(SectionIdMaskToString(section_restrict_mask_), ":", + term_); +} + +libtextclassifier3::Status +DocHitInfoIteratorTermLitePrefix::RetrieveMoreHits() { + // Take union of lite terms. + int term_len = term_.length(); + int terms_matched = 0; + for (LiteIndex::PrefixIterator it = lite_index_->FindTermPrefixes(term_); + it.IsValid(); it.Advance()) { + bool exact_match = strlen(it.GetKey()) == term_len; + ICING_ASSIGN_OR_RETURN( + uint32_t term_id, + term_id_codec_->EncodeTvi(it.GetValueIndex(), TviType::LITE)); + lite_index_->AppendHits(term_id, section_restrict_mask_, + /*only_from_prefix_sections=*/!exact_match, + &cached_hits_); + ++terms_matched; + } + if (terms_matched > 1) { + SortAndDedupeDocumentIds(); + } + cached_hits_idx_ = 0; + return libtextclassifier3::Status::OK; +} + +void DocHitInfoIteratorTermLitePrefix::SortAndDedupeDocumentIds() { + // Re-sort cached document_ids and merge sections. + sort(cached_hits_.begin(), cached_hits_.end()); + + int idx = 0; + for (int i = 1; i < cached_hits_.size(); ++i) { + const DocHitInfo& hit_info = cached_hits_.at(i); + DocHitInfo& collapsed_hit_info = cached_hits_.at(idx); + if (collapsed_hit_info.document_id() == hit_info.document_id()) { + collapsed_hit_info.MergeSectionsFrom(hit_info); + } else { + // New document_id. + cached_hits_.at(++idx) = hit_info; + } + } + // idx points to last doc hit info. + cached_hits_.resize(idx + 1); +} + +std::string DocHitInfoIteratorTermLitePrefix::ToString() const { + return absl_ports::StrCat(SectionIdMaskToString(section_restrict_mask_), ":", + term_, "*"); +} + +} // namespace lib +} // namespace icing diff --git a/icing/index/lite/doc-hit-info-iterator-term-lite.h b/icing/index/lite/doc-hit-info-iterator-term-lite.h new file mode 100644 index 0000000..bd2de6d --- /dev/null +++ b/icing/index/lite/doc-hit-info-iterator-term-lite.h @@ -0,0 +1,109 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_LITE_H_ +#define ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_LITE_H_ + +#include <cstdint> +#include <vector> + +#include "icing/text_classifier/lib3/utils/base/status.h" +#include "icing/index/hit/doc-hit-info.h" +#include "icing/index/iterator/doc-hit-info-iterator.h" +#include "icing/index/lite/lite-index.h" +#include "icing/index/term-id-codec.h" +#include "icing/schema/section.h" + +namespace icing { +namespace lib { + +class DocHitInfoIteratorTermLite : public DocHitInfoIterator { + public: + explicit DocHitInfoIteratorTermLite(const TermIdCodec* term_id_codec, + LiteIndex* lite_index, + const std::string& term, + SectionIdMask section_restrict_mask) + : term_(term), + lite_index_(lite_index), + cached_hits_idx_(-1), + term_id_codec_(term_id_codec), + num_advance_calls_(0), + section_restrict_mask_(section_restrict_mask) {} + + libtextclassifier3::Status Advance() override; + + int32_t GetNumBlocksInspected() const override { + // TODO(b/137862424): Implement this once the main index is added. + return 0; + } + int32_t GetNumLeafAdvanceCalls() const override { return num_advance_calls_; } + + protected: + // Add DocHitInfos corresponding to term_ to cached_hits_. + virtual libtextclassifier3::Status RetrieveMoreHits() = 0; + + const std::string term_; + LiteIndex* const lite_index_; + // Stores hits retrieved from the index. This may only be a subset of the hits + // that are present in the index. Current value pointed to by the Iterator is + // tracked by cached_hits_idx_. + std::vector<DocHitInfo> cached_hits_; + int cached_hits_idx_; + const TermIdCodec* term_id_codec_; + int num_advance_calls_; + // Mask indicating which sections hits should be considered for. + // Ex. 0000 0000 0000 0010 means that only hits from section 1 are desired. + const SectionIdMask section_restrict_mask_; +}; + +class DocHitInfoIteratorTermLiteExact : public DocHitInfoIteratorTermLite { + public: + explicit DocHitInfoIteratorTermLiteExact(const TermIdCodec* term_id_codec, + LiteIndex* lite_index, + const std::string& term, + SectionIdMask section_id_mask) + : DocHitInfoIteratorTermLite(term_id_codec, lite_index, term, + section_id_mask) {} + + std::string ToString() const override; + + protected: + libtextclassifier3::Status RetrieveMoreHits() override; +}; + +class DocHitInfoIteratorTermLitePrefix : public DocHitInfoIteratorTermLite { + public: + explicit DocHitInfoIteratorTermLitePrefix(const TermIdCodec* term_id_codec, + LiteIndex* lite_index, + const std::string& term, + SectionIdMask section_id_mask) + : DocHitInfoIteratorTermLite(term_id_codec, lite_index, term, + section_id_mask) {} + + std::string ToString() const override; + + protected: + libtextclassifier3::Status RetrieveMoreHits() override; + + private: + // After retrieving DocHitInfos from the index, a DocHitInfo for docid 1 and + // "foo" and a DocHitInfo for docid 1 and "fool". These DocHitInfos should be + // merged. + void SortAndDedupeDocumentIds(); +}; + +} // namespace lib +} // namespace icing + +#endif // ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_LITE_H_ |