aboutsummaryrefslogtreecommitdiff
path: root/icing/index/lite
diff options
context:
space:
mode:
authorTerry Wang <tytytyww@google.com>2020-09-24 13:39:23 -0700
committerTerry Wang <tytytyww@google.com>2020-09-24 13:39:23 -0700
commite15b6b66f871a71b73278c34d5c54f648f880c29 (patch)
tree61e187172a8802fae8e39f04ce69d3ae5c939f2e /icing/index/lite
parent9f1b9cf4dc93fa7bfee0a3637c93dc5b557aab30 (diff)
downloadicing-e15b6b66f871a71b73278c34d5c54f648f880c29.tar.gz
Pull upstream changes.
Change-Id: I44831fdadcdb67f2e19570a35cb4c76faf8397f9
Diffstat (limited to 'icing/index/lite')
-rw-r--r--icing/index/lite/doc-hit-info-iterator-term-lite.cc126
-rw-r--r--icing/index/lite/doc-hit-info-iterator-term-lite.h109
2 files changed, 235 insertions, 0 deletions
diff --git a/icing/index/lite/doc-hit-info-iterator-term-lite.cc b/icing/index/lite/doc-hit-info-iterator-term-lite.cc
new file mode 100644
index 0000000..a975f86
--- /dev/null
+++ b/icing/index/lite/doc-hit-info-iterator-term-lite.cc
@@ -0,0 +1,126 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/lite/doc-hit-info-iterator-term-lite.h"
+
+#include <cstdint>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/index/hit/doc-hit-info.h"
+#include "icing/schema/section.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+std::string SectionIdMaskToString(SectionIdMask section_id_mask) {
+ std::string mask(kMaxSectionId + 1, '0');
+ for (SectionId i = kMaxSectionId; i >= 0; --i) {
+ if (section_id_mask & (1U << i)) {
+ mask[kMaxSectionId - i] = '1';
+ }
+ }
+ return mask;
+}
+
+} // namespace
+
+libtextclassifier3::Status DocHitInfoIteratorTermLite::Advance() {
+ if (cached_hits_idx_ == -1) {
+ ICING_RETURN_IF_ERROR(RetrieveMoreHits());
+ } else {
+ ++cached_hits_idx_;
+ }
+ if (cached_hits_idx_ == -1 || cached_hits_idx_ >= cached_hits_.size()) {
+ // Nothing more for the iterator to return. Set these members to invalid
+ // values.
+ doc_hit_info_ = DocHitInfo();
+ hit_intersect_section_ids_mask_ = kSectionIdMaskNone;
+ return absl_ports::ResourceExhaustedError(
+ "No more DocHitInfos in iterator");
+ }
+ doc_hit_info_ = cached_hits_.at(cached_hits_idx_);
+ hit_intersect_section_ids_mask_ = doc_hit_info_.hit_section_ids_mask();
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status DocHitInfoIteratorTermLiteExact::RetrieveMoreHits() {
+ // Exact match only. All hits in lite lexicon are exact.
+ ICING_ASSIGN_OR_RETURN(uint32_t tvi, lite_index_->FindTerm(term_));
+ ICING_ASSIGN_OR_RETURN(uint32_t term_id,
+ term_id_codec_->EncodeTvi(tvi, TviType::LITE));
+ lite_index_->AppendHits(term_id, section_restrict_mask_,
+ /*only_from_prefix_sections=*/false, &cached_hits_);
+ cached_hits_idx_ = 0;
+ return libtextclassifier3::Status::OK;
+}
+
+std::string DocHitInfoIteratorTermLiteExact::ToString() const {
+ return absl_ports::StrCat(SectionIdMaskToString(section_restrict_mask_), ":",
+ term_);
+}
+
+libtextclassifier3::Status
+DocHitInfoIteratorTermLitePrefix::RetrieveMoreHits() {
+ // Take union of lite terms.
+ int term_len = term_.length();
+ int terms_matched = 0;
+ for (LiteIndex::PrefixIterator it = lite_index_->FindTermPrefixes(term_);
+ it.IsValid(); it.Advance()) {
+ bool exact_match = strlen(it.GetKey()) == term_len;
+ ICING_ASSIGN_OR_RETURN(
+ uint32_t term_id,
+ term_id_codec_->EncodeTvi(it.GetValueIndex(), TviType::LITE));
+ lite_index_->AppendHits(term_id, section_restrict_mask_,
+ /*only_from_prefix_sections=*/!exact_match,
+ &cached_hits_);
+ ++terms_matched;
+ }
+ if (terms_matched > 1) {
+ SortAndDedupeDocumentIds();
+ }
+ cached_hits_idx_ = 0;
+ return libtextclassifier3::Status::OK;
+}
+
+void DocHitInfoIteratorTermLitePrefix::SortAndDedupeDocumentIds() {
+ // Re-sort cached document_ids and merge sections.
+ sort(cached_hits_.begin(), cached_hits_.end());
+
+ int idx = 0;
+ for (int i = 1; i < cached_hits_.size(); ++i) {
+ const DocHitInfo& hit_info = cached_hits_.at(i);
+ DocHitInfo& collapsed_hit_info = cached_hits_.at(idx);
+ if (collapsed_hit_info.document_id() == hit_info.document_id()) {
+ collapsed_hit_info.MergeSectionsFrom(hit_info);
+ } else {
+ // New document_id.
+ cached_hits_.at(++idx) = hit_info;
+ }
+ }
+ // idx points to last doc hit info.
+ cached_hits_.resize(idx + 1);
+}
+
+std::string DocHitInfoIteratorTermLitePrefix::ToString() const {
+ return absl_ports::StrCat(SectionIdMaskToString(section_restrict_mask_), ":",
+ term_, "*");
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/lite/doc-hit-info-iterator-term-lite.h b/icing/index/lite/doc-hit-info-iterator-term-lite.h
new file mode 100644
index 0000000..bd2de6d
--- /dev/null
+++ b/icing/index/lite/doc-hit-info-iterator-term-lite.h
@@ -0,0 +1,109 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_LITE_H_
+#define ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_LITE_H_
+
+#include <cstdint>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/index/hit/doc-hit-info.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/index/lite/lite-index.h"
+#include "icing/index/term-id-codec.h"
+#include "icing/schema/section.h"
+
+namespace icing {
+namespace lib {
+
+class DocHitInfoIteratorTermLite : public DocHitInfoIterator {
+ public:
+ explicit DocHitInfoIteratorTermLite(const TermIdCodec* term_id_codec,
+ LiteIndex* lite_index,
+ const std::string& term,
+ SectionIdMask section_restrict_mask)
+ : term_(term),
+ lite_index_(lite_index),
+ cached_hits_idx_(-1),
+ term_id_codec_(term_id_codec),
+ num_advance_calls_(0),
+ section_restrict_mask_(section_restrict_mask) {}
+
+ libtextclassifier3::Status Advance() override;
+
+ int32_t GetNumBlocksInspected() const override {
+ // TODO(b/137862424): Implement this once the main index is added.
+ return 0;
+ }
+ int32_t GetNumLeafAdvanceCalls() const override { return num_advance_calls_; }
+
+ protected:
+ // Add DocHitInfos corresponding to term_ to cached_hits_.
+ virtual libtextclassifier3::Status RetrieveMoreHits() = 0;
+
+ const std::string term_;
+ LiteIndex* const lite_index_;
+ // Stores hits retrieved from the index. This may only be a subset of the hits
+ // that are present in the index. Current value pointed to by the Iterator is
+ // tracked by cached_hits_idx_.
+ std::vector<DocHitInfo> cached_hits_;
+ int cached_hits_idx_;
+ const TermIdCodec* term_id_codec_;
+ int num_advance_calls_;
+ // Mask indicating which sections hits should be considered for.
+ // Ex. 0000 0000 0000 0010 means that only hits from section 1 are desired.
+ const SectionIdMask section_restrict_mask_;
+};
+
+class DocHitInfoIteratorTermLiteExact : public DocHitInfoIteratorTermLite {
+ public:
+ explicit DocHitInfoIteratorTermLiteExact(const TermIdCodec* term_id_codec,
+ LiteIndex* lite_index,
+ const std::string& term,
+ SectionIdMask section_id_mask)
+ : DocHitInfoIteratorTermLite(term_id_codec, lite_index, term,
+ section_id_mask) {}
+
+ std::string ToString() const override;
+
+ protected:
+ libtextclassifier3::Status RetrieveMoreHits() override;
+};
+
+class DocHitInfoIteratorTermLitePrefix : public DocHitInfoIteratorTermLite {
+ public:
+ explicit DocHitInfoIteratorTermLitePrefix(const TermIdCodec* term_id_codec,
+ LiteIndex* lite_index,
+ const std::string& term,
+ SectionIdMask section_id_mask)
+ : DocHitInfoIteratorTermLite(term_id_codec, lite_index, term,
+ section_id_mask) {}
+
+ std::string ToString() const override;
+
+ protected:
+ libtextclassifier3::Status RetrieveMoreHits() override;
+
+ private:
+ // After retrieving DocHitInfos from the index, a DocHitInfo for docid 1 and
+ // "foo" and a DocHitInfo for docid 1 and "fool". These DocHitInfos should be
+ // merged.
+ void SortAndDedupeDocumentIds();
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_LITE_H_