Pull upstream changes.

Change-Id: I44831fdadcdb67f2e19570a35cb4c76faf8397f9
author: Terry Wang <tytytyww@google.com> 2020-09-24 13:39:23 -0700
committer: Terry Wang <tytytyww@google.com> 2020-09-24 13:39:23 -0700
commit: e15b6b66f871a71b73278c34d5c54f648f880c29 (patch)
tree: 61e187172a8802fae8e39f04ce69d3ae5c939f2e /icing/index/lite
parent: 9f1b9cf4dc93fa7bfee0a3637c93dc5b557aab30 (diff)
download: icing-e15b6b66f871a71b73278c34d5c54f648f880c29.tar.gz
2 files changed, 235 insertions, 0 deletions
diff --git a/icing/index/lite/doc-hit-info-iterator-term-lite.cc b/icing/index/lite/doc-hit-info-iterator-term-lite.cc
new file mode 100644
index 0000000..a975f86
--- /dev/null
+++ b/icing/index/lite/doc-hit-info-iterator-term-lite.cc
@@ -0,0 +1,126 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/lite/doc-hit-info-iterator-term-lite.h"
+
+#include <cstdint>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/index/hit/doc-hit-info.h"
+#include "icing/schema/section.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+std::string SectionIdMaskToString(SectionIdMask section_id_mask) {
+  std::string mask(kMaxSectionId + 1, '0');
+  for (SectionId i = kMaxSectionId; i >= 0; --i) {
+    if (section_id_mask & (1U << i)) {
+      mask[kMaxSectionId - i] = '1';
+    }
+  }
+  return mask;
+}
+
+}  // namespace
+
+libtextclassifier3::Status DocHitInfoIteratorTermLite::Advance() {
+  if (cached_hits_idx_ == -1) {
+    ICING_RETURN_IF_ERROR(RetrieveMoreHits());
+  } else {
+    ++cached_hits_idx_;
+  }
+  if (cached_hits_idx_ == -1 || cached_hits_idx_ >= cached_hits_.size()) {
+    // Nothing more for the iterator to return. Set these members to invalid
+    // values.
+    doc_hit_info_ = DocHitInfo();
+    hit_intersect_section_ids_mask_ = kSectionIdMaskNone;
+    return absl_ports::ResourceExhaustedError(
+        "No more DocHitInfos in iterator");
+  }
+  doc_hit_info_ = cached_hits_.at(cached_hits_idx_);
+  hit_intersect_section_ids_mask_ = doc_hit_info_.hit_section_ids_mask();
+  return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status DocHitInfoIteratorTermLiteExact::RetrieveMoreHits() {
+  // Exact match only. All hits in lite lexicon are exact.
+  ICING_ASSIGN_OR_RETURN(uint32_t tvi, lite_index_->FindTerm(term_));
+  ICING_ASSIGN_OR_RETURN(uint32_t term_id,
+                         term_id_codec_->EncodeTvi(tvi, TviType::LITE));
+  lite_index_->AppendHits(term_id, section_restrict_mask_,
+                          /*only_from_prefix_sections=*/false, &cached_hits_);
+  cached_hits_idx_ = 0;
+  return libtextclassifier3::Status::OK;
+}
+
+std::string DocHitInfoIteratorTermLiteExact::ToString() const {
+  return absl_ports::StrCat(SectionIdMaskToString(section_restrict_mask_), ":",
+                            term_);
+}
+
+libtextclassifier3::Status
+DocHitInfoIteratorTermLitePrefix::RetrieveMoreHits() {
+  // Take union of lite terms.
+  int term_len = term_.length();
+  int terms_matched = 0;
+  for (LiteIndex::PrefixIterator it = lite_index_->FindTermPrefixes(term_);
+       it.IsValid(); it.Advance()) {
+    bool exact_match = strlen(it.GetKey()) == term_len;
+    ICING_ASSIGN_OR_RETURN(
+        uint32_t term_id,
+        term_id_codec_->EncodeTvi(it.GetValueIndex(), TviType::LITE));
+    lite_index_->AppendHits(term_id, section_restrict_mask_,
+                            /*only_from_prefix_sections=*/!exact_match,
+                            &cached_hits_);
+    ++terms_matched;
+  }
+  if (terms_matched > 1) {
+    SortAndDedupeDocumentIds();
+  }
+  cached_hits_idx_ = 0;
+  return libtextclassifier3::Status::OK;
+}
+
+void DocHitInfoIteratorTermLitePrefix::SortAndDedupeDocumentIds() {
+  // Re-sort cached document_ids and merge sections.
+  sort(cached_hits_.begin(), cached_hits_.end());
+
+  int idx = 0;
+  for (int i = 1; i < cached_hits_.size(); ++i) {
+    const DocHitInfo& hit_info = cached_hits_.at(i);
+    DocHitInfo& collapsed_hit_info = cached_hits_.at(idx);
+    if (collapsed_hit_info.document_id() == hit_info.document_id()) {
+      collapsed_hit_info.MergeSectionsFrom(hit_info);
+    } else {
+      // New document_id.
+      cached_hits_.at(++idx) = hit_info;
+    }
+  }
+  // idx points to last doc hit info.
+  cached_hits_.resize(idx + 1);
+}
+
+std::string DocHitInfoIteratorTermLitePrefix::ToString() const {
+  return absl_ports::StrCat(SectionIdMaskToString(section_restrict_mask_), ":",
+                            term_, "*");
+}
+
+}  // namespace lib
+}  // namespace icing
diff --git a/icing/index/lite/doc-hit-info-iterator-term-lite.h b/icing/index/lite/doc-hit-info-iterator-term-lite.h
new file mode 100644
index 0000000..bd2de6d
--- /dev/null
+++ b/icing/index/lite/doc-hit-info-iterator-term-lite.h
@@ -0,0 +1,109 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_LITE_H_
+#define ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_LITE_H_
+
+#include <cstdint>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/index/hit/doc-hit-info.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/index/lite/lite-index.h"
+#include "icing/index/term-id-codec.h"
+#include "icing/schema/section.h"
+
+namespace icing {
+namespace lib {
+
+class DocHitInfoIteratorTermLite : public DocHitInfoIterator {
+ public:
+  explicit DocHitInfoIteratorTermLite(const TermIdCodec* term_id_codec,
+                                      LiteIndex* lite_index,
+                                      const std::string& term,
+                                      SectionIdMask section_restrict_mask)
+      : term_(term),
+        lite_index_(lite_index),
+        cached_hits_idx_(-1),
+        term_id_codec_(term_id_codec),
+        num_advance_calls_(0),
+        section_restrict_mask_(section_restrict_mask) {}
+
+  libtextclassifier3::Status Advance() override;
+
+  int32_t GetNumBlocksInspected() const override {
+    // TODO(b/137862424): Implement this once the main index is added.
+    return 0;
+  }
+  int32_t GetNumLeafAdvanceCalls() const override { return num_advance_calls_; }
+
+ protected:
+  // Add DocHitInfos corresponding to term_ to cached_hits_.
+  virtual libtextclassifier3::Status RetrieveMoreHits() = 0;
+
+  const std::string term_;
+  LiteIndex* const lite_index_;
+  // Stores hits retrieved from the index. This may only be a subset of the hits
+  // that are present in the index. Current value pointed to by the Iterator is
+  // tracked by cached_hits_idx_.
+  std::vector<DocHitInfo> cached_hits_;
+  int cached_hits_idx_;
+  const TermIdCodec* term_id_codec_;
+  int num_advance_calls_;
+  // Mask indicating which sections hits should be considered for.
+  // Ex. 0000 0000 0000 0010 means that only hits from section 1 are desired.
+  const SectionIdMask section_restrict_mask_;
+};
+
+class DocHitInfoIteratorTermLiteExact : public DocHitInfoIteratorTermLite {
+ public:
+  explicit DocHitInfoIteratorTermLiteExact(const TermIdCodec* term_id_codec,
+                                           LiteIndex* lite_index,
+                                           const std::string& term,
+                                           SectionIdMask section_id_mask)
+      : DocHitInfoIteratorTermLite(term_id_codec, lite_index, term,
+                                   section_id_mask) {}
+
+  std::string ToString() const override;
+
+ protected:
+  libtextclassifier3::Status RetrieveMoreHits() override;
+};
+
+class DocHitInfoIteratorTermLitePrefix : public DocHitInfoIteratorTermLite {
+ public:
+  explicit DocHitInfoIteratorTermLitePrefix(const TermIdCodec* term_id_codec,
+                                            LiteIndex* lite_index,
+                                            const std::string& term,
+                                            SectionIdMask section_id_mask)
+      : DocHitInfoIteratorTermLite(term_id_codec, lite_index, term,
+                                   section_id_mask) {}
+
+  std::string ToString() const override;
+
+ protected:
+  libtextclassifier3::Status RetrieveMoreHits() override;
+
+ private:
+  // After retrieving DocHitInfos from the index, a DocHitInfo for docid 1 and
+  // "foo" and a DocHitInfo for docid 1 and "fool". These DocHitInfos should be
+  // merged.
+  void SortAndDedupeDocumentIds();
+};
+
+}  // namespace lib
+}  // namespace icing
+
+#endif  // ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_LITE_H_
author	Terry Wang <tytytyww@google.com>	2020-09-24 13:39:23 -0700
committer	Terry Wang <tytytyww@google.com>	2020-09-24 13:39:23 -0700
commit	e15b6b66f871a71b73278c34d5c54f648f880c29 (patch)
tree	61e187172a8802fae8e39f04ce69d3ae5c939f2e /icing/index/lite
parent	9f1b9cf4dc93fa7bfee0a3637c93dc5b557aab30 (diff)
download	icing-e15b6b66f871a71b73278c34d5c54f648f880c29.tar.gz