Pull upstream changes.

Change-Id: I794757716961569b5c02171cfc82785efb2cf106
author: Terry Wang <tytytyww@google.com> 2020-10-01 18:53:44 -0700
committer: Terry Wang <tytytyww@google.com> 2020-10-01 18:53:44 -0700
commit: 5abfe5bcac00f4f188d3d8041fa97bf77206b577 (patch)
tree: 69376254e2e5f886cb0d26cdb547001f8e45f372 /icing/index/lite
parent: e15b6b66f871a71b73278c34d5c54f648f880c29 (diff)
download: icing-5abfe5bcac00f4f188d3d8041fa97bf77206b577.tar.gz
4 files changed, 182 insertions, 76 deletions
diff --git a/icing/index/lite/doc-hit-info-iterator-term-lite.cc b/icing/index/lite/doc-hit-info-iterator-term-lite.cc
index a975f86..1f1c296 100644
--- a/icing/index/lite/doc-hit-info-iterator-term-lite.cc
+++ b/icing/index/lite/doc-hit-info-iterator-term-lite.cc
@@ -61,7 +61,7 @@ libtextclassifier3::Status DocHitInfoIteratorTermLite::Advance() {
 
 libtextclassifier3::Status DocHitInfoIteratorTermLiteExact::RetrieveMoreHits() {
   // Exact match only. All hits in lite lexicon are exact.
-  ICING_ASSIGN_OR_RETURN(uint32_t tvi, lite_index_->FindTerm(term_));
+  ICING_ASSIGN_OR_RETURN(uint32_t tvi, lite_index_->GetTermId(term_));
   ICING_ASSIGN_OR_RETURN(uint32_t term_id,
                          term_id_codec_->EncodeTvi(tvi, TviType::LITE));
   lite_index_->AppendHits(term_id, section_restrict_mask_,
diff --git a/icing/index/lite/lite-index.cc b/icing/index/lite/lite-index.cc
index a72402e..89240ee 100644
--- a/icing/index/lite/lite-index.cc
+++ b/icing/index/lite/lite-index.cc
@@ -65,8 +65,8 @@ size_t header_size() { return sizeof(IcingLiteIndex_HeaderImpl::HeaderData); }
 
 }  // namespace
 
-const LiteIndex::Element::Value LiteIndex::Element::kInvalidValue =
-    LiteIndex::Element(0, Hit()).value();
+const TermIdHitPair::Value TermIdHitPair::kInvalidValue =
+    TermIdHitPair(0, Hit()).value();
 
 libtextclassifier3::StatusOr<std::unique_ptr<LiteIndex>> LiteIndex::Create(
     const LiteIndex::Options& options, const IcingFilesystem* filesystem) {
@@ -163,7 +163,7 @@ libtextclassifier3::Status LiteIndex::Initialize() {
     header_->Reset();
 
     if (!hit_buffer_.Init(hit_buffer_fd_.get(), header_padded_size, true,
-                          sizeof(Element::Value), header_->cur_size(),
+                          sizeof(TermIdHitPair::Value), header_->cur_size(),
                           options_.hit_buffer_size, &hit_buffer_crc_, true)) {
       status = absl_ports::InternalError("Failed to initialize new hit buffer");
       goto error;
@@ -177,7 +177,7 @@ libtextclassifier3::Status LiteIndex::Initialize() {
             header_mmap_.address()));
 
     if (!hit_buffer_.Init(hit_buffer_fd_.get(), header_padded_size, true,
-                          sizeof(Element::Value), header_->cur_size(),
+                          sizeof(TermIdHitPair::Value), header_->cur_size(),
                           options_.hit_buffer_size, &hit_buffer_crc_, true)) {
       status = absl_ports::InternalError(
           "Failed to re-initialize existing hit buffer");
@@ -312,20 +312,21 @@ libtextclassifier3::Status LiteIndex::AddHit(uint32_t term_id, const Hit& hit) {
 
   header_->set_last_added_docid(hit.document_id());
 
-  Element elt(term_id, hit);
+  TermIdHitPair term_id_hit_pair(term_id, hit);
   uint32_t cur_size = header_->cur_size();
-  Element::Value* valp = hit_buffer_.GetMutableMem<Element::Value>(cur_size, 1);
+  TermIdHitPair::Value* valp =
+      hit_buffer_.GetMutableMem<TermIdHitPair::Value>(cur_size, 1);
   if (valp == nullptr) {
     return absl_ports::ResourceExhaustedError(
         "Allocating more space in hit buffer failed!");
   }
-  *valp = elt.value();
+  *valp = term_id_hit_pair.value();
   header_->set_cur_size(cur_size + 1);
 
   return libtextclassifier3::Status::OK;
 }
 
-libtextclassifier3::StatusOr<uint32_t> LiteIndex::FindTerm(
+libtextclassifier3::StatusOr<uint32_t> LiteIndex::GetTermId(
     const std::string& term) const {
   char dummy;
   uint32_t tvi;
@@ -336,16 +337,17 @@ libtextclassifier3::StatusOr<uint32_t> LiteIndex::FindTerm(
   return tvi;
 }
 
-uint32_t LiteIndex::AppendHits(uint32_t term_id, SectionIdMask section_id_mask,
-                               bool only_from_prefix_sections,
-                               std::vector<DocHitInfo>* hits_out) {
-  uint32_t count = 0;
+int LiteIndex::AppendHits(uint32_t term_id, SectionIdMask section_id_mask,
+                          bool only_from_prefix_sections,
+                          std::vector<DocHitInfo>* hits_out) {
+  int count = 0;
   DocumentId last_document_id = kInvalidDocumentId;
   for (uint32_t idx = Seek(term_id); idx < header_->cur_size(); idx++) {
-    Element elt(hit_buffer_.array_cast<Element>()[idx]);
-    if (elt.term_id() != term_id) break;
+    TermIdHitPair term_id_hit_pair(
+        hit_buffer_.array_cast<TermIdHitPair>()[idx]);
+    if (term_id_hit_pair.term_id() != term_id) break;
 
-    const Hit& hit = elt.hit();
+    const Hit& hit = term_id_hit_pair.hit();
     // Check sections.
     if (((1u << hit.section_id()) & section_id_mask) == 0) {
       continue;
@@ -356,7 +358,7 @@ uint32_t LiteIndex::AppendHits(uint32_t term_id, SectionIdMask section_id_mask,
     }
     DocumentId document_id = hit.document_id();
     if (document_id != last_document_id) {
-      count++;
+      ++count;
       if (hits_out != nullptr) {
         hits_out->push_back(DocHitInfo(document_id));
       }
@@ -369,7 +371,7 @@ uint32_t LiteIndex::AppendHits(uint32_t term_id, SectionIdMask section_id_mask,
   return count;
 }
 
-uint32_t LiteIndex::CountHits(uint32_t term_id) {
+int LiteIndex::CountHits(uint32_t term_id) {
   return AppendHits(term_id, kSectionIdMaskAll,
                     /*only_from_prefix_sections=*/false,
                     /*hits_out=*/nullptr);
@@ -421,8 +423,8 @@ uint32_t LiteIndex::Seek(uint32_t term_id) {
     IcingTimer timer;
 
     auto* array_start =
-        hit_buffer_.GetMutableMem<Element::Value>(0, header_->cur_size());
-    Element::Value* sort_start = array_start + header_->searchable_end();
+        hit_buffer_.GetMutableMem<TermIdHitPair::Value>(0, header_->cur_size());
+    TermIdHitPair::Value* sort_start = array_start + header_->searchable_end();
     std::sort(sort_start, array_start + header_->cur_size());
 
     // Now merge with previous region. Since the previous region is already
@@ -445,11 +447,13 @@ uint32_t LiteIndex::Seek(uint32_t term_id) {
 
   // Binary search for our term_id.  Make sure we get the first
   // element.  Using kBeginSortValue ensures this for the hit value.
-  Element elt(term_id, Hit(Hit::kMaxDocumentIdSortValue, Hit::kMaxHitScore));
+  TermIdHitPair term_id_hit_pair(
+      term_id, Hit(Hit::kMaxDocumentIdSortValue, Hit::kMaxHitScore));
 
-  const Element::Value* array = hit_buffer_.array_cast<Element::Value>();
-  const Element::Value* ptr =
-      std::lower_bound(array, array + header_->cur_size(), elt.value());
+  const TermIdHitPair::Value* array =
+      hit_buffer_.array_cast<TermIdHitPair::Value>();
+  const TermIdHitPair::Value* ptr = std::lower_bound(
+      array, array + header_->cur_size(), term_id_hit_pair.value());
   return ptr - array;
 }
 
diff --git a/icing/index/lite/lite-index.h b/icing/index/lite/lite-index.h
index b60a947..27ccf33 100644
--- a/icing/index/lite/lite-index.h
+++ b/icing/index/lite/lite-index.h
@@ -30,6 +30,7 @@
 #include "icing/file/filesystem.h"
 #include "icing/index/hit/doc-hit-info.h"
 #include "icing/index/hit/hit.h"
+#include "icing/index/lite/term-id-hit-pair.h"
 #include "icing/legacy/index/icing-array-storage.h"
 #include "icing/legacy/index/icing-dynamic-trie.h"
 #include "icing/legacy/index/icing-filesystem.h"
@@ -49,49 +50,6 @@ namespace lib {
 class LiteIndex {
  public:
   // An entry in the hit buffer.
-  class Element {
-   public:
-    // Layout bits: 24 termid + 32 hit value + 8 hit score.
-    using Value = uint64_t;
-
-    static constexpr int kTermIdBits = 24;
-    static constexpr int kHitValueBits = sizeof(Hit::Value) * 8;
-    static constexpr int kHitScoreBits = sizeof(Hit::Score) * 8;
-
-    static const Value kInvalidValue;
-
-    explicit Element(Value v = kInvalidValue) : value_(v) {}
-
-    Element(uint32_t term_id, const Hit& hit) {
-      static_assert(
-          kTermIdBits + kHitValueBits + kHitScoreBits <= sizeof(Value) * 8,
-          "LiteIndexElementTooBig");
-
-      value_ = 0;
-      // Term id goes into the most significant bits because it takes
-      // precedent in sorts.
-      bit_util::BitfieldSet(term_id, kHitValueBits + kHitScoreBits, kTermIdBits,
-                            &value_);
-      bit_util::BitfieldSet(hit.value(), kHitScoreBits, kHitValueBits, &value_);
-      bit_util::BitfieldSet(hit.score(), 0, kHitScoreBits, &value_);
-    }
-
-    uint32_t term_id() const {
-      return bit_util::BitfieldGet(value_, kHitValueBits + kHitScoreBits,
-                                   kTermIdBits);
-    }
-
-    Hit hit() const {
-      return Hit(bit_util::BitfieldGet(value_, kHitScoreBits, kHitValueBits),
-                 bit_util::BitfieldGet(value_, 0, kHitScoreBits));
-    }
-
-    Value value() const { return value_; }
-
-   private:
-    Value value_;
-  };
-
   using Options = IcingLiteIndexOptions;
 
   // Updates checksum of subcomponents.
@@ -126,7 +84,7 @@ class LiteIndex {
   Crc32 ComputeChecksum();
 
   // Returns term_id if term found, NOT_FOUND otherwise.
-  libtextclassifier3::StatusOr<uint32_t> FindTerm(
+  libtextclassifier3::StatusOr<uint32_t> GetTermId(
       const std::string& term) const;
 
   // Returns an iterator for all terms for which 'prefix' is a prefix.
@@ -170,25 +128,89 @@ class LiteIndex {
                                                   NamespaceId namespace_id);
 
   // Append hit to buffer. term_id must be encoded using the same term_id_codec
-  // supplied to the index constructor. Returns non-OK if hit cannot be added
-  // (either due to hit buffer or file system capacity reached).
+  // supplied to the index constructor.
+  // RETURNS:
+  //  - OK if hit was successfully added
+  //  - RESOURCE_EXHAUSTED if hit could not be added (either due to hit buffer
+  //    or file system capacity reached).
   libtextclassifier3::Status AddHit(uint32_t term_id, const Hit& hit);
 
   // Add all hits with term_id from the sections specified in section_id_mask,
   // skipping hits in non-prefix sections if only_from_prefix_sections is true,
-  // to hits_out.
-  uint32_t AppendHits(uint32_t term_id, SectionIdMask section_id_mask,
-                      bool only_from_prefix_sections,
-                      std::vector<DocHitInfo>* hits_out);
+  // to hits_out. If hits_out is nullptr, no hits will be added.
+  //
+  // Returns the number of hits that would be added to hits_out.
+  int AppendHits(uint32_t term_id, SectionIdMask section_id_mask,
+                 bool only_from_prefix_sections,
+                 std::vector<DocHitInfo>* hits_out);
 
   // Returns the hit count of the term.
-  uint32_t CountHits(uint32_t term_id);
+  int CountHits(uint32_t term_id);
 
   // Check if buffer has reached its capacity.
   bool is_full() const;
 
+  bool empty() const { return size() == 0; }
+
+  uint32_t size() const { return header_->cur_size(); }
+
+  class const_iterator {
+    friend class LiteIndex;
+
+   public:
+    using iterator_category = std::forward_iterator_tag;
+    using value_type = TermIdHitPair;
+    using reference = const value_type&;
+    using pointer = const value_type*;
+
+    const_iterator() : const_iterator(nullptr, -1, -1) {}
+
+    reference operator*() const { return start_[position_]; }
+
+    pointer operator->() const { return start_ + position_; }
+
+    const_iterator& operator++() {
+      if (++position_ >= end_position_) {
+        start_ = nullptr;
+        position_ = -1;
+        end_position_ = -1;
+      }
+      return *this;
+    }
+
+    const_iterator operator++(int) {
+      auto tmp = *this;
+      ++*this;
+      return tmp;
+    }
+
+    bool operator!=(const const_iterator& rhs) { return !(*this == rhs); }
+
+    bool operator==(const const_iterator& rhs) {
+      return start_ == rhs.start_ && position_ == rhs.position_;
+    }
+
+   private:
+    explicit const_iterator(const TermIdHitPair* start, int position,
+                            int end_position)
+        : start_(start), position_(position), end_position_(end_position) {}
+
+    const TermIdHitPair* start_;
+    int position_;
+    int end_position_;
+  };
+
+  const_iterator begin() const {
+    // If the LiteIndex is empty, just return end().
+    return empty() ? end()
+                   : const_iterator(hit_buffer_.array_cast<TermIdHitPair>(), 0,
+                                    header_->cur_size());
+  }
+
+  const_iterator end() const { return const_iterator(); }
+
   constexpr static uint32_t max_hit_buffer_size() {
-    return std::numeric_limits<uint32_t>::max() / sizeof(LiteIndex::Element);
+    return std::numeric_limits<uint32_t>::max() / sizeof(TermIdHitPair);
   }
 
   // We keep track of the last added document_id. This is always the largest
diff --git a/icing/index/lite/term-id-hit-pair.h b/icing/index/lite/term-id-hit-pair.h
new file mode 100644
index 0000000..191f766
--- /dev/null
+++ b/icing/index/lite/term-id-hit-pair.h
@@ -0,0 +1,80 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_TERM_ID_HIT_PAIR_H_
+#define ICING_INDEX_TERM_ID_HIT_PAIR_H_
+
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "icing/index/hit/hit.h"
+#include "icing/util/bit-util.h"
+
+namespace icing {
+namespace lib {
+
+class TermIdHitPair {
+ public:
+  // Layout bits: 24 termid + 32 hit value + 8 hit score.
+  using Value = uint64_t;
+
+  static constexpr int kTermIdBits = 24;
+  static constexpr int kHitValueBits = sizeof(Hit::Value) * 8;
+  static constexpr int kHitScoreBits = sizeof(Hit::Score) * 8;
+
+  static const Value kInvalidValue;
+
+  explicit TermIdHitPair(Value v = kInvalidValue) : value_(v) {}
+
+  TermIdHitPair(uint32_t term_id, const Hit& hit) {
+    static_assert(
+        kTermIdBits + kHitValueBits + kHitScoreBits <= sizeof(Value) * 8,
+        "TermIdHitPairTooBig");
+
+    value_ = 0;
+    // Term id goes into the most significant bits because it takes
+    // precedent in sorts.
+    bit_util::BitfieldSet(term_id, kHitValueBits + kHitScoreBits, kTermIdBits,
+                          &value_);
+    bit_util::BitfieldSet(hit.value(), kHitScoreBits, kHitValueBits, &value_);
+    bit_util::BitfieldSet(hit.score(), 0, kHitScoreBits, &value_);
+  }
+
+  uint32_t term_id() const {
+    return bit_util::BitfieldGet(value_, kHitValueBits + kHitScoreBits,
+                                 kTermIdBits);
+  }
+
+  Hit hit() const {
+    return Hit(bit_util::BitfieldGet(value_, kHitScoreBits, kHitValueBits),
+               bit_util::BitfieldGet(value_, 0, kHitScoreBits));
+  }
+
+  Value value() const { return value_; }
+
+  bool operator==(const TermIdHitPair& rhs) const {
+    return value_ == rhs.value_;
+  }
+
+ private:
+  Value value_;
+};
+
+}  // namespace lib
+}  // namespace icing
+
+#endif  // ICING_INDEX_TERM_ID_HIT_PAIR_H_
author	Terry Wang <tytytyww@google.com>	2020-10-01 18:53:44 -0700
committer	Terry Wang <tytytyww@google.com>	2020-10-01 18:53:44 -0700
commit	5abfe5bcac00f4f188d3d8041fa97bf77206b577 (patch)
tree	69376254e2e5f886cb0d26cdb547001f8e45f372 /icing/index/lite
parent	e15b6b66f871a71b73278c34d5c54f648f880c29 (diff)
download	icing-5abfe5bcac00f4f188d3d8041fa97bf77206b577.tar.gz