aboutsummaryrefslogtreecommitdiff
path: root/icing/index/lite
diff options
context:
space:
mode:
authorTerry Wang <tytytyww@google.com>2020-10-01 18:53:44 -0700
committerTerry Wang <tytytyww@google.com>2020-10-01 18:53:44 -0700
commit5abfe5bcac00f4f188d3d8041fa97bf77206b577 (patch)
tree69376254e2e5f886cb0d26cdb547001f8e45f372 /icing/index/lite
parente15b6b66f871a71b73278c34d5c54f648f880c29 (diff)
downloadicing-5abfe5bcac00f4f188d3d8041fa97bf77206b577.tar.gz
Pull upstream changes.
Change-Id: I794757716961569b5c02171cfc82785efb2cf106
Diffstat (limited to 'icing/index/lite')
-rw-r--r--icing/index/lite/doc-hit-info-iterator-term-lite.cc2
-rw-r--r--icing/index/lite/lite-index.cc50
-rw-r--r--icing/index/lite/lite-index.h126
-rw-r--r--icing/index/lite/term-id-hit-pair.h80
4 files changed, 182 insertions, 76 deletions
diff --git a/icing/index/lite/doc-hit-info-iterator-term-lite.cc b/icing/index/lite/doc-hit-info-iterator-term-lite.cc
index a975f86..1f1c296 100644
--- a/icing/index/lite/doc-hit-info-iterator-term-lite.cc
+++ b/icing/index/lite/doc-hit-info-iterator-term-lite.cc
@@ -61,7 +61,7 @@ libtextclassifier3::Status DocHitInfoIteratorTermLite::Advance() {
libtextclassifier3::Status DocHitInfoIteratorTermLiteExact::RetrieveMoreHits() {
// Exact match only. All hits in lite lexicon are exact.
- ICING_ASSIGN_OR_RETURN(uint32_t tvi, lite_index_->FindTerm(term_));
+ ICING_ASSIGN_OR_RETURN(uint32_t tvi, lite_index_->GetTermId(term_));
ICING_ASSIGN_OR_RETURN(uint32_t term_id,
term_id_codec_->EncodeTvi(tvi, TviType::LITE));
lite_index_->AppendHits(term_id, section_restrict_mask_,
diff --git a/icing/index/lite/lite-index.cc b/icing/index/lite/lite-index.cc
index a72402e..89240ee 100644
--- a/icing/index/lite/lite-index.cc
+++ b/icing/index/lite/lite-index.cc
@@ -65,8 +65,8 @@ size_t header_size() { return sizeof(IcingLiteIndex_HeaderImpl::HeaderData); }
} // namespace
-const LiteIndex::Element::Value LiteIndex::Element::kInvalidValue =
- LiteIndex::Element(0, Hit()).value();
+const TermIdHitPair::Value TermIdHitPair::kInvalidValue =
+ TermIdHitPair(0, Hit()).value();
libtextclassifier3::StatusOr<std::unique_ptr<LiteIndex>> LiteIndex::Create(
const LiteIndex::Options& options, const IcingFilesystem* filesystem) {
@@ -163,7 +163,7 @@ libtextclassifier3::Status LiteIndex::Initialize() {
header_->Reset();
if (!hit_buffer_.Init(hit_buffer_fd_.get(), header_padded_size, true,
- sizeof(Element::Value), header_->cur_size(),
+ sizeof(TermIdHitPair::Value), header_->cur_size(),
options_.hit_buffer_size, &hit_buffer_crc_, true)) {
status = absl_ports::InternalError("Failed to initialize new hit buffer");
goto error;
@@ -177,7 +177,7 @@ libtextclassifier3::Status LiteIndex::Initialize() {
header_mmap_.address()));
if (!hit_buffer_.Init(hit_buffer_fd_.get(), header_padded_size, true,
- sizeof(Element::Value), header_->cur_size(),
+ sizeof(TermIdHitPair::Value), header_->cur_size(),
options_.hit_buffer_size, &hit_buffer_crc_, true)) {
status = absl_ports::InternalError(
"Failed to re-initialize existing hit buffer");
@@ -312,20 +312,21 @@ libtextclassifier3::Status LiteIndex::AddHit(uint32_t term_id, const Hit& hit) {
header_->set_last_added_docid(hit.document_id());
- Element elt(term_id, hit);
+ TermIdHitPair term_id_hit_pair(term_id, hit);
uint32_t cur_size = header_->cur_size();
- Element::Value* valp = hit_buffer_.GetMutableMem<Element::Value>(cur_size, 1);
+ TermIdHitPair::Value* valp =
+ hit_buffer_.GetMutableMem<TermIdHitPair::Value>(cur_size, 1);
if (valp == nullptr) {
return absl_ports::ResourceExhaustedError(
"Allocating more space in hit buffer failed!");
}
- *valp = elt.value();
+ *valp = term_id_hit_pair.value();
header_->set_cur_size(cur_size + 1);
return libtextclassifier3::Status::OK;
}
-libtextclassifier3::StatusOr<uint32_t> LiteIndex::FindTerm(
+libtextclassifier3::StatusOr<uint32_t> LiteIndex::GetTermId(
const std::string& term) const {
char dummy;
uint32_t tvi;
@@ -336,16 +337,17 @@ libtextclassifier3::StatusOr<uint32_t> LiteIndex::FindTerm(
return tvi;
}
-uint32_t LiteIndex::AppendHits(uint32_t term_id, SectionIdMask section_id_mask,
- bool only_from_prefix_sections,
- std::vector<DocHitInfo>* hits_out) {
- uint32_t count = 0;
+int LiteIndex::AppendHits(uint32_t term_id, SectionIdMask section_id_mask,
+ bool only_from_prefix_sections,
+ std::vector<DocHitInfo>* hits_out) {
+ int count = 0;
DocumentId last_document_id = kInvalidDocumentId;
for (uint32_t idx = Seek(term_id); idx < header_->cur_size(); idx++) {
- Element elt(hit_buffer_.array_cast<Element>()[idx]);
- if (elt.term_id() != term_id) break;
+ TermIdHitPair term_id_hit_pair(
+ hit_buffer_.array_cast<TermIdHitPair>()[idx]);
+ if (term_id_hit_pair.term_id() != term_id) break;
- const Hit& hit = elt.hit();
+ const Hit& hit = term_id_hit_pair.hit();
// Check sections.
if (((1u << hit.section_id()) & section_id_mask) == 0) {
continue;
@@ -356,7 +358,7 @@ uint32_t LiteIndex::AppendHits(uint32_t term_id, SectionIdMask section_id_mask,
}
DocumentId document_id = hit.document_id();
if (document_id != last_document_id) {
- count++;
+ ++count;
if (hits_out != nullptr) {
hits_out->push_back(DocHitInfo(document_id));
}
@@ -369,7 +371,7 @@ uint32_t LiteIndex::AppendHits(uint32_t term_id, SectionIdMask section_id_mask,
return count;
}
-uint32_t LiteIndex::CountHits(uint32_t term_id) {
+int LiteIndex::CountHits(uint32_t term_id) {
return AppendHits(term_id, kSectionIdMaskAll,
/*only_from_prefix_sections=*/false,
/*hits_out=*/nullptr);
@@ -421,8 +423,8 @@ uint32_t LiteIndex::Seek(uint32_t term_id) {
IcingTimer timer;
auto* array_start =
- hit_buffer_.GetMutableMem<Element::Value>(0, header_->cur_size());
- Element::Value* sort_start = array_start + header_->searchable_end();
+ hit_buffer_.GetMutableMem<TermIdHitPair::Value>(0, header_->cur_size());
+ TermIdHitPair::Value* sort_start = array_start + header_->searchable_end();
std::sort(sort_start, array_start + header_->cur_size());
// Now merge with previous region. Since the previous region is already
@@ -445,11 +447,13 @@ uint32_t LiteIndex::Seek(uint32_t term_id) {
// Binary search for our term_id. Make sure we get the first
// element. Using kBeginSortValue ensures this for the hit value.
- Element elt(term_id, Hit(Hit::kMaxDocumentIdSortValue, Hit::kMaxHitScore));
+ TermIdHitPair term_id_hit_pair(
+ term_id, Hit(Hit::kMaxDocumentIdSortValue, Hit::kMaxHitScore));
- const Element::Value* array = hit_buffer_.array_cast<Element::Value>();
- const Element::Value* ptr =
- std::lower_bound(array, array + header_->cur_size(), elt.value());
+ const TermIdHitPair::Value* array =
+ hit_buffer_.array_cast<TermIdHitPair::Value>();
+ const TermIdHitPair::Value* ptr = std::lower_bound(
+ array, array + header_->cur_size(), term_id_hit_pair.value());
return ptr - array;
}
diff --git a/icing/index/lite/lite-index.h b/icing/index/lite/lite-index.h
index b60a947..27ccf33 100644
--- a/icing/index/lite/lite-index.h
+++ b/icing/index/lite/lite-index.h
@@ -30,6 +30,7 @@
#include "icing/file/filesystem.h"
#include "icing/index/hit/doc-hit-info.h"
#include "icing/index/hit/hit.h"
+#include "icing/index/lite/term-id-hit-pair.h"
#include "icing/legacy/index/icing-array-storage.h"
#include "icing/legacy/index/icing-dynamic-trie.h"
#include "icing/legacy/index/icing-filesystem.h"
@@ -49,49 +50,6 @@ namespace lib {
class LiteIndex {
public:
// An entry in the hit buffer.
- class Element {
- public:
- // Layout bits: 24 termid + 32 hit value + 8 hit score.
- using Value = uint64_t;
-
- static constexpr int kTermIdBits = 24;
- static constexpr int kHitValueBits = sizeof(Hit::Value) * 8;
- static constexpr int kHitScoreBits = sizeof(Hit::Score) * 8;
-
- static const Value kInvalidValue;
-
- explicit Element(Value v = kInvalidValue) : value_(v) {}
-
- Element(uint32_t term_id, const Hit& hit) {
- static_assert(
- kTermIdBits + kHitValueBits + kHitScoreBits <= sizeof(Value) * 8,
- "LiteIndexElementTooBig");
-
- value_ = 0;
- // Term id goes into the most significant bits because it takes
- // precedent in sorts.
- bit_util::BitfieldSet(term_id, kHitValueBits + kHitScoreBits, kTermIdBits,
- &value_);
- bit_util::BitfieldSet(hit.value(), kHitScoreBits, kHitValueBits, &value_);
- bit_util::BitfieldSet(hit.score(), 0, kHitScoreBits, &value_);
- }
-
- uint32_t term_id() const {
- return bit_util::BitfieldGet(value_, kHitValueBits + kHitScoreBits,
- kTermIdBits);
- }
-
- Hit hit() const {
- return Hit(bit_util::BitfieldGet(value_, kHitScoreBits, kHitValueBits),
- bit_util::BitfieldGet(value_, 0, kHitScoreBits));
- }
-
- Value value() const { return value_; }
-
- private:
- Value value_;
- };
-
using Options = IcingLiteIndexOptions;
// Updates checksum of subcomponents.
@@ -126,7 +84,7 @@ class LiteIndex {
Crc32 ComputeChecksum();
// Returns term_id if term found, NOT_FOUND otherwise.
- libtextclassifier3::StatusOr<uint32_t> FindTerm(
+ libtextclassifier3::StatusOr<uint32_t> GetTermId(
const std::string& term) const;
// Returns an iterator for all terms for which 'prefix' is a prefix.
@@ -170,25 +128,89 @@ class LiteIndex {
NamespaceId namespace_id);
// Append hit to buffer. term_id must be encoded using the same term_id_codec
- // supplied to the index constructor. Returns non-OK if hit cannot be added
- // (either due to hit buffer or file system capacity reached).
+ // supplied to the index constructor.
+ // RETURNS:
+ // - OK if hit was successfully added
+ // - RESOURCE_EXHAUSTED if hit could not be added (either due to hit buffer
+ // or file system capacity reached).
libtextclassifier3::Status AddHit(uint32_t term_id, const Hit& hit);
// Add all hits with term_id from the sections specified in section_id_mask,
// skipping hits in non-prefix sections if only_from_prefix_sections is true,
- // to hits_out.
- uint32_t AppendHits(uint32_t term_id, SectionIdMask section_id_mask,
- bool only_from_prefix_sections,
- std::vector<DocHitInfo>* hits_out);
+ // to hits_out. If hits_out is nullptr, no hits will be added.
+ //
+ // Returns the number of hits that would be added to hits_out.
+ int AppendHits(uint32_t term_id, SectionIdMask section_id_mask,
+ bool only_from_prefix_sections,
+ std::vector<DocHitInfo>* hits_out);
// Returns the hit count of the term.
- uint32_t CountHits(uint32_t term_id);
+ int CountHits(uint32_t term_id);
// Check if buffer has reached its capacity.
bool is_full() const;
+ bool empty() const { return size() == 0; }
+
+ uint32_t size() const { return header_->cur_size(); }
+
+ class const_iterator {
+ friend class LiteIndex;
+
+ public:
+ using iterator_category = std::forward_iterator_tag;
+ using value_type = TermIdHitPair;
+ using reference = const value_type&;
+ using pointer = const value_type*;
+
+ const_iterator() : const_iterator(nullptr, -1, -1) {}
+
+ reference operator*() const { return start_[position_]; }
+
+ pointer operator->() const { return start_ + position_; }
+
+ const_iterator& operator++() {
+ if (++position_ >= end_position_) {
+ start_ = nullptr;
+ position_ = -1;
+ end_position_ = -1;
+ }
+ return *this;
+ }
+
+ const_iterator operator++(int) {
+ auto tmp = *this;
+ ++*this;
+ return tmp;
+ }
+
+ bool operator!=(const const_iterator& rhs) { return !(*this == rhs); }
+
+ bool operator==(const const_iterator& rhs) {
+ return start_ == rhs.start_ && position_ == rhs.position_;
+ }
+
+ private:
+ explicit const_iterator(const TermIdHitPair* start, int position,
+ int end_position)
+ : start_(start), position_(position), end_position_(end_position) {}
+
+ const TermIdHitPair* start_;
+ int position_;
+ int end_position_;
+ };
+
+ const_iterator begin() const {
+ // If the LiteIndex is empty, just return end().
+ return empty() ? end()
+ : const_iterator(hit_buffer_.array_cast<TermIdHitPair>(), 0,
+ header_->cur_size());
+ }
+
+ const_iterator end() const { return const_iterator(); }
+
constexpr static uint32_t max_hit_buffer_size() {
- return std::numeric_limits<uint32_t>::max() / sizeof(LiteIndex::Element);
+ return std::numeric_limits<uint32_t>::max() / sizeof(TermIdHitPair);
}
// We keep track of the last added document_id. This is always the largest
diff --git a/icing/index/lite/term-id-hit-pair.h b/icing/index/lite/term-id-hit-pair.h
new file mode 100644
index 0000000..191f766
--- /dev/null
+++ b/icing/index/lite/term-id-hit-pair.h
@@ -0,0 +1,80 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_TERM_ID_HIT_PAIR_H_
+#define ICING_INDEX_TERM_ID_HIT_PAIR_H_
+
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "icing/index/hit/hit.h"
+#include "icing/util/bit-util.h"
+
+namespace icing {
+namespace lib {
+
+class TermIdHitPair {
+ public:
+ // Layout bits: 24 termid + 32 hit value + 8 hit score.
+ using Value = uint64_t;
+
+ static constexpr int kTermIdBits = 24;
+ static constexpr int kHitValueBits = sizeof(Hit::Value) * 8;
+ static constexpr int kHitScoreBits = sizeof(Hit::Score) * 8;
+
+ static const Value kInvalidValue;
+
+ explicit TermIdHitPair(Value v = kInvalidValue) : value_(v) {}
+
+ TermIdHitPair(uint32_t term_id, const Hit& hit) {
+ static_assert(
+ kTermIdBits + kHitValueBits + kHitScoreBits <= sizeof(Value) * 8,
+ "TermIdHitPairTooBig");
+
+ value_ = 0;
+ // Term id goes into the most significant bits because it takes
+ // precedent in sorts.
+ bit_util::BitfieldSet(term_id, kHitValueBits + kHitScoreBits, kTermIdBits,
+ &value_);
+ bit_util::BitfieldSet(hit.value(), kHitScoreBits, kHitValueBits, &value_);
+ bit_util::BitfieldSet(hit.score(), 0, kHitScoreBits, &value_);
+ }
+
+ uint32_t term_id() const {
+ return bit_util::BitfieldGet(value_, kHitValueBits + kHitScoreBits,
+ kTermIdBits);
+ }
+
+ Hit hit() const {
+ return Hit(bit_util::BitfieldGet(value_, kHitScoreBits, kHitValueBits),
+ bit_util::BitfieldGet(value_, 0, kHitScoreBits));
+ }
+
+ Value value() const { return value_; }
+
+ bool operator==(const TermIdHitPair& rhs) const {
+ return value_ == rhs.value_;
+ }
+
+ private:
+ Value value_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_INDEX_TERM_ID_HIT_PAIR_H_