diff options
author | Terry Wang <tytytyww@google.com> | 2020-10-01 18:53:44 -0700 |
---|---|---|
committer | Terry Wang <tytytyww@google.com> | 2020-10-01 18:53:44 -0700 |
commit | 5abfe5bcac00f4f188d3d8041fa97bf77206b577 (patch) | |
tree | 69376254e2e5f886cb0d26cdb547001f8e45f372 /icing/index/lite | |
parent | e15b6b66f871a71b73278c34d5c54f648f880c29 (diff) | |
download | icing-5abfe5bcac00f4f188d3d8041fa97bf77206b577.tar.gz |
Pull upstream changes.
Change-Id: I794757716961569b5c02171cfc82785efb2cf106
Diffstat (limited to 'icing/index/lite')
-rw-r--r-- | icing/index/lite/doc-hit-info-iterator-term-lite.cc | 2 | ||||
-rw-r--r-- | icing/index/lite/lite-index.cc | 50 | ||||
-rw-r--r-- | icing/index/lite/lite-index.h | 126 | ||||
-rw-r--r-- | icing/index/lite/term-id-hit-pair.h | 80 |
4 files changed, 182 insertions, 76 deletions
diff --git a/icing/index/lite/doc-hit-info-iterator-term-lite.cc b/icing/index/lite/doc-hit-info-iterator-term-lite.cc index a975f86..1f1c296 100644 --- a/icing/index/lite/doc-hit-info-iterator-term-lite.cc +++ b/icing/index/lite/doc-hit-info-iterator-term-lite.cc @@ -61,7 +61,7 @@ libtextclassifier3::Status DocHitInfoIteratorTermLite::Advance() { libtextclassifier3::Status DocHitInfoIteratorTermLiteExact::RetrieveMoreHits() { // Exact match only. All hits in lite lexicon are exact. - ICING_ASSIGN_OR_RETURN(uint32_t tvi, lite_index_->FindTerm(term_)); + ICING_ASSIGN_OR_RETURN(uint32_t tvi, lite_index_->GetTermId(term_)); ICING_ASSIGN_OR_RETURN(uint32_t term_id, term_id_codec_->EncodeTvi(tvi, TviType::LITE)); lite_index_->AppendHits(term_id, section_restrict_mask_, diff --git a/icing/index/lite/lite-index.cc b/icing/index/lite/lite-index.cc index a72402e..89240ee 100644 --- a/icing/index/lite/lite-index.cc +++ b/icing/index/lite/lite-index.cc @@ -65,8 +65,8 @@ size_t header_size() { return sizeof(IcingLiteIndex_HeaderImpl::HeaderData); } } // namespace -const LiteIndex::Element::Value LiteIndex::Element::kInvalidValue = - LiteIndex::Element(0, Hit()).value(); +const TermIdHitPair::Value TermIdHitPair::kInvalidValue = + TermIdHitPair(0, Hit()).value(); libtextclassifier3::StatusOr<std::unique_ptr<LiteIndex>> LiteIndex::Create( const LiteIndex::Options& options, const IcingFilesystem* filesystem) { @@ -163,7 +163,7 @@ libtextclassifier3::Status LiteIndex::Initialize() { header_->Reset(); if (!hit_buffer_.Init(hit_buffer_fd_.get(), header_padded_size, true, - sizeof(Element::Value), header_->cur_size(), + sizeof(TermIdHitPair::Value), header_->cur_size(), options_.hit_buffer_size, &hit_buffer_crc_, true)) { status = absl_ports::InternalError("Failed to initialize new hit buffer"); goto error; @@ -177,7 +177,7 @@ libtextclassifier3::Status LiteIndex::Initialize() { header_mmap_.address())); if (!hit_buffer_.Init(hit_buffer_fd_.get(), header_padded_size, true, - sizeof(Element::Value), header_->cur_size(), + sizeof(TermIdHitPair::Value), header_->cur_size(), options_.hit_buffer_size, &hit_buffer_crc_, true)) { status = absl_ports::InternalError( "Failed to re-initialize existing hit buffer"); @@ -312,20 +312,21 @@ libtextclassifier3::Status LiteIndex::AddHit(uint32_t term_id, const Hit& hit) { header_->set_last_added_docid(hit.document_id()); - Element elt(term_id, hit); + TermIdHitPair term_id_hit_pair(term_id, hit); uint32_t cur_size = header_->cur_size(); - Element::Value* valp = hit_buffer_.GetMutableMem<Element::Value>(cur_size, 1); + TermIdHitPair::Value* valp = + hit_buffer_.GetMutableMem<TermIdHitPair::Value>(cur_size, 1); if (valp == nullptr) { return absl_ports::ResourceExhaustedError( "Allocating more space in hit buffer failed!"); } - *valp = elt.value(); + *valp = term_id_hit_pair.value(); header_->set_cur_size(cur_size + 1); return libtextclassifier3::Status::OK; } -libtextclassifier3::StatusOr<uint32_t> LiteIndex::FindTerm( +libtextclassifier3::StatusOr<uint32_t> LiteIndex::GetTermId( const std::string& term) const { char dummy; uint32_t tvi; @@ -336,16 +337,17 @@ libtextclassifier3::StatusOr<uint32_t> LiteIndex::FindTerm( return tvi; } -uint32_t LiteIndex::AppendHits(uint32_t term_id, SectionIdMask section_id_mask, - bool only_from_prefix_sections, - std::vector<DocHitInfo>* hits_out) { - uint32_t count = 0; +int LiteIndex::AppendHits(uint32_t term_id, SectionIdMask section_id_mask, + bool only_from_prefix_sections, + std::vector<DocHitInfo>* hits_out) { + int count = 0; DocumentId last_document_id = kInvalidDocumentId; for (uint32_t idx = Seek(term_id); idx < header_->cur_size(); idx++) { - Element elt(hit_buffer_.array_cast<Element>()[idx]); - if (elt.term_id() != term_id) break; + TermIdHitPair term_id_hit_pair( + hit_buffer_.array_cast<TermIdHitPair>()[idx]); + if (term_id_hit_pair.term_id() != term_id) break; - const Hit& hit = elt.hit(); + const Hit& hit = term_id_hit_pair.hit(); // Check sections. if (((1u << hit.section_id()) & section_id_mask) == 0) { continue; @@ -356,7 +358,7 @@ uint32_t LiteIndex::AppendHits(uint32_t term_id, SectionIdMask section_id_mask, } DocumentId document_id = hit.document_id(); if (document_id != last_document_id) { - count++; + ++count; if (hits_out != nullptr) { hits_out->push_back(DocHitInfo(document_id)); } @@ -369,7 +371,7 @@ uint32_t LiteIndex::AppendHits(uint32_t term_id, SectionIdMask section_id_mask, return count; } -uint32_t LiteIndex::CountHits(uint32_t term_id) { +int LiteIndex::CountHits(uint32_t term_id) { return AppendHits(term_id, kSectionIdMaskAll, /*only_from_prefix_sections=*/false, /*hits_out=*/nullptr); @@ -421,8 +423,8 @@ uint32_t LiteIndex::Seek(uint32_t term_id) { IcingTimer timer; auto* array_start = - hit_buffer_.GetMutableMem<Element::Value>(0, header_->cur_size()); - Element::Value* sort_start = array_start + header_->searchable_end(); + hit_buffer_.GetMutableMem<TermIdHitPair::Value>(0, header_->cur_size()); + TermIdHitPair::Value* sort_start = array_start + header_->searchable_end(); std::sort(sort_start, array_start + header_->cur_size()); // Now merge with previous region. Since the previous region is already @@ -445,11 +447,13 @@ uint32_t LiteIndex::Seek(uint32_t term_id) { // Binary search for our term_id. Make sure we get the first // element. Using kBeginSortValue ensures this for the hit value. - Element elt(term_id, Hit(Hit::kMaxDocumentIdSortValue, Hit::kMaxHitScore)); + TermIdHitPair term_id_hit_pair( + term_id, Hit(Hit::kMaxDocumentIdSortValue, Hit::kMaxHitScore)); - const Element::Value* array = hit_buffer_.array_cast<Element::Value>(); - const Element::Value* ptr = - std::lower_bound(array, array + header_->cur_size(), elt.value()); + const TermIdHitPair::Value* array = + hit_buffer_.array_cast<TermIdHitPair::Value>(); + const TermIdHitPair::Value* ptr = std::lower_bound( + array, array + header_->cur_size(), term_id_hit_pair.value()); return ptr - array; } diff --git a/icing/index/lite/lite-index.h b/icing/index/lite/lite-index.h index b60a947..27ccf33 100644 --- a/icing/index/lite/lite-index.h +++ b/icing/index/lite/lite-index.h @@ -30,6 +30,7 @@ #include "icing/file/filesystem.h" #include "icing/index/hit/doc-hit-info.h" #include "icing/index/hit/hit.h" +#include "icing/index/lite/term-id-hit-pair.h" #include "icing/legacy/index/icing-array-storage.h" #include "icing/legacy/index/icing-dynamic-trie.h" #include "icing/legacy/index/icing-filesystem.h" @@ -49,49 +50,6 @@ namespace lib { class LiteIndex { public: // An entry in the hit buffer. - class Element { - public: - // Layout bits: 24 termid + 32 hit value + 8 hit score. - using Value = uint64_t; - - static constexpr int kTermIdBits = 24; - static constexpr int kHitValueBits = sizeof(Hit::Value) * 8; - static constexpr int kHitScoreBits = sizeof(Hit::Score) * 8; - - static const Value kInvalidValue; - - explicit Element(Value v = kInvalidValue) : value_(v) {} - - Element(uint32_t term_id, const Hit& hit) { - static_assert( - kTermIdBits + kHitValueBits + kHitScoreBits <= sizeof(Value) * 8, - "LiteIndexElementTooBig"); - - value_ = 0; - // Term id goes into the most significant bits because it takes - // precedent in sorts. - bit_util::BitfieldSet(term_id, kHitValueBits + kHitScoreBits, kTermIdBits, - &value_); - bit_util::BitfieldSet(hit.value(), kHitScoreBits, kHitValueBits, &value_); - bit_util::BitfieldSet(hit.score(), 0, kHitScoreBits, &value_); - } - - uint32_t term_id() const { - return bit_util::BitfieldGet(value_, kHitValueBits + kHitScoreBits, - kTermIdBits); - } - - Hit hit() const { - return Hit(bit_util::BitfieldGet(value_, kHitScoreBits, kHitValueBits), - bit_util::BitfieldGet(value_, 0, kHitScoreBits)); - } - - Value value() const { return value_; } - - private: - Value value_; - }; - using Options = IcingLiteIndexOptions; // Updates checksum of subcomponents. @@ -126,7 +84,7 @@ class LiteIndex { Crc32 ComputeChecksum(); // Returns term_id if term found, NOT_FOUND otherwise. - libtextclassifier3::StatusOr<uint32_t> FindTerm( + libtextclassifier3::StatusOr<uint32_t> GetTermId( const std::string& term) const; // Returns an iterator for all terms for which 'prefix' is a prefix. @@ -170,25 +128,89 @@ class LiteIndex { NamespaceId namespace_id); // Append hit to buffer. term_id must be encoded using the same term_id_codec - // supplied to the index constructor. Returns non-OK if hit cannot be added - // (either due to hit buffer or file system capacity reached). + // supplied to the index constructor. + // RETURNS: + // - OK if hit was successfully added + // - RESOURCE_EXHAUSTED if hit could not be added (either due to hit buffer + // or file system capacity reached). libtextclassifier3::Status AddHit(uint32_t term_id, const Hit& hit); // Add all hits with term_id from the sections specified in section_id_mask, // skipping hits in non-prefix sections if only_from_prefix_sections is true, - // to hits_out. - uint32_t AppendHits(uint32_t term_id, SectionIdMask section_id_mask, - bool only_from_prefix_sections, - std::vector<DocHitInfo>* hits_out); + // to hits_out. If hits_out is nullptr, no hits will be added. + // + // Returns the number of hits that would be added to hits_out. + int AppendHits(uint32_t term_id, SectionIdMask section_id_mask, + bool only_from_prefix_sections, + std::vector<DocHitInfo>* hits_out); // Returns the hit count of the term. - uint32_t CountHits(uint32_t term_id); + int CountHits(uint32_t term_id); // Check if buffer has reached its capacity. bool is_full() const; + bool empty() const { return size() == 0; } + + uint32_t size() const { return header_->cur_size(); } + + class const_iterator { + friend class LiteIndex; + + public: + using iterator_category = std::forward_iterator_tag; + using value_type = TermIdHitPair; + using reference = const value_type&; + using pointer = const value_type*; + + const_iterator() : const_iterator(nullptr, -1, -1) {} + + reference operator*() const { return start_[position_]; } + + pointer operator->() const { return start_ + position_; } + + const_iterator& operator++() { + if (++position_ >= end_position_) { + start_ = nullptr; + position_ = -1; + end_position_ = -1; + } + return *this; + } + + const_iterator operator++(int) { + auto tmp = *this; + ++*this; + return tmp; + } + + bool operator!=(const const_iterator& rhs) { return !(*this == rhs); } + + bool operator==(const const_iterator& rhs) { + return start_ == rhs.start_ && position_ == rhs.position_; + } + + private: + explicit const_iterator(const TermIdHitPair* start, int position, + int end_position) + : start_(start), position_(position), end_position_(end_position) {} + + const TermIdHitPair* start_; + int position_; + int end_position_; + }; + + const_iterator begin() const { + // If the LiteIndex is empty, just return end(). + return empty() ? end() + : const_iterator(hit_buffer_.array_cast<TermIdHitPair>(), 0, + header_->cur_size()); + } + + const_iterator end() const { return const_iterator(); } + constexpr static uint32_t max_hit_buffer_size() { - return std::numeric_limits<uint32_t>::max() / sizeof(LiteIndex::Element); + return std::numeric_limits<uint32_t>::max() / sizeof(TermIdHitPair); } // We keep track of the last added document_id. This is always the largest diff --git a/icing/index/lite/term-id-hit-pair.h b/icing/index/lite/term-id-hit-pair.h new file mode 100644 index 0000000..191f766 --- /dev/null +++ b/icing/index/lite/term-id-hit-pair.h @@ -0,0 +1,80 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ICING_INDEX_TERM_ID_HIT_PAIR_H_ +#define ICING_INDEX_TERM_ID_HIT_PAIR_H_ + +#include <cstdint> +#include <limits> +#include <memory> +#include <string> +#include <vector> + +#include "icing/index/hit/hit.h" +#include "icing/util/bit-util.h" + +namespace icing { +namespace lib { + +class TermIdHitPair { + public: + // Layout bits: 24 termid + 32 hit value + 8 hit score. + using Value = uint64_t; + + static constexpr int kTermIdBits = 24; + static constexpr int kHitValueBits = sizeof(Hit::Value) * 8; + static constexpr int kHitScoreBits = sizeof(Hit::Score) * 8; + + static const Value kInvalidValue; + + explicit TermIdHitPair(Value v = kInvalidValue) : value_(v) {} + + TermIdHitPair(uint32_t term_id, const Hit& hit) { + static_assert( + kTermIdBits + kHitValueBits + kHitScoreBits <= sizeof(Value) * 8, + "TermIdHitPairTooBig"); + + value_ = 0; + // Term id goes into the most significant bits because it takes + // precedent in sorts. + bit_util::BitfieldSet(term_id, kHitValueBits + kHitScoreBits, kTermIdBits, + &value_); + bit_util::BitfieldSet(hit.value(), kHitScoreBits, kHitValueBits, &value_); + bit_util::BitfieldSet(hit.score(), 0, kHitScoreBits, &value_); + } + + uint32_t term_id() const { + return bit_util::BitfieldGet(value_, kHitValueBits + kHitScoreBits, + kTermIdBits); + } + + Hit hit() const { + return Hit(bit_util::BitfieldGet(value_, kHitScoreBits, kHitValueBits), + bit_util::BitfieldGet(value_, 0, kHitScoreBits)); + } + + Value value() const { return value_; } + + bool operator==(const TermIdHitPair& rhs) const { + return value_ == rhs.value_; + } + + private: + Value value_; +}; + +} // namespace lib +} // namespace icing + +#endif // ICING_INDEX_TERM_ID_HIT_PAIR_H_ |