aboutsummaryrefslogtreecommitdiff
path: root/icing/index
diff options
context:
space:
mode:
authorTim Barron <tjbarron@google.com>2021-01-14 20:53:07 +0000
committerTim Barron <tjbarron@google.com>2021-01-14 20:53:07 +0000
commita34db390d80f862bfaaa49dea3605c5fec3bca3d (patch)
tree67a4a87803cf2b31619c3ddff3674967fc1461ce /icing/index
parent59c2caa38fd8dca3760dad751f4f8e5de8be25f5 (diff)
downloadicing-a34db390d80f862bfaaa49dea3605c5fec3bca3d.tar.gz
Update Icing from upstream.
Change-Id: I43038a59e7170fb8ecbaf6098a37221b3682ce09
Diffstat (limited to 'icing/index')
-rw-r--r--icing/index/hit/doc-hit-info.h4
-rw-r--r--icing/index/hit/doc-hit-info_test.cc4
-rw-r--r--icing/index/hit/hit.h1
-rw-r--r--icing/index/index-processor.cc83
-rw-r--r--icing/index/index-processor.h24
-rw-r--r--icing/index/index-processor_benchmark.cc44
-rw-r--r--icing/index/index-processor_test.cc187
-rw-r--r--icing/index/iterator/doc-hit-info-iterator-and.h21
-rw-r--r--icing/index/iterator/doc-hit-info-iterator-and_test.cc205
-rw-r--r--icing/index/iterator/doc-hit-info-iterator-filter.h5
-rw-r--r--icing/index/iterator/doc-hit-info-iterator-or.cc3
-rw-r--r--icing/index/iterator/doc-hit-info-iterator-or.h31
-rw-r--r--icing/index/iterator/doc-hit-info-iterator-or_test.cc275
-rw-r--r--icing/index/iterator/doc-hit-info-iterator-section-restrict.h9
-rw-r--r--icing/index/iterator/doc-hit-info-iterator-test-util.h37
-rw-r--r--icing/index/iterator/doc-hit-info-iterator.h29
-rw-r--r--icing/index/lite/doc-hit-info-iterator-term-lite.h28
-rw-r--r--icing/index/main/doc-hit-info-iterator-term-main.h28
18 files changed, 874 insertions, 144 deletions
diff --git a/icing/index/hit/doc-hit-info.h b/icing/index/hit/doc-hit-info.h
index 8171960..0be87d6 100644
--- a/icing/index/hit/doc-hit-info.h
+++ b/icing/index/hit/doc-hit-info.h
@@ -25,7 +25,7 @@
namespace icing {
namespace lib {
-// DocHitInfo provides a collapsed view of all hits for a specific term and doc.
+// DocHitInfo provides a collapsed view of all hits for a specific doc.
// Hits contain a document_id, section_id and a term frequency. The
// information in multiple hits is collapse into a DocHitInfo by providing a
// SectionIdMask of all sections that contained a hit for this term as well as
@@ -36,7 +36,7 @@ class DocHitInfo {
SectionIdMask hit_section_ids_mask = kSectionIdMaskNone)
: document_id_(document_id_in),
hit_section_ids_mask_(hit_section_ids_mask) {
- memset(hit_term_frequency_, Hit::kDefaultTermFrequency,
+ memset(hit_term_frequency_, Hit::kNoTermFrequency,
sizeof(hit_term_frequency_));
}
diff --git a/icing/index/hit/doc-hit-info_test.cc b/icing/index/hit/doc-hit-info_test.cc
index 15c0de9..36c1a06 100644
--- a/icing/index/hit/doc-hit-info_test.cc
+++ b/icing/index/hit/doc-hit-info_test.cc
@@ -34,13 +34,13 @@ constexpr DocumentId kSomeOtherDocumentId = 54;
TEST(DocHitInfoTest, InitialMaxHitTermFrequencies) {
DocHitInfo info(kSomeDocumentId);
for (SectionId i = 0; i <= kMaxSectionId; ++i) {
- EXPECT_THAT(info.hit_term_frequency(i), Eq(Hit::kDefaultTermFrequency));
+ EXPECT_THAT(info.hit_term_frequency(i), Eq(Hit::kNoTermFrequency));
}
}
TEST(DocHitInfoTest, UpdateHitTermFrequenciesForTheFirstTime) {
DocHitInfo info(kSomeDocumentId);
- ASSERT_THAT(info.hit_term_frequency(3), Eq(Hit::kDefaultTermFrequency));
+ ASSERT_THAT(info.hit_term_frequency(3), Eq(Hit::kNoTermFrequency));
// Updating a section for the first time, should change its hit
// term_frequency
diff --git a/icing/index/hit/hit.h b/icing/index/hit/hit.h
index 525a5e5..ee1f64b 100644
--- a/icing/index/hit/hit.h
+++ b/icing/index/hit/hit.h
@@ -58,6 +58,7 @@ class Hit {
static constexpr TermFrequency kMaxTermFrequency =
std::numeric_limits<TermFrequency>::max();
static constexpr TermFrequency kDefaultTermFrequency = 1;
+ static constexpr TermFrequency kNoTermFrequency = 0;
explicit Hit(Value value = kInvalidValue,
TermFrequency term_frequency = kDefaultTermFrequency)
diff --git a/icing/index/index-processor.cc b/icing/index/index-processor.cc
index 892263b..d2f9d41 100644
--- a/icing/index/index-processor.cc
+++ b/icing/index/index-processor.cc
@@ -31,34 +31,30 @@
#include "icing/schema/section-manager.h"
#include "icing/schema/section.h"
#include "icing/store/document-id.h"
-#include "icing/tokenization/language-segmenter.h"
#include "icing/tokenization/token.h"
#include "icing/tokenization/tokenizer-factory.h"
#include "icing/tokenization/tokenizer.h"
#include "icing/transform/normalizer.h"
#include "icing/util/status-macros.h"
+#include "icing/util/tokenized-document.h"
namespace icing {
namespace lib {
libtextclassifier3::StatusOr<std::unique_ptr<IndexProcessor>>
-IndexProcessor::Create(const SchemaStore* schema_store,
- const LanguageSegmenter* lang_segmenter,
- const Normalizer* normalizer, Index* index,
+IndexProcessor::Create(const Normalizer* normalizer, Index* index,
const IndexProcessor::Options& options,
const Clock* clock) {
- ICING_RETURN_ERROR_IF_NULL(schema_store);
- ICING_RETURN_ERROR_IF_NULL(lang_segmenter);
ICING_RETURN_ERROR_IF_NULL(normalizer);
ICING_RETURN_ERROR_IF_NULL(index);
ICING_RETURN_ERROR_IF_NULL(clock);
- return std::unique_ptr<IndexProcessor>(new IndexProcessor(
- schema_store, lang_segmenter, normalizer, index, options, clock));
+ return std::unique_ptr<IndexProcessor>(
+ new IndexProcessor(normalizer, index, options, clock));
}
libtextclassifier3::Status IndexProcessor::IndexDocument(
- const DocumentProto& document, DocumentId document_id,
+ const TokenizedDocument& tokenized_document, DocumentId document_id,
NativePutDocumentStats* put_document_stats) {
std::unique_ptr<Timer> index_timer = clock_.GetNewTimer();
@@ -68,54 +64,45 @@ libtextclassifier3::Status IndexProcessor::IndexDocument(
"DocumentId %d must be greater than last added document_id %d",
document_id, index_->last_added_document_id()));
}
- ICING_ASSIGN_OR_RETURN(std::vector<Section> sections,
- schema_store_.ExtractSections(document));
uint32_t num_tokens = 0;
libtextclassifier3::Status overall_status;
- for (const Section& section : sections) {
+ for (const TokenizedSection& section : tokenized_document.sections()) {
// TODO(b/152934343): pass real namespace ids in
Index::Editor editor =
index_->Edit(document_id, section.metadata.id,
section.metadata.term_match_type, /*namespace_id=*/0);
- for (std::string_view subcontent : section.content) {
- ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer> tokenizer,
- tokenizer_factory::CreateIndexingTokenizer(
- section.metadata.tokenizer, &lang_segmenter_));
- ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer::Iterator> itr,
- tokenizer->Tokenize(subcontent));
- while (itr->Advance()) {
- if (++num_tokens > options_.max_tokens_per_document) {
- // Index all tokens buffered so far.
- editor.IndexAllBufferedTerms();
- if (put_document_stats != nullptr) {
- put_document_stats->mutable_tokenization_stats()
- ->set_exceeded_max_token_num(true);
- put_document_stats->mutable_tokenization_stats()
- ->set_num_tokens_indexed(options_.max_tokens_per_document);
- }
- switch (options_.token_limit_behavior) {
- case Options::TokenLimitBehavior::kReturnError:
- return absl_ports::ResourceExhaustedError(
- "Max number of tokens reached!");
- case Options::TokenLimitBehavior::kSuppressError:
- return overall_status;
- }
+ for (std::string_view token : section.token_sequence) {
+ if (++num_tokens > options_.max_tokens_per_document) {
+ // Index all tokens buffered so far.
+ editor.IndexAllBufferedTerms();
+ if (put_document_stats != nullptr) {
+ put_document_stats->mutable_tokenization_stats()
+ ->set_exceeded_max_token_num(true);
+ put_document_stats->mutable_tokenization_stats()
+ ->set_num_tokens_indexed(options_.max_tokens_per_document);
}
- std::string term = normalizer_.NormalizeTerm(itr->GetToken().text);
- // Add this term to Hit buffer. Even if adding this hit fails, we keep
- // trying to add more hits because it's possible that future hits could
- // still be added successfully. For instance if the lexicon is full, we
- // might fail to add a hit for a new term, but should still be able to
- // add hits for terms that are already in the index.
- auto status = editor.BufferTerm(term.c_str());
- if (overall_status.ok() && !status.ok()) {
- // If we've succeeded to add everything so far, set overall_status to
- // represent this new failure. If we've already failed, no need to
- // update the status - we're already going to return a resource
- // exhausted error.
- overall_status = status;
+ switch (options_.token_limit_behavior) {
+ case Options::TokenLimitBehavior::kReturnError:
+ return absl_ports::ResourceExhaustedError(
+ "Max number of tokens reached!");
+ case Options::TokenLimitBehavior::kSuppressError:
+ return overall_status;
}
}
+ std::string term = normalizer_.NormalizeTerm(token);
+ // Add this term to Hit buffer. Even if adding this hit fails, we keep
+ // trying to add more hits because it's possible that future hits could
+ // still be added successfully. For instance if the lexicon is full, we
+ // might fail to add a hit for a new term, but should still be able to
+ // add hits for terms that are already in the index.
+ auto status = editor.BufferTerm(term.c_str());
+ if (overall_status.ok() && !status.ok()) {
+ // If we've succeeded to add everything so far, set overall_status to
+ // represent this new failure. If we've already failed, no need to
+ // update the status - we're already going to return a resource
+ // exhausted error.
+ overall_status = status;
+ }
}
// Add all the seen terms to the index with their term frequency.
auto status = editor.IndexAllBufferedTerms();
diff --git a/icing/index/index-processor.h b/icing/index/index-processor.h
index 2eb4ad8..9fc7c46 100644
--- a/icing/index/index-processor.h
+++ b/icing/index/index-processor.h
@@ -21,12 +21,11 @@
#include "icing/text_classifier/lib3/utils/base/status.h"
#include "icing/index/index.h"
#include "icing/proto/document.pb.h"
-#include "icing/schema/schema-store.h"
#include "icing/schema/section-manager.h"
#include "icing/store/document-id.h"
-#include "icing/tokenization/language-segmenter.h"
#include "icing/tokenization/token.h"
#include "icing/transform/normalizer.h"
+#include "icing/util/tokenized-document.h"
namespace icing {
namespace lib {
@@ -58,14 +57,13 @@ class IndexProcessor {
// An IndexProcessor on success
// FAILED_PRECONDITION if any of the pointers is null.
static libtextclassifier3::StatusOr<std::unique_ptr<IndexProcessor>> Create(
- const SchemaStore* schema_store, const LanguageSegmenter* lang_segmenter,
const Normalizer* normalizer, Index* index, const Options& options,
const Clock* clock);
- // Add document to the index, associated with document_id. If the number of
- // tokens in the document exceeds max_tokens_per_document, then only the first
- // max_tokens_per_document will be added to the index. All tokens of length
- // exceeding max_token_length will be shortened to max_token_length.
+ // Add tokenized document to the index, associated with document_id. If the
+ // number of tokens in the document exceeds max_tokens_per_document, then only
+ // the first max_tokens_per_document will be added to the index. All tokens of
+ // length exceeding max_token_length will be shortened to max_token_length.
//
// Indexing a document *may* trigger an index merge. If a merge fails, then
// all content in the index will be lost.
@@ -82,25 +80,19 @@ class IndexProcessor {
// NOT_FOUND if there is no definition for the document's schema type.
// INTERNAL_ERROR if any other errors occur
libtextclassifier3::Status IndexDocument(
- const DocumentProto& document, DocumentId document_id,
+ const TokenizedDocument& tokenized_document, DocumentId document_id,
NativePutDocumentStats* put_document_stats = nullptr);
private:
- IndexProcessor(const SchemaStore* schema_store,
- const LanguageSegmenter* lang_segmenter,
- const Normalizer* normalizer, Index* index,
+ IndexProcessor(const Normalizer* normalizer, Index* index,
const Options& options, const Clock* clock)
- : schema_store_(*schema_store),
- lang_segmenter_(*lang_segmenter),
- normalizer_(*normalizer),
+ : normalizer_(*normalizer),
index_(index),
options_(options),
clock_(*clock) {}
std::string NormalizeToken(const Token& token);
- const SchemaStore& schema_store_;
- const LanguageSegmenter& lang_segmenter_;
const Normalizer& normalizer_;
Index* const index_;
const Options options_;
diff --git a/icing/index/index-processor_benchmark.cc b/icing/index/index-processor_benchmark.cc
index 96a390b..afeac4d 100644
--- a/icing/index/index-processor_benchmark.cc
+++ b/icing/index/index-processor_benchmark.cc
@@ -31,6 +31,7 @@
#include "icing/transform/normalizer-factory.h"
#include "icing/transform/normalizer.h"
#include "icing/util/logging.h"
+#include "icing/util/tokenized-document.h"
#include "unicode/uloc.h"
// Run on a Linux workstation:
@@ -168,16 +169,13 @@ void CleanUp(const Filesystem& filesystem, const std::string& index_dir) {
}
std::unique_ptr<IndexProcessor> CreateIndexProcessor(
- const SchemaStore* schema_store,
- const LanguageSegmenter* language_segmenter, const Normalizer* normalizer,
- Index* index, const Clock* clock) {
+ const Normalizer* normalizer, Index* index, const Clock* clock) {
IndexProcessor::Options processor_options{};
processor_options.max_tokens_per_document = 1024 * 1024 * 10;
processor_options.token_limit_behavior =
IndexProcessor::Options::TokenLimitBehavior::kReturnError;
- return IndexProcessor::Create(schema_store, language_segmenter, normalizer,
- index, processor_options, clock)
+ return IndexProcessor::Create(normalizer, index, processor_options, clock)
.ValueOrDie();
}
@@ -203,15 +201,18 @@ void BM_IndexDocumentWithOneProperty(benchmark::State& state) {
Clock clock;
std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore(&clock);
std::unique_ptr<IndexProcessor> index_processor =
- CreateIndexProcessor(schema_store.get(), language_segmenter.get(),
- normalizer.get(), index.get(), &clock);
+ CreateIndexProcessor(normalizer.get(), index.get(), &clock);
DocumentProto input_document = CreateDocumentWithOneProperty(state.range(0));
+ TokenizedDocument tokenized_document(std::move(
+ TokenizedDocument::Create(schema_store.get(), language_segmenter.get(),
+ input_document)
+ .ValueOrDie()));
DocumentId document_id = 0;
for (auto _ : state) {
ICING_ASSERT_OK(
- index_processor->IndexDocument(input_document, document_id++));
+ index_processor->IndexDocument(tokenized_document, document_id++));
}
CleanUp(filesystem, index_dir);
@@ -254,16 +255,19 @@ void BM_IndexDocumentWithTenProperties(benchmark::State& state) {
Clock clock;
std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore(&clock);
std::unique_ptr<IndexProcessor> index_processor =
- CreateIndexProcessor(schema_store.get(), language_segmenter.get(),
- normalizer.get(), index.get(), &clock);
+ CreateIndexProcessor(normalizer.get(), index.get(), &clock);
DocumentProto input_document =
CreateDocumentWithTenProperties(state.range(0));
+ TokenizedDocument tokenized_document(std::move(
+ TokenizedDocument::Create(schema_store.get(), language_segmenter.get(),
+ input_document)
+ .ValueOrDie()));
DocumentId document_id = 0;
for (auto _ : state) {
ICING_ASSERT_OK(
- index_processor->IndexDocument(input_document, document_id++));
+ index_processor->IndexDocument(tokenized_document, document_id++));
}
CleanUp(filesystem, index_dir);
@@ -306,16 +310,19 @@ void BM_IndexDocumentWithDiacriticLetters(benchmark::State& state) {
Clock clock;
std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore(&clock);
std::unique_ptr<IndexProcessor> index_processor =
- CreateIndexProcessor(schema_store.get(), language_segmenter.get(),
- normalizer.get(), index.get(), &clock);
+ CreateIndexProcessor(normalizer.get(), index.get(), &clock);
DocumentProto input_document =
CreateDocumentWithDiacriticLetters(state.range(0));
+ TokenizedDocument tokenized_document(std::move(
+ TokenizedDocument::Create(schema_store.get(), language_segmenter.get(),
+ input_document)
+ .ValueOrDie()));
DocumentId document_id = 0;
for (auto _ : state) {
ICING_ASSERT_OK(
- index_processor->IndexDocument(input_document, document_id++));
+ index_processor->IndexDocument(tokenized_document, document_id++));
}
CleanUp(filesystem, index_dir);
@@ -358,15 +365,18 @@ void BM_IndexDocumentWithHiragana(benchmark::State& state) {
Clock clock;
std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore(&clock);
std::unique_ptr<IndexProcessor> index_processor =
- CreateIndexProcessor(schema_store.get(), language_segmenter.get(),
- normalizer.get(), index.get(), &clock);
+ CreateIndexProcessor(normalizer.get(), index.get(), &clock);
DocumentProto input_document = CreateDocumentWithHiragana(state.range(0));
+ TokenizedDocument tokenized_document(std::move(
+ TokenizedDocument::Create(schema_store.get(), language_segmenter.get(),
+ input_document)
+ .ValueOrDie()));
DocumentId document_id = 0;
for (auto _ : state) {
ICING_ASSERT_OK(
- index_processor->IndexDocument(input_document, document_id++));
+ index_processor->IndexDocument(tokenized_document, document_id++));
}
CleanUp(filesystem, index_dir);
diff --git a/icing/index/index-processor_test.cc b/icing/index/index-processor_test.cc
index bdd9575..e6bb615 100644
--- a/icing/index/index-processor_test.cc
+++ b/icing/index/index-processor_test.cc
@@ -53,6 +53,7 @@
#include "icing/tokenization/language-segmenter.h"
#include "icing/transform/normalizer-factory.h"
#include "icing/transform/normalizer.h"
+#include "icing/util/tokenized-document.h"
#include "unicode/uloc.h"
namespace icing {
@@ -140,8 +141,7 @@ class IndexProcessorTest : public Test {
ICING_ASSERT_OK_AND_ASSIGN(
index_processor_,
- IndexProcessor::Create(schema_store_.get(), lang_segmenter_.get(),
- normalizer_.get(), index_.get(),
+ IndexProcessor::Create(normalizer_.get(), index_.get(),
processor_options, &fake_clock_));
mock_icing_filesystem_ = std::make_unique<IcingMockFilesystem>();
}
@@ -195,7 +195,7 @@ class IndexProcessorTest : public Test {
type_config->set_schema_type(std::string(kFakeType));
AddStringProperty(std::string(kExactProperty), DataType::STRING,
- Cardinality::REQUIRED, TermMatchType::EXACT_ONLY,
+ Cardinality::OPTIONAL, TermMatchType::EXACT_ONLY,
type_config);
AddStringProperty(std::string(kPrefixedProperty), DataType::STRING,
@@ -244,25 +244,11 @@ TEST_F(IndexProcessorTest, CreationWithNullPointerShouldFail) {
processor_options.token_limit_behavior =
IndexProcessor::Options::TokenLimitBehavior::kReturnError;
- EXPECT_THAT(
- IndexProcessor::Create(/*schema_store=*/nullptr, lang_segmenter_.get(),
- normalizer_.get(), index_.get(), processor_options,
- &fake_clock_),
- StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
-
- EXPECT_THAT(
- IndexProcessor::Create(schema_store_.get(), /*lang_segmenter=*/nullptr,
- normalizer_.get(), index_.get(), processor_options,
- &fake_clock_),
- StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
-
- EXPECT_THAT(IndexProcessor::Create(schema_store_.get(), lang_segmenter_.get(),
- /*normalizer=*/nullptr, index_.get(),
+ EXPECT_THAT(IndexProcessor::Create(/*normalizer=*/nullptr, index_.get(),
processor_options, &fake_clock_),
StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
- EXPECT_THAT(IndexProcessor::Create(schema_store_.get(), lang_segmenter_.get(),
- normalizer_.get(), /*index=*/nullptr,
+ EXPECT_THAT(IndexProcessor::Create(normalizer_.get(), /*index=*/nullptr,
processor_options, &fake_clock_),
StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
}
@@ -276,7 +262,12 @@ TEST_F(IndexProcessorTest, NoTermMatchTypeContent) {
.AddBytesProperty(std::string(kUnindexedProperty2),
"attachment bytes")
.Build();
- EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0), IsOk());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
+ IsOk());
EXPECT_THAT(index_->last_added_document_id(), Eq(kInvalidDocumentId));
}
@@ -287,7 +278,12 @@ TEST_F(IndexProcessorTest, OneDoc) {
.SetSchema(std::string(kFakeType))
.AddStringProperty(std::string(kExactProperty), "hello world")
.Build();
- EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0), IsOk());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
+ IsOk());
EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DocHitInfoIterator> itr,
@@ -313,7 +309,12 @@ TEST_F(IndexProcessorTest, MultipleDocs) {
.AddStringProperty(std::string(kExactProperty), "hello world")
.AddStringProperty(std::string(kPrefixedProperty), "good night moon!")
.Build();
- EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0), IsOk());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
+ IsOk());
EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
std::string coffeeRepeatedString = "coffee";
@@ -329,7 +330,12 @@ TEST_F(IndexProcessorTest, MultipleDocs) {
.AddStringProperty(std::string(kPrefixedProperty),
"mr. world world wide")
.Build();
- EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId1), IsOk());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId1),
+ IsOk());
EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DocHitInfoIterator> itr,
@@ -372,11 +378,18 @@ TEST_F(IndexProcessorTest, DocWithNestedProperty) {
.AddDocumentProperty(
std::string(kSubProperty),
DocumentBuilder()
+ .SetKey("icing", "nested_type/1")
+ .SetSchema(std::string(kNestedType))
.AddStringProperty(std::string(kNestedProperty),
"rocky raccoon")
.Build())
.Build();
- EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0), IsOk());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
+ IsOk());
EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DocHitInfoIterator> itr,
@@ -396,7 +409,12 @@ TEST_F(IndexProcessorTest, DocWithRepeatedProperty) {
.AddStringProperty(std::string(kRepeatedProperty), "rocky",
"italian stallion")
.Build();
- EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0), IsOk());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
+ IsOk());
EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DocHitInfoIterator> itr,
@@ -417,8 +435,7 @@ TEST_F(IndexProcessorTest, TooManyTokensReturnError) {
ICING_ASSERT_OK_AND_ASSIGN(
index_processor_,
- IndexProcessor::Create(schema_store_.get(), lang_segmenter_.get(),
- normalizer_.get(), index_.get(), options,
+ IndexProcessor::Create(normalizer_.get(), index_.get(), options,
&fake_clock_));
DocumentProto document =
@@ -428,7 +445,11 @@ TEST_F(IndexProcessorTest, TooManyTokensReturnError) {
.AddStringProperty(std::string(kExactProperty), "hello world")
.AddStringProperty(std::string(kPrefixedProperty), "good night moon!")
.Build();
- EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0),
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
@@ -457,8 +478,7 @@ TEST_F(IndexProcessorTest, TooManyTokensSuppressError) {
ICING_ASSERT_OK_AND_ASSIGN(
index_processor_,
- IndexProcessor::Create(schema_store_.get(), lang_segmenter_.get(),
- normalizer_.get(), index_.get(), options,
+ IndexProcessor::Create(normalizer_.get(), index_.get(), options,
&fake_clock_));
DocumentProto document =
@@ -468,7 +488,12 @@ TEST_F(IndexProcessorTest, TooManyTokensSuppressError) {
.AddStringProperty(std::string(kExactProperty), "hello world")
.AddStringProperty(std::string(kPrefixedProperty), "good night moon!")
.Build();
- EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0), IsOk());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
+ IsOk());
EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
// "night" should have been indexed.
@@ -498,8 +523,7 @@ TEST_F(IndexProcessorTest, TooLongTokens) {
ICING_ASSERT_OK_AND_ASSIGN(
index_processor_,
- IndexProcessor::Create(schema_store_.get(), lang_segmenter_.get(),
- normalizer.get(), index_.get(), options,
+ IndexProcessor::Create(normalizer.get(), index_.get(), options,
&fake_clock_));
DocumentProto document =
@@ -509,7 +533,12 @@ TEST_F(IndexProcessorTest, TooLongTokens) {
.AddStringProperty(std::string(kExactProperty), "hello world")
.AddStringProperty(std::string(kPrefixedProperty), "good night moon!")
.Build();
- EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0), IsOk());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
+ IsOk());
EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
// "good" should have been indexed normally.
@@ -542,7 +571,12 @@ TEST_F(IndexProcessorTest, NonPrefixedContentPrefixQuery) {
.SetSchema(std::string(kFakeType))
.AddStringProperty(std::string(kExactProperty), "best rocky movies")
.Build();
- EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0), IsOk());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
+ IsOk());
EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
document =
@@ -551,7 +585,12 @@ TEST_F(IndexProcessorTest, NonPrefixedContentPrefixQuery) {
.SetSchema(std::string(kFakeType))
.AddStringProperty(std::string(kPrefixedProperty), "rocky raccoon")
.Build();
- EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId1), IsOk());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId1),
+ IsOk());
EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
// Only document_id 1 should surface in a prefix query for "Rock"
@@ -570,7 +609,12 @@ TEST_F(IndexProcessorTest, TokenNormalization) {
.SetSchema(std::string(kFakeType))
.AddStringProperty(std::string(kExactProperty), "ALL UPPER CASE")
.Build();
- EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0), IsOk());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
+ IsOk());
EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
document =
@@ -579,7 +623,12 @@ TEST_F(IndexProcessorTest, TokenNormalization) {
.SetSchema(std::string(kFakeType))
.AddStringProperty(std::string(kExactProperty), "all lower case")
.Build();
- EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId1), IsOk());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId1),
+ IsOk());
EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DocHitInfoIterator> itr,
@@ -600,7 +649,12 @@ TEST_F(IndexProcessorTest, OutOfOrderDocumentIds) {
.SetSchema(std::string(kFakeType))
.AddStringProperty(std::string(kExactProperty), "ALL UPPER CASE")
.Build();
- EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId1), IsOk());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId1),
+ IsOk());
EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
// Indexing a document with document_id < last_added_document_id should cause
@@ -611,11 +665,15 @@ TEST_F(IndexProcessorTest, OutOfOrderDocumentIds) {
.SetSchema(std::string(kFakeType))
.AddStringProperty(std::string(kExactProperty), "all lower case")
.Build();
- EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0),
+ ICING_ASSERT_OK_AND_ASSIGN(
+ tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
// As should indexing a document document_id == last_added_document_id.
- EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0),
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
@@ -635,8 +693,7 @@ TEST_F(IndexProcessorTest, NonAsciiIndexing) {
ICING_ASSERT_OK_AND_ASSIGN(
index_processor_,
- IndexProcessor::Create(schema_store_.get(), lang_segmenter_.get(),
- normalizer_.get(), index_.get(),
+ IndexProcessor::Create(normalizer_.get(), index_.get(),
processor_options, &fake_clock_));
DocumentProto document =
@@ -646,7 +703,12 @@ TEST_F(IndexProcessorTest, NonAsciiIndexing) {
.AddStringProperty(std::string(kExactProperty),
"你好,世界!你好:世界。“你好”世界?")
.Build();
- EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0), IsOk());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
+ IsOk());
EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DocHitInfoIterator> itr,
@@ -666,8 +728,7 @@ TEST_F(IndexProcessorTest,
ICING_ASSERT_OK_AND_ASSIGN(
index_processor_,
- IndexProcessor::Create(schema_store_.get(), lang_segmenter_.get(),
- normalizer_.get(), index_.get(), processor_options,
+ IndexProcessor::Create(normalizer_.get(), index_.get(), processor_options,
&fake_clock_));
// This is the maximum token length that an empty lexicon constructed for a
@@ -684,7 +745,11 @@ TEST_F(IndexProcessorTest,
absl_ports::StrCat(enormous_string, " foo"))
.AddStringProperty(std::string(kPrefixedProperty), "bar baz")
.Build();
- EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0),
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
@@ -715,6 +780,10 @@ TEST_F(IndexProcessorTest, IndexingDocAutomaticMerge) {
.SetSchema(std::string(kFakeType))
.AddStringProperty(std::string(kExactProperty), kIpsumText)
.Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
Index::Options options(index_dir_,
/*index_merge_size=*/document.ByteSizeLong() * 100);
ICING_ASSERT_OK_AND_ASSIGN(
@@ -727,8 +796,7 @@ TEST_F(IndexProcessorTest, IndexingDocAutomaticMerge) {
ICING_ASSERT_OK_AND_ASSIGN(
index_processor_,
- IndexProcessor::Create(schema_store_.get(), lang_segmenter_.get(),
- normalizer_.get(), index_.get(), processor_options,
+ IndexProcessor::Create(normalizer_.get(), index_.get(), processor_options,
&fake_clock_));
DocumentId doc_id = 0;
// Have determined experimentally that indexing 3373 documents with this text
@@ -737,10 +805,12 @@ TEST_F(IndexProcessorTest, IndexingDocAutomaticMerge) {
// empties the LiteIndex.
constexpr int kNumDocsLiteIndexExhaustion = 3373;
for (; doc_id < kNumDocsLiteIndexExhaustion; ++doc_id) {
- EXPECT_THAT(index_processor_->IndexDocument(document, doc_id), IsOk());
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, doc_id),
+ IsOk());
EXPECT_THAT(index_->last_added_document_id(), Eq(doc_id));
}
- EXPECT_THAT(index_processor_->IndexDocument(document, doc_id), IsOk());
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, doc_id),
+ IsOk());
EXPECT_THAT(index_->last_added_document_id(), Eq(doc_id));
}
@@ -768,6 +838,10 @@ TEST_F(IndexProcessorTest, IndexingDocMergeFailureResets) {
.SetSchema(std::string(kFakeType))
.AddStringProperty(std::string(kPrefixedProperty), kIpsumText)
.Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
// 2. Recreate the index with the mock filesystem and a merge size that will
// only allow one document to be added before requiring a merge.
@@ -784,25 +858,26 @@ TEST_F(IndexProcessorTest, IndexingDocMergeFailureResets) {
ICING_ASSERT_OK_AND_ASSIGN(
index_processor_,
- IndexProcessor::Create(schema_store_.get(), lang_segmenter_.get(),
- normalizer_.get(), index_.get(), processor_options,
+ IndexProcessor::Create(normalizer_.get(), index_.get(), processor_options,
&fake_clock_));
// 3. Index one document. This should fit in the LiteIndex without requiring a
// merge.
DocumentId doc_id = 0;
- EXPECT_THAT(index_processor_->IndexDocument(document, doc_id), IsOk());
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, doc_id),
+ IsOk());
EXPECT_THAT(index_->last_added_document_id(), Eq(doc_id));
// 4. Add one more document to trigger a merge, which should fail and result
// in a Reset.
++doc_id;
- EXPECT_THAT(index_processor_->IndexDocument(document, doc_id),
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, doc_id),
StatusIs(libtextclassifier3::StatusCode::DATA_LOSS));
EXPECT_THAT(index_->last_added_document_id(), Eq(kInvalidDocumentId));
// 5. Indexing a new document should succeed.
- EXPECT_THAT(index_processor_->IndexDocument(document, doc_id), IsOk());
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, doc_id),
+ IsOk());
EXPECT_THAT(index_->last_added_document_id(), Eq(doc_id));
}
diff --git a/icing/index/iterator/doc-hit-info-iterator-and.h b/icing/index/iterator/doc-hit-info-iterator-and.h
index 4618fb9..faca785 100644
--- a/icing/index/iterator/doc-hit-info-iterator-and.h
+++ b/icing/index/iterator/doc-hit-info-iterator-and.h
@@ -46,6 +46,16 @@ class DocHitInfoIteratorAnd : public DocHitInfoIterator {
std::string ToString() const override;
+ void PopulateMatchedTermsStats(
+ std::vector<TermMatchInfo> *matched_terms_stats) const override {
+ if (doc_hit_info_.document_id() == kInvalidDocumentId) {
+ // Current hit isn't valid, return.
+ return;
+ }
+ short_->PopulateMatchedTermsStats(matched_terms_stats);
+ long_->PopulateMatchedTermsStats(matched_terms_stats);
+ }
+
private:
std::unique_ptr<DocHitInfoIterator> short_;
std::unique_ptr<DocHitInfoIterator> long_;
@@ -67,6 +77,17 @@ class DocHitInfoIteratorAndNary : public DocHitInfoIterator {
std::string ToString() const override;
+ void PopulateMatchedTermsStats(
+ std::vector<TermMatchInfo> *matched_terms_stats) const override {
+ if (doc_hit_info_.document_id() == kInvalidDocumentId) {
+ // Current hit isn't valid, return.
+ return;
+ }
+ for (size_t i = 0; i < iterators_.size(); ++i) {
+ iterators_.at(i)->PopulateMatchedTermsStats(matched_terms_stats);
+ }
+ }
+
private:
std::vector<std::unique_ptr<DocHitInfoIterator>> iterators_;
};
diff --git a/icing/index/iterator/doc-hit-info-iterator-and_test.cc b/icing/index/iterator/doc-hit-info-iterator-and_test.cc
index 35574b7..783e937 100644
--- a/icing/index/iterator/doc-hit-info-iterator-and_test.cc
+++ b/icing/index/iterator/doc-hit-info-iterator-and_test.cc
@@ -32,8 +32,10 @@ namespace lib {
namespace {
using ::testing::ElementsAre;
+using ::testing::ElementsAreArray;
using ::testing::Eq;
using ::testing::IsEmpty;
+using ::testing::SizeIs;
TEST(CreateAndIteratorTest, And) {
// Basic test that we can create a working And iterator. Further testing of
@@ -196,6 +198,125 @@ TEST(DocHitInfoIteratorAndTest, SectionIdMask) {
EXPECT_THAT(and_iter.hit_intersect_section_ids_mask(), Eq(mask_anded_result));
}
+TEST(DocHitInfoIteratorAndTest, PopulateMatchedTermsStats) {
+ {
+ // Arbitrary section ids for the documents in the DocHitInfoIterators.
+ // Created to test correct section_id_mask behavior.
+ SectionIdMask section_id_mask1 = 0b01010101; // hits in sections 0, 2, 4, 6
+ std::array<Hit::TermFrequency, kMaxSectionId> term_frequencies1{
+ 1, 0, 2, 0, 3, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0};
+ SectionIdMask section_id_mask2 = 0b00000110; // hits in sections 1, 2
+ std::array<Hit::TermFrequency, kMaxSectionId> term_frequencies2{
+ 0, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+ DocHitInfo doc_hit_info1 = DocHitInfo(4);
+ doc_hit_info1.UpdateSection(/*section_id=*/0, /*hit_term_frequency=*/1);
+ doc_hit_info1.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/2);
+ doc_hit_info1.UpdateSection(/*section_id=*/4, /*hit_term_frequency=*/3);
+ doc_hit_info1.UpdateSection(/*section_id=*/6, /*hit_term_frequency=*/4);
+
+ DocHitInfo doc_hit_info2 = DocHitInfo(4);
+ doc_hit_info2.UpdateSection(/*section_id=*/1, /*hit_term_frequency=*/2);
+ doc_hit_info2.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/6);
+
+ std::vector<DocHitInfo> first_vector = {doc_hit_info1};
+ std::vector<DocHitInfo> second_vector = {doc_hit_info2};
+
+ auto first_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(first_vector, "hi");
+ first_iter->set_hit_intersect_section_ids_mask(section_id_mask1);
+
+ auto second_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(second_vector, "hello");
+ second_iter->set_hit_intersect_section_ids_mask(section_id_mask2);
+
+ DocHitInfoIteratorAnd and_iter(std::move(first_iter),
+ std::move(second_iter));
+ std::vector<TermMatchInfo> matched_terms_stats;
+ and_iter.PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(matched_terms_stats, IsEmpty());
+
+ ICING_EXPECT_OK(and_iter.Advance());
+ EXPECT_THAT(and_iter.doc_hit_info().document_id(), Eq(4));
+
+ and_iter.PopulateMatchedTermsStats(&matched_terms_stats);
+ ASSERT_THAT(matched_terms_stats, SizeIs(2)); // 2 terms
+ EXPECT_EQ(matched_terms_stats.at(0).term, "hi");
+ EXPECT_EQ(matched_terms_stats.at(1).term, "hello");
+ EXPECT_THAT(matched_terms_stats.at(0).term_frequencies,
+ ElementsAreArray(term_frequencies1));
+ EXPECT_EQ(matched_terms_stats.at(0).section_ids_mask, section_id_mask1);
+ EXPECT_THAT(matched_terms_stats.at(1).term_frequencies,
+ ElementsAreArray(term_frequencies2));
+ EXPECT_EQ(matched_terms_stats.at(1).section_ids_mask, section_id_mask2);
+
+ EXPECT_FALSE(and_iter.Advance().ok());
+ }
+ {
+ // Arbitrary section ids for the documents in the DocHitInfoIterators.
+ // Created to test correct section_id_mask behavior.
+ SectionIdMask section_id_mask1 = 0b00000101; // hits in sections 0, 2
+ std::array<Hit::TermFrequency, kMaxSectionId> term_frequencies1{
+ 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+ DocHitInfo doc_hit_info1 = DocHitInfo(4);
+ doc_hit_info1.UpdateSection(/*section_id=*/0, /*hit_term_frequency=*/1);
+ doc_hit_info1.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/2);
+
+ std::vector<DocHitInfo> first_vector = {doc_hit_info1};
+ std::vector<DocHitInfo> second_vector = {doc_hit_info1};
+
+ auto first_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(first_vector, "hi");
+ first_iter->set_hit_intersect_section_ids_mask(section_id_mask1);
+
+ auto second_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(second_vector, "hi");
+ second_iter->set_hit_intersect_section_ids_mask(section_id_mask1);
+
+ DocHitInfoIteratorAnd and_iter(std::move(first_iter),
+ std::move(second_iter));
+ std::vector<TermMatchInfo> matched_terms_stats;
+ and_iter.PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(matched_terms_stats, IsEmpty());
+
+ ICING_EXPECT_OK(and_iter.Advance());
+ EXPECT_THAT(and_iter.doc_hit_info().document_id(), Eq(4));
+
+ and_iter.PopulateMatchedTermsStats(&matched_terms_stats);
+ ASSERT_THAT(matched_terms_stats, SizeIs(1)); // 1 term
+ EXPECT_EQ(matched_terms_stats.at(0).term, "hi");
+ EXPECT_THAT(matched_terms_stats.at(0).term_frequencies,
+ ElementsAreArray(term_frequencies1));
+ EXPECT_EQ(matched_terms_stats.at(0).section_ids_mask, section_id_mask1);
+
+ EXPECT_FALSE(and_iter.Advance().ok());
+ }
+}
+
+TEST(DocHitInfoIteratorAndTest, PopulateMatchedTermsStats_NoMatchingDocument) {
+ DocHitInfo doc_hit_info1 = DocHitInfo(4);
+ doc_hit_info1.UpdateSection(/*section_id=*/0, /*hit_term_frequency=*/1);
+
+ DocHitInfo doc_hit_info2 = DocHitInfo(5);
+ doc_hit_info2.UpdateSection(/*section_id=*/1, /*hit_term_frequency=*/2);
+ doc_hit_info2.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/6);
+
+ std::vector<DocHitInfo> first_vector = {doc_hit_info1};
+ std::vector<DocHitInfo> second_vector = {doc_hit_info2};
+
+ auto first_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(first_vector, "hi");
+ auto second_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(second_vector, "hello");
+
+ DocHitInfoIteratorAnd and_iter(std::move(first_iter), std::move(second_iter));
+ std::vector<TermMatchInfo> matched_terms_stats;
+ and_iter.PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(matched_terms_stats, IsEmpty());
+ EXPECT_FALSE(and_iter.Advance().ok());
+}
+
TEST(DocHitInfoIteratorAndNaryTest, Initialize) {
std::vector<std::unique_ptr<DocHitInfoIterator>> iterators;
iterators.push_back(std::make_unique<DocHitInfoIteratorDummy>());
@@ -345,6 +466,90 @@ TEST(DocHitInfoIteratorAndNaryTest, SectionIdMask) {
EXPECT_THAT(and_iter.hit_intersect_section_ids_mask(), Eq(mask_anded_result));
}
+TEST(DocHitInfoIteratorAndNaryTest, PopulateMatchedTermsStats) {
+ // Arbitrary section ids/term frequencies for the documents in the
+ // DocHitInfoIterators.
+ // For term "hi", document 10 and 8
+ SectionIdMask section_id_mask1_hi = 0b01000101; // hits in sections 0, 2, 6
+ std::array<Hit::TermFrequency, kMaxSectionId> term_frequencies1_hi{
+ 1, 0, 2, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0};
+ DocHitInfo doc_hit_info1_hi = DocHitInfo(10);
+ doc_hit_info1_hi.UpdateSection(/*section_id=*/0, /*hit_term_frequency=*/1);
+ doc_hit_info1_hi.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/2);
+ doc_hit_info1_hi.UpdateSection(/*section_id=*/6, /*hit_term_frequency=*/4);
+
+ DocHitInfo doc_hit_info2_hi = DocHitInfo(8);
+ doc_hit_info2_hi.UpdateSection(/*section_id=*/1, /*hit_term_frequency=*/2);
+ doc_hit_info2_hi.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/6);
+
+ // For term "hello", document 10 and 9
+ SectionIdMask section_id_mask1_hello = 0b00001001; // hits in sections 0, 3
+ std::array<Hit::TermFrequency, kMaxSectionId> term_frequencies1_hello{
+ 2, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+ DocHitInfo doc_hit_info1_hello = DocHitInfo(10);
+ doc_hit_info1_hello.UpdateSection(/*section_id=*/0, /*hit_term_frequency=*/2);
+ doc_hit_info1_hello.UpdateSection(/*section_id=*/3, /*hit_term_frequency=*/3);
+
+ DocHitInfo doc_hit_info2_hello = DocHitInfo(9);
+ doc_hit_info2_hello.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/3);
+ doc_hit_info2_hello.UpdateSection(/*section_id=*/3, /*hit_term_frequency=*/2);
+
+ // For term "ciao", document 10 and 9
+ SectionIdMask section_id_mask1_ciao = 0b00000011; // hits in sections 0, 1
+ std::array<Hit::TermFrequency, kMaxSectionId> term_frequencies1_ciao{
+ 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+ DocHitInfo doc_hit_info1_ciao = DocHitInfo(10);
+ doc_hit_info1_ciao.UpdateSection(/*section_id=*/0, /*hit_term_frequency=*/2);
+ doc_hit_info1_ciao.UpdateSection(/*section_id=*/1, /*hit_term_frequency=*/3);
+
+ DocHitInfo doc_hit_info2_ciao = DocHitInfo(9);
+ doc_hit_info2_ciao.UpdateSection(/*section_id=*/3, /*hit_term_frequency=*/3);
+ doc_hit_info2_ciao.UpdateSection(/*section_id=*/4, /*hit_term_frequency=*/2);
+
+ std::vector<DocHitInfo> first_vector = {doc_hit_info1_hi, doc_hit_info2_hi};
+ std::vector<DocHitInfo> second_vector = {doc_hit_info1_hello,
+ doc_hit_info2_hello};
+ std::vector<DocHitInfo> third_vector = {doc_hit_info1_ciao,
+ doc_hit_info2_ciao};
+
+ auto first_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(first_vector, "hi");
+ auto second_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(second_vector, "hello");
+ auto third_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(third_vector, "ciao");
+
+ std::vector<std::unique_ptr<DocHitInfoIterator>> iterators;
+ iterators.push_back(std::move(first_iter));
+ iterators.push_back(std::move(second_iter));
+ iterators.push_back(std::move(third_iter));
+
+ DocHitInfoIteratorAndNary and_iter(std::move(iterators));
+ std::vector<TermMatchInfo> matched_terms_stats;
+ and_iter.PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(matched_terms_stats, IsEmpty());
+
+ ICING_EXPECT_OK(and_iter.Advance());
+ EXPECT_THAT(and_iter.doc_hit_info().document_id(), Eq(10));
+
+ and_iter.PopulateMatchedTermsStats(&matched_terms_stats);
+ ASSERT_THAT(matched_terms_stats, SizeIs(3)); // 3 terms
+ EXPECT_EQ(matched_terms_stats.at(0).term, "hi");
+ EXPECT_THAT(matched_terms_stats.at(0).term_frequencies,
+ ElementsAreArray(term_frequencies1_hi));
+ EXPECT_EQ(matched_terms_stats.at(0).section_ids_mask, section_id_mask1_hi);
+ EXPECT_EQ(matched_terms_stats.at(1).term, "hello");
+ EXPECT_THAT(matched_terms_stats.at(1).term_frequencies,
+ ElementsAreArray(term_frequencies1_hello));
+ EXPECT_EQ(matched_terms_stats.at(1).section_ids_mask, section_id_mask1_hello);
+ EXPECT_EQ(matched_terms_stats.at(2).term, "ciao");
+ EXPECT_THAT(matched_terms_stats.at(2).term_frequencies,
+ ElementsAreArray(term_frequencies1_ciao));
+ EXPECT_EQ(matched_terms_stats.at(2).section_ids_mask, section_id_mask1_ciao);
+
+ EXPECT_FALSE(and_iter.Advance().ok());
+}
+
} // namespace
} // namespace lib
diff --git a/icing/index/iterator/doc-hit-info-iterator-filter.h b/icing/index/iterator/doc-hit-info-iterator-filter.h
index 9119610..fb60e38 100644
--- a/icing/index/iterator/doc-hit-info-iterator-filter.h
+++ b/icing/index/iterator/doc-hit-info-iterator-filter.h
@@ -67,6 +67,11 @@ class DocHitInfoIteratorFilter : public DocHitInfoIterator {
std::string ToString() const override;
+ void PopulateMatchedTermsStats(
+ std::vector<TermMatchInfo>* matched_terms_stats) const override {
+ delegate_->PopulateMatchedTermsStats(matched_terms_stats);
+ }
+
private:
std::unique_ptr<DocHitInfoIterator> delegate_;
const DocumentStore& document_store_;
diff --git a/icing/index/iterator/doc-hit-info-iterator-or.cc b/icing/index/iterator/doc-hit-info-iterator-or.cc
index 8f00f88..b4234e0 100644
--- a/icing/index/iterator/doc-hit-info-iterator-or.cc
+++ b/icing/index/iterator/doc-hit-info-iterator-or.cc
@@ -108,6 +108,7 @@ libtextclassifier3::Status DocHitInfoIteratorOr::Advance() {
} else {
chosen = left_.get();
}
+ current_ = chosen;
doc_hit_info_ = chosen->doc_hit_info();
hit_intersect_section_ids_mask_ = chosen->hit_intersect_section_ids_mask();
@@ -139,6 +140,7 @@ DocHitInfoIteratorOrNary::DocHitInfoIteratorOrNary(
: iterators_(std::move(iterators)) {}
libtextclassifier3::Status DocHitInfoIteratorOrNary::Advance() {
+ current_iterators_.clear();
if (iterators_.size() < 2) {
return absl_ports::InvalidArgumentError(
"Not enough iterators to OR together");
@@ -187,6 +189,7 @@ libtextclassifier3::Status DocHitInfoIteratorOrNary::Advance() {
hit_intersect_section_ids_mask_ = kSectionIdMaskNone;
for (const auto& iterator : iterators_) {
if (iterator->doc_hit_info().document_id() == next_document_id) {
+ current_iterators_.push_back(iterator.get());
if (doc_hit_info_.document_id() == kInvalidDocumentId) {
doc_hit_info_ = iterator->doc_hit_info();
hit_intersect_section_ids_mask_ =
diff --git a/icing/index/iterator/doc-hit-info-iterator-or.h b/icing/index/iterator/doc-hit-info-iterator-or.h
index 4128e0f..2f49430 100644
--- a/icing/index/iterator/doc-hit-info-iterator-or.h
+++ b/icing/index/iterator/doc-hit-info-iterator-or.h
@@ -42,9 +42,26 @@ class DocHitInfoIteratorOr : public DocHitInfoIterator {
std::string ToString() const override;
+ void PopulateMatchedTermsStats(
+ std::vector<TermMatchInfo> *matched_terms_stats) const override {
+ if (doc_hit_info_.document_id() == kInvalidDocumentId) {
+ // Current hit isn't valid, return.
+ return;
+ }
+ current_->PopulateMatchedTermsStats(matched_terms_stats);
+ // If equal, then current_ == left_. Combine with results from right_.
+ if (left_document_id_ == right_document_id_) {
+ right_->PopulateMatchedTermsStats(matched_terms_stats);
+ }
+ }
+
private:
std::unique_ptr<DocHitInfoIterator> left_;
std::unique_ptr<DocHitInfoIterator> right_;
+ // Pointer to the chosen iterator that points to the current doc_hit_info_. If
+ // both left_ and right_ point to the same docid, then chosen_ == left.
+ // chosen_ does not own the iterator it points to.
+ DocHitInfoIterator *current_;
DocumentId left_document_id_ = kMaxDocumentId;
DocumentId right_document_id_ = kMaxDocumentId;
};
@@ -65,8 +82,22 @@ class DocHitInfoIteratorOrNary : public DocHitInfoIterator {
std::string ToString() const override;
+ void PopulateMatchedTermsStats(
+ std::vector<TermMatchInfo> *matched_terms_stats) const override {
+ if (doc_hit_info_.document_id() == kInvalidDocumentId) {
+ // Current hit isn't valid, return.
+ return;
+ }
+ for (size_t i = 0; i < current_iterators_.size(); i++) {
+ current_iterators_.at(i)->PopulateMatchedTermsStats(matched_terms_stats);
+ }
+ }
+
private:
std::vector<std::unique_ptr<DocHitInfoIterator>> iterators_;
+ // Pointers to the iterators that point to the current doc_hit_info_.
+ // current_iterators_ does not own the iterators it points to.
+ std::vector<DocHitInfoIterator *> current_iterators_;
};
} // namespace lib
diff --git a/icing/index/iterator/doc-hit-info-iterator-or_test.cc b/icing/index/iterator/doc-hit-info-iterator-or_test.cc
index 3faa5ab..3f00a39 100644
--- a/icing/index/iterator/doc-hit-info-iterator-or_test.cc
+++ b/icing/index/iterator/doc-hit-info-iterator-or_test.cc
@@ -32,7 +32,10 @@ namespace lib {
namespace {
using ::testing::ElementsAre;
+using ::testing::ElementsAreArray;
using ::testing::Eq;
+using ::testing::IsEmpty;
+using ::testing::SizeIs;
TEST(CreateAndIteratorTest, Or) {
// Basic test that we can create a working Or iterator. Further testing of
@@ -175,6 +178,159 @@ TEST(DocHitInfoIteratorOrTest, SectionIdMask) {
EXPECT_THAT(or_iter.hit_intersect_section_ids_mask(), Eq(mask_anded_result));
}
+TEST(DocHitInfoIteratorOrTest, PopulateMatchedTermsStats) {
+ {
+ // Arbitrary section ids for the documents in the DocHitInfoIterators.
+ // Created to test correct section_id_mask behavior.
+ SectionIdMask section_id_mask1 = 0b01010101; // hits in sections 0, 2, 4, 6
+ std::array<Hit::TermFrequency, kMaxSectionId> term_frequencies1{
+ 1, 0, 2, 0, 3, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0};
+ SectionIdMask section_id_mask2 = 0b00000110; // hits in sections 1, 2
+ std::array<Hit::TermFrequency, kMaxSectionId> term_frequencies2{
+ 0, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+ DocHitInfo doc_hit_info1 = DocHitInfo(4);
+ doc_hit_info1.UpdateSection(/*section_id=*/0, /*hit_term_frequency=*/1);
+ doc_hit_info1.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/2);
+ doc_hit_info1.UpdateSection(/*section_id=*/4, /*hit_term_frequency=*/3);
+ doc_hit_info1.UpdateSection(/*section_id=*/6, /*hit_term_frequency=*/4);
+
+ DocHitInfo doc_hit_info2 = DocHitInfo(4);
+ doc_hit_info2.UpdateSection(/*section_id=*/1, /*hit_term_frequency=*/2);
+ doc_hit_info2.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/6);
+
+ std::vector<DocHitInfo> first_vector = {doc_hit_info1};
+ std::vector<DocHitInfo> second_vector = {doc_hit_info2};
+
+ auto first_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(first_vector, "hi");
+ first_iter->set_hit_intersect_section_ids_mask(section_id_mask1);
+
+ auto second_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(second_vector, "hello");
+ second_iter->set_hit_intersect_section_ids_mask(section_id_mask2);
+
+ DocHitInfoIteratorOr or_iter(std::move(first_iter), std::move(second_iter));
+ std::vector<TermMatchInfo> matched_terms_stats;
+ or_iter.PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(matched_terms_stats, IsEmpty());
+
+ ICING_EXPECT_OK(or_iter.Advance());
+ EXPECT_THAT(or_iter.doc_hit_info().document_id(), Eq(4));
+
+ or_iter.PopulateMatchedTermsStats(&matched_terms_stats);
+ ASSERT_THAT(matched_terms_stats, SizeIs(2)); // 2 terms
+ EXPECT_EQ(matched_terms_stats.at(0).term, "hi");
+ EXPECT_EQ(matched_terms_stats.at(1).term, "hello");
+ EXPECT_THAT(matched_terms_stats.at(0).term_frequencies,
+ ElementsAreArray(term_frequencies1));
+ EXPECT_EQ(matched_terms_stats.at(0).section_ids_mask, section_id_mask1);
+ EXPECT_THAT(matched_terms_stats.at(1).term_frequencies,
+ ElementsAreArray(term_frequencies2));
+ EXPECT_EQ(matched_terms_stats.at(1).section_ids_mask, section_id_mask2);
+
+ EXPECT_FALSE(or_iter.Advance().ok());
+ }
+ {
+ // Arbitrary section ids for the documents in the DocHitInfoIterators.
+ // Created to test correct section_id_mask behavior.
+ SectionIdMask section_id_mask1 = 0b00000101; // hits in sections 0, 2
+ std::array<Hit::TermFrequency, kMaxSectionId> term_frequencies1{
+ 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+ DocHitInfo doc_hit_info1 = DocHitInfo(4);
+ doc_hit_info1.UpdateSection(/*section_id=*/0, /*hit_term_frequency=*/1);
+ doc_hit_info1.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/2);
+
+ std::vector<DocHitInfo> first_vector = {doc_hit_info1};
+ std::vector<DocHitInfo> second_vector = {doc_hit_info1};
+
+ auto first_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(first_vector, "hi");
+ first_iter->set_hit_intersect_section_ids_mask(section_id_mask1);
+
+ auto second_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(second_vector, "hi");
+ second_iter->set_hit_intersect_section_ids_mask(section_id_mask1);
+
+ DocHitInfoIteratorOr or_iter(std::move(first_iter), std::move(second_iter));
+ std::vector<TermMatchInfo> matched_terms_stats;
+ or_iter.PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(matched_terms_stats, IsEmpty());
+
+ ICING_EXPECT_OK(or_iter.Advance());
+ EXPECT_THAT(or_iter.doc_hit_info().document_id(), Eq(4));
+
+ or_iter.PopulateMatchedTermsStats(&matched_terms_stats);
+ ASSERT_THAT(matched_terms_stats, SizeIs(1)); // 1 term
+ EXPECT_EQ(matched_terms_stats.at(0).term, "hi");
+ EXPECT_THAT(matched_terms_stats.at(0).term_frequencies,
+ ElementsAreArray(term_frequencies1));
+ EXPECT_EQ(matched_terms_stats.at(0).section_ids_mask, section_id_mask1);
+
+ EXPECT_FALSE(or_iter.Advance().ok());
+ }
+ {
+ // Arbitrary section ids for the documents in the DocHitInfoIterators.
+ // Created to test correct section_id_mask behavior.
+ SectionIdMask section_id_mask1 = 0b01010101; // hits in sections 0, 2, 4, 6
+ std::array<Hit::TermFrequency, kMaxSectionId> term_frequencies1{
+ 1, 0, 2, 0, 3, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0};
+ SectionIdMask section_id_mask2 = 0b00000110; // hits in sections 1, 2
+ std::array<Hit::TermFrequency, kMaxSectionId> term_frequencies2{
+ 0, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+ DocHitInfo doc_hit_info1 = DocHitInfo(4);
+ doc_hit_info1.UpdateSection(/*section_id=*/0, /*hit_term_frequency=*/1);
+ doc_hit_info1.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/2);
+ doc_hit_info1.UpdateSection(/*section_id=*/4, /*hit_term_frequency=*/3);
+ doc_hit_info1.UpdateSection(/*section_id=*/6, /*hit_term_frequency=*/4);
+
+ DocHitInfo doc_hit_info2 = DocHitInfo(5);
+ doc_hit_info2.UpdateSection(/*section_id=*/1, /*hit_term_frequency=*/2);
+ doc_hit_info2.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/6);
+
+ std::vector<DocHitInfo> first_vector = {doc_hit_info1};
+ std::vector<DocHitInfo> second_vector = {doc_hit_info2};
+
+ auto first_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(first_vector, "hi");
+ first_iter->set_hit_intersect_section_ids_mask(section_id_mask1);
+
+ auto second_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(second_vector, "hello");
+ second_iter->set_hit_intersect_section_ids_mask(section_id_mask2);
+
+ DocHitInfoIteratorOr or_iter(std::move(first_iter), std::move(second_iter));
+ std::vector<TermMatchInfo> matched_terms_stats;
+ or_iter.PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(matched_terms_stats, IsEmpty());
+
+ ICING_EXPECT_OK(or_iter.Advance());
+ EXPECT_THAT(or_iter.doc_hit_info().document_id(), Eq(5));
+
+ or_iter.PopulateMatchedTermsStats(&matched_terms_stats);
+ ASSERT_THAT(matched_terms_stats, SizeIs(1)); // 1 term
+ EXPECT_EQ(matched_terms_stats.at(0).term, "hello");
+ EXPECT_THAT(matched_terms_stats.at(0).term_frequencies,
+ ElementsAreArray(term_frequencies2));
+ EXPECT_EQ(matched_terms_stats.at(0).section_ids_mask, section_id_mask2);
+
+ ICING_EXPECT_OK(or_iter.Advance());
+ EXPECT_THAT(or_iter.doc_hit_info().document_id(), Eq(4));
+
+ matched_terms_stats.clear();
+ or_iter.PopulateMatchedTermsStats(&matched_terms_stats);
+ ASSERT_THAT(matched_terms_stats, SizeIs(1)); // 1 term
+ EXPECT_EQ(matched_terms_stats.at(0).term, "hi");
+ EXPECT_THAT(matched_terms_stats.at(0).term_frequencies,
+ ElementsAreArray(term_frequencies1));
+ EXPECT_EQ(matched_terms_stats.at(0).section_ids_mask, section_id_mask1);
+
+ EXPECT_FALSE(or_iter.Advance().ok());
+ }
+}
+
TEST(DocHitInfoIteratorOrNaryTest, Initialize) {
std::vector<std::unique_ptr<DocHitInfoIterator>> iterators;
iterators.push_back(std::make_unique<DocHitInfoIteratorDummy>());
@@ -316,6 +472,125 @@ TEST(DocHitInfoIteratorOrNaryTest, SectionIdMask) {
EXPECT_THAT(or_iter.hit_intersect_section_ids_mask(), Eq(mask_anded_result));
}
+TEST(DocHitInfoIteratorOrNaryTest, PopulateMatchedTermsStats) {
+ // Arbitrary section ids/term frequencies for the documents in the
+ // DocHitInfoIterators.
+ // For term "hi", document 10 and 8
+ SectionIdMask section_id_mask1_hi = 0b01000101; // hits in sections 0, 2, 6
+ std::array<Hit::TermFrequency, kMaxSectionId> term_frequencies1_hi{
+ 1, 0, 2, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0};
+ DocHitInfo doc_hit_info1_hi = DocHitInfo(10);
+ doc_hit_info1_hi.UpdateSection(/*section_id=*/0, /*hit_term_frequency=*/1);
+ doc_hit_info1_hi.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/2);
+ doc_hit_info1_hi.UpdateSection(/*section_id=*/6, /*hit_term_frequency=*/4);
+
+ SectionIdMask section_id_mask2_hi = 0b00000110; // hits in sections 1, 2
+ std::array<Hit::TermFrequency, kMaxSectionId> term_frequencies2_hi{
+ 0, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+ DocHitInfo doc_hit_info2_hi = DocHitInfo(8);
+ doc_hit_info2_hi.UpdateSection(/*section_id=*/1, /*hit_term_frequency=*/2);
+ doc_hit_info2_hi.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/6);
+
+ // For term "hello", document 10 and 9
+ SectionIdMask section_id_mask1_hello = 0b00001001; // hits in sections 0, 3
+ std::array<Hit::TermFrequency, kMaxSectionId> term_frequencies1_hello{
+ 2, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+ DocHitInfo doc_hit_info1_hello = DocHitInfo(10);
+ doc_hit_info1_hello.UpdateSection(/*section_id=*/0, /*hit_term_frequency=*/2);
+ doc_hit_info1_hello.UpdateSection(/*section_id=*/3, /*hit_term_frequency=*/3);
+
+ SectionIdMask section_id_mask2_hello = 0b00001100; // hits in sections 2, 3
+ std::array<Hit::TermFrequency, kMaxSectionId> term_frequencies2_hello{
+ 0, 0, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+ DocHitInfo doc_hit_info2_hello = DocHitInfo(9);
+ doc_hit_info2_hello.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/3);
+ doc_hit_info2_hello.UpdateSection(/*section_id=*/3, /*hit_term_frequency=*/2);
+
+ // For term "ciao", document 9 and 8
+ SectionIdMask section_id_mask1_ciao = 0b00000011; // hits in sections 0, 1
+ std::array<Hit::TermFrequency, kMaxSectionId> term_frequencies1_ciao{
+ 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+ DocHitInfo doc_hit_info1_ciao = DocHitInfo(9);
+ doc_hit_info1_ciao.UpdateSection(/*section_id=*/0, /*hit_term_frequency=*/2);
+ doc_hit_info1_ciao.UpdateSection(/*section_id=*/1, /*hit_term_frequency=*/3);
+
+ SectionIdMask section_id_mask2_ciao = 0b00011000; // hits in sections 3, 4
+ std::array<Hit::TermFrequency, kMaxSectionId> term_frequencies2_ciao{
+ 0, 0, 0, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+ DocHitInfo doc_hit_info2_ciao = DocHitInfo(8);
+ doc_hit_info2_ciao.UpdateSection(/*section_id=*/3, /*hit_term_frequency=*/3);
+ doc_hit_info2_ciao.UpdateSection(/*section_id=*/4, /*hit_term_frequency=*/2);
+
+ std::vector<DocHitInfo> first_vector = {doc_hit_info1_hi, doc_hit_info2_hi};
+ std::vector<DocHitInfo> second_vector = {doc_hit_info1_hello,
+ doc_hit_info2_hello};
+ std::vector<DocHitInfo> third_vector = {doc_hit_info1_ciao,
+ doc_hit_info2_ciao};
+
+ auto first_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(first_vector, "hi");
+ auto second_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(second_vector, "hello");
+ auto third_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(third_vector, "ciao");
+
+ std::vector<std::unique_ptr<DocHitInfoIterator>> iterators;
+ iterators.push_back(std::move(first_iter));
+ iterators.push_back(std::move(second_iter));
+ iterators.push_back(std::move(third_iter));
+
+ DocHitInfoIteratorOrNary or_iter(std::move(iterators));
+ std::vector<TermMatchInfo> matched_terms_stats;
+ or_iter.PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(matched_terms_stats, IsEmpty());
+
+ ICING_EXPECT_OK(or_iter.Advance());
+ EXPECT_THAT(or_iter.doc_hit_info().document_id(), Eq(10));
+
+ or_iter.PopulateMatchedTermsStats(&matched_terms_stats);
+ ASSERT_THAT(matched_terms_stats, SizeIs(2)); // 2 terms
+ EXPECT_EQ(matched_terms_stats.at(0).term, "hi");
+ EXPECT_THAT(matched_terms_stats.at(0).term_frequencies,
+ ElementsAreArray(term_frequencies1_hi));
+ EXPECT_EQ(matched_terms_stats.at(0).section_ids_mask, section_id_mask1_hi);
+ EXPECT_EQ(matched_terms_stats.at(1).term, "hello");
+ EXPECT_THAT(matched_terms_stats.at(1).term_frequencies,
+ ElementsAreArray(term_frequencies1_hello));
+ EXPECT_EQ(matched_terms_stats.at(1).section_ids_mask, section_id_mask1_hello);
+
+ ICING_EXPECT_OK(or_iter.Advance());
+ EXPECT_THAT(or_iter.doc_hit_info().document_id(), Eq(9));
+
+ matched_terms_stats.clear();
+ or_iter.PopulateMatchedTermsStats(&matched_terms_stats);
+ ASSERT_THAT(matched_terms_stats, SizeIs(2)); // 2 terms
+ EXPECT_EQ(matched_terms_stats.at(0).term, "hello");
+ EXPECT_THAT(matched_terms_stats.at(0).term_frequencies,
+ ElementsAreArray(term_frequencies2_hello));
+ EXPECT_EQ(matched_terms_stats.at(0).section_ids_mask, section_id_mask2_hello);
+ EXPECT_EQ(matched_terms_stats.at(1).term, "ciao");
+ EXPECT_THAT(matched_terms_stats.at(1).term_frequencies,
+ ElementsAreArray(term_frequencies1_ciao));
+ EXPECT_EQ(matched_terms_stats.at(1).section_ids_mask, section_id_mask1_ciao);
+
+ ICING_EXPECT_OK(or_iter.Advance());
+ EXPECT_THAT(or_iter.doc_hit_info().document_id(), Eq(8));
+
+ matched_terms_stats.clear();
+ or_iter.PopulateMatchedTermsStats(&matched_terms_stats);
+ ASSERT_THAT(matched_terms_stats, SizeIs(2)); // 2 terms
+ EXPECT_EQ(matched_terms_stats.at(0).term, "hi");
+ EXPECT_THAT(matched_terms_stats.at(0).term_frequencies,
+ ElementsAreArray(term_frequencies2_hi));
+ EXPECT_EQ(matched_terms_stats.at(0).section_ids_mask, section_id_mask2_hi);
+ EXPECT_EQ(matched_terms_stats.at(1).term, "ciao");
+ EXPECT_THAT(matched_terms_stats.at(1).term_frequencies,
+ ElementsAreArray(term_frequencies2_ciao));
+ EXPECT_EQ(matched_terms_stats.at(1).section_ids_mask, section_id_mask2_ciao);
+
+ EXPECT_FALSE(or_iter.Advance().ok());
+}
+
} // namespace
} // namespace lib
diff --git a/icing/index/iterator/doc-hit-info-iterator-section-restrict.h b/icing/index/iterator/doc-hit-info-iterator-section-restrict.h
index ae5a896..ba74384 100644
--- a/icing/index/iterator/doc-hit-info-iterator-section-restrict.h
+++ b/icing/index/iterator/doc-hit-info-iterator-section-restrict.h
@@ -52,6 +52,15 @@ class DocHitInfoIteratorSectionRestrict : public DocHitInfoIterator {
std::string ToString() const override;
+ // NOTE: currently, section restricts does decide which documents to
+ // return, but doesn't impact the relevance score of a document.
+ // TODO(b/173156803): decide whether we want to filter the matched_terms_stats
+ // for the restricted sections.
+ void PopulateMatchedTermsStats(
+ std::vector<TermMatchInfo>* matched_terms_stats) const override {
+ delegate_->PopulateMatchedTermsStats(matched_terms_stats);
+ }
+
private:
std::unique_ptr<DocHitInfoIterator> delegate_;
const DocumentStore& document_store_;
diff --git a/icing/index/iterator/doc-hit-info-iterator-test-util.h b/icing/index/iterator/doc-hit-info-iterator-test-util.h
index c4d7aa7..913696a 100644
--- a/icing/index/iterator/doc-hit-info-iterator-test-util.h
+++ b/icing/index/iterator/doc-hit-info-iterator-test-util.h
@@ -15,7 +15,6 @@
#ifndef ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TEST_UTIL_H_
#define ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TEST_UTIL_H_
-#include <cstdint>
#include <string>
#include <utility>
#include <vector>
@@ -40,8 +39,9 @@ namespace lib {
class DocHitInfoIteratorDummy : public DocHitInfoIterator {
public:
DocHitInfoIteratorDummy() = default;
- explicit DocHitInfoIteratorDummy(std::vector<DocHitInfo> doc_hit_infos)
- : doc_hit_infos_(std::move(doc_hit_infos)) {}
+ explicit DocHitInfoIteratorDummy(std::vector<DocHitInfo> doc_hit_infos,
+ std::string term = "")
+ : doc_hit_infos_(std::move(doc_hit_infos)), term_(std::move(term)) {}
libtextclassifier3::Status Advance() override {
if (index_ < doc_hit_infos_.size()) {
@@ -54,6 +54,36 @@ class DocHitInfoIteratorDummy : public DocHitInfoIterator {
"No more DocHitInfos in iterator");
}
+ // Imitates behavior of DocHitInfoIteratorTermMain/DocHitInfoIteratorTermLite
+ void PopulateMatchedTermsStats(
+ std::vector<TermMatchInfo>* matched_terms_stats) const override {
+ if (doc_hit_info_.document_id() == kInvalidDocumentId) {
+ // Current hit isn't valid, return.
+ return;
+ }
+ SectionIdMask section_mask = doc_hit_info_.hit_section_ids_mask();
+ std::array<Hit::TermFrequency, kMaxSectionId> section_term_frequencies = {
+ Hit::kNoTermFrequency};
+
+ while (section_mask) {
+ SectionId section_id = __builtin_ctz(section_mask);
+ section_term_frequencies.at(section_id) =
+ doc_hit_info_.hit_term_frequency(section_id);
+ section_mask &= ~(1u << section_id);
+ }
+ TermMatchInfo term_stats(term_, doc_hit_info_.hit_section_ids_mask(),
+ section_term_frequencies);
+
+ for (auto& cur_term_stats : *matched_terms_stats) {
+ if (cur_term_stats.term == term_stats.term) {
+ // Same docId and same term, we don't need to add the term and the term
+ // frequency should always be the same
+ return;
+ }
+ }
+ matched_terms_stats->push_back(term_stats);
+ }
+
void set_hit_intersect_section_ids_mask(
SectionIdMask hit_intersect_section_ids_mask) {
hit_intersect_section_ids_mask_ = hit_intersect_section_ids_mask;
@@ -91,6 +121,7 @@ class DocHitInfoIteratorDummy : public DocHitInfoIterator {
int32_t num_blocks_inspected_ = 0;
int32_t num_leaf_advance_calls_ = 0;
std::vector<DocHitInfo> doc_hit_infos_;
+ std::string term_;
};
inline std::vector<DocumentId> GetDocumentIds(DocHitInfoIterator* iterator) {
diff --git a/icing/index/iterator/doc-hit-info-iterator.h b/icing/index/iterator/doc-hit-info-iterator.h
index bcc2b6e..c4d9901 100644
--- a/icing/index/iterator/doc-hit-info-iterator.h
+++ b/icing/index/iterator/doc-hit-info-iterator.h
@@ -17,6 +17,7 @@
#include <cstdint>
#include <string>
+#include <string_view>
#include "icing/text_classifier/lib3/utils/base/status.h"
#include "icing/text_classifier/lib3/utils/base/statusor.h"
@@ -28,6 +29,26 @@
namespace icing {
namespace lib {
+// Data structure that maps a single matched query term to its section mask
+// and the list of term frequencies.
+// TODO(b/158603837): add stat on whether the matched terms are prefix matched
+// or not. This information will be used to boost exact match.
+struct TermMatchInfo {
+ std::string_view term;
+ // SectionIdMask associated to the term.
+ SectionIdMask section_ids_mask;
+ // Array with fixed size kMaxSectionId. For every section id, i.e.
+ // vector index, it stores the term frequency of the term.
+ std::array<Hit::TermFrequency, kMaxSectionId> term_frequencies;
+
+ explicit TermMatchInfo(
+ std::string_view term, SectionIdMask section_ids_mask,
+ std::array<Hit::TermFrequency, kMaxSectionId> term_frequencies)
+ : term(term),
+ section_ids_mask(section_ids_mask),
+ term_frequencies(std::move(term_frequencies)) {}
+};
+
// Iterator over DocHitInfos (collapsed Hits) in REVERSE document_id order.
//
// NOTE: You must call Advance() before calling hit_info() or
@@ -70,6 +91,14 @@ class DocHitInfoIterator {
// A string representing the iterator.
virtual std::string ToString() const = 0;
+ // For the last hit docid, retrieves all the matched query terms and other
+ // stats, see TermMatchInfo.
+ // If Advance() wasn't called after construction, Advance() returned false or
+ // the concrete HitIterator didn't override this method, the vectors aren't
+ // populated.
+ virtual void PopulateMatchedTermsStats(
+ std::vector<TermMatchInfo>* matched_terms_stats) const {}
+
protected:
DocHitInfo doc_hit_info_;
SectionIdMask hit_intersect_section_ids_mask_ = kSectionIdMaskNone;
diff --git a/icing/index/lite/doc-hit-info-iterator-term-lite.h b/icing/index/lite/doc-hit-info-iterator-term-lite.h
index bd2de6d..ac5e97f 100644
--- a/icing/index/lite/doc-hit-info-iterator-term-lite.h
+++ b/icing/index/lite/doc-hit-info-iterator-term-lite.h
@@ -49,6 +49,34 @@ class DocHitInfoIteratorTermLite : public DocHitInfoIterator {
}
int32_t GetNumLeafAdvanceCalls() const override { return num_advance_calls_; }
+ void PopulateMatchedTermsStats(
+ std::vector<TermMatchInfo>* matched_terms_stats) const override {
+ if (doc_hit_info_.document_id() == kInvalidDocumentId) {
+ // Current hit isn't valid, return.
+ return;
+ }
+ SectionIdMask section_mask = doc_hit_info_.hit_section_ids_mask();
+ std::array<Hit::TermFrequency, kMaxSectionId> section_term_frequencies = {
+ Hit::kNoTermFrequency};
+ while (section_mask) {
+ SectionId section_id = __builtin_ctz(section_mask);
+ section_term_frequencies.at(section_id) =
+ doc_hit_info_.hit_term_frequency(section_id);
+ section_mask &= ~(1u << section_id);
+ }
+ TermMatchInfo term_stats(term_, doc_hit_info_.hit_section_ids_mask(),
+ std::move(section_term_frequencies));
+
+ for (const TermMatchInfo& cur_term_stats : *matched_terms_stats) {
+ if (cur_term_stats.term == term_stats.term) {
+ // Same docId and same term, we don't need to add the term and the term
+ // frequency should always be the same
+ return;
+ }
+ }
+ matched_terms_stats->push_back(std::move(term_stats));
+ }
+
protected:
// Add DocHitInfos corresponding to term_ to cached_hits_.
virtual libtextclassifier3::Status RetrieveMoreHits() = 0;
diff --git a/icing/index/main/doc-hit-info-iterator-term-main.h b/icing/index/main/doc-hit-info-iterator-term-main.h
index 1f77226..d626d7a 100644
--- a/icing/index/main/doc-hit-info-iterator-term-main.h
+++ b/icing/index/main/doc-hit-info-iterator-term-main.h
@@ -49,6 +49,34 @@ class DocHitInfoIteratorTermMain : public DocHitInfoIterator {
}
int32_t GetNumLeafAdvanceCalls() const override { return num_advance_calls_; }
+ void PopulateMatchedTermsStats(
+ std::vector<TermMatchInfo>* matched_terms_stats) const override {
+ if (doc_hit_info_.document_id() == kInvalidDocumentId) {
+ // Current hit isn't valid, return.
+ return;
+ }
+ SectionIdMask section_mask = doc_hit_info_.hit_section_ids_mask();
+ std::array<Hit::TermFrequency, kMaxSectionId> section_term_frequencies = {
+ Hit::kNoTermFrequency};
+ while (section_mask) {
+ SectionId section_id = __builtin_ctz(section_mask);
+ section_term_frequencies.at(section_id) =
+ doc_hit_info_.hit_term_frequency(section_id);
+ section_mask &= ~(1u << section_id);
+ }
+ TermMatchInfo term_stats(term_, doc_hit_info_.hit_section_ids_mask(),
+ std::move(section_term_frequencies));
+
+ for (const TermMatchInfo& cur_term_stats : *matched_terms_stats) {
+ if (cur_term_stats.term == term_stats.term) {
+ // Same docId and same term, we don't need to add the term and the term
+ // frequency should always be the same
+ return;
+ }
+ }
+ matched_terms_stats->push_back(std::move(term_stats));
+ }
+
protected:
// Add DocHitInfos corresponding to term_ to cached_doc_hit_infos_.
virtual libtextclassifier3::Status RetrieveMoreHits() = 0;