diff options
author | Tim Barron <tjbarron@google.com> | 2021-01-14 20:53:07 +0000 |
---|---|---|
committer | Tim Barron <tjbarron@google.com> | 2021-01-14 20:53:07 +0000 |
commit | a34db390d80f862bfaaa49dea3605c5fec3bca3d (patch) | |
tree | 67a4a87803cf2b31619c3ddff3674967fc1461ce /icing/index | |
parent | 59c2caa38fd8dca3760dad751f4f8e5de8be25f5 (diff) | |
download | icing-a34db390d80f862bfaaa49dea3605c5fec3bca3d.tar.gz |
Update Icing from upstream.
Change-Id: I43038a59e7170fb8ecbaf6098a37221b3682ce09
Diffstat (limited to 'icing/index')
18 files changed, 874 insertions, 144 deletions
diff --git a/icing/index/hit/doc-hit-info.h b/icing/index/hit/doc-hit-info.h index 8171960..0be87d6 100644 --- a/icing/index/hit/doc-hit-info.h +++ b/icing/index/hit/doc-hit-info.h @@ -25,7 +25,7 @@ namespace icing { namespace lib { -// DocHitInfo provides a collapsed view of all hits for a specific term and doc. +// DocHitInfo provides a collapsed view of all hits for a specific doc. // Hits contain a document_id, section_id and a term frequency. The // information in multiple hits is collapse into a DocHitInfo by providing a // SectionIdMask of all sections that contained a hit for this term as well as @@ -36,7 +36,7 @@ class DocHitInfo { SectionIdMask hit_section_ids_mask = kSectionIdMaskNone) : document_id_(document_id_in), hit_section_ids_mask_(hit_section_ids_mask) { - memset(hit_term_frequency_, Hit::kDefaultTermFrequency, + memset(hit_term_frequency_, Hit::kNoTermFrequency, sizeof(hit_term_frequency_)); } diff --git a/icing/index/hit/doc-hit-info_test.cc b/icing/index/hit/doc-hit-info_test.cc index 15c0de9..36c1a06 100644 --- a/icing/index/hit/doc-hit-info_test.cc +++ b/icing/index/hit/doc-hit-info_test.cc @@ -34,13 +34,13 @@ constexpr DocumentId kSomeOtherDocumentId = 54; TEST(DocHitInfoTest, InitialMaxHitTermFrequencies) { DocHitInfo info(kSomeDocumentId); for (SectionId i = 0; i <= kMaxSectionId; ++i) { - EXPECT_THAT(info.hit_term_frequency(i), Eq(Hit::kDefaultTermFrequency)); + EXPECT_THAT(info.hit_term_frequency(i), Eq(Hit::kNoTermFrequency)); } } TEST(DocHitInfoTest, UpdateHitTermFrequenciesForTheFirstTime) { DocHitInfo info(kSomeDocumentId); - ASSERT_THAT(info.hit_term_frequency(3), Eq(Hit::kDefaultTermFrequency)); + ASSERT_THAT(info.hit_term_frequency(3), Eq(Hit::kNoTermFrequency)); // Updating a section for the first time, should change its hit // term_frequency diff --git a/icing/index/hit/hit.h b/icing/index/hit/hit.h index 525a5e5..ee1f64b 100644 --- a/icing/index/hit/hit.h +++ b/icing/index/hit/hit.h @@ -58,6 +58,7 @@ class Hit { static constexpr TermFrequency kMaxTermFrequency = std::numeric_limits<TermFrequency>::max(); static constexpr TermFrequency kDefaultTermFrequency = 1; + static constexpr TermFrequency kNoTermFrequency = 0; explicit Hit(Value value = kInvalidValue, TermFrequency term_frequency = kDefaultTermFrequency) diff --git a/icing/index/index-processor.cc b/icing/index/index-processor.cc index 892263b..d2f9d41 100644 --- a/icing/index/index-processor.cc +++ b/icing/index/index-processor.cc @@ -31,34 +31,30 @@ #include "icing/schema/section-manager.h" #include "icing/schema/section.h" #include "icing/store/document-id.h" -#include "icing/tokenization/language-segmenter.h" #include "icing/tokenization/token.h" #include "icing/tokenization/tokenizer-factory.h" #include "icing/tokenization/tokenizer.h" #include "icing/transform/normalizer.h" #include "icing/util/status-macros.h" +#include "icing/util/tokenized-document.h" namespace icing { namespace lib { libtextclassifier3::StatusOr<std::unique_ptr<IndexProcessor>> -IndexProcessor::Create(const SchemaStore* schema_store, - const LanguageSegmenter* lang_segmenter, - const Normalizer* normalizer, Index* index, +IndexProcessor::Create(const Normalizer* normalizer, Index* index, const IndexProcessor::Options& options, const Clock* clock) { - ICING_RETURN_ERROR_IF_NULL(schema_store); - ICING_RETURN_ERROR_IF_NULL(lang_segmenter); ICING_RETURN_ERROR_IF_NULL(normalizer); ICING_RETURN_ERROR_IF_NULL(index); ICING_RETURN_ERROR_IF_NULL(clock); - return std::unique_ptr<IndexProcessor>(new IndexProcessor( - schema_store, lang_segmenter, normalizer, index, options, clock)); + return std::unique_ptr<IndexProcessor>( + new IndexProcessor(normalizer, index, options, clock)); } libtextclassifier3::Status IndexProcessor::IndexDocument( - const DocumentProto& document, DocumentId document_id, + const TokenizedDocument& tokenized_document, DocumentId document_id, NativePutDocumentStats* put_document_stats) { std::unique_ptr<Timer> index_timer = clock_.GetNewTimer(); @@ -68,54 +64,45 @@ libtextclassifier3::Status IndexProcessor::IndexDocument( "DocumentId %d must be greater than last added document_id %d", document_id, index_->last_added_document_id())); } - ICING_ASSIGN_OR_RETURN(std::vector<Section> sections, - schema_store_.ExtractSections(document)); uint32_t num_tokens = 0; libtextclassifier3::Status overall_status; - for (const Section& section : sections) { + for (const TokenizedSection& section : tokenized_document.sections()) { // TODO(b/152934343): pass real namespace ids in Index::Editor editor = index_->Edit(document_id, section.metadata.id, section.metadata.term_match_type, /*namespace_id=*/0); - for (std::string_view subcontent : section.content) { - ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer> tokenizer, - tokenizer_factory::CreateIndexingTokenizer( - section.metadata.tokenizer, &lang_segmenter_)); - ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer::Iterator> itr, - tokenizer->Tokenize(subcontent)); - while (itr->Advance()) { - if (++num_tokens > options_.max_tokens_per_document) { - // Index all tokens buffered so far. - editor.IndexAllBufferedTerms(); - if (put_document_stats != nullptr) { - put_document_stats->mutable_tokenization_stats() - ->set_exceeded_max_token_num(true); - put_document_stats->mutable_tokenization_stats() - ->set_num_tokens_indexed(options_.max_tokens_per_document); - } - switch (options_.token_limit_behavior) { - case Options::TokenLimitBehavior::kReturnError: - return absl_ports::ResourceExhaustedError( - "Max number of tokens reached!"); - case Options::TokenLimitBehavior::kSuppressError: - return overall_status; - } + for (std::string_view token : section.token_sequence) { + if (++num_tokens > options_.max_tokens_per_document) { + // Index all tokens buffered so far. + editor.IndexAllBufferedTerms(); + if (put_document_stats != nullptr) { + put_document_stats->mutable_tokenization_stats() + ->set_exceeded_max_token_num(true); + put_document_stats->mutable_tokenization_stats() + ->set_num_tokens_indexed(options_.max_tokens_per_document); } - std::string term = normalizer_.NormalizeTerm(itr->GetToken().text); - // Add this term to Hit buffer. Even if adding this hit fails, we keep - // trying to add more hits because it's possible that future hits could - // still be added successfully. For instance if the lexicon is full, we - // might fail to add a hit for a new term, but should still be able to - // add hits for terms that are already in the index. - auto status = editor.BufferTerm(term.c_str()); - if (overall_status.ok() && !status.ok()) { - // If we've succeeded to add everything so far, set overall_status to - // represent this new failure. If we've already failed, no need to - // update the status - we're already going to return a resource - // exhausted error. - overall_status = status; + switch (options_.token_limit_behavior) { + case Options::TokenLimitBehavior::kReturnError: + return absl_ports::ResourceExhaustedError( + "Max number of tokens reached!"); + case Options::TokenLimitBehavior::kSuppressError: + return overall_status; } } + std::string term = normalizer_.NormalizeTerm(token); + // Add this term to Hit buffer. Even if adding this hit fails, we keep + // trying to add more hits because it's possible that future hits could + // still be added successfully. For instance if the lexicon is full, we + // might fail to add a hit for a new term, but should still be able to + // add hits for terms that are already in the index. + auto status = editor.BufferTerm(term.c_str()); + if (overall_status.ok() && !status.ok()) { + // If we've succeeded to add everything so far, set overall_status to + // represent this new failure. If we've already failed, no need to + // update the status - we're already going to return a resource + // exhausted error. + overall_status = status; + } } // Add all the seen terms to the index with their term frequency. auto status = editor.IndexAllBufferedTerms(); diff --git a/icing/index/index-processor.h b/icing/index/index-processor.h index 2eb4ad8..9fc7c46 100644 --- a/icing/index/index-processor.h +++ b/icing/index/index-processor.h @@ -21,12 +21,11 @@ #include "icing/text_classifier/lib3/utils/base/status.h" #include "icing/index/index.h" #include "icing/proto/document.pb.h" -#include "icing/schema/schema-store.h" #include "icing/schema/section-manager.h" #include "icing/store/document-id.h" -#include "icing/tokenization/language-segmenter.h" #include "icing/tokenization/token.h" #include "icing/transform/normalizer.h" +#include "icing/util/tokenized-document.h" namespace icing { namespace lib { @@ -58,14 +57,13 @@ class IndexProcessor { // An IndexProcessor on success // FAILED_PRECONDITION if any of the pointers is null. static libtextclassifier3::StatusOr<std::unique_ptr<IndexProcessor>> Create( - const SchemaStore* schema_store, const LanguageSegmenter* lang_segmenter, const Normalizer* normalizer, Index* index, const Options& options, const Clock* clock); - // Add document to the index, associated with document_id. If the number of - // tokens in the document exceeds max_tokens_per_document, then only the first - // max_tokens_per_document will be added to the index. All tokens of length - // exceeding max_token_length will be shortened to max_token_length. + // Add tokenized document to the index, associated with document_id. If the + // number of tokens in the document exceeds max_tokens_per_document, then only + // the first max_tokens_per_document will be added to the index. All tokens of + // length exceeding max_token_length will be shortened to max_token_length. // // Indexing a document *may* trigger an index merge. If a merge fails, then // all content in the index will be lost. @@ -82,25 +80,19 @@ class IndexProcessor { // NOT_FOUND if there is no definition for the document's schema type. // INTERNAL_ERROR if any other errors occur libtextclassifier3::Status IndexDocument( - const DocumentProto& document, DocumentId document_id, + const TokenizedDocument& tokenized_document, DocumentId document_id, NativePutDocumentStats* put_document_stats = nullptr); private: - IndexProcessor(const SchemaStore* schema_store, - const LanguageSegmenter* lang_segmenter, - const Normalizer* normalizer, Index* index, + IndexProcessor(const Normalizer* normalizer, Index* index, const Options& options, const Clock* clock) - : schema_store_(*schema_store), - lang_segmenter_(*lang_segmenter), - normalizer_(*normalizer), + : normalizer_(*normalizer), index_(index), options_(options), clock_(*clock) {} std::string NormalizeToken(const Token& token); - const SchemaStore& schema_store_; - const LanguageSegmenter& lang_segmenter_; const Normalizer& normalizer_; Index* const index_; const Options options_; diff --git a/icing/index/index-processor_benchmark.cc b/icing/index/index-processor_benchmark.cc index 96a390b..afeac4d 100644 --- a/icing/index/index-processor_benchmark.cc +++ b/icing/index/index-processor_benchmark.cc @@ -31,6 +31,7 @@ #include "icing/transform/normalizer-factory.h" #include "icing/transform/normalizer.h" #include "icing/util/logging.h" +#include "icing/util/tokenized-document.h" #include "unicode/uloc.h" // Run on a Linux workstation: @@ -168,16 +169,13 @@ void CleanUp(const Filesystem& filesystem, const std::string& index_dir) { } std::unique_ptr<IndexProcessor> CreateIndexProcessor( - const SchemaStore* schema_store, - const LanguageSegmenter* language_segmenter, const Normalizer* normalizer, - Index* index, const Clock* clock) { + const Normalizer* normalizer, Index* index, const Clock* clock) { IndexProcessor::Options processor_options{}; processor_options.max_tokens_per_document = 1024 * 1024 * 10; processor_options.token_limit_behavior = IndexProcessor::Options::TokenLimitBehavior::kReturnError; - return IndexProcessor::Create(schema_store, language_segmenter, normalizer, - index, processor_options, clock) + return IndexProcessor::Create(normalizer, index, processor_options, clock) .ValueOrDie(); } @@ -203,15 +201,18 @@ void BM_IndexDocumentWithOneProperty(benchmark::State& state) { Clock clock; std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore(&clock); std::unique_ptr<IndexProcessor> index_processor = - CreateIndexProcessor(schema_store.get(), language_segmenter.get(), - normalizer.get(), index.get(), &clock); + CreateIndexProcessor(normalizer.get(), index.get(), &clock); DocumentProto input_document = CreateDocumentWithOneProperty(state.range(0)); + TokenizedDocument tokenized_document(std::move( + TokenizedDocument::Create(schema_store.get(), language_segmenter.get(), + input_document) + .ValueOrDie())); DocumentId document_id = 0; for (auto _ : state) { ICING_ASSERT_OK( - index_processor->IndexDocument(input_document, document_id++)); + index_processor->IndexDocument(tokenized_document, document_id++)); } CleanUp(filesystem, index_dir); @@ -254,16 +255,19 @@ void BM_IndexDocumentWithTenProperties(benchmark::State& state) { Clock clock; std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore(&clock); std::unique_ptr<IndexProcessor> index_processor = - CreateIndexProcessor(schema_store.get(), language_segmenter.get(), - normalizer.get(), index.get(), &clock); + CreateIndexProcessor(normalizer.get(), index.get(), &clock); DocumentProto input_document = CreateDocumentWithTenProperties(state.range(0)); + TokenizedDocument tokenized_document(std::move( + TokenizedDocument::Create(schema_store.get(), language_segmenter.get(), + input_document) + .ValueOrDie())); DocumentId document_id = 0; for (auto _ : state) { ICING_ASSERT_OK( - index_processor->IndexDocument(input_document, document_id++)); + index_processor->IndexDocument(tokenized_document, document_id++)); } CleanUp(filesystem, index_dir); @@ -306,16 +310,19 @@ void BM_IndexDocumentWithDiacriticLetters(benchmark::State& state) { Clock clock; std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore(&clock); std::unique_ptr<IndexProcessor> index_processor = - CreateIndexProcessor(schema_store.get(), language_segmenter.get(), - normalizer.get(), index.get(), &clock); + CreateIndexProcessor(normalizer.get(), index.get(), &clock); DocumentProto input_document = CreateDocumentWithDiacriticLetters(state.range(0)); + TokenizedDocument tokenized_document(std::move( + TokenizedDocument::Create(schema_store.get(), language_segmenter.get(), + input_document) + .ValueOrDie())); DocumentId document_id = 0; for (auto _ : state) { ICING_ASSERT_OK( - index_processor->IndexDocument(input_document, document_id++)); + index_processor->IndexDocument(tokenized_document, document_id++)); } CleanUp(filesystem, index_dir); @@ -358,15 +365,18 @@ void BM_IndexDocumentWithHiragana(benchmark::State& state) { Clock clock; std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore(&clock); std::unique_ptr<IndexProcessor> index_processor = - CreateIndexProcessor(schema_store.get(), language_segmenter.get(), - normalizer.get(), index.get(), &clock); + CreateIndexProcessor(normalizer.get(), index.get(), &clock); DocumentProto input_document = CreateDocumentWithHiragana(state.range(0)); + TokenizedDocument tokenized_document(std::move( + TokenizedDocument::Create(schema_store.get(), language_segmenter.get(), + input_document) + .ValueOrDie())); DocumentId document_id = 0; for (auto _ : state) { ICING_ASSERT_OK( - index_processor->IndexDocument(input_document, document_id++)); + index_processor->IndexDocument(tokenized_document, document_id++)); } CleanUp(filesystem, index_dir); diff --git a/icing/index/index-processor_test.cc b/icing/index/index-processor_test.cc index bdd9575..e6bb615 100644 --- a/icing/index/index-processor_test.cc +++ b/icing/index/index-processor_test.cc @@ -53,6 +53,7 @@ #include "icing/tokenization/language-segmenter.h" #include "icing/transform/normalizer-factory.h" #include "icing/transform/normalizer.h" +#include "icing/util/tokenized-document.h" #include "unicode/uloc.h" namespace icing { @@ -140,8 +141,7 @@ class IndexProcessorTest : public Test { ICING_ASSERT_OK_AND_ASSIGN( index_processor_, - IndexProcessor::Create(schema_store_.get(), lang_segmenter_.get(), - normalizer_.get(), index_.get(), + IndexProcessor::Create(normalizer_.get(), index_.get(), processor_options, &fake_clock_)); mock_icing_filesystem_ = std::make_unique<IcingMockFilesystem>(); } @@ -195,7 +195,7 @@ class IndexProcessorTest : public Test { type_config->set_schema_type(std::string(kFakeType)); AddStringProperty(std::string(kExactProperty), DataType::STRING, - Cardinality::REQUIRED, TermMatchType::EXACT_ONLY, + Cardinality::OPTIONAL, TermMatchType::EXACT_ONLY, type_config); AddStringProperty(std::string(kPrefixedProperty), DataType::STRING, @@ -244,25 +244,11 @@ TEST_F(IndexProcessorTest, CreationWithNullPointerShouldFail) { processor_options.token_limit_behavior = IndexProcessor::Options::TokenLimitBehavior::kReturnError; - EXPECT_THAT( - IndexProcessor::Create(/*schema_store=*/nullptr, lang_segmenter_.get(), - normalizer_.get(), index_.get(), processor_options, - &fake_clock_), - StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); - - EXPECT_THAT( - IndexProcessor::Create(schema_store_.get(), /*lang_segmenter=*/nullptr, - normalizer_.get(), index_.get(), processor_options, - &fake_clock_), - StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); - - EXPECT_THAT(IndexProcessor::Create(schema_store_.get(), lang_segmenter_.get(), - /*normalizer=*/nullptr, index_.get(), + EXPECT_THAT(IndexProcessor::Create(/*normalizer=*/nullptr, index_.get(), processor_options, &fake_clock_), StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); - EXPECT_THAT(IndexProcessor::Create(schema_store_.get(), lang_segmenter_.get(), - normalizer_.get(), /*index=*/nullptr, + EXPECT_THAT(IndexProcessor::Create(normalizer_.get(), /*index=*/nullptr, processor_options, &fake_clock_), StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); } @@ -276,7 +262,12 @@ TEST_F(IndexProcessorTest, NoTermMatchTypeContent) { .AddBytesProperty(std::string(kUnindexedProperty2), "attachment bytes") .Build(); - EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0), IsOk()); + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_document, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + document)); + EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), + IsOk()); EXPECT_THAT(index_->last_added_document_id(), Eq(kInvalidDocumentId)); } @@ -287,7 +278,12 @@ TEST_F(IndexProcessorTest, OneDoc) { .SetSchema(std::string(kFakeType)) .AddStringProperty(std::string(kExactProperty), "hello world") .Build(); - EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0), IsOk()); + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_document, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + document)); + EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), + IsOk()); EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DocHitInfoIterator> itr, @@ -313,7 +309,12 @@ TEST_F(IndexProcessorTest, MultipleDocs) { .AddStringProperty(std::string(kExactProperty), "hello world") .AddStringProperty(std::string(kPrefixedProperty), "good night moon!") .Build(); - EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0), IsOk()); + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_document, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + document)); + EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), + IsOk()); EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); std::string coffeeRepeatedString = "coffee"; @@ -329,7 +330,12 @@ TEST_F(IndexProcessorTest, MultipleDocs) { .AddStringProperty(std::string(kPrefixedProperty), "mr. world world wide") .Build(); - EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId1), IsOk()); + ICING_ASSERT_OK_AND_ASSIGN( + tokenized_document, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + document)); + EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId1), + IsOk()); EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DocHitInfoIterator> itr, @@ -372,11 +378,18 @@ TEST_F(IndexProcessorTest, DocWithNestedProperty) { .AddDocumentProperty( std::string(kSubProperty), DocumentBuilder() + .SetKey("icing", "nested_type/1") + .SetSchema(std::string(kNestedType)) .AddStringProperty(std::string(kNestedProperty), "rocky raccoon") .Build()) .Build(); - EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0), IsOk()); + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_document, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + document)); + EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), + IsOk()); EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DocHitInfoIterator> itr, @@ -396,7 +409,12 @@ TEST_F(IndexProcessorTest, DocWithRepeatedProperty) { .AddStringProperty(std::string(kRepeatedProperty), "rocky", "italian stallion") .Build(); - EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0), IsOk()); + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_document, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + document)); + EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), + IsOk()); EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DocHitInfoIterator> itr, @@ -417,8 +435,7 @@ TEST_F(IndexProcessorTest, TooManyTokensReturnError) { ICING_ASSERT_OK_AND_ASSIGN( index_processor_, - IndexProcessor::Create(schema_store_.get(), lang_segmenter_.get(), - normalizer_.get(), index_.get(), options, + IndexProcessor::Create(normalizer_.get(), index_.get(), options, &fake_clock_)); DocumentProto document = @@ -428,7 +445,11 @@ TEST_F(IndexProcessorTest, TooManyTokensReturnError) { .AddStringProperty(std::string(kExactProperty), "hello world") .AddStringProperty(std::string(kPrefixedProperty), "good night moon!") .Build(); - EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0), + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_document, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + document)); + EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED)); EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); @@ -457,8 +478,7 @@ TEST_F(IndexProcessorTest, TooManyTokensSuppressError) { ICING_ASSERT_OK_AND_ASSIGN( index_processor_, - IndexProcessor::Create(schema_store_.get(), lang_segmenter_.get(), - normalizer_.get(), index_.get(), options, + IndexProcessor::Create(normalizer_.get(), index_.get(), options, &fake_clock_)); DocumentProto document = @@ -468,7 +488,12 @@ TEST_F(IndexProcessorTest, TooManyTokensSuppressError) { .AddStringProperty(std::string(kExactProperty), "hello world") .AddStringProperty(std::string(kPrefixedProperty), "good night moon!") .Build(); - EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0), IsOk()); + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_document, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + document)); + EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), + IsOk()); EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); // "night" should have been indexed. @@ -498,8 +523,7 @@ TEST_F(IndexProcessorTest, TooLongTokens) { ICING_ASSERT_OK_AND_ASSIGN( index_processor_, - IndexProcessor::Create(schema_store_.get(), lang_segmenter_.get(), - normalizer.get(), index_.get(), options, + IndexProcessor::Create(normalizer.get(), index_.get(), options, &fake_clock_)); DocumentProto document = @@ -509,7 +533,12 @@ TEST_F(IndexProcessorTest, TooLongTokens) { .AddStringProperty(std::string(kExactProperty), "hello world") .AddStringProperty(std::string(kPrefixedProperty), "good night moon!") .Build(); - EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0), IsOk()); + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_document, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + document)); + EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), + IsOk()); EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); // "good" should have been indexed normally. @@ -542,7 +571,12 @@ TEST_F(IndexProcessorTest, NonPrefixedContentPrefixQuery) { .SetSchema(std::string(kFakeType)) .AddStringProperty(std::string(kExactProperty), "best rocky movies") .Build(); - EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0), IsOk()); + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_document, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + document)); + EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), + IsOk()); EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); document = @@ -551,7 +585,12 @@ TEST_F(IndexProcessorTest, NonPrefixedContentPrefixQuery) { .SetSchema(std::string(kFakeType)) .AddStringProperty(std::string(kPrefixedProperty), "rocky raccoon") .Build(); - EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId1), IsOk()); + ICING_ASSERT_OK_AND_ASSIGN( + tokenized_document, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + document)); + EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId1), + IsOk()); EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1)); // Only document_id 1 should surface in a prefix query for "Rock" @@ -570,7 +609,12 @@ TEST_F(IndexProcessorTest, TokenNormalization) { .SetSchema(std::string(kFakeType)) .AddStringProperty(std::string(kExactProperty), "ALL UPPER CASE") .Build(); - EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0), IsOk()); + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_document, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + document)); + EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), + IsOk()); EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); document = @@ -579,7 +623,12 @@ TEST_F(IndexProcessorTest, TokenNormalization) { .SetSchema(std::string(kFakeType)) .AddStringProperty(std::string(kExactProperty), "all lower case") .Build(); - EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId1), IsOk()); + ICING_ASSERT_OK_AND_ASSIGN( + tokenized_document, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + document)); + EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId1), + IsOk()); EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DocHitInfoIterator> itr, @@ -600,7 +649,12 @@ TEST_F(IndexProcessorTest, OutOfOrderDocumentIds) { .SetSchema(std::string(kFakeType)) .AddStringProperty(std::string(kExactProperty), "ALL UPPER CASE") .Build(); - EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId1), IsOk()); + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_document, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + document)); + EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId1), + IsOk()); EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1)); // Indexing a document with document_id < last_added_document_id should cause @@ -611,11 +665,15 @@ TEST_F(IndexProcessorTest, OutOfOrderDocumentIds) { .SetSchema(std::string(kFakeType)) .AddStringProperty(std::string(kExactProperty), "all lower case") .Build(); - EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0), + ICING_ASSERT_OK_AND_ASSIGN( + tokenized_document, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + document)); + EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); // As should indexing a document document_id == last_added_document_id. - EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0), + EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1)); @@ -635,8 +693,7 @@ TEST_F(IndexProcessorTest, NonAsciiIndexing) { ICING_ASSERT_OK_AND_ASSIGN( index_processor_, - IndexProcessor::Create(schema_store_.get(), lang_segmenter_.get(), - normalizer_.get(), index_.get(), + IndexProcessor::Create(normalizer_.get(), index_.get(), processor_options, &fake_clock_)); DocumentProto document = @@ -646,7 +703,12 @@ TEST_F(IndexProcessorTest, NonAsciiIndexing) { .AddStringProperty(std::string(kExactProperty), "你好,世界!你好:世界。“你好”世界?") .Build(); - EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0), IsOk()); + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_document, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + document)); + EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), + IsOk()); EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DocHitInfoIterator> itr, @@ -666,8 +728,7 @@ TEST_F(IndexProcessorTest, ICING_ASSERT_OK_AND_ASSIGN( index_processor_, - IndexProcessor::Create(schema_store_.get(), lang_segmenter_.get(), - normalizer_.get(), index_.get(), processor_options, + IndexProcessor::Create(normalizer_.get(), index_.get(), processor_options, &fake_clock_)); // This is the maximum token length that an empty lexicon constructed for a @@ -684,7 +745,11 @@ TEST_F(IndexProcessorTest, absl_ports::StrCat(enormous_string, " foo")) .AddStringProperty(std::string(kPrefixedProperty), "bar baz") .Build(); - EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0), + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_document, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + document)); + EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED)); EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); @@ -715,6 +780,10 @@ TEST_F(IndexProcessorTest, IndexingDocAutomaticMerge) { .SetSchema(std::string(kFakeType)) .AddStringProperty(std::string(kExactProperty), kIpsumText) .Build(); + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_document, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + document)); Index::Options options(index_dir_, /*index_merge_size=*/document.ByteSizeLong() * 100); ICING_ASSERT_OK_AND_ASSIGN( @@ -727,8 +796,7 @@ TEST_F(IndexProcessorTest, IndexingDocAutomaticMerge) { ICING_ASSERT_OK_AND_ASSIGN( index_processor_, - IndexProcessor::Create(schema_store_.get(), lang_segmenter_.get(), - normalizer_.get(), index_.get(), processor_options, + IndexProcessor::Create(normalizer_.get(), index_.get(), processor_options, &fake_clock_)); DocumentId doc_id = 0; // Have determined experimentally that indexing 3373 documents with this text @@ -737,10 +805,12 @@ TEST_F(IndexProcessorTest, IndexingDocAutomaticMerge) { // empties the LiteIndex. constexpr int kNumDocsLiteIndexExhaustion = 3373; for (; doc_id < kNumDocsLiteIndexExhaustion; ++doc_id) { - EXPECT_THAT(index_processor_->IndexDocument(document, doc_id), IsOk()); + EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, doc_id), + IsOk()); EXPECT_THAT(index_->last_added_document_id(), Eq(doc_id)); } - EXPECT_THAT(index_processor_->IndexDocument(document, doc_id), IsOk()); + EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, doc_id), + IsOk()); EXPECT_THAT(index_->last_added_document_id(), Eq(doc_id)); } @@ -768,6 +838,10 @@ TEST_F(IndexProcessorTest, IndexingDocMergeFailureResets) { .SetSchema(std::string(kFakeType)) .AddStringProperty(std::string(kPrefixedProperty), kIpsumText) .Build(); + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_document, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + document)); // 2. Recreate the index with the mock filesystem and a merge size that will // only allow one document to be added before requiring a merge. @@ -784,25 +858,26 @@ TEST_F(IndexProcessorTest, IndexingDocMergeFailureResets) { ICING_ASSERT_OK_AND_ASSIGN( index_processor_, - IndexProcessor::Create(schema_store_.get(), lang_segmenter_.get(), - normalizer_.get(), index_.get(), processor_options, + IndexProcessor::Create(normalizer_.get(), index_.get(), processor_options, &fake_clock_)); // 3. Index one document. This should fit in the LiteIndex without requiring a // merge. DocumentId doc_id = 0; - EXPECT_THAT(index_processor_->IndexDocument(document, doc_id), IsOk()); + EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, doc_id), + IsOk()); EXPECT_THAT(index_->last_added_document_id(), Eq(doc_id)); // 4. Add one more document to trigger a merge, which should fail and result // in a Reset. ++doc_id; - EXPECT_THAT(index_processor_->IndexDocument(document, doc_id), + EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, doc_id), StatusIs(libtextclassifier3::StatusCode::DATA_LOSS)); EXPECT_THAT(index_->last_added_document_id(), Eq(kInvalidDocumentId)); // 5. Indexing a new document should succeed. - EXPECT_THAT(index_processor_->IndexDocument(document, doc_id), IsOk()); + EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, doc_id), + IsOk()); EXPECT_THAT(index_->last_added_document_id(), Eq(doc_id)); } diff --git a/icing/index/iterator/doc-hit-info-iterator-and.h b/icing/index/iterator/doc-hit-info-iterator-and.h index 4618fb9..faca785 100644 --- a/icing/index/iterator/doc-hit-info-iterator-and.h +++ b/icing/index/iterator/doc-hit-info-iterator-and.h @@ -46,6 +46,16 @@ class DocHitInfoIteratorAnd : public DocHitInfoIterator { std::string ToString() const override; + void PopulateMatchedTermsStats( + std::vector<TermMatchInfo> *matched_terms_stats) const override { + if (doc_hit_info_.document_id() == kInvalidDocumentId) { + // Current hit isn't valid, return. + return; + } + short_->PopulateMatchedTermsStats(matched_terms_stats); + long_->PopulateMatchedTermsStats(matched_terms_stats); + } + private: std::unique_ptr<DocHitInfoIterator> short_; std::unique_ptr<DocHitInfoIterator> long_; @@ -67,6 +77,17 @@ class DocHitInfoIteratorAndNary : public DocHitInfoIterator { std::string ToString() const override; + void PopulateMatchedTermsStats( + std::vector<TermMatchInfo> *matched_terms_stats) const override { + if (doc_hit_info_.document_id() == kInvalidDocumentId) { + // Current hit isn't valid, return. + return; + } + for (size_t i = 0; i < iterators_.size(); ++i) { + iterators_.at(i)->PopulateMatchedTermsStats(matched_terms_stats); + } + } + private: std::vector<std::unique_ptr<DocHitInfoIterator>> iterators_; }; diff --git a/icing/index/iterator/doc-hit-info-iterator-and_test.cc b/icing/index/iterator/doc-hit-info-iterator-and_test.cc index 35574b7..783e937 100644 --- a/icing/index/iterator/doc-hit-info-iterator-and_test.cc +++ b/icing/index/iterator/doc-hit-info-iterator-and_test.cc @@ -32,8 +32,10 @@ namespace lib { namespace { using ::testing::ElementsAre; +using ::testing::ElementsAreArray; using ::testing::Eq; using ::testing::IsEmpty; +using ::testing::SizeIs; TEST(CreateAndIteratorTest, And) { // Basic test that we can create a working And iterator. Further testing of @@ -196,6 +198,125 @@ TEST(DocHitInfoIteratorAndTest, SectionIdMask) { EXPECT_THAT(and_iter.hit_intersect_section_ids_mask(), Eq(mask_anded_result)); } +TEST(DocHitInfoIteratorAndTest, PopulateMatchedTermsStats) { + { + // Arbitrary section ids for the documents in the DocHitInfoIterators. + // Created to test correct section_id_mask behavior. + SectionIdMask section_id_mask1 = 0b01010101; // hits in sections 0, 2, 4, 6 + std::array<Hit::TermFrequency, kMaxSectionId> term_frequencies1{ + 1, 0, 2, 0, 3, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0}; + SectionIdMask section_id_mask2 = 0b00000110; // hits in sections 1, 2 + std::array<Hit::TermFrequency, kMaxSectionId> term_frequencies2{ + 0, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + + DocHitInfo doc_hit_info1 = DocHitInfo(4); + doc_hit_info1.UpdateSection(/*section_id=*/0, /*hit_term_frequency=*/1); + doc_hit_info1.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/2); + doc_hit_info1.UpdateSection(/*section_id=*/4, /*hit_term_frequency=*/3); + doc_hit_info1.UpdateSection(/*section_id=*/6, /*hit_term_frequency=*/4); + + DocHitInfo doc_hit_info2 = DocHitInfo(4); + doc_hit_info2.UpdateSection(/*section_id=*/1, /*hit_term_frequency=*/2); + doc_hit_info2.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/6); + + std::vector<DocHitInfo> first_vector = {doc_hit_info1}; + std::vector<DocHitInfo> second_vector = {doc_hit_info2}; + + auto first_iter = + std::make_unique<DocHitInfoIteratorDummy>(first_vector, "hi"); + first_iter->set_hit_intersect_section_ids_mask(section_id_mask1); + + auto second_iter = + std::make_unique<DocHitInfoIteratorDummy>(second_vector, "hello"); + second_iter->set_hit_intersect_section_ids_mask(section_id_mask2); + + DocHitInfoIteratorAnd and_iter(std::move(first_iter), + std::move(second_iter)); + std::vector<TermMatchInfo> matched_terms_stats; + and_iter.PopulateMatchedTermsStats(&matched_terms_stats); + EXPECT_THAT(matched_terms_stats, IsEmpty()); + + ICING_EXPECT_OK(and_iter.Advance()); + EXPECT_THAT(and_iter.doc_hit_info().document_id(), Eq(4)); + + and_iter.PopulateMatchedTermsStats(&matched_terms_stats); + ASSERT_THAT(matched_terms_stats, SizeIs(2)); // 2 terms + EXPECT_EQ(matched_terms_stats.at(0).term, "hi"); + EXPECT_EQ(matched_terms_stats.at(1).term, "hello"); + EXPECT_THAT(matched_terms_stats.at(0).term_frequencies, + ElementsAreArray(term_frequencies1)); + EXPECT_EQ(matched_terms_stats.at(0).section_ids_mask, section_id_mask1); + EXPECT_THAT(matched_terms_stats.at(1).term_frequencies, + ElementsAreArray(term_frequencies2)); + EXPECT_EQ(matched_terms_stats.at(1).section_ids_mask, section_id_mask2); + + EXPECT_FALSE(and_iter.Advance().ok()); + } + { + // Arbitrary section ids for the documents in the DocHitInfoIterators. + // Created to test correct section_id_mask behavior. + SectionIdMask section_id_mask1 = 0b00000101; // hits in sections 0, 2 + std::array<Hit::TermFrequency, kMaxSectionId> term_frequencies1{ + 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + + DocHitInfo doc_hit_info1 = DocHitInfo(4); + doc_hit_info1.UpdateSection(/*section_id=*/0, /*hit_term_frequency=*/1); + doc_hit_info1.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/2); + + std::vector<DocHitInfo> first_vector = {doc_hit_info1}; + std::vector<DocHitInfo> second_vector = {doc_hit_info1}; + + auto first_iter = + std::make_unique<DocHitInfoIteratorDummy>(first_vector, "hi"); + first_iter->set_hit_intersect_section_ids_mask(section_id_mask1); + + auto second_iter = + std::make_unique<DocHitInfoIteratorDummy>(second_vector, "hi"); + second_iter->set_hit_intersect_section_ids_mask(section_id_mask1); + + DocHitInfoIteratorAnd and_iter(std::move(first_iter), + std::move(second_iter)); + std::vector<TermMatchInfo> matched_terms_stats; + and_iter.PopulateMatchedTermsStats(&matched_terms_stats); + EXPECT_THAT(matched_terms_stats, IsEmpty()); + + ICING_EXPECT_OK(and_iter.Advance()); + EXPECT_THAT(and_iter.doc_hit_info().document_id(), Eq(4)); + + and_iter.PopulateMatchedTermsStats(&matched_terms_stats); + ASSERT_THAT(matched_terms_stats, SizeIs(1)); // 1 term + EXPECT_EQ(matched_terms_stats.at(0).term, "hi"); + EXPECT_THAT(matched_terms_stats.at(0).term_frequencies, + ElementsAreArray(term_frequencies1)); + EXPECT_EQ(matched_terms_stats.at(0).section_ids_mask, section_id_mask1); + + EXPECT_FALSE(and_iter.Advance().ok()); + } +} + +TEST(DocHitInfoIteratorAndTest, PopulateMatchedTermsStats_NoMatchingDocument) { + DocHitInfo doc_hit_info1 = DocHitInfo(4); + doc_hit_info1.UpdateSection(/*section_id=*/0, /*hit_term_frequency=*/1); + + DocHitInfo doc_hit_info2 = DocHitInfo(5); + doc_hit_info2.UpdateSection(/*section_id=*/1, /*hit_term_frequency=*/2); + doc_hit_info2.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/6); + + std::vector<DocHitInfo> first_vector = {doc_hit_info1}; + std::vector<DocHitInfo> second_vector = {doc_hit_info2}; + + auto first_iter = + std::make_unique<DocHitInfoIteratorDummy>(first_vector, "hi"); + auto second_iter = + std::make_unique<DocHitInfoIteratorDummy>(second_vector, "hello"); + + DocHitInfoIteratorAnd and_iter(std::move(first_iter), std::move(second_iter)); + std::vector<TermMatchInfo> matched_terms_stats; + and_iter.PopulateMatchedTermsStats(&matched_terms_stats); + EXPECT_THAT(matched_terms_stats, IsEmpty()); + EXPECT_FALSE(and_iter.Advance().ok()); +} + TEST(DocHitInfoIteratorAndNaryTest, Initialize) { std::vector<std::unique_ptr<DocHitInfoIterator>> iterators; iterators.push_back(std::make_unique<DocHitInfoIteratorDummy>()); @@ -345,6 +466,90 @@ TEST(DocHitInfoIteratorAndNaryTest, SectionIdMask) { EXPECT_THAT(and_iter.hit_intersect_section_ids_mask(), Eq(mask_anded_result)); } +TEST(DocHitInfoIteratorAndNaryTest, PopulateMatchedTermsStats) { + // Arbitrary section ids/term frequencies for the documents in the + // DocHitInfoIterators. + // For term "hi", document 10 and 8 + SectionIdMask section_id_mask1_hi = 0b01000101; // hits in sections 0, 2, 6 + std::array<Hit::TermFrequency, kMaxSectionId> term_frequencies1_hi{ + 1, 0, 2, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0}; + DocHitInfo doc_hit_info1_hi = DocHitInfo(10); + doc_hit_info1_hi.UpdateSection(/*section_id=*/0, /*hit_term_frequency=*/1); + doc_hit_info1_hi.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/2); + doc_hit_info1_hi.UpdateSection(/*section_id=*/6, /*hit_term_frequency=*/4); + + DocHitInfo doc_hit_info2_hi = DocHitInfo(8); + doc_hit_info2_hi.UpdateSection(/*section_id=*/1, /*hit_term_frequency=*/2); + doc_hit_info2_hi.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/6); + + // For term "hello", document 10 and 9 + SectionIdMask section_id_mask1_hello = 0b00001001; // hits in sections 0, 3 + std::array<Hit::TermFrequency, kMaxSectionId> term_frequencies1_hello{ + 2, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + DocHitInfo doc_hit_info1_hello = DocHitInfo(10); + doc_hit_info1_hello.UpdateSection(/*section_id=*/0, /*hit_term_frequency=*/2); + doc_hit_info1_hello.UpdateSection(/*section_id=*/3, /*hit_term_frequency=*/3); + + DocHitInfo doc_hit_info2_hello = DocHitInfo(9); + doc_hit_info2_hello.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/3); + doc_hit_info2_hello.UpdateSection(/*section_id=*/3, /*hit_term_frequency=*/2); + + // For term "ciao", document 10 and 9 + SectionIdMask section_id_mask1_ciao = 0b00000011; // hits in sections 0, 1 + std::array<Hit::TermFrequency, kMaxSectionId> term_frequencies1_ciao{ + 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + DocHitInfo doc_hit_info1_ciao = DocHitInfo(10); + doc_hit_info1_ciao.UpdateSection(/*section_id=*/0, /*hit_term_frequency=*/2); + doc_hit_info1_ciao.UpdateSection(/*section_id=*/1, /*hit_term_frequency=*/3); + + DocHitInfo doc_hit_info2_ciao = DocHitInfo(9); + doc_hit_info2_ciao.UpdateSection(/*section_id=*/3, /*hit_term_frequency=*/3); + doc_hit_info2_ciao.UpdateSection(/*section_id=*/4, /*hit_term_frequency=*/2); + + std::vector<DocHitInfo> first_vector = {doc_hit_info1_hi, doc_hit_info2_hi}; + std::vector<DocHitInfo> second_vector = {doc_hit_info1_hello, + doc_hit_info2_hello}; + std::vector<DocHitInfo> third_vector = {doc_hit_info1_ciao, + doc_hit_info2_ciao}; + + auto first_iter = + std::make_unique<DocHitInfoIteratorDummy>(first_vector, "hi"); + auto second_iter = + std::make_unique<DocHitInfoIteratorDummy>(second_vector, "hello"); + auto third_iter = + std::make_unique<DocHitInfoIteratorDummy>(third_vector, "ciao"); + + std::vector<std::unique_ptr<DocHitInfoIterator>> iterators; + iterators.push_back(std::move(first_iter)); + iterators.push_back(std::move(second_iter)); + iterators.push_back(std::move(third_iter)); + + DocHitInfoIteratorAndNary and_iter(std::move(iterators)); + std::vector<TermMatchInfo> matched_terms_stats; + and_iter.PopulateMatchedTermsStats(&matched_terms_stats); + EXPECT_THAT(matched_terms_stats, IsEmpty()); + + ICING_EXPECT_OK(and_iter.Advance()); + EXPECT_THAT(and_iter.doc_hit_info().document_id(), Eq(10)); + + and_iter.PopulateMatchedTermsStats(&matched_terms_stats); + ASSERT_THAT(matched_terms_stats, SizeIs(3)); // 3 terms + EXPECT_EQ(matched_terms_stats.at(0).term, "hi"); + EXPECT_THAT(matched_terms_stats.at(0).term_frequencies, + ElementsAreArray(term_frequencies1_hi)); + EXPECT_EQ(matched_terms_stats.at(0).section_ids_mask, section_id_mask1_hi); + EXPECT_EQ(matched_terms_stats.at(1).term, "hello"); + EXPECT_THAT(matched_terms_stats.at(1).term_frequencies, + ElementsAreArray(term_frequencies1_hello)); + EXPECT_EQ(matched_terms_stats.at(1).section_ids_mask, section_id_mask1_hello); + EXPECT_EQ(matched_terms_stats.at(2).term, "ciao"); + EXPECT_THAT(matched_terms_stats.at(2).term_frequencies, + ElementsAreArray(term_frequencies1_ciao)); + EXPECT_EQ(matched_terms_stats.at(2).section_ids_mask, section_id_mask1_ciao); + + EXPECT_FALSE(and_iter.Advance().ok()); +} + } // namespace } // namespace lib diff --git a/icing/index/iterator/doc-hit-info-iterator-filter.h b/icing/index/iterator/doc-hit-info-iterator-filter.h index 9119610..fb60e38 100644 --- a/icing/index/iterator/doc-hit-info-iterator-filter.h +++ b/icing/index/iterator/doc-hit-info-iterator-filter.h @@ -67,6 +67,11 @@ class DocHitInfoIteratorFilter : public DocHitInfoIterator { std::string ToString() const override; + void PopulateMatchedTermsStats( + std::vector<TermMatchInfo>* matched_terms_stats) const override { + delegate_->PopulateMatchedTermsStats(matched_terms_stats); + } + private: std::unique_ptr<DocHitInfoIterator> delegate_; const DocumentStore& document_store_; diff --git a/icing/index/iterator/doc-hit-info-iterator-or.cc b/icing/index/iterator/doc-hit-info-iterator-or.cc index 8f00f88..b4234e0 100644 --- a/icing/index/iterator/doc-hit-info-iterator-or.cc +++ b/icing/index/iterator/doc-hit-info-iterator-or.cc @@ -108,6 +108,7 @@ libtextclassifier3::Status DocHitInfoIteratorOr::Advance() { } else { chosen = left_.get(); } + current_ = chosen; doc_hit_info_ = chosen->doc_hit_info(); hit_intersect_section_ids_mask_ = chosen->hit_intersect_section_ids_mask(); @@ -139,6 +140,7 @@ DocHitInfoIteratorOrNary::DocHitInfoIteratorOrNary( : iterators_(std::move(iterators)) {} libtextclassifier3::Status DocHitInfoIteratorOrNary::Advance() { + current_iterators_.clear(); if (iterators_.size() < 2) { return absl_ports::InvalidArgumentError( "Not enough iterators to OR together"); @@ -187,6 +189,7 @@ libtextclassifier3::Status DocHitInfoIteratorOrNary::Advance() { hit_intersect_section_ids_mask_ = kSectionIdMaskNone; for (const auto& iterator : iterators_) { if (iterator->doc_hit_info().document_id() == next_document_id) { + current_iterators_.push_back(iterator.get()); if (doc_hit_info_.document_id() == kInvalidDocumentId) { doc_hit_info_ = iterator->doc_hit_info(); hit_intersect_section_ids_mask_ = diff --git a/icing/index/iterator/doc-hit-info-iterator-or.h b/icing/index/iterator/doc-hit-info-iterator-or.h index 4128e0f..2f49430 100644 --- a/icing/index/iterator/doc-hit-info-iterator-or.h +++ b/icing/index/iterator/doc-hit-info-iterator-or.h @@ -42,9 +42,26 @@ class DocHitInfoIteratorOr : public DocHitInfoIterator { std::string ToString() const override; + void PopulateMatchedTermsStats( + std::vector<TermMatchInfo> *matched_terms_stats) const override { + if (doc_hit_info_.document_id() == kInvalidDocumentId) { + // Current hit isn't valid, return. + return; + } + current_->PopulateMatchedTermsStats(matched_terms_stats); + // If equal, then current_ == left_. Combine with results from right_. + if (left_document_id_ == right_document_id_) { + right_->PopulateMatchedTermsStats(matched_terms_stats); + } + } + private: std::unique_ptr<DocHitInfoIterator> left_; std::unique_ptr<DocHitInfoIterator> right_; + // Pointer to the chosen iterator that points to the current doc_hit_info_. If + // both left_ and right_ point to the same docid, then chosen_ == left. + // chosen_ does not own the iterator it points to. + DocHitInfoIterator *current_; DocumentId left_document_id_ = kMaxDocumentId; DocumentId right_document_id_ = kMaxDocumentId; }; @@ -65,8 +82,22 @@ class DocHitInfoIteratorOrNary : public DocHitInfoIterator { std::string ToString() const override; + void PopulateMatchedTermsStats( + std::vector<TermMatchInfo> *matched_terms_stats) const override { + if (doc_hit_info_.document_id() == kInvalidDocumentId) { + // Current hit isn't valid, return. + return; + } + for (size_t i = 0; i < current_iterators_.size(); i++) { + current_iterators_.at(i)->PopulateMatchedTermsStats(matched_terms_stats); + } + } + private: std::vector<std::unique_ptr<DocHitInfoIterator>> iterators_; + // Pointers to the iterators that point to the current doc_hit_info_. + // current_iterators_ does not own the iterators it points to. + std::vector<DocHitInfoIterator *> current_iterators_; }; } // namespace lib diff --git a/icing/index/iterator/doc-hit-info-iterator-or_test.cc b/icing/index/iterator/doc-hit-info-iterator-or_test.cc index 3faa5ab..3f00a39 100644 --- a/icing/index/iterator/doc-hit-info-iterator-or_test.cc +++ b/icing/index/iterator/doc-hit-info-iterator-or_test.cc @@ -32,7 +32,10 @@ namespace lib { namespace { using ::testing::ElementsAre; +using ::testing::ElementsAreArray; using ::testing::Eq; +using ::testing::IsEmpty; +using ::testing::SizeIs; TEST(CreateAndIteratorTest, Or) { // Basic test that we can create a working Or iterator. Further testing of @@ -175,6 +178,159 @@ TEST(DocHitInfoIteratorOrTest, SectionIdMask) { EXPECT_THAT(or_iter.hit_intersect_section_ids_mask(), Eq(mask_anded_result)); } +TEST(DocHitInfoIteratorOrTest, PopulateMatchedTermsStats) { + { + // Arbitrary section ids for the documents in the DocHitInfoIterators. + // Created to test correct section_id_mask behavior. + SectionIdMask section_id_mask1 = 0b01010101; // hits in sections 0, 2, 4, 6 + std::array<Hit::TermFrequency, kMaxSectionId> term_frequencies1{ + 1, 0, 2, 0, 3, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0}; + SectionIdMask section_id_mask2 = 0b00000110; // hits in sections 1, 2 + std::array<Hit::TermFrequency, kMaxSectionId> term_frequencies2{ + 0, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + + DocHitInfo doc_hit_info1 = DocHitInfo(4); + doc_hit_info1.UpdateSection(/*section_id=*/0, /*hit_term_frequency=*/1); + doc_hit_info1.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/2); + doc_hit_info1.UpdateSection(/*section_id=*/4, /*hit_term_frequency=*/3); + doc_hit_info1.UpdateSection(/*section_id=*/6, /*hit_term_frequency=*/4); + + DocHitInfo doc_hit_info2 = DocHitInfo(4); + doc_hit_info2.UpdateSection(/*section_id=*/1, /*hit_term_frequency=*/2); + doc_hit_info2.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/6); + + std::vector<DocHitInfo> first_vector = {doc_hit_info1}; + std::vector<DocHitInfo> second_vector = {doc_hit_info2}; + + auto first_iter = + std::make_unique<DocHitInfoIteratorDummy>(first_vector, "hi"); + first_iter->set_hit_intersect_section_ids_mask(section_id_mask1); + + auto second_iter = + std::make_unique<DocHitInfoIteratorDummy>(second_vector, "hello"); + second_iter->set_hit_intersect_section_ids_mask(section_id_mask2); + + DocHitInfoIteratorOr or_iter(std::move(first_iter), std::move(second_iter)); + std::vector<TermMatchInfo> matched_terms_stats; + or_iter.PopulateMatchedTermsStats(&matched_terms_stats); + EXPECT_THAT(matched_terms_stats, IsEmpty()); + + ICING_EXPECT_OK(or_iter.Advance()); + EXPECT_THAT(or_iter.doc_hit_info().document_id(), Eq(4)); + + or_iter.PopulateMatchedTermsStats(&matched_terms_stats); + ASSERT_THAT(matched_terms_stats, SizeIs(2)); // 2 terms + EXPECT_EQ(matched_terms_stats.at(0).term, "hi"); + EXPECT_EQ(matched_terms_stats.at(1).term, "hello"); + EXPECT_THAT(matched_terms_stats.at(0).term_frequencies, + ElementsAreArray(term_frequencies1)); + EXPECT_EQ(matched_terms_stats.at(0).section_ids_mask, section_id_mask1); + EXPECT_THAT(matched_terms_stats.at(1).term_frequencies, + ElementsAreArray(term_frequencies2)); + EXPECT_EQ(matched_terms_stats.at(1).section_ids_mask, section_id_mask2); + + EXPECT_FALSE(or_iter.Advance().ok()); + } + { + // Arbitrary section ids for the documents in the DocHitInfoIterators. + // Created to test correct section_id_mask behavior. + SectionIdMask section_id_mask1 = 0b00000101; // hits in sections 0, 2 + std::array<Hit::TermFrequency, kMaxSectionId> term_frequencies1{ + 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + + DocHitInfo doc_hit_info1 = DocHitInfo(4); + doc_hit_info1.UpdateSection(/*section_id=*/0, /*hit_term_frequency=*/1); + doc_hit_info1.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/2); + + std::vector<DocHitInfo> first_vector = {doc_hit_info1}; + std::vector<DocHitInfo> second_vector = {doc_hit_info1}; + + auto first_iter = + std::make_unique<DocHitInfoIteratorDummy>(first_vector, "hi"); + first_iter->set_hit_intersect_section_ids_mask(section_id_mask1); + + auto second_iter = + std::make_unique<DocHitInfoIteratorDummy>(second_vector, "hi"); + second_iter->set_hit_intersect_section_ids_mask(section_id_mask1); + + DocHitInfoIteratorOr or_iter(std::move(first_iter), std::move(second_iter)); + std::vector<TermMatchInfo> matched_terms_stats; + or_iter.PopulateMatchedTermsStats(&matched_terms_stats); + EXPECT_THAT(matched_terms_stats, IsEmpty()); + + ICING_EXPECT_OK(or_iter.Advance()); + EXPECT_THAT(or_iter.doc_hit_info().document_id(), Eq(4)); + + or_iter.PopulateMatchedTermsStats(&matched_terms_stats); + ASSERT_THAT(matched_terms_stats, SizeIs(1)); // 1 term + EXPECT_EQ(matched_terms_stats.at(0).term, "hi"); + EXPECT_THAT(matched_terms_stats.at(0).term_frequencies, + ElementsAreArray(term_frequencies1)); + EXPECT_EQ(matched_terms_stats.at(0).section_ids_mask, section_id_mask1); + + EXPECT_FALSE(or_iter.Advance().ok()); + } + { + // Arbitrary section ids for the documents in the DocHitInfoIterators. + // Created to test correct section_id_mask behavior. + SectionIdMask section_id_mask1 = 0b01010101; // hits in sections 0, 2, 4, 6 + std::array<Hit::TermFrequency, kMaxSectionId> term_frequencies1{ + 1, 0, 2, 0, 3, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0}; + SectionIdMask section_id_mask2 = 0b00000110; // hits in sections 1, 2 + std::array<Hit::TermFrequency, kMaxSectionId> term_frequencies2{ + 0, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + + DocHitInfo doc_hit_info1 = DocHitInfo(4); + doc_hit_info1.UpdateSection(/*section_id=*/0, /*hit_term_frequency=*/1); + doc_hit_info1.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/2); + doc_hit_info1.UpdateSection(/*section_id=*/4, /*hit_term_frequency=*/3); + doc_hit_info1.UpdateSection(/*section_id=*/6, /*hit_term_frequency=*/4); + + DocHitInfo doc_hit_info2 = DocHitInfo(5); + doc_hit_info2.UpdateSection(/*section_id=*/1, /*hit_term_frequency=*/2); + doc_hit_info2.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/6); + + std::vector<DocHitInfo> first_vector = {doc_hit_info1}; + std::vector<DocHitInfo> second_vector = {doc_hit_info2}; + + auto first_iter = + std::make_unique<DocHitInfoIteratorDummy>(first_vector, "hi"); + first_iter->set_hit_intersect_section_ids_mask(section_id_mask1); + + auto second_iter = + std::make_unique<DocHitInfoIteratorDummy>(second_vector, "hello"); + second_iter->set_hit_intersect_section_ids_mask(section_id_mask2); + + DocHitInfoIteratorOr or_iter(std::move(first_iter), std::move(second_iter)); + std::vector<TermMatchInfo> matched_terms_stats; + or_iter.PopulateMatchedTermsStats(&matched_terms_stats); + EXPECT_THAT(matched_terms_stats, IsEmpty()); + + ICING_EXPECT_OK(or_iter.Advance()); + EXPECT_THAT(or_iter.doc_hit_info().document_id(), Eq(5)); + + or_iter.PopulateMatchedTermsStats(&matched_terms_stats); + ASSERT_THAT(matched_terms_stats, SizeIs(1)); // 1 term + EXPECT_EQ(matched_terms_stats.at(0).term, "hello"); + EXPECT_THAT(matched_terms_stats.at(0).term_frequencies, + ElementsAreArray(term_frequencies2)); + EXPECT_EQ(matched_terms_stats.at(0).section_ids_mask, section_id_mask2); + + ICING_EXPECT_OK(or_iter.Advance()); + EXPECT_THAT(or_iter.doc_hit_info().document_id(), Eq(4)); + + matched_terms_stats.clear(); + or_iter.PopulateMatchedTermsStats(&matched_terms_stats); + ASSERT_THAT(matched_terms_stats, SizeIs(1)); // 1 term + EXPECT_EQ(matched_terms_stats.at(0).term, "hi"); + EXPECT_THAT(matched_terms_stats.at(0).term_frequencies, + ElementsAreArray(term_frequencies1)); + EXPECT_EQ(matched_terms_stats.at(0).section_ids_mask, section_id_mask1); + + EXPECT_FALSE(or_iter.Advance().ok()); + } +} + TEST(DocHitInfoIteratorOrNaryTest, Initialize) { std::vector<std::unique_ptr<DocHitInfoIterator>> iterators; iterators.push_back(std::make_unique<DocHitInfoIteratorDummy>()); @@ -316,6 +472,125 @@ TEST(DocHitInfoIteratorOrNaryTest, SectionIdMask) { EXPECT_THAT(or_iter.hit_intersect_section_ids_mask(), Eq(mask_anded_result)); } +TEST(DocHitInfoIteratorOrNaryTest, PopulateMatchedTermsStats) { + // Arbitrary section ids/term frequencies for the documents in the + // DocHitInfoIterators. + // For term "hi", document 10 and 8 + SectionIdMask section_id_mask1_hi = 0b01000101; // hits in sections 0, 2, 6 + std::array<Hit::TermFrequency, kMaxSectionId> term_frequencies1_hi{ + 1, 0, 2, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0}; + DocHitInfo doc_hit_info1_hi = DocHitInfo(10); + doc_hit_info1_hi.UpdateSection(/*section_id=*/0, /*hit_term_frequency=*/1); + doc_hit_info1_hi.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/2); + doc_hit_info1_hi.UpdateSection(/*section_id=*/6, /*hit_term_frequency=*/4); + + SectionIdMask section_id_mask2_hi = 0b00000110; // hits in sections 1, 2 + std::array<Hit::TermFrequency, kMaxSectionId> term_frequencies2_hi{ + 0, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + DocHitInfo doc_hit_info2_hi = DocHitInfo(8); + doc_hit_info2_hi.UpdateSection(/*section_id=*/1, /*hit_term_frequency=*/2); + doc_hit_info2_hi.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/6); + + // For term "hello", document 10 and 9 + SectionIdMask section_id_mask1_hello = 0b00001001; // hits in sections 0, 3 + std::array<Hit::TermFrequency, kMaxSectionId> term_frequencies1_hello{ + 2, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + DocHitInfo doc_hit_info1_hello = DocHitInfo(10); + doc_hit_info1_hello.UpdateSection(/*section_id=*/0, /*hit_term_frequency=*/2); + doc_hit_info1_hello.UpdateSection(/*section_id=*/3, /*hit_term_frequency=*/3); + + SectionIdMask section_id_mask2_hello = 0b00001100; // hits in sections 2, 3 + std::array<Hit::TermFrequency, kMaxSectionId> term_frequencies2_hello{ + 0, 0, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + DocHitInfo doc_hit_info2_hello = DocHitInfo(9); + doc_hit_info2_hello.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/3); + doc_hit_info2_hello.UpdateSection(/*section_id=*/3, /*hit_term_frequency=*/2); + + // For term "ciao", document 9 and 8 + SectionIdMask section_id_mask1_ciao = 0b00000011; // hits in sections 0, 1 + std::array<Hit::TermFrequency, kMaxSectionId> term_frequencies1_ciao{ + 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + DocHitInfo doc_hit_info1_ciao = DocHitInfo(9); + doc_hit_info1_ciao.UpdateSection(/*section_id=*/0, /*hit_term_frequency=*/2); + doc_hit_info1_ciao.UpdateSection(/*section_id=*/1, /*hit_term_frequency=*/3); + + SectionIdMask section_id_mask2_ciao = 0b00011000; // hits in sections 3, 4 + std::array<Hit::TermFrequency, kMaxSectionId> term_frequencies2_ciao{ + 0, 0, 0, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + DocHitInfo doc_hit_info2_ciao = DocHitInfo(8); + doc_hit_info2_ciao.UpdateSection(/*section_id=*/3, /*hit_term_frequency=*/3); + doc_hit_info2_ciao.UpdateSection(/*section_id=*/4, /*hit_term_frequency=*/2); + + std::vector<DocHitInfo> first_vector = {doc_hit_info1_hi, doc_hit_info2_hi}; + std::vector<DocHitInfo> second_vector = {doc_hit_info1_hello, + doc_hit_info2_hello}; + std::vector<DocHitInfo> third_vector = {doc_hit_info1_ciao, + doc_hit_info2_ciao}; + + auto first_iter = + std::make_unique<DocHitInfoIteratorDummy>(first_vector, "hi"); + auto second_iter = + std::make_unique<DocHitInfoIteratorDummy>(second_vector, "hello"); + auto third_iter = + std::make_unique<DocHitInfoIteratorDummy>(third_vector, "ciao"); + + std::vector<std::unique_ptr<DocHitInfoIterator>> iterators; + iterators.push_back(std::move(first_iter)); + iterators.push_back(std::move(second_iter)); + iterators.push_back(std::move(third_iter)); + + DocHitInfoIteratorOrNary or_iter(std::move(iterators)); + std::vector<TermMatchInfo> matched_terms_stats; + or_iter.PopulateMatchedTermsStats(&matched_terms_stats); + EXPECT_THAT(matched_terms_stats, IsEmpty()); + + ICING_EXPECT_OK(or_iter.Advance()); + EXPECT_THAT(or_iter.doc_hit_info().document_id(), Eq(10)); + + or_iter.PopulateMatchedTermsStats(&matched_terms_stats); + ASSERT_THAT(matched_terms_stats, SizeIs(2)); // 2 terms + EXPECT_EQ(matched_terms_stats.at(0).term, "hi"); + EXPECT_THAT(matched_terms_stats.at(0).term_frequencies, + ElementsAreArray(term_frequencies1_hi)); + EXPECT_EQ(matched_terms_stats.at(0).section_ids_mask, section_id_mask1_hi); + EXPECT_EQ(matched_terms_stats.at(1).term, "hello"); + EXPECT_THAT(matched_terms_stats.at(1).term_frequencies, + ElementsAreArray(term_frequencies1_hello)); + EXPECT_EQ(matched_terms_stats.at(1).section_ids_mask, section_id_mask1_hello); + + ICING_EXPECT_OK(or_iter.Advance()); + EXPECT_THAT(or_iter.doc_hit_info().document_id(), Eq(9)); + + matched_terms_stats.clear(); + or_iter.PopulateMatchedTermsStats(&matched_terms_stats); + ASSERT_THAT(matched_terms_stats, SizeIs(2)); // 2 terms + EXPECT_EQ(matched_terms_stats.at(0).term, "hello"); + EXPECT_THAT(matched_terms_stats.at(0).term_frequencies, + ElementsAreArray(term_frequencies2_hello)); + EXPECT_EQ(matched_terms_stats.at(0).section_ids_mask, section_id_mask2_hello); + EXPECT_EQ(matched_terms_stats.at(1).term, "ciao"); + EXPECT_THAT(matched_terms_stats.at(1).term_frequencies, + ElementsAreArray(term_frequencies1_ciao)); + EXPECT_EQ(matched_terms_stats.at(1).section_ids_mask, section_id_mask1_ciao); + + ICING_EXPECT_OK(or_iter.Advance()); + EXPECT_THAT(or_iter.doc_hit_info().document_id(), Eq(8)); + + matched_terms_stats.clear(); + or_iter.PopulateMatchedTermsStats(&matched_terms_stats); + ASSERT_THAT(matched_terms_stats, SizeIs(2)); // 2 terms + EXPECT_EQ(matched_terms_stats.at(0).term, "hi"); + EXPECT_THAT(matched_terms_stats.at(0).term_frequencies, + ElementsAreArray(term_frequencies2_hi)); + EXPECT_EQ(matched_terms_stats.at(0).section_ids_mask, section_id_mask2_hi); + EXPECT_EQ(matched_terms_stats.at(1).term, "ciao"); + EXPECT_THAT(matched_terms_stats.at(1).term_frequencies, + ElementsAreArray(term_frequencies2_ciao)); + EXPECT_EQ(matched_terms_stats.at(1).section_ids_mask, section_id_mask2_ciao); + + EXPECT_FALSE(or_iter.Advance().ok()); +} + } // namespace } // namespace lib diff --git a/icing/index/iterator/doc-hit-info-iterator-section-restrict.h b/icing/index/iterator/doc-hit-info-iterator-section-restrict.h index ae5a896..ba74384 100644 --- a/icing/index/iterator/doc-hit-info-iterator-section-restrict.h +++ b/icing/index/iterator/doc-hit-info-iterator-section-restrict.h @@ -52,6 +52,15 @@ class DocHitInfoIteratorSectionRestrict : public DocHitInfoIterator { std::string ToString() const override; + // NOTE: currently, section restricts does decide which documents to + // return, but doesn't impact the relevance score of a document. + // TODO(b/173156803): decide whether we want to filter the matched_terms_stats + // for the restricted sections. + void PopulateMatchedTermsStats( + std::vector<TermMatchInfo>* matched_terms_stats) const override { + delegate_->PopulateMatchedTermsStats(matched_terms_stats); + } + private: std::unique_ptr<DocHitInfoIterator> delegate_; const DocumentStore& document_store_; diff --git a/icing/index/iterator/doc-hit-info-iterator-test-util.h b/icing/index/iterator/doc-hit-info-iterator-test-util.h index c4d7aa7..913696a 100644 --- a/icing/index/iterator/doc-hit-info-iterator-test-util.h +++ b/icing/index/iterator/doc-hit-info-iterator-test-util.h @@ -15,7 +15,6 @@ #ifndef ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TEST_UTIL_H_ #define ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TEST_UTIL_H_ -#include <cstdint> #include <string> #include <utility> #include <vector> @@ -40,8 +39,9 @@ namespace lib { class DocHitInfoIteratorDummy : public DocHitInfoIterator { public: DocHitInfoIteratorDummy() = default; - explicit DocHitInfoIteratorDummy(std::vector<DocHitInfo> doc_hit_infos) - : doc_hit_infos_(std::move(doc_hit_infos)) {} + explicit DocHitInfoIteratorDummy(std::vector<DocHitInfo> doc_hit_infos, + std::string term = "") + : doc_hit_infos_(std::move(doc_hit_infos)), term_(std::move(term)) {} libtextclassifier3::Status Advance() override { if (index_ < doc_hit_infos_.size()) { @@ -54,6 +54,36 @@ class DocHitInfoIteratorDummy : public DocHitInfoIterator { "No more DocHitInfos in iterator"); } + // Imitates behavior of DocHitInfoIteratorTermMain/DocHitInfoIteratorTermLite + void PopulateMatchedTermsStats( + std::vector<TermMatchInfo>* matched_terms_stats) const override { + if (doc_hit_info_.document_id() == kInvalidDocumentId) { + // Current hit isn't valid, return. + return; + } + SectionIdMask section_mask = doc_hit_info_.hit_section_ids_mask(); + std::array<Hit::TermFrequency, kMaxSectionId> section_term_frequencies = { + Hit::kNoTermFrequency}; + + while (section_mask) { + SectionId section_id = __builtin_ctz(section_mask); + section_term_frequencies.at(section_id) = + doc_hit_info_.hit_term_frequency(section_id); + section_mask &= ~(1u << section_id); + } + TermMatchInfo term_stats(term_, doc_hit_info_.hit_section_ids_mask(), + section_term_frequencies); + + for (auto& cur_term_stats : *matched_terms_stats) { + if (cur_term_stats.term == term_stats.term) { + // Same docId and same term, we don't need to add the term and the term + // frequency should always be the same + return; + } + } + matched_terms_stats->push_back(term_stats); + } + void set_hit_intersect_section_ids_mask( SectionIdMask hit_intersect_section_ids_mask) { hit_intersect_section_ids_mask_ = hit_intersect_section_ids_mask; @@ -91,6 +121,7 @@ class DocHitInfoIteratorDummy : public DocHitInfoIterator { int32_t num_blocks_inspected_ = 0; int32_t num_leaf_advance_calls_ = 0; std::vector<DocHitInfo> doc_hit_infos_; + std::string term_; }; inline std::vector<DocumentId> GetDocumentIds(DocHitInfoIterator* iterator) { diff --git a/icing/index/iterator/doc-hit-info-iterator.h b/icing/index/iterator/doc-hit-info-iterator.h index bcc2b6e..c4d9901 100644 --- a/icing/index/iterator/doc-hit-info-iterator.h +++ b/icing/index/iterator/doc-hit-info-iterator.h @@ -17,6 +17,7 @@ #include <cstdint> #include <string> +#include <string_view> #include "icing/text_classifier/lib3/utils/base/status.h" #include "icing/text_classifier/lib3/utils/base/statusor.h" @@ -28,6 +29,26 @@ namespace icing { namespace lib { +// Data structure that maps a single matched query term to its section mask +// and the list of term frequencies. +// TODO(b/158603837): add stat on whether the matched terms are prefix matched +// or not. This information will be used to boost exact match. +struct TermMatchInfo { + std::string_view term; + // SectionIdMask associated to the term. + SectionIdMask section_ids_mask; + // Array with fixed size kMaxSectionId. For every section id, i.e. + // vector index, it stores the term frequency of the term. + std::array<Hit::TermFrequency, kMaxSectionId> term_frequencies; + + explicit TermMatchInfo( + std::string_view term, SectionIdMask section_ids_mask, + std::array<Hit::TermFrequency, kMaxSectionId> term_frequencies) + : term(term), + section_ids_mask(section_ids_mask), + term_frequencies(std::move(term_frequencies)) {} +}; + // Iterator over DocHitInfos (collapsed Hits) in REVERSE document_id order. // // NOTE: You must call Advance() before calling hit_info() or @@ -70,6 +91,14 @@ class DocHitInfoIterator { // A string representing the iterator. virtual std::string ToString() const = 0; + // For the last hit docid, retrieves all the matched query terms and other + // stats, see TermMatchInfo. + // If Advance() wasn't called after construction, Advance() returned false or + // the concrete HitIterator didn't override this method, the vectors aren't + // populated. + virtual void PopulateMatchedTermsStats( + std::vector<TermMatchInfo>* matched_terms_stats) const {} + protected: DocHitInfo doc_hit_info_; SectionIdMask hit_intersect_section_ids_mask_ = kSectionIdMaskNone; diff --git a/icing/index/lite/doc-hit-info-iterator-term-lite.h b/icing/index/lite/doc-hit-info-iterator-term-lite.h index bd2de6d..ac5e97f 100644 --- a/icing/index/lite/doc-hit-info-iterator-term-lite.h +++ b/icing/index/lite/doc-hit-info-iterator-term-lite.h @@ -49,6 +49,34 @@ class DocHitInfoIteratorTermLite : public DocHitInfoIterator { } int32_t GetNumLeafAdvanceCalls() const override { return num_advance_calls_; } + void PopulateMatchedTermsStats( + std::vector<TermMatchInfo>* matched_terms_stats) const override { + if (doc_hit_info_.document_id() == kInvalidDocumentId) { + // Current hit isn't valid, return. + return; + } + SectionIdMask section_mask = doc_hit_info_.hit_section_ids_mask(); + std::array<Hit::TermFrequency, kMaxSectionId> section_term_frequencies = { + Hit::kNoTermFrequency}; + while (section_mask) { + SectionId section_id = __builtin_ctz(section_mask); + section_term_frequencies.at(section_id) = + doc_hit_info_.hit_term_frequency(section_id); + section_mask &= ~(1u << section_id); + } + TermMatchInfo term_stats(term_, doc_hit_info_.hit_section_ids_mask(), + std::move(section_term_frequencies)); + + for (const TermMatchInfo& cur_term_stats : *matched_terms_stats) { + if (cur_term_stats.term == term_stats.term) { + // Same docId and same term, we don't need to add the term and the term + // frequency should always be the same + return; + } + } + matched_terms_stats->push_back(std::move(term_stats)); + } + protected: // Add DocHitInfos corresponding to term_ to cached_hits_. virtual libtextclassifier3::Status RetrieveMoreHits() = 0; diff --git a/icing/index/main/doc-hit-info-iterator-term-main.h b/icing/index/main/doc-hit-info-iterator-term-main.h index 1f77226..d626d7a 100644 --- a/icing/index/main/doc-hit-info-iterator-term-main.h +++ b/icing/index/main/doc-hit-info-iterator-term-main.h @@ -49,6 +49,34 @@ class DocHitInfoIteratorTermMain : public DocHitInfoIterator { } int32_t GetNumLeafAdvanceCalls() const override { return num_advance_calls_; } + void PopulateMatchedTermsStats( + std::vector<TermMatchInfo>* matched_terms_stats) const override { + if (doc_hit_info_.document_id() == kInvalidDocumentId) { + // Current hit isn't valid, return. + return; + } + SectionIdMask section_mask = doc_hit_info_.hit_section_ids_mask(); + std::array<Hit::TermFrequency, kMaxSectionId> section_term_frequencies = { + Hit::kNoTermFrequency}; + while (section_mask) { + SectionId section_id = __builtin_ctz(section_mask); + section_term_frequencies.at(section_id) = + doc_hit_info_.hit_term_frequency(section_id); + section_mask &= ~(1u << section_id); + } + TermMatchInfo term_stats(term_, doc_hit_info_.hit_section_ids_mask(), + std::move(section_term_frequencies)); + + for (const TermMatchInfo& cur_term_stats : *matched_terms_stats) { + if (cur_term_stats.term == term_stats.term) { + // Same docId and same term, we don't need to add the term and the term + // frequency should always be the same + return; + } + } + matched_terms_stats->push_back(std::move(term_stats)); + } + protected: // Add DocHitInfos corresponding to term_ to cached_doc_hit_infos_. virtual libtextclassifier3::Status RetrieveMoreHits() = 0; |