Update Icing from upstream.

Change-Id: I43038a59e7170fb8ecbaf6098a37221b3682ce09
author: Tim Barron <tjbarron@google.com> 2021-01-14 20:53:07 +0000
committer: Tim Barron <tjbarron@google.com> 2021-01-14 20:53:07 +0000
commit: a34db390d80f862bfaaa49dea3605c5fec3bca3d (patch)
tree: 67a4a87803cf2b31619c3ddff3674967fc1461ce /icing/index
parent: 59c2caa38fd8dca3760dad751f4f8e5de8be25f5 (diff)
download: icing-a34db390d80f862bfaaa49dea3605c5fec3bca3d.tar.gz
18 files changed, 874 insertions, 144 deletions
diff --git a/icing/index/hit/doc-hit-info.h b/icing/index/hit/doc-hit-info.h
index 8171960..0be87d6 100644
--- a/icing/index/hit/doc-hit-info.h
+++ b/icing/index/hit/doc-hit-info.h
@@ -25,7 +25,7 @@
 namespace icing {
 namespace lib {
 
-// DocHitInfo provides a collapsed view of all hits for a specific term and doc.
+// DocHitInfo provides a collapsed view of all hits for a specific doc.
 // Hits contain a document_id, section_id and a term frequency. The
 // information in multiple hits is collapse into a DocHitInfo by providing a
 // SectionIdMask of all sections that contained a hit for this term as well as
@@ -36,7 +36,7 @@ class DocHitInfo {
                       SectionIdMask hit_section_ids_mask = kSectionIdMaskNone)
       : document_id_(document_id_in),
         hit_section_ids_mask_(hit_section_ids_mask) {
-    memset(hit_term_frequency_, Hit::kDefaultTermFrequency,
+    memset(hit_term_frequency_, Hit::kNoTermFrequency,
            sizeof(hit_term_frequency_));
   }
 
diff --git a/icing/index/hit/doc-hit-info_test.cc b/icing/index/hit/doc-hit-info_test.cc
index 15c0de9..36c1a06 100644
--- a/icing/index/hit/doc-hit-info_test.cc
+++ b/icing/index/hit/doc-hit-info_test.cc
@@ -34,13 +34,13 @@ constexpr DocumentId kSomeOtherDocumentId = 54;
 TEST(DocHitInfoTest, InitialMaxHitTermFrequencies) {
   DocHitInfo info(kSomeDocumentId);
   for (SectionId i = 0; i <= kMaxSectionId; ++i) {
-    EXPECT_THAT(info.hit_term_frequency(i), Eq(Hit::kDefaultTermFrequency));
+    EXPECT_THAT(info.hit_term_frequency(i), Eq(Hit::kNoTermFrequency));
   }
 }
 
 TEST(DocHitInfoTest, UpdateHitTermFrequenciesForTheFirstTime) {
   DocHitInfo info(kSomeDocumentId);
-  ASSERT_THAT(info.hit_term_frequency(3), Eq(Hit::kDefaultTermFrequency));
+  ASSERT_THAT(info.hit_term_frequency(3), Eq(Hit::kNoTermFrequency));
 
   // Updating a section for the first time, should change its hit
   // term_frequency
diff --git a/icing/index/hit/hit.h b/icing/index/hit/hit.h
index 525a5e5..ee1f64b 100644
--- a/icing/index/hit/hit.h
+++ b/icing/index/hit/hit.h
@@ -58,6 +58,7 @@ class Hit {
   static constexpr TermFrequency kMaxTermFrequency =
       std::numeric_limits<TermFrequency>::max();
   static constexpr TermFrequency kDefaultTermFrequency = 1;
+  static constexpr TermFrequency kNoTermFrequency = 0;
 
   explicit Hit(Value value = kInvalidValue,
                TermFrequency term_frequency = kDefaultTermFrequency)
diff --git a/icing/index/index-processor.cc b/icing/index/index-processor.cc
index 892263b..d2f9d41 100644
--- a/icing/index/index-processor.cc
+++ b/icing/index/index-processor.cc
@@ -31,34 +31,30 @@
 #include "icing/schema/section-manager.h"
 #include "icing/schema/section.h"
 #include "icing/store/document-id.h"
-#include "icing/tokenization/language-segmenter.h"
 #include "icing/tokenization/token.h"
 #include "icing/tokenization/tokenizer-factory.h"
 #include "icing/tokenization/tokenizer.h"
 #include "icing/transform/normalizer.h"
 #include "icing/util/status-macros.h"
+#include "icing/util/tokenized-document.h"
 
 namespace icing {
 namespace lib {
 
 libtextclassifier3::StatusOr<std::unique_ptr<IndexProcessor>>
-IndexProcessor::Create(const SchemaStore* schema_store,
-                       const LanguageSegmenter* lang_segmenter,
-                       const Normalizer* normalizer, Index* index,
+IndexProcessor::Create(const Normalizer* normalizer, Index* index,
                        const IndexProcessor::Options& options,
                        const Clock* clock) {
-  ICING_RETURN_ERROR_IF_NULL(schema_store);
-  ICING_RETURN_ERROR_IF_NULL(lang_segmenter);
   ICING_RETURN_ERROR_IF_NULL(normalizer);
   ICING_RETURN_ERROR_IF_NULL(index);
   ICING_RETURN_ERROR_IF_NULL(clock);
 
-  return std::unique_ptr<IndexProcessor>(new IndexProcessor(
-      schema_store, lang_segmenter, normalizer, index, options, clock));
+  return std::unique_ptr<IndexProcessor>(
+      new IndexProcessor(normalizer, index, options, clock));
 }
 
 libtextclassifier3::Status IndexProcessor::IndexDocument(
-    const DocumentProto& document, DocumentId document_id,
+    const TokenizedDocument& tokenized_document, DocumentId document_id,
     NativePutDocumentStats* put_document_stats) {
   std::unique_ptr<Timer> index_timer = clock_.GetNewTimer();
 
@@ -68,54 +64,45 @@ libtextclassifier3::Status IndexProcessor::IndexDocument(
         "DocumentId %d must be greater than last added document_id %d",
         document_id, index_->last_added_document_id()));
   }
-  ICING_ASSIGN_OR_RETURN(std::vector<Section> sections,
-                         schema_store_.ExtractSections(document));
   uint32_t num_tokens = 0;
   libtextclassifier3::Status overall_status;
-  for (const Section& section : sections) {
+  for (const TokenizedSection& section : tokenized_document.sections()) {
     // TODO(b/152934343): pass real namespace ids in
     Index::Editor editor =
         index_->Edit(document_id, section.metadata.id,
                      section.metadata.term_match_type, /*namespace_id=*/0);
-    for (std::string_view subcontent : section.content) {
-      ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer> tokenizer,
-                             tokenizer_factory::CreateIndexingTokenizer(
-                                 section.metadata.tokenizer, &lang_segmenter_));
-      ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer::Iterator> itr,
-                             tokenizer->Tokenize(subcontent));
-      while (itr->Advance()) {
-        if (++num_tokens > options_.max_tokens_per_document) {
-          // Index all tokens buffered so far.
-          editor.IndexAllBufferedTerms();
-          if (put_document_stats != nullptr) {
-            put_document_stats->mutable_tokenization_stats()
-                ->set_exceeded_max_token_num(true);
-            put_document_stats->mutable_tokenization_stats()
-                ->set_num_tokens_indexed(options_.max_tokens_per_document);
-          }
-          switch (options_.token_limit_behavior) {
-            case Options::TokenLimitBehavior::kReturnError:
-              return absl_ports::ResourceExhaustedError(
-                  "Max number of tokens reached!");
-            case Options::TokenLimitBehavior::kSuppressError:
-              return overall_status;
-          }
+    for (std::string_view token : section.token_sequence) {
+      if (++num_tokens > options_.max_tokens_per_document) {
+        // Index all tokens buffered so far.
+        editor.IndexAllBufferedTerms();
+        if (put_document_stats != nullptr) {
+          put_document_stats->mutable_tokenization_stats()
+              ->set_exceeded_max_token_num(true);
+          put_document_stats->mutable_tokenization_stats()
+              ->set_num_tokens_indexed(options_.max_tokens_per_document);
         }
-        std::string term = normalizer_.NormalizeTerm(itr->GetToken().text);
-        // Add this term to Hit buffer. Even if adding this hit fails, we keep
-        // trying to add more hits because it's possible that future hits could
-        // still be added successfully. For instance if the lexicon is full, we
-        // might fail to add a hit for a new term, but should still be able to
-        // add hits for terms that are already in the index.
-        auto status = editor.BufferTerm(term.c_str());
-        if (overall_status.ok() && !status.ok()) {
-          // If we've succeeded to add everything so far, set overall_status to
-          // represent this new failure. If we've already failed, no need to
-          // update the status - we're already going to return a resource
-          // exhausted error.
-          overall_status = status;
+        switch (options_.token_limit_behavior) {
+          case Options::TokenLimitBehavior::kReturnError:
+            return absl_ports::ResourceExhaustedError(
+                "Max number of tokens reached!");
+          case Options::TokenLimitBehavior::kSuppressError:
+            return overall_status;
         }
       }
+      std::string term = normalizer_.NormalizeTerm(token);
+      // Add this term to Hit buffer. Even if adding this hit fails, we keep
+      // trying to add more hits because it's possible that future hits could
+      // still be added successfully. For instance if the lexicon is full, we
+      // might fail to add a hit for a new term, but should still be able to
+      // add hits for terms that are already in the index.
+      auto status = editor.BufferTerm(term.c_str());
+      if (overall_status.ok() && !status.ok()) {
+        // If we've succeeded to add everything so far, set overall_status to
+        // represent this new failure. If we've already failed, no need to
+        // update the status - we're already going to return a resource
+        // exhausted error.
+        overall_status = status;
+      }
     }
     // Add all the seen terms to the index with their term frequency.
     auto status = editor.IndexAllBufferedTerms();
diff --git a/icing/index/index-processor.h b/icing/index/index-processor.h
index 2eb4ad8..9fc7c46 100644
--- a/icing/index/index-processor.h
+++ b/icing/index/index-processor.h
@@ -21,12 +21,11 @@
 #include "icing/text_classifier/lib3/utils/base/status.h"
 #include "icing/index/index.h"
 #include "icing/proto/document.pb.h"
-#include "icing/schema/schema-store.h"
 #include "icing/schema/section-manager.h"
 #include "icing/store/document-id.h"
-#include "icing/tokenization/language-segmenter.h"
 #include "icing/tokenization/token.h"
 #include "icing/transform/normalizer.h"
+#include "icing/util/tokenized-document.h"
 
 namespace icing {
 namespace lib {
@@ -58,14 +57,13 @@ class IndexProcessor {
   //   An IndexProcessor on success
   //   FAILED_PRECONDITION if any of the pointers is null.
   static libtextclassifier3::StatusOr<std::unique_ptr<IndexProcessor>> Create(
-      const SchemaStore* schema_store, const LanguageSegmenter* lang_segmenter,
       const Normalizer* normalizer, Index* index, const Options& options,
       const Clock* clock);
 
-  // Add document to the index, associated with document_id. If the number of
-  // tokens in the document exceeds max_tokens_per_document, then only the first
-  // max_tokens_per_document will be added to the index. All tokens of length
-  // exceeding max_token_length will be shortened to max_token_length.
+  // Add tokenized document to the index, associated with document_id. If the
+  // number of tokens in the document exceeds max_tokens_per_document, then only
+  // the first max_tokens_per_document will be added to the index. All tokens of
+  // length exceeding max_token_length will be shortened to max_token_length.
   //
   // Indexing a document *may* trigger an index merge. If a merge fails, then
   // all content in the index will be lost.
@@ -82,25 +80,19 @@ class IndexProcessor {
   //   NOT_FOUND if there is no definition for the document's schema type.
   //   INTERNAL_ERROR if any other errors occur
   libtextclassifier3::Status IndexDocument(
-      const DocumentProto& document, DocumentId document_id,
+      const TokenizedDocument& tokenized_document, DocumentId document_id,
       NativePutDocumentStats* put_document_stats = nullptr);
 
  private:
-  IndexProcessor(const SchemaStore* schema_store,
-                 const LanguageSegmenter* lang_segmenter,
-                 const Normalizer* normalizer, Index* index,
+  IndexProcessor(const Normalizer* normalizer, Index* index,
                  const Options& options, const Clock* clock)
-      : schema_store_(*schema_store),
-        lang_segmenter_(*lang_segmenter),
-        normalizer_(*normalizer),
+      : normalizer_(*normalizer),
         index_(index),
         options_(options),
         clock_(*clock) {}
 
   std::string NormalizeToken(const Token& token);
 
-  const SchemaStore& schema_store_;
-  const LanguageSegmenter& lang_segmenter_;
   const Normalizer& normalizer_;
   Index* const index_;
   const Options options_;
diff --git a/icing/index/index-processor_benchmark.cc b/icing/index/index-processor_benchmark.cc
index 96a390b..afeac4d 100644
--- a/icing/index/index-processor_benchmark.cc
+++ b/icing/index/index-processor_benchmark.cc
@@ -31,6 +31,7 @@
 #include "icing/transform/normalizer-factory.h"
 #include "icing/transform/normalizer.h"
 #include "icing/util/logging.h"
+#include "icing/util/tokenized-document.h"
 #include "unicode/uloc.h"
 
 // Run on a Linux workstation:
@@ -168,16 +169,13 @@ void CleanUp(const Filesystem& filesystem, const std::string& index_dir) {
 }
 
 std::unique_ptr<IndexProcessor> CreateIndexProcessor(
-    const SchemaStore* schema_store,
-    const LanguageSegmenter* language_segmenter, const Normalizer* normalizer,
-    Index* index, const Clock* clock) {
+    const Normalizer* normalizer, Index* index, const Clock* clock) {
   IndexProcessor::Options processor_options{};
   processor_options.max_tokens_per_document = 1024 * 1024 * 10;
   processor_options.token_limit_behavior =
       IndexProcessor::Options::TokenLimitBehavior::kReturnError;
 
-  return IndexProcessor::Create(schema_store, language_segmenter, normalizer,
-                                index, processor_options, clock)
+  return IndexProcessor::Create(normalizer, index, processor_options, clock)
       .ValueOrDie();
 }
 
@@ -203,15 +201,18 @@ void BM_IndexDocumentWithOneProperty(benchmark::State& state) {
   Clock clock;
   std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore(&clock);
   std::unique_ptr<IndexProcessor> index_processor =
-      CreateIndexProcessor(schema_store.get(), language_segmenter.get(),
-                           normalizer.get(), index.get(), &clock);
+      CreateIndexProcessor(normalizer.get(), index.get(), &clock);
 
   DocumentProto input_document = CreateDocumentWithOneProperty(state.range(0));
+  TokenizedDocument tokenized_document(std::move(
+      TokenizedDocument::Create(schema_store.get(), language_segmenter.get(),
+                                input_document)
+          .ValueOrDie()));
 
   DocumentId document_id = 0;
   for (auto _ : state) {
     ICING_ASSERT_OK(
-        index_processor->IndexDocument(input_document, document_id++));
+        index_processor->IndexDocument(tokenized_document, document_id++));
   }
 
   CleanUp(filesystem, index_dir);
@@ -254,16 +255,19 @@ void BM_IndexDocumentWithTenProperties(benchmark::State& state) {
   Clock clock;
   std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore(&clock);
   std::unique_ptr<IndexProcessor> index_processor =
-      CreateIndexProcessor(schema_store.get(), language_segmenter.get(),
-                           normalizer.get(), index.get(), &clock);
+      CreateIndexProcessor(normalizer.get(), index.get(), &clock);
 
   DocumentProto input_document =
       CreateDocumentWithTenProperties(state.range(0));
+  TokenizedDocument tokenized_document(std::move(
+      TokenizedDocument::Create(schema_store.get(), language_segmenter.get(),
+                                input_document)
+          .ValueOrDie()));
 
   DocumentId document_id = 0;
   for (auto _ : state) {
     ICING_ASSERT_OK(
-        index_processor->IndexDocument(input_document, document_id++));
+        index_processor->IndexDocument(tokenized_document, document_id++));
   }
 
   CleanUp(filesystem, index_dir);
@@ -306,16 +310,19 @@ void BM_IndexDocumentWithDiacriticLetters(benchmark::State& state) {
   Clock clock;
   std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore(&clock);
   std::unique_ptr<IndexProcessor> index_processor =
-      CreateIndexProcessor(schema_store.get(), language_segmenter.get(),
-                           normalizer.get(), index.get(), &clock);
+      CreateIndexProcessor(normalizer.get(), index.get(), &clock);
 
   DocumentProto input_document =
       CreateDocumentWithDiacriticLetters(state.range(0));
+  TokenizedDocument tokenized_document(std::move(
+      TokenizedDocument::Create(schema_store.get(), language_segmenter.get(),
+                                input_document)
+          .ValueOrDie()));
 
   DocumentId document_id = 0;
   for (auto _ : state) {
     ICING_ASSERT_OK(
-        index_processor->IndexDocument(input_document, document_id++));
+        index_processor->IndexDocument(tokenized_document, document_id++));
   }
 
   CleanUp(filesystem, index_dir);
@@ -358,15 +365,18 @@ void BM_IndexDocumentWithHiragana(benchmark::State& state) {
   Clock clock;
   std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore(&clock);
   std::unique_ptr<IndexProcessor> index_processor =
-      CreateIndexProcessor(schema_store.get(), language_segmenter.get(),
-                           normalizer.get(), index.get(), &clock);
+      CreateIndexProcessor(normalizer.get(), index.get(), &clock);
 
   DocumentProto input_document = CreateDocumentWithHiragana(state.range(0));
+  TokenizedDocument tokenized_document(std::move(
+      TokenizedDocument::Create(schema_store.get(), language_segmenter.get(),
+                                input_document)
+          .ValueOrDie()));
 
   DocumentId document_id = 0;
   for (auto _ : state) {
     ICING_ASSERT_OK(
-        index_processor->IndexDocument(input_document, document_id++));
+        index_processor->IndexDocument(tokenized_document, document_id++));
   }
 
   CleanUp(filesystem, index_dir);
diff --git a/icing/index/index-processor_test.cc b/icing/index/index-processor_test.cc
index bdd9575..e6bb615 100644
--- a/icing/index/index-processor_test.cc
+++ b/icing/index/index-processor_test.cc
@@ -53,6 +53,7 @@
 #include "icing/tokenization/language-segmenter.h"
 #include "icing/transform/normalizer-factory.h"
 #include "icing/transform/normalizer.h"
+#include "icing/util/tokenized-document.h"
 #include "unicode/uloc.h"
 
 namespace icing {
@@ -140,8 +141,7 @@ class IndexProcessorTest : public Test {
 
     ICING_ASSERT_OK_AND_ASSIGN(
         index_processor_,
-        IndexProcessor::Create(schema_store_.get(), lang_segmenter_.get(),
-                               normalizer_.get(), index_.get(),
+        IndexProcessor::Create(normalizer_.get(), index_.get(),
                                processor_options, &fake_clock_));
     mock_icing_filesystem_ = std::make_unique<IcingMockFilesystem>();
   }
@@ -195,7 +195,7 @@ class IndexProcessorTest : public Test {
     type_config->set_schema_type(std::string(kFakeType));
 
     AddStringProperty(std::string(kExactProperty), DataType::STRING,
-                      Cardinality::REQUIRED, TermMatchType::EXACT_ONLY,
+                      Cardinality::OPTIONAL, TermMatchType::EXACT_ONLY,
                       type_config);
 
     AddStringProperty(std::string(kPrefixedProperty), DataType::STRING,
@@ -244,25 +244,11 @@ TEST_F(IndexProcessorTest, CreationWithNullPointerShouldFail) {
   processor_options.token_limit_behavior =
       IndexProcessor::Options::TokenLimitBehavior::kReturnError;
 
-  EXPECT_THAT(
-      IndexProcessor::Create(/*schema_store=*/nullptr, lang_segmenter_.get(),
-                             normalizer_.get(), index_.get(), processor_options,
-                             &fake_clock_),
-      StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
-
-  EXPECT_THAT(
-      IndexProcessor::Create(schema_store_.get(), /*lang_segmenter=*/nullptr,
-                             normalizer_.get(), index_.get(), processor_options,
-                             &fake_clock_),
-      StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
-
-  EXPECT_THAT(IndexProcessor::Create(schema_store_.get(), lang_segmenter_.get(),
-                                     /*normalizer=*/nullptr, index_.get(),
+  EXPECT_THAT(IndexProcessor::Create(/*normalizer=*/nullptr, index_.get(),
                                      processor_options, &fake_clock_),
               StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
 
-  EXPECT_THAT(IndexProcessor::Create(schema_store_.get(), lang_segmenter_.get(),
-                                     normalizer_.get(), /*index=*/nullptr,
+  EXPECT_THAT(IndexProcessor::Create(normalizer_.get(), /*index=*/nullptr,
                                      processor_options, &fake_clock_),
               StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
 }
@@ -276,7 +262,12 @@ TEST_F(IndexProcessorTest, NoTermMatchTypeContent) {
           .AddBytesProperty(std::string(kUnindexedProperty2),
                             "attachment bytes")
           .Build();
-  EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0), IsOk());
+  ICING_ASSERT_OK_AND_ASSIGN(
+      TokenizedDocument tokenized_document,
+      TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+                                document));
+  EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
+              IsOk());
   EXPECT_THAT(index_->last_added_document_id(), Eq(kInvalidDocumentId));
 }
 
@@ -287,7 +278,12 @@ TEST_F(IndexProcessorTest, OneDoc) {
           .SetSchema(std::string(kFakeType))
           .AddStringProperty(std::string(kExactProperty), "hello world")
           .Build();
-  EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0), IsOk());
+  ICING_ASSERT_OK_AND_ASSIGN(
+      TokenizedDocument tokenized_document,
+      TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+                                document));
+  EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
+              IsOk());
   EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
 
   ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DocHitInfoIterator> itr,
@@ -313,7 +309,12 @@ TEST_F(IndexProcessorTest, MultipleDocs) {
           .AddStringProperty(std::string(kExactProperty), "hello world")
           .AddStringProperty(std::string(kPrefixedProperty), "good night moon!")
           .Build();
-  EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0), IsOk());
+  ICING_ASSERT_OK_AND_ASSIGN(
+      TokenizedDocument tokenized_document,
+      TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+                                document));
+  EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
+              IsOk());
   EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
 
   std::string coffeeRepeatedString = "coffee";
@@ -329,7 +330,12 @@ TEST_F(IndexProcessorTest, MultipleDocs) {
           .AddStringProperty(std::string(kPrefixedProperty),
                              "mr. world world wide")
           .Build();
-  EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId1), IsOk());
+  ICING_ASSERT_OK_AND_ASSIGN(
+      tokenized_document,
+      TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+                                document));
+  EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId1),
+              IsOk());
   EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
 
   ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DocHitInfoIterator> itr,
@@ -372,11 +378,18 @@ TEST_F(IndexProcessorTest, DocWithNestedProperty) {
           .AddDocumentProperty(
               std::string(kSubProperty),
               DocumentBuilder()
+                  .SetKey("icing", "nested_type/1")
+                  .SetSchema(std::string(kNestedType))
                   .AddStringProperty(std::string(kNestedProperty),
                                      "rocky raccoon")
                   .Build())
           .Build();
-  EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0), IsOk());
+  ICING_ASSERT_OK_AND_ASSIGN(
+      TokenizedDocument tokenized_document,
+      TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+                                document));
+  EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
+              IsOk());
   EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
 
   ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DocHitInfoIterator> itr,
@@ -396,7 +409,12 @@ TEST_F(IndexProcessorTest, DocWithRepeatedProperty) {
           .AddStringProperty(std::string(kRepeatedProperty), "rocky",
                              "italian stallion")
           .Build();
-  EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0), IsOk());
+  ICING_ASSERT_OK_AND_ASSIGN(
+      TokenizedDocument tokenized_document,
+      TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+                                document));
+  EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
+              IsOk());
   EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
 
   ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DocHitInfoIterator> itr,
@@ -417,8 +435,7 @@ TEST_F(IndexProcessorTest, TooManyTokensReturnError) {
 
   ICING_ASSERT_OK_AND_ASSIGN(
       index_processor_,
-      IndexProcessor::Create(schema_store_.get(), lang_segmenter_.get(),
-                             normalizer_.get(), index_.get(), options,
+      IndexProcessor::Create(normalizer_.get(), index_.get(), options,
                              &fake_clock_));
 
   DocumentProto document =
@@ -428,7 +445,11 @@ TEST_F(IndexProcessorTest, TooManyTokensReturnError) {
           .AddStringProperty(std::string(kExactProperty), "hello world")
           .AddStringProperty(std::string(kPrefixedProperty), "good night moon!")
           .Build();
-  EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0),
+  ICING_ASSERT_OK_AND_ASSIGN(
+      TokenizedDocument tokenized_document,
+      TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+                                document));
+  EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
               StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
   EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
 
@@ -457,8 +478,7 @@ TEST_F(IndexProcessorTest, TooManyTokensSuppressError) {
 
   ICING_ASSERT_OK_AND_ASSIGN(
       index_processor_,
-      IndexProcessor::Create(schema_store_.get(), lang_segmenter_.get(),
-                             normalizer_.get(), index_.get(), options,
+      IndexProcessor::Create(normalizer_.get(), index_.get(), options,
                              &fake_clock_));
 
   DocumentProto document =
@@ -468,7 +488,12 @@ TEST_F(IndexProcessorTest, TooManyTokensSuppressError) {
           .AddStringProperty(std::string(kExactProperty), "hello world")
           .AddStringProperty(std::string(kPrefixedProperty), "good night moon!")
           .Build();
-  EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0), IsOk());
+  ICING_ASSERT_OK_AND_ASSIGN(
+      TokenizedDocument tokenized_document,
+      TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+                                document));
+  EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
+              IsOk());
   EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
 
   // "night" should have been indexed.
@@ -498,8 +523,7 @@ TEST_F(IndexProcessorTest, TooLongTokens) {
 
   ICING_ASSERT_OK_AND_ASSIGN(
       index_processor_,
-      IndexProcessor::Create(schema_store_.get(), lang_segmenter_.get(),
-                             normalizer.get(), index_.get(), options,
+      IndexProcessor::Create(normalizer.get(), index_.get(), options,
                              &fake_clock_));
 
   DocumentProto document =
@@ -509,7 +533,12 @@ TEST_F(IndexProcessorTest, TooLongTokens) {
           .AddStringProperty(std::string(kExactProperty), "hello world")
           .AddStringProperty(std::string(kPrefixedProperty), "good night moon!")
           .Build();
-  EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0), IsOk());
+  ICING_ASSERT_OK_AND_ASSIGN(
+      TokenizedDocument tokenized_document,
+      TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+                                document));
+  EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
+              IsOk());
   EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
 
   // "good" should have been indexed normally.
@@ -542,7 +571,12 @@ TEST_F(IndexProcessorTest, NonPrefixedContentPrefixQuery) {
           .SetSchema(std::string(kFakeType))
           .AddStringProperty(std::string(kExactProperty), "best rocky movies")
           .Build();
-  EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0), IsOk());
+  ICING_ASSERT_OK_AND_ASSIGN(
+      TokenizedDocument tokenized_document,
+      TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+                                document));
+  EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
+              IsOk());
   EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
 
   document =
@@ -551,7 +585,12 @@ TEST_F(IndexProcessorTest, NonPrefixedContentPrefixQuery) {
           .SetSchema(std::string(kFakeType))
           .AddStringProperty(std::string(kPrefixedProperty), "rocky raccoon")
           .Build();
-  EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId1), IsOk());
+  ICING_ASSERT_OK_AND_ASSIGN(
+      tokenized_document,
+      TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+                                document));
+  EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId1),
+              IsOk());
   EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
 
   // Only document_id 1 should surface in a prefix query for "Rock"
@@ -570,7 +609,12 @@ TEST_F(IndexProcessorTest, TokenNormalization) {
           .SetSchema(std::string(kFakeType))
           .AddStringProperty(std::string(kExactProperty), "ALL UPPER CASE")
           .Build();
-  EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0), IsOk());
+  ICING_ASSERT_OK_AND_ASSIGN(
+      TokenizedDocument tokenized_document,
+      TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+                                document));
+  EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
+              IsOk());
   EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
 
   document =
@@ -579,7 +623,12 @@ TEST_F(IndexProcessorTest, TokenNormalization) {
           .SetSchema(std::string(kFakeType))
           .AddStringProperty(std::string(kExactProperty), "all lower case")
           .Build();
-  EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId1), IsOk());
+  ICING_ASSERT_OK_AND_ASSIGN(
+      tokenized_document,
+      TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+                                document));
+  EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId1),
+              IsOk());
   EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
 
   ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DocHitInfoIterator> itr,
@@ -600,7 +649,12 @@ TEST_F(IndexProcessorTest, OutOfOrderDocumentIds) {
           .SetSchema(std::string(kFakeType))
           .AddStringProperty(std::string(kExactProperty), "ALL UPPER CASE")
           .Build();
-  EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId1), IsOk());
+  ICING_ASSERT_OK_AND_ASSIGN(
+      TokenizedDocument tokenized_document,
+      TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+                                document));
+  EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId1),
+              IsOk());
   EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
 
   // Indexing a document with document_id < last_added_document_id should cause
@@ -611,11 +665,15 @@ TEST_F(IndexProcessorTest, OutOfOrderDocumentIds) {
           .SetSchema(std::string(kFakeType))
           .AddStringProperty(std::string(kExactProperty), "all lower case")
           .Build();
-  EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0),
+  ICING_ASSERT_OK_AND_ASSIGN(
+      tokenized_document,
+      TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+                                document));
+  EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
               StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
 
   // As should indexing a document document_id == last_added_document_id.
-  EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0),
+  EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
               StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
 
   EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
@@ -635,8 +693,7 @@ TEST_F(IndexProcessorTest, NonAsciiIndexing) {
 
   ICING_ASSERT_OK_AND_ASSIGN(
       index_processor_,
-      IndexProcessor::Create(schema_store_.get(), lang_segmenter_.get(),
-                             normalizer_.get(), index_.get(),
+      IndexProcessor::Create(normalizer_.get(), index_.get(),
                              processor_options, &fake_clock_));
 
   DocumentProto document =
@@ -646,7 +703,12 @@ TEST_F(IndexProcessorTest, NonAsciiIndexing) {
           .AddStringProperty(std::string(kExactProperty),
                              "你好，世界！你好：世界。“你好”世界？")
           .Build();
-  EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0), IsOk());
+  ICING_ASSERT_OK_AND_ASSIGN(
+      TokenizedDocument tokenized_document,
+      TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+                                document));
+  EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
+              IsOk());
   EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
 
   ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DocHitInfoIterator> itr,
@@ -666,8 +728,7 @@ TEST_F(IndexProcessorTest,
 
   ICING_ASSERT_OK_AND_ASSIGN(
       index_processor_,
-      IndexProcessor::Create(schema_store_.get(), lang_segmenter_.get(),
-                             normalizer_.get(), index_.get(), processor_options,
+      IndexProcessor::Create(normalizer_.get(), index_.get(), processor_options,
                              &fake_clock_));
 
   // This is the maximum token length that an empty lexicon constructed for a
@@ -684,7 +745,11 @@ TEST_F(IndexProcessorTest,
                              absl_ports::StrCat(enormous_string, " foo"))
           .AddStringProperty(std::string(kPrefixedProperty), "bar baz")
           .Build();
-  EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0),
+  ICING_ASSERT_OK_AND_ASSIGN(
+      TokenizedDocument tokenized_document,
+      TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+                                document));
+  EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
               StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
   EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
 
@@ -715,6 +780,10 @@ TEST_F(IndexProcessorTest, IndexingDocAutomaticMerge) {
           .SetSchema(std::string(kFakeType))
           .AddStringProperty(std::string(kExactProperty), kIpsumText)
           .Build();
+  ICING_ASSERT_OK_AND_ASSIGN(
+      TokenizedDocument tokenized_document,
+      TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+                                document));
   Index::Options options(index_dir_,
                          /*index_merge_size=*/document.ByteSizeLong() * 100);
   ICING_ASSERT_OK_AND_ASSIGN(
@@ -727,8 +796,7 @@ TEST_F(IndexProcessorTest, IndexingDocAutomaticMerge) {
 
   ICING_ASSERT_OK_AND_ASSIGN(
       index_processor_,
-      IndexProcessor::Create(schema_store_.get(), lang_segmenter_.get(),
-                             normalizer_.get(), index_.get(), processor_options,
+      IndexProcessor::Create(normalizer_.get(), index_.get(), processor_options,
                              &fake_clock_));
   DocumentId doc_id = 0;
   // Have determined experimentally that indexing 3373 documents with this text
@@ -737,10 +805,12 @@ TEST_F(IndexProcessorTest, IndexingDocAutomaticMerge) {
   // empties the LiteIndex.
   constexpr int kNumDocsLiteIndexExhaustion = 3373;
   for (; doc_id < kNumDocsLiteIndexExhaustion; ++doc_id) {
-    EXPECT_THAT(index_processor_->IndexDocument(document, doc_id), IsOk());
+    EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, doc_id),
+                IsOk());
     EXPECT_THAT(index_->last_added_document_id(), Eq(doc_id));
   }
-  EXPECT_THAT(index_processor_->IndexDocument(document, doc_id), IsOk());
+  EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, doc_id),
+              IsOk());
   EXPECT_THAT(index_->last_added_document_id(), Eq(doc_id));
 }
 
@@ -768,6 +838,10 @@ TEST_F(IndexProcessorTest, IndexingDocMergeFailureResets) {
           .SetSchema(std::string(kFakeType))
           .AddStringProperty(std::string(kPrefixedProperty), kIpsumText)
           .Build();
+  ICING_ASSERT_OK_AND_ASSIGN(
+      TokenizedDocument tokenized_document,
+      TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+                                document));
 
   // 2. Recreate the index with the mock filesystem and a merge size that will
   // only allow one document to be added before requiring a merge.
@@ -784,25 +858,26 @@ TEST_F(IndexProcessorTest, IndexingDocMergeFailureResets) {
 
   ICING_ASSERT_OK_AND_ASSIGN(
       index_processor_,
-      IndexProcessor::Create(schema_store_.get(), lang_segmenter_.get(),
-                             normalizer_.get(), index_.get(), processor_options,
+      IndexProcessor::Create(normalizer_.get(), index_.get(), processor_options,
                              &fake_clock_));
 
   // 3. Index one document. This should fit in the LiteIndex without requiring a
   // merge.
   DocumentId doc_id = 0;
-  EXPECT_THAT(index_processor_->IndexDocument(document, doc_id), IsOk());
+  EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, doc_id),
+              IsOk());
   EXPECT_THAT(index_->last_added_document_id(), Eq(doc_id));
 
   // 4. Add one more document to trigger a merge, which should fail and result
   // in a Reset.
   ++doc_id;
-  EXPECT_THAT(index_processor_->IndexDocument(document, doc_id),
+  EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, doc_id),
               StatusIs(libtextclassifier3::StatusCode::DATA_LOSS));
   EXPECT_THAT(index_->last_added_document_id(), Eq(kInvalidDocumentId));
 
   // 5. Indexing a new document should succeed.
-  EXPECT_THAT(index_processor_->IndexDocument(document, doc_id), IsOk());
+  EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, doc_id),
+              IsOk());
   EXPECT_THAT(index_->last_added_document_id(), Eq(doc_id));
 }
 
diff --git a/icing/index/iterator/doc-hit-info-iterator-and.h b/icing/index/iterator/doc-hit-info-iterator-and.h
index 4618fb9..faca785 100644
--- a/icing/index/iterator/doc-hit-info-iterator-and.h
+++ b/icing/index/iterator/doc-hit-info-iterator-and.h
@@ -46,6 +46,16 @@ class DocHitInfoIteratorAnd : public DocHitInfoIterator {
 
   std::string ToString() const override;
 
+  void PopulateMatchedTermsStats(
+      std::vector<TermMatchInfo> *matched_terms_stats) const override {
+    if (doc_hit_info_.document_id() == kInvalidDocumentId) {
+      // Current hit isn't valid, return.
+      return;
+    }
+    short_->PopulateMatchedTermsStats(matched_terms_stats);
+    long_->PopulateMatchedTermsStats(matched_terms_stats);
+  }
+
  private:
   std::unique_ptr<DocHitInfoIterator> short_;
   std::unique_ptr<DocHitInfoIterator> long_;
@@ -67,6 +77,17 @@ class DocHitInfoIteratorAndNary : public DocHitInfoIterator {
 
   std::string ToString() const override;
 
+  void PopulateMatchedTermsStats(
+      std::vector<TermMatchInfo> *matched_terms_stats) const override {
+    if (doc_hit_info_.document_id() == kInvalidDocumentId) {
+      // Current hit isn't valid, return.
+      return;
+    }
+    for (size_t i = 0; i < iterators_.size(); ++i) {
+      iterators_.at(i)->PopulateMatchedTermsStats(matched_terms_stats);
+    }
+  }
+
  private:
   std::vector<std::unique_ptr<DocHitInfoIterator>> iterators_;
 };
diff --git a/icing/index/iterator/doc-hit-info-iterator-and_test.cc b/icing/index/iterator/doc-hit-info-iterator-and_test.cc
index 35574b7..783e937 100644
--- a/icing/index/iterator/doc-hit-info-iterator-and_test.cc
+++ b/icing/index/iterator/doc-hit-info-iterator-and_test.cc
@@ -32,8 +32,10 @@ namespace lib {
 namespace {
 
 using ::testing::ElementsAre;
+using ::testing::ElementsAreArray;
 using ::testing::Eq;
 using ::testing::IsEmpty;
+using ::testing::SizeIs;
 
 TEST(CreateAndIteratorTest, And) {
   // Basic test that we can create a working And iterator. Further testing of
@@ -196,6 +198,125 @@ TEST(DocHitInfoIteratorAndTest, SectionIdMask) {
   EXPECT_THAT(and_iter.hit_intersect_section_ids_mask(), Eq(mask_anded_result));
 }
 
+TEST(DocHitInfoIteratorAndTest, PopulateMatchedTermsStats) {
+  {
+    // Arbitrary section ids for the documents in the DocHitInfoIterators.
+    // Created to test correct section_id_mask behavior.
+    SectionIdMask section_id_mask1 = 0b01010101;  // hits in sections 0, 2, 4, 6
+    std::array<Hit::TermFrequency, kMaxSectionId> term_frequencies1{
+        1, 0, 2, 0, 3, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0};
+    SectionIdMask section_id_mask2 = 0b00000110;  // hits in sections 1, 2
+    std::array<Hit::TermFrequency, kMaxSectionId> term_frequencies2{
+        0, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+    DocHitInfo doc_hit_info1 = DocHitInfo(4);
+    doc_hit_info1.UpdateSection(/*section_id=*/0, /*hit_term_frequency=*/1);
+    doc_hit_info1.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/2);
+    doc_hit_info1.UpdateSection(/*section_id=*/4, /*hit_term_frequency=*/3);
+    doc_hit_info1.UpdateSection(/*section_id=*/6, /*hit_term_frequency=*/4);
+
+    DocHitInfo doc_hit_info2 = DocHitInfo(4);
+    doc_hit_info2.UpdateSection(/*section_id=*/1, /*hit_term_frequency=*/2);
+    doc_hit_info2.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/6);
+
+    std::vector<DocHitInfo> first_vector = {doc_hit_info1};
+    std::vector<DocHitInfo> second_vector = {doc_hit_info2};
+
+    auto first_iter =
+        std::make_unique<DocHitInfoIteratorDummy>(first_vector, "hi");
+    first_iter->set_hit_intersect_section_ids_mask(section_id_mask1);
+
+    auto second_iter =
+        std::make_unique<DocHitInfoIteratorDummy>(second_vector, "hello");
+    second_iter->set_hit_intersect_section_ids_mask(section_id_mask2);
+
+    DocHitInfoIteratorAnd and_iter(std::move(first_iter),
+                                   std::move(second_iter));
+    std::vector<TermMatchInfo> matched_terms_stats;
+    and_iter.PopulateMatchedTermsStats(&matched_terms_stats);
+    EXPECT_THAT(matched_terms_stats, IsEmpty());
+
+    ICING_EXPECT_OK(and_iter.Advance());
+    EXPECT_THAT(and_iter.doc_hit_info().document_id(), Eq(4));
+
+    and_iter.PopulateMatchedTermsStats(&matched_terms_stats);
+    ASSERT_THAT(matched_terms_stats, SizeIs(2));  // 2 terms
+    EXPECT_EQ(matched_terms_stats.at(0).term, "hi");
+    EXPECT_EQ(matched_terms_stats.at(1).term, "hello");
+    EXPECT_THAT(matched_terms_stats.at(0).term_frequencies,
+                ElementsAreArray(term_frequencies1));
+    EXPECT_EQ(matched_terms_stats.at(0).section_ids_mask, section_id_mask1);
+    EXPECT_THAT(matched_terms_stats.at(1).term_frequencies,
+                ElementsAreArray(term_frequencies2));
+    EXPECT_EQ(matched_terms_stats.at(1).section_ids_mask, section_id_mask2);
+
+    EXPECT_FALSE(and_iter.Advance().ok());
+  }
+  {
+    // Arbitrary section ids for the documents in the DocHitInfoIterators.
+    // Created to test correct section_id_mask behavior.
+    SectionIdMask section_id_mask1 = 0b00000101;  // hits in sections 0, 2
+    std::array<Hit::TermFrequency, kMaxSectionId> term_frequencies1{
+        1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+    DocHitInfo doc_hit_info1 = DocHitInfo(4);
+    doc_hit_info1.UpdateSection(/*section_id=*/0, /*hit_term_frequency=*/1);
+    doc_hit_info1.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/2);
+
+    std::vector<DocHitInfo> first_vector = {doc_hit_info1};
+    std::vector<DocHitInfo> second_vector = {doc_hit_info1};
+
+    auto first_iter =
+        std::make_unique<DocHitInfoIteratorDummy>(first_vector, "hi");
+    first_iter->set_hit_intersect_section_ids_mask(section_id_mask1);
+
+    auto second_iter =
+        std::make_unique<DocHitInfoIteratorDummy>(second_vector, "hi");
+    second_iter->set_hit_intersect_section_ids_mask(section_id_mask1);
+
+    DocHitInfoIteratorAnd and_iter(std::move(first_iter),
+                                   std::move(second_iter));
+    std::vector<TermMatchInfo> matched_terms_stats;
+    and_iter.PopulateMatchedTermsStats(&matched_terms_stats);
+    EXPECT_THAT(matched_terms_stats, IsEmpty());
+
+    ICING_EXPECT_OK(and_iter.Advance());
+    EXPECT_THAT(and_iter.doc_hit_info().document_id(), Eq(4));
+
+    and_iter.PopulateMatchedTermsStats(&matched_terms_stats);
+    ASSERT_THAT(matched_terms_stats, SizeIs(1));  // 1 term
+    EXPECT_EQ(matched_terms_stats.at(0).term, "hi");
+    EXPECT_THAT(matched_terms_stats.at(0).term_frequencies,
+                ElementsAreArray(term_frequencies1));
+    EXPECT_EQ(matched_terms_stats.at(0).section_ids_mask, section_id_mask1);
+
+    EXPECT_FALSE(and_iter.Advance().ok());
+  }
+}
+
+TEST(DocHitInfoIteratorAndTest, PopulateMatchedTermsStats_NoMatchingDocument) {
+  DocHitInfo doc_hit_info1 = DocHitInfo(4);
+  doc_hit_info1.UpdateSection(/*section_id=*/0, /*hit_term_frequency=*/1);
+
+  DocHitInfo doc_hit_info2 = DocHitInfo(5);
+  doc_hit_info2.UpdateSection(/*section_id=*/1, /*hit_term_frequency=*/2);
+  doc_hit_info2.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/6);
+
+  std::vector<DocHitInfo> first_vector = {doc_hit_info1};
+  std::vector<DocHitInfo> second_vector = {doc_hit_info2};
+
+  auto first_iter =
+      std::make_unique<DocHitInfoIteratorDummy>(first_vector, "hi");
+  auto second_iter =
+      std::make_unique<DocHitInfoIteratorDummy>(second_vector, "hello");
+
+  DocHitInfoIteratorAnd and_iter(std::move(first_iter), std::move(second_iter));
+  std::vector<TermMatchInfo> matched_terms_stats;
+  and_iter.PopulateMatchedTermsStats(&matched_terms_stats);
+  EXPECT_THAT(matched_terms_stats, IsEmpty());
+  EXPECT_FALSE(and_iter.Advance().ok());
+}
+
 TEST(DocHitInfoIteratorAndNaryTest, Initialize) {
   std::vector<std::unique_ptr<DocHitInfoIterator>> iterators;
   iterators.push_back(std::make_unique<DocHitInfoIteratorDummy>());
@@ -345,6 +466,90 @@ TEST(DocHitInfoIteratorAndNaryTest, SectionIdMask) {
   EXPECT_THAT(and_iter.hit_intersect_section_ids_mask(), Eq(mask_anded_result));
 }
 
+TEST(DocHitInfoIteratorAndNaryTest, PopulateMatchedTermsStats) {
+  // Arbitrary section ids/term frequencies for the documents in the
+  // DocHitInfoIterators.
+  // For term "hi", document 10 and 8
+  SectionIdMask section_id_mask1_hi = 0b01000101;  // hits in sections 0, 2, 6
+  std::array<Hit::TermFrequency, kMaxSectionId> term_frequencies1_hi{
+      1, 0, 2, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0};
+  DocHitInfo doc_hit_info1_hi = DocHitInfo(10);
+  doc_hit_info1_hi.UpdateSection(/*section_id=*/0, /*hit_term_frequency=*/1);
+  doc_hit_info1_hi.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/2);
+  doc_hit_info1_hi.UpdateSection(/*section_id=*/6, /*hit_term_frequency=*/4);
+
+  DocHitInfo doc_hit_info2_hi = DocHitInfo(8);
+  doc_hit_info2_hi.UpdateSection(/*section_id=*/1, /*hit_term_frequency=*/2);
+  doc_hit_info2_hi.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/6);
+
+  // For term "hello", document 10 and 9
+  SectionIdMask section_id_mask1_hello = 0b00001001;  // hits in sections 0, 3
+  std::array<Hit::TermFrequency, kMaxSectionId> term_frequencies1_hello{
+      2, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  DocHitInfo doc_hit_info1_hello = DocHitInfo(10);
+  doc_hit_info1_hello.UpdateSection(/*section_id=*/0, /*hit_term_frequency=*/2);
+  doc_hit_info1_hello.UpdateSection(/*section_id=*/3, /*hit_term_frequency=*/3);
+
+  DocHitInfo doc_hit_info2_hello = DocHitInfo(9);
+  doc_hit_info2_hello.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/3);
+  doc_hit_info2_hello.UpdateSection(/*section_id=*/3, /*hit_term_frequency=*/2);
+
+  // For term "ciao", document 10 and 9
+  SectionIdMask section_id_mask1_ciao = 0b00000011;  // hits in sections 0, 1
+  std::array<Hit::TermFrequency, kMaxSectionId> term_frequencies1_ciao{
+      2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  DocHitInfo doc_hit_info1_ciao = DocHitInfo(10);
+  doc_hit_info1_ciao.UpdateSection(/*section_id=*/0, /*hit_term_frequency=*/2);
+  doc_hit_info1_ciao.UpdateSection(/*section_id=*/1, /*hit_term_frequency=*/3);
+
+  DocHitInfo doc_hit_info2_ciao = DocHitInfo(9);
+  doc_hit_info2_ciao.UpdateSection(/*section_id=*/3, /*hit_term_frequency=*/3);
+  doc_hit_info2_ciao.UpdateSection(/*section_id=*/4, /*hit_term_frequency=*/2);
+
+  std::vector<DocHitInfo> first_vector = {doc_hit_info1_hi, doc_hit_info2_hi};
+  std::vector<DocHitInfo> second_vector = {doc_hit_info1_hello,
+                                           doc_hit_info2_hello};
+  std::vector<DocHitInfo> third_vector = {doc_hit_info1_ciao,
+                                          doc_hit_info2_ciao};
+
+  auto first_iter =
+      std::make_unique<DocHitInfoIteratorDummy>(first_vector, "hi");
+  auto second_iter =
+      std::make_unique<DocHitInfoIteratorDummy>(second_vector, "hello");
+  auto third_iter =
+      std::make_unique<DocHitInfoIteratorDummy>(third_vector, "ciao");
+
+  std::vector<std::unique_ptr<DocHitInfoIterator>> iterators;
+  iterators.push_back(std::move(first_iter));
+  iterators.push_back(std::move(second_iter));
+  iterators.push_back(std::move(third_iter));
+
+  DocHitInfoIteratorAndNary and_iter(std::move(iterators));
+  std::vector<TermMatchInfo> matched_terms_stats;
+  and_iter.PopulateMatchedTermsStats(&matched_terms_stats);
+  EXPECT_THAT(matched_terms_stats, IsEmpty());
+
+  ICING_EXPECT_OK(and_iter.Advance());
+  EXPECT_THAT(and_iter.doc_hit_info().document_id(), Eq(10));
+
+  and_iter.PopulateMatchedTermsStats(&matched_terms_stats);
+  ASSERT_THAT(matched_terms_stats, SizeIs(3));  // 3 terms
+  EXPECT_EQ(matched_terms_stats.at(0).term, "hi");
+  EXPECT_THAT(matched_terms_stats.at(0).term_frequencies,
+              ElementsAreArray(term_frequencies1_hi));
+  EXPECT_EQ(matched_terms_stats.at(0).section_ids_mask, section_id_mask1_hi);
+  EXPECT_EQ(matched_terms_stats.at(1).term, "hello");
+  EXPECT_THAT(matched_terms_stats.at(1).term_frequencies,
+              ElementsAreArray(term_frequencies1_hello));
+  EXPECT_EQ(matched_terms_stats.at(1).section_ids_mask, section_id_mask1_hello);
+  EXPECT_EQ(matched_terms_stats.at(2).term, "ciao");
+  EXPECT_THAT(matched_terms_stats.at(2).term_frequencies,
+              ElementsAreArray(term_frequencies1_ciao));
+  EXPECT_EQ(matched_terms_stats.at(2).section_ids_mask, section_id_mask1_ciao);
+
+  EXPECT_FALSE(and_iter.Advance().ok());
+}
+
 }  // namespace
 
 }  // namespace lib
diff --git a/icing/index/iterator/doc-hit-info-iterator-filter.h b/icing/index/iterator/doc-hit-info-iterator-filter.h
index 9119610..fb60e38 100644
--- a/icing/index/iterator/doc-hit-info-iterator-filter.h
+++ b/icing/index/iterator/doc-hit-info-iterator-filter.h
@@ -67,6 +67,11 @@ class DocHitInfoIteratorFilter : public DocHitInfoIterator {
 
   std::string ToString() const override;
 
+  void PopulateMatchedTermsStats(
+      std::vector<TermMatchInfo>* matched_terms_stats) const override {
+    delegate_->PopulateMatchedTermsStats(matched_terms_stats);
+  }
+
  private:
   std::unique_ptr<DocHitInfoIterator> delegate_;
   const DocumentStore& document_store_;
diff --git a/icing/index/iterator/doc-hit-info-iterator-or.cc b/icing/index/iterator/doc-hit-info-iterator-or.cc
index 8f00f88..b4234e0 100644
--- a/icing/index/iterator/doc-hit-info-iterator-or.cc
+++ b/icing/index/iterator/doc-hit-info-iterator-or.cc
@@ -108,6 +108,7 @@ libtextclassifier3::Status DocHitInfoIteratorOr::Advance() {
   } else {
     chosen = left_.get();
   }
+  current_ = chosen;
 
   doc_hit_info_ = chosen->doc_hit_info();
   hit_intersect_section_ids_mask_ = chosen->hit_intersect_section_ids_mask();
@@ -139,6 +140,7 @@ DocHitInfoIteratorOrNary::DocHitInfoIteratorOrNary(
     : iterators_(std::move(iterators)) {}
 
 libtextclassifier3::Status DocHitInfoIteratorOrNary::Advance() {
+  current_iterators_.clear();
   if (iterators_.size() < 2) {
     return absl_ports::InvalidArgumentError(
         "Not enough iterators to OR together");
@@ -187,6 +189,7 @@ libtextclassifier3::Status DocHitInfoIteratorOrNary::Advance() {
   hit_intersect_section_ids_mask_ = kSectionIdMaskNone;
   for (const auto& iterator : iterators_) {
     if (iterator->doc_hit_info().document_id() == next_document_id) {
+      current_iterators_.push_back(iterator.get());
       if (doc_hit_info_.document_id() == kInvalidDocumentId) {
         doc_hit_info_ = iterator->doc_hit_info();
         hit_intersect_section_ids_mask_ =
diff --git a/icing/index/iterator/doc-hit-info-iterator-or.h b/icing/index/iterator/doc-hit-info-iterator-or.h
index 4128e0f..2f49430 100644
--- a/icing/index/iterator/doc-hit-info-iterator-or.h
+++ b/icing/index/iterator/doc-hit-info-iterator-or.h
@@ -42,9 +42,26 @@ class DocHitInfoIteratorOr : public DocHitInfoIterator {
 
   std::string ToString() const override;
 
+  void PopulateMatchedTermsStats(
+      std::vector<TermMatchInfo> *matched_terms_stats) const override {
+    if (doc_hit_info_.document_id() == kInvalidDocumentId) {
+      // Current hit isn't valid, return.
+      return;
+    }
+    current_->PopulateMatchedTermsStats(matched_terms_stats);
+    // If equal, then current_ == left_. Combine with results from right_.
+    if (left_document_id_ == right_document_id_) {
+      right_->PopulateMatchedTermsStats(matched_terms_stats);
+    }
+  }
+
  private:
   std::unique_ptr<DocHitInfoIterator> left_;
   std::unique_ptr<DocHitInfoIterator> right_;
+  // Pointer to the chosen iterator that points to the current doc_hit_info_. If
+  // both left_ and right_ point to the same docid, then chosen_ == left.
+  // chosen_ does not own the iterator it points to.
+  DocHitInfoIterator *current_;
   DocumentId left_document_id_ = kMaxDocumentId;
   DocumentId right_document_id_ = kMaxDocumentId;
 };
@@ -65,8 +82,22 @@ class DocHitInfoIteratorOrNary : public DocHitInfoIterator {
 
   std::string ToString() const override;
 
+  void PopulateMatchedTermsStats(
+      std::vector<TermMatchInfo> *matched_terms_stats) const override {
+    if (doc_hit_info_.document_id() == kInvalidDocumentId) {
+      // Current hit isn't valid, return.
+      return;
+    }
+    for (size_t i = 0; i < current_iterators_.size(); i++) {
+      current_iterators_.at(i)->PopulateMatchedTermsStats(matched_terms_stats);
+    }
+  }
+
  private:
   std::vector<std::unique_ptr<DocHitInfoIterator>> iterators_;
+  // Pointers to the iterators that point to the current doc_hit_info_.
+  // current_iterators_ does not own the iterators it points to.
+  std::vector<DocHitInfoIterator *> current_iterators_;
 };
 
 }  // namespace lib
diff --git a/icing/index/iterator/doc-hit-info-iterator-or_test.cc b/icing/index/iterator/doc-hit-info-iterator-or_test.cc
index 3faa5ab..3f00a39 100644
--- a/icing/index/iterator/doc-hit-info-iterator-or_test.cc
+++ b/icing/index/iterator/doc-hit-info-iterator-or_test.cc
@@ -32,7 +32,10 @@ namespace lib {
 namespace {
 
 using ::testing::ElementsAre;
+using ::testing::ElementsAreArray;
 using ::testing::Eq;
+using ::testing::IsEmpty;
+using ::testing::SizeIs;
 
 TEST(CreateAndIteratorTest, Or) {
   // Basic test that we can create a working Or iterator. Further testing of
@@ -175,6 +178,159 @@ TEST(DocHitInfoIteratorOrTest, SectionIdMask) {
   EXPECT_THAT(or_iter.hit_intersect_section_ids_mask(), Eq(mask_anded_result));
 }
 
+TEST(DocHitInfoIteratorOrTest, PopulateMatchedTermsStats) {
+  {
+    // Arbitrary section ids for the documents in the DocHitInfoIterators.
+    // Created to test correct section_id_mask behavior.
+    SectionIdMask section_id_mask1 = 0b01010101;  // hits in sections 0, 2, 4, 6
+    std::array<Hit::TermFrequency, kMaxSectionId> term_frequencies1{
+        1, 0, 2, 0, 3, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0};
+    SectionIdMask section_id_mask2 = 0b00000110;  // hits in sections 1, 2
+    std::array<Hit::TermFrequency, kMaxSectionId> term_frequencies2{
+        0, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+    DocHitInfo doc_hit_info1 = DocHitInfo(4);
+    doc_hit_info1.UpdateSection(/*section_id=*/0, /*hit_term_frequency=*/1);
+    doc_hit_info1.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/2);
+    doc_hit_info1.UpdateSection(/*section_id=*/4, /*hit_term_frequency=*/3);
+    doc_hit_info1.UpdateSection(/*section_id=*/6, /*hit_term_frequency=*/4);
+
+    DocHitInfo doc_hit_info2 = DocHitInfo(4);
+    doc_hit_info2.UpdateSection(/*section_id=*/1, /*hit_term_frequency=*/2);
+    doc_hit_info2.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/6);
+
+    std::vector<DocHitInfo> first_vector = {doc_hit_info1};
+    std::vector<DocHitInfo> second_vector = {doc_hit_info2};
+
+    auto first_iter =
+        std::make_unique<DocHitInfoIteratorDummy>(first_vector, "hi");
+    first_iter->set_hit_intersect_section_ids_mask(section_id_mask1);
+
+    auto second_iter =
+        std::make_unique<DocHitInfoIteratorDummy>(second_vector, "hello");
+    second_iter->set_hit_intersect_section_ids_mask(section_id_mask2);
+
+    DocHitInfoIteratorOr or_iter(std::move(first_iter), std::move(second_iter));
+    std::vector<TermMatchInfo> matched_terms_stats;
+    or_iter.PopulateMatchedTermsStats(&matched_terms_stats);
+    EXPECT_THAT(matched_terms_stats, IsEmpty());
+
+    ICING_EXPECT_OK(or_iter.Advance());
+    EXPECT_THAT(or_iter.doc_hit_info().document_id(), Eq(4));
+
+    or_iter.PopulateMatchedTermsStats(&matched_terms_stats);
+    ASSERT_THAT(matched_terms_stats, SizeIs(2));  // 2 terms
+    EXPECT_EQ(matched_terms_stats.at(0).term, "hi");
+    EXPECT_EQ(matched_terms_stats.at(1).term, "hello");
+    EXPECT_THAT(matched_terms_stats.at(0).term_frequencies,
+                ElementsAreArray(term_frequencies1));
+    EXPECT_EQ(matched_terms_stats.at(0).section_ids_mask, section_id_mask1);
+    EXPECT_THAT(matched_terms_stats.at(1).term_frequencies,
+                ElementsAreArray(term_frequencies2));
+    EXPECT_EQ(matched_terms_stats.at(1).section_ids_mask, section_id_mask2);
+
+    EXPECT_FALSE(or_iter.Advance().ok());
+  }
+  {
+    // Arbitrary section ids for the documents in the DocHitInfoIterators.
+    // Created to test correct section_id_mask behavior.
+    SectionIdMask section_id_mask1 = 0b00000101;  // hits in sections 0, 2
+    std::array<Hit::TermFrequency, kMaxSectionId> term_frequencies1{
+        1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+    DocHitInfo doc_hit_info1 = DocHitInfo(4);
+    doc_hit_info1.UpdateSection(/*section_id=*/0, /*hit_term_frequency=*/1);
+    doc_hit_info1.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/2);
+
+    std::vector<DocHitInfo> first_vector = {doc_hit_info1};
+    std::vector<DocHitInfo> second_vector = {doc_hit_info1};
+
+    auto first_iter =
+        std::make_unique<DocHitInfoIteratorDummy>(first_vector, "hi");
+    first_iter->set_hit_intersect_section_ids_mask(section_id_mask1);
+
+    auto second_iter =
+        std::make_unique<DocHitInfoIteratorDummy>(second_vector, "hi");
+    second_iter->set_hit_intersect_section_ids_mask(section_id_mask1);
+
+    DocHitInfoIteratorOr or_iter(std::move(first_iter), std::move(second_iter));
+    std::vector<TermMatchInfo> matched_terms_stats;
+    or_iter.PopulateMatchedTermsStats(&matched_terms_stats);
+    EXPECT_THAT(matched_terms_stats, IsEmpty());
+
+    ICING_EXPECT_OK(or_iter.Advance());
+    EXPECT_THAT(or_iter.doc_hit_info().document_id(), Eq(4));
+
+    or_iter.PopulateMatchedTermsStats(&matched_terms_stats);
+    ASSERT_THAT(matched_terms_stats, SizeIs(1));  // 1 term
+    EXPECT_EQ(matched_terms_stats.at(0).term, "hi");
+    EXPECT_THAT(matched_terms_stats.at(0).term_frequencies,
+                ElementsAreArray(term_frequencies1));
+    EXPECT_EQ(matched_terms_stats.at(0).section_ids_mask, section_id_mask1);
+
+    EXPECT_FALSE(or_iter.Advance().ok());
+  }
+  {
+    // Arbitrary section ids for the documents in the DocHitInfoIterators.
+    // Created to test correct section_id_mask behavior.
+    SectionIdMask section_id_mask1 = 0b01010101;  // hits in sections 0, 2, 4, 6
+    std::array<Hit::TermFrequency, kMaxSectionId> term_frequencies1{
+        1, 0, 2, 0, 3, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0};
+    SectionIdMask section_id_mask2 = 0b00000110;  // hits in sections 1, 2
+    std::array<Hit::TermFrequency, kMaxSectionId> term_frequencies2{
+        0, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+    DocHitInfo doc_hit_info1 = DocHitInfo(4);
+    doc_hit_info1.UpdateSection(/*section_id=*/0, /*hit_term_frequency=*/1);
+    doc_hit_info1.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/2);
+    doc_hit_info1.UpdateSection(/*section_id=*/4, /*hit_term_frequency=*/3);
+    doc_hit_info1.UpdateSection(/*section_id=*/6, /*hit_term_frequency=*/4);
+
+    DocHitInfo doc_hit_info2 = DocHitInfo(5);
+    doc_hit_info2.UpdateSection(/*section_id=*/1, /*hit_term_frequency=*/2);
+    doc_hit_info2.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/6);
+
+    std::vector<DocHitInfo> first_vector = {doc_hit_info1};
+    std::vector<DocHitInfo> second_vector = {doc_hit_info2};
+
+    auto first_iter =
+        std::make_unique<DocHitInfoIteratorDummy>(first_vector, "hi");
+    first_iter->set_hit_intersect_section_ids_mask(section_id_mask1);
+
+    auto second_iter =
+        std::make_unique<DocHitInfoIteratorDummy>(second_vector, "hello");
+    second_iter->set_hit_intersect_section_ids_mask(section_id_mask2);
+
+    DocHitInfoIteratorOr or_iter(std::move(first_iter), std::move(second_iter));
+    std::vector<TermMatchInfo> matched_terms_stats;
+    or_iter.PopulateMatchedTermsStats(&matched_terms_stats);
+    EXPECT_THAT(matched_terms_stats, IsEmpty());
+
+    ICING_EXPECT_OK(or_iter.Advance());
+    EXPECT_THAT(or_iter.doc_hit_info().document_id(), Eq(5));
+
+    or_iter.PopulateMatchedTermsStats(&matched_terms_stats);
+    ASSERT_THAT(matched_terms_stats, SizeIs(1));  // 1 term
+    EXPECT_EQ(matched_terms_stats.at(0).term, "hello");
+    EXPECT_THAT(matched_terms_stats.at(0).term_frequencies,
+                ElementsAreArray(term_frequencies2));
+    EXPECT_EQ(matched_terms_stats.at(0).section_ids_mask, section_id_mask2);
+
+    ICING_EXPECT_OK(or_iter.Advance());
+    EXPECT_THAT(or_iter.doc_hit_info().document_id(), Eq(4));
+
+    matched_terms_stats.clear();
+    or_iter.PopulateMatchedTermsStats(&matched_terms_stats);
+    ASSERT_THAT(matched_terms_stats, SizeIs(1));  // 1 term
+    EXPECT_EQ(matched_terms_stats.at(0).term, "hi");
+    EXPECT_THAT(matched_terms_stats.at(0).term_frequencies,
+                ElementsAreArray(term_frequencies1));
+    EXPECT_EQ(matched_terms_stats.at(0).section_ids_mask, section_id_mask1);
+
+    EXPECT_FALSE(or_iter.Advance().ok());
+  }
+}
+
 TEST(DocHitInfoIteratorOrNaryTest, Initialize) {
   std::vector<std::unique_ptr<DocHitInfoIterator>> iterators;
   iterators.push_back(std::make_unique<DocHitInfoIteratorDummy>());
@@ -316,6 +472,125 @@ TEST(DocHitInfoIteratorOrNaryTest, SectionIdMask) {
   EXPECT_THAT(or_iter.hit_intersect_section_ids_mask(), Eq(mask_anded_result));
 }
 
+TEST(DocHitInfoIteratorOrNaryTest, PopulateMatchedTermsStats) {
+  // Arbitrary section ids/term frequencies for the documents in the
+  // DocHitInfoIterators.
+  // For term "hi", document 10 and 8
+  SectionIdMask section_id_mask1_hi = 0b01000101;  // hits in sections 0, 2, 6
+  std::array<Hit::TermFrequency, kMaxSectionId> term_frequencies1_hi{
+      1, 0, 2, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0};
+  DocHitInfo doc_hit_info1_hi = DocHitInfo(10);
+  doc_hit_info1_hi.UpdateSection(/*section_id=*/0, /*hit_term_frequency=*/1);
+  doc_hit_info1_hi.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/2);
+  doc_hit_info1_hi.UpdateSection(/*section_id=*/6, /*hit_term_frequency=*/4);
+
+  SectionIdMask section_id_mask2_hi = 0b00000110;  // hits in sections 1, 2
+  std::array<Hit::TermFrequency, kMaxSectionId> term_frequencies2_hi{
+      0, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  DocHitInfo doc_hit_info2_hi = DocHitInfo(8);
+  doc_hit_info2_hi.UpdateSection(/*section_id=*/1, /*hit_term_frequency=*/2);
+  doc_hit_info2_hi.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/6);
+
+  // For term "hello", document 10 and 9
+  SectionIdMask section_id_mask1_hello = 0b00001001;  // hits in sections 0, 3
+  std::array<Hit::TermFrequency, kMaxSectionId> term_frequencies1_hello{
+      2, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  DocHitInfo doc_hit_info1_hello = DocHitInfo(10);
+  doc_hit_info1_hello.UpdateSection(/*section_id=*/0, /*hit_term_frequency=*/2);
+  doc_hit_info1_hello.UpdateSection(/*section_id=*/3, /*hit_term_frequency=*/3);
+
+  SectionIdMask section_id_mask2_hello = 0b00001100;  // hits in sections 2, 3
+  std::array<Hit::TermFrequency, kMaxSectionId> term_frequencies2_hello{
+      0, 0, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  DocHitInfo doc_hit_info2_hello = DocHitInfo(9);
+  doc_hit_info2_hello.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/3);
+  doc_hit_info2_hello.UpdateSection(/*section_id=*/3, /*hit_term_frequency=*/2);
+
+  // For term "ciao", document 9 and 8
+  SectionIdMask section_id_mask1_ciao = 0b00000011;  // hits in sections 0, 1
+  std::array<Hit::TermFrequency, kMaxSectionId> term_frequencies1_ciao{
+      2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  DocHitInfo doc_hit_info1_ciao = DocHitInfo(9);
+  doc_hit_info1_ciao.UpdateSection(/*section_id=*/0, /*hit_term_frequency=*/2);
+  doc_hit_info1_ciao.UpdateSection(/*section_id=*/1, /*hit_term_frequency=*/3);
+
+  SectionIdMask section_id_mask2_ciao = 0b00011000;  // hits in sections 3, 4
+  std::array<Hit::TermFrequency, kMaxSectionId> term_frequencies2_ciao{
+      0, 0, 0, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  DocHitInfo doc_hit_info2_ciao = DocHitInfo(8);
+  doc_hit_info2_ciao.UpdateSection(/*section_id=*/3, /*hit_term_frequency=*/3);
+  doc_hit_info2_ciao.UpdateSection(/*section_id=*/4, /*hit_term_frequency=*/2);
+
+  std::vector<DocHitInfo> first_vector = {doc_hit_info1_hi, doc_hit_info2_hi};
+  std::vector<DocHitInfo> second_vector = {doc_hit_info1_hello,
+                                           doc_hit_info2_hello};
+  std::vector<DocHitInfo> third_vector = {doc_hit_info1_ciao,
+                                          doc_hit_info2_ciao};
+
+  auto first_iter =
+      std::make_unique<DocHitInfoIteratorDummy>(first_vector, "hi");
+  auto second_iter =
+      std::make_unique<DocHitInfoIteratorDummy>(second_vector, "hello");
+  auto third_iter =
+      std::make_unique<DocHitInfoIteratorDummy>(third_vector, "ciao");
+
+  std::vector<std::unique_ptr<DocHitInfoIterator>> iterators;
+  iterators.push_back(std::move(first_iter));
+  iterators.push_back(std::move(second_iter));
+  iterators.push_back(std::move(third_iter));
+
+  DocHitInfoIteratorOrNary or_iter(std::move(iterators));
+  std::vector<TermMatchInfo> matched_terms_stats;
+  or_iter.PopulateMatchedTermsStats(&matched_terms_stats);
+  EXPECT_THAT(matched_terms_stats, IsEmpty());
+
+  ICING_EXPECT_OK(or_iter.Advance());
+  EXPECT_THAT(or_iter.doc_hit_info().document_id(), Eq(10));
+
+  or_iter.PopulateMatchedTermsStats(&matched_terms_stats);
+  ASSERT_THAT(matched_terms_stats, SizeIs(2));  // 2 terms
+  EXPECT_EQ(matched_terms_stats.at(0).term, "hi");
+  EXPECT_THAT(matched_terms_stats.at(0).term_frequencies,
+              ElementsAreArray(term_frequencies1_hi));
+  EXPECT_EQ(matched_terms_stats.at(0).section_ids_mask, section_id_mask1_hi);
+  EXPECT_EQ(matched_terms_stats.at(1).term, "hello");
+  EXPECT_THAT(matched_terms_stats.at(1).term_frequencies,
+              ElementsAreArray(term_frequencies1_hello));
+  EXPECT_EQ(matched_terms_stats.at(1).section_ids_mask, section_id_mask1_hello);
+
+  ICING_EXPECT_OK(or_iter.Advance());
+  EXPECT_THAT(or_iter.doc_hit_info().document_id(), Eq(9));
+
+  matched_terms_stats.clear();
+  or_iter.PopulateMatchedTermsStats(&matched_terms_stats);
+  ASSERT_THAT(matched_terms_stats, SizeIs(2));  // 2 terms
+  EXPECT_EQ(matched_terms_stats.at(0).term, "hello");
+  EXPECT_THAT(matched_terms_stats.at(0).term_frequencies,
+              ElementsAreArray(term_frequencies2_hello));
+  EXPECT_EQ(matched_terms_stats.at(0).section_ids_mask, section_id_mask2_hello);
+  EXPECT_EQ(matched_terms_stats.at(1).term, "ciao");
+  EXPECT_THAT(matched_terms_stats.at(1).term_frequencies,
+              ElementsAreArray(term_frequencies1_ciao));
+  EXPECT_EQ(matched_terms_stats.at(1).section_ids_mask, section_id_mask1_ciao);
+
+  ICING_EXPECT_OK(or_iter.Advance());
+  EXPECT_THAT(or_iter.doc_hit_info().document_id(), Eq(8));
+
+  matched_terms_stats.clear();
+  or_iter.PopulateMatchedTermsStats(&matched_terms_stats);
+  ASSERT_THAT(matched_terms_stats, SizeIs(2));  // 2 terms
+  EXPECT_EQ(matched_terms_stats.at(0).term, "hi");
+  EXPECT_THAT(matched_terms_stats.at(0).term_frequencies,
+              ElementsAreArray(term_frequencies2_hi));
+  EXPECT_EQ(matched_terms_stats.at(0).section_ids_mask, section_id_mask2_hi);
+  EXPECT_EQ(matched_terms_stats.at(1).term, "ciao");
+  EXPECT_THAT(matched_terms_stats.at(1).term_frequencies,
+              ElementsAreArray(term_frequencies2_ciao));
+  EXPECT_EQ(matched_terms_stats.at(1).section_ids_mask, section_id_mask2_ciao);
+
+  EXPECT_FALSE(or_iter.Advance().ok());
+}
+
 }  // namespace
 
 }  // namespace lib
diff --git a/icing/index/iterator/doc-hit-info-iterator-section-restrict.h b/icing/index/iterator/doc-hit-info-iterator-section-restrict.h
index ae5a896..ba74384 100644
--- a/icing/index/iterator/doc-hit-info-iterator-section-restrict.h
+++ b/icing/index/iterator/doc-hit-info-iterator-section-restrict.h
@@ -52,6 +52,15 @@ class DocHitInfoIteratorSectionRestrict : public DocHitInfoIterator {
 
   std::string ToString() const override;
 
+  // NOTE: currently, section restricts does decide which documents to
+  // return, but doesn't impact the relevance score of a document.
+  // TODO(b/173156803): decide whether we want to filter the matched_terms_stats
+  // for the restricted sections.
+  void PopulateMatchedTermsStats(
+      std::vector<TermMatchInfo>* matched_terms_stats) const override {
+    delegate_->PopulateMatchedTermsStats(matched_terms_stats);
+  }
+
  private:
   std::unique_ptr<DocHitInfoIterator> delegate_;
   const DocumentStore& document_store_;
diff --git a/icing/index/iterator/doc-hit-info-iterator-test-util.h b/icing/index/iterator/doc-hit-info-iterator-test-util.h
index c4d7aa7..913696a 100644
--- a/icing/index/iterator/doc-hit-info-iterator-test-util.h
+++ b/icing/index/iterator/doc-hit-info-iterator-test-util.h
@@ -15,7 +15,6 @@
 #ifndef ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TEST_UTIL_H_
 #define ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TEST_UTIL_H_
 
-#include <cstdint>
 #include <string>
 #include <utility>
 #include <vector>
@@ -40,8 +39,9 @@ namespace lib {
 class DocHitInfoIteratorDummy : public DocHitInfoIterator {
  public:
   DocHitInfoIteratorDummy() = default;
-  explicit DocHitInfoIteratorDummy(std::vector<DocHitInfo> doc_hit_infos)
-      : doc_hit_infos_(std::move(doc_hit_infos)) {}
+  explicit DocHitInfoIteratorDummy(std::vector<DocHitInfo> doc_hit_infos,
+                                   std::string term = "")
+      : doc_hit_infos_(std::move(doc_hit_infos)), term_(std::move(term)) {}
 
   libtextclassifier3::Status Advance() override {
     if (index_ < doc_hit_infos_.size()) {
@@ -54,6 +54,36 @@ class DocHitInfoIteratorDummy : public DocHitInfoIterator {
         "No more DocHitInfos in iterator");
   }
 
+  // Imitates behavior of DocHitInfoIteratorTermMain/DocHitInfoIteratorTermLite
+  void PopulateMatchedTermsStats(
+      std::vector<TermMatchInfo>* matched_terms_stats) const override {
+    if (doc_hit_info_.document_id() == kInvalidDocumentId) {
+      // Current hit isn't valid, return.
+      return;
+    }
+    SectionIdMask section_mask = doc_hit_info_.hit_section_ids_mask();
+    std::array<Hit::TermFrequency, kMaxSectionId> section_term_frequencies = {
+        Hit::kNoTermFrequency};
+
+    while (section_mask) {
+      SectionId section_id = __builtin_ctz(section_mask);
+      section_term_frequencies.at(section_id) =
+          doc_hit_info_.hit_term_frequency(section_id);
+      section_mask &= ~(1u << section_id);
+    }
+    TermMatchInfo term_stats(term_, doc_hit_info_.hit_section_ids_mask(),
+                             section_term_frequencies);
+
+    for (auto& cur_term_stats : *matched_terms_stats) {
+      if (cur_term_stats.term == term_stats.term) {
+        // Same docId and same term, we don't need to add the term and the term
+        // frequency should always be the same
+        return;
+      }
+    }
+    matched_terms_stats->push_back(term_stats);
+  }
+
   void set_hit_intersect_section_ids_mask(
       SectionIdMask hit_intersect_section_ids_mask) {
     hit_intersect_section_ids_mask_ = hit_intersect_section_ids_mask;
@@ -91,6 +121,7 @@ class DocHitInfoIteratorDummy : public DocHitInfoIterator {
   int32_t num_blocks_inspected_ = 0;
   int32_t num_leaf_advance_calls_ = 0;
   std::vector<DocHitInfo> doc_hit_infos_;
+  std::string term_;
 };
 
 inline std::vector<DocumentId> GetDocumentIds(DocHitInfoIterator* iterator) {
diff --git a/icing/index/iterator/doc-hit-info-iterator.h b/icing/index/iterator/doc-hit-info-iterator.h
index bcc2b6e..c4d9901 100644
--- a/icing/index/iterator/doc-hit-info-iterator.h
+++ b/icing/index/iterator/doc-hit-info-iterator.h
@@ -17,6 +17,7 @@
 
 #include <cstdint>
 #include <string>
+#include <string_view>
 
 #include "icing/text_classifier/lib3/utils/base/status.h"
 #include "icing/text_classifier/lib3/utils/base/statusor.h"
@@ -28,6 +29,26 @@
 namespace icing {
 namespace lib {
 
+// Data structure that maps a single matched query term to its section mask
+// and the list of term frequencies.
+// TODO(b/158603837): add stat on whether the matched terms are prefix matched
+// or not. This information will be used to boost exact match.
+struct TermMatchInfo {
+  std::string_view term;
+  // SectionIdMask associated to the term.
+  SectionIdMask section_ids_mask;
+  // Array with fixed size kMaxSectionId. For every section id, i.e.
+  // vector index, it stores the term frequency of the term.
+  std::array<Hit::TermFrequency, kMaxSectionId> term_frequencies;
+
+  explicit TermMatchInfo(
+      std::string_view term, SectionIdMask section_ids_mask,
+      std::array<Hit::TermFrequency, kMaxSectionId> term_frequencies)
+      : term(term),
+        section_ids_mask(section_ids_mask),
+        term_frequencies(std::move(term_frequencies)) {}
+};
+
 // Iterator over DocHitInfos (collapsed Hits) in REVERSE document_id order.
 //
 // NOTE: You must call Advance() before calling hit_info() or
@@ -70,6 +91,14 @@ class DocHitInfoIterator {
   // A string representing the iterator.
   virtual std::string ToString() const = 0;
 
+  // For the last hit docid, retrieves all the matched query terms and other
+  // stats, see TermMatchInfo.
+  // If Advance() wasn't called after construction, Advance() returned false or
+  // the concrete HitIterator didn't override this method, the vectors aren't
+  // populated.
+  virtual void PopulateMatchedTermsStats(
+      std::vector<TermMatchInfo>* matched_terms_stats) const {}
+
  protected:
   DocHitInfo doc_hit_info_;
   SectionIdMask hit_intersect_section_ids_mask_ = kSectionIdMaskNone;
diff --git a/icing/index/lite/doc-hit-info-iterator-term-lite.h b/icing/index/lite/doc-hit-info-iterator-term-lite.h
index bd2de6d..ac5e97f 100644
--- a/icing/index/lite/doc-hit-info-iterator-term-lite.h
+++ b/icing/index/lite/doc-hit-info-iterator-term-lite.h
@@ -49,6 +49,34 @@ class DocHitInfoIteratorTermLite : public DocHitInfoIterator {
   }
   int32_t GetNumLeafAdvanceCalls() const override { return num_advance_calls_; }
 
+  void PopulateMatchedTermsStats(
+      std::vector<TermMatchInfo>* matched_terms_stats) const override {
+    if (doc_hit_info_.document_id() == kInvalidDocumentId) {
+      // Current hit isn't valid, return.
+      return;
+    }
+    SectionIdMask section_mask = doc_hit_info_.hit_section_ids_mask();
+    std::array<Hit::TermFrequency, kMaxSectionId> section_term_frequencies = {
+        Hit::kNoTermFrequency};
+    while (section_mask) {
+      SectionId section_id = __builtin_ctz(section_mask);
+      section_term_frequencies.at(section_id) =
+          doc_hit_info_.hit_term_frequency(section_id);
+      section_mask &= ~(1u << section_id);
+    }
+    TermMatchInfo term_stats(term_, doc_hit_info_.hit_section_ids_mask(),
+                             std::move(section_term_frequencies));
+
+    for (const TermMatchInfo& cur_term_stats : *matched_terms_stats) {
+      if (cur_term_stats.term == term_stats.term) {
+        // Same docId and same term, we don't need to add the term and the term
+        // frequency should always be the same
+        return;
+      }
+    }
+    matched_terms_stats->push_back(std::move(term_stats));
+  }
+
  protected:
   // Add DocHitInfos corresponding to term_ to cached_hits_.
   virtual libtextclassifier3::Status RetrieveMoreHits() = 0;
diff --git a/icing/index/main/doc-hit-info-iterator-term-main.h b/icing/index/main/doc-hit-info-iterator-term-main.h
index 1f77226..d626d7a 100644
--- a/icing/index/main/doc-hit-info-iterator-term-main.h
+++ b/icing/index/main/doc-hit-info-iterator-term-main.h
@@ -49,6 +49,34 @@ class DocHitInfoIteratorTermMain : public DocHitInfoIterator {
   }
   int32_t GetNumLeafAdvanceCalls() const override { return num_advance_calls_; }
 
+  void PopulateMatchedTermsStats(
+      std::vector<TermMatchInfo>* matched_terms_stats) const override {
+    if (doc_hit_info_.document_id() == kInvalidDocumentId) {
+      // Current hit isn't valid, return.
+      return;
+    }
+    SectionIdMask section_mask = doc_hit_info_.hit_section_ids_mask();
+    std::array<Hit::TermFrequency, kMaxSectionId> section_term_frequencies = {
+        Hit::kNoTermFrequency};
+    while (section_mask) {
+      SectionId section_id = __builtin_ctz(section_mask);
+      section_term_frequencies.at(section_id) =
+          doc_hit_info_.hit_term_frequency(section_id);
+      section_mask &= ~(1u << section_id);
+    }
+    TermMatchInfo term_stats(term_, doc_hit_info_.hit_section_ids_mask(),
+                             std::move(section_term_frequencies));
+
+    for (const TermMatchInfo& cur_term_stats : *matched_terms_stats) {
+      if (cur_term_stats.term == term_stats.term) {
+        // Same docId and same term, we don't need to add the term and the term
+        // frequency should always be the same
+        return;
+      }
+    }
+    matched_terms_stats->push_back(std::move(term_stats));
+  }
+
  protected:
   // Add DocHitInfos corresponding to term_ to cached_doc_hit_infos_.
   virtual libtextclassifier3::Status RetrieveMoreHits() = 0;
author	Tim Barron <tjbarron@google.com>	2021-01-14 20:53:07 +0000
committer	Tim Barron <tjbarron@google.com>	2021-01-14 20:53:07 +0000
commit	a34db390d80f862bfaaa49dea3605c5fec3bca3d (patch)
tree	67a4a87803cf2b31619c3ddff3674967fc1461ce /icing/index
parent	59c2caa38fd8dca3760dad751f4f8e5de8be25f5 (diff)
download	icing-a34db390d80f862bfaaa49dea3605c5fec3bca3d.tar.gz