Sync from upstream.

====================================================================== Refactor DocumentStore::Initialize to improve readability of document store recovery. ====================================================================== Remove non-NDK API usages of ICU4C in libicing. ====================================================================== Move IcuDataFileHelper to the testing directory since it is a test-only util. ====================================================================== Support dump function for DocumentStore ====================================================================== Switch to use PRead rather than MMap in the proto log. ====================================================================== Support dump function for main/lite index and lexicon ====================================================================== Fix LiteIndex::AppendHits ====================================================================== Enable and fix DocumentStoreTest.LoadScoreCacheAndInitializeSuccessfully ====================================================================== Fix MainIndex::GetStorageInfo. ====================================================================== Fix icing-search-engine_fuzz_test by making IcuLanguageSegmenterIterator::Advance non-recursive. ====================================================================== Allow to return additional information for deleted documents in DeleteByQuery ====================================================================== Using enum class in Token::Type for better type safety. ====================================================================== Bug: 158089703 Bug: 185845269 Bug: 209071710 Bug: 211785521 Bug: 218413237 Bug: 223549255 Change-Id: Id2786047ab279734bdd2aee883e82607b6a0e403
author: Tim Barron <tjbarron@google.com> 2022-04-12 14:30:14 -0700
committer: Tim Barron <tjbarron@google.com> 2022-04-12 14:36:38 -0700
commit: d5c9ae94052a0f2f1b9ddec9dbbe502bc4f11d54 (patch)
tree: 90b929dc92d5874b5c15caca064401196ab4fc65
parent: beff93fe1f5165aeeb871d9711963aa1846299ae (diff)
download: icing-d5c9ae94052a0f2f1b9ddec9dbbe502bc4f11d54.tar.gz
57 files changed, 1469 insertions, 753 deletions
diff --git a/icing/file/file-backed-bitmap.cc b/icing/file/file-backed-bitmap.cc
index f1e568c..eec7668 100644
--- a/icing/file/file-backed-bitmap.cc
+++ b/icing/file/file-backed-bitmap.cc
@@ -50,7 +50,7 @@ FileBackedBitmap::Create(const Filesystem* filesystem,
   auto bitmap = std::unique_ptr<FileBackedBitmap>(
       new FileBackedBitmap(filesystem, file_path, mmap_strategy));
 
-  // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
+  // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
   // that can support error logging.
   libtextclassifier3::Status status = bitmap->Initialize();
   if (!status.ok()) {
@@ -122,7 +122,7 @@ libtextclassifier3::Status FileBackedBitmap::FileBackedBitmap::Initialize() {
                   << " of size: " << file_size;
   }
 
-  // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
+  // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
   // that can support error logging.
   libtextclassifier3::Status status = mmapper_->Remap(0, file_size);
   if (!status.ok()) {
@@ -198,7 +198,7 @@ int FileBackedBitmap::NumBits() const {
 libtextclassifier3::Status FileBackedBitmap::Set(int bit_index,
                                                  bool bit_value) {
   if (bit_index >= NumBits()) {
-    // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
+    // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
     // that can support error logging.
     libtextclassifier3::Status status = GrowTo(bit_index);
     if (!status.ok()) {
@@ -261,7 +261,7 @@ libtextclassifier3::Status FileBackedBitmap::GrowTo(int new_num_bits) {
                                       file_path_.c_str(), new_file_size));
   }
 
-  // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
+  // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
   // that can support error logging.
   libtextclassifier3::Status status = mmapper_->Remap(0, new_file_size);
   if (!status.ok()) {
@@ -281,7 +281,7 @@ libtextclassifier3::Status FileBackedBitmap::TruncateTo(int new_num_bits) {
   }
 
   const size_t new_file_size = FileSizeForBits(new_num_bits);
-  // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
+  // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
   // that can support error logging.
   libtextclassifier3::Status status = mmapper_->Remap(0, new_file_size);
   if (!status.ok()) {
diff --git a/icing/file/filesystem.h b/icing/file/filesystem.h
index ca8c4a8..dd2c5d1 100644
--- a/icing/file/filesystem.h
+++ b/icing/file/filesystem.h
@@ -233,6 +233,11 @@ class Filesystem {
   // Increments to_increment by size if size is valid, or sets to_increment
   // to kBadFileSize if either size or to_increment is kBadFileSize.
   static void IncrementByOrSetInvalid(int64_t size, int64_t* to_increment);
+
+  // Return -1 if file_size is invalid. Otherwise, return file_size.
+  static int64_t SanitizeFileSize(int64_t file_size) {
+    return (file_size != kBadFileSize) ? file_size : -1;
+  }
 };
 // LINT.ThenChange(//depot/google3/icing/file/mock-filesystem.h)
 
diff --git a/icing/file/portable-file-backed-proto-log.h b/icing/file/portable-file-backed-proto-log.h
index f676dc5..409ab96 100644
--- a/icing/file/portable-file-backed-proto-log.h
+++ b/icing/file/portable-file-backed-proto-log.h
@@ -124,6 +124,8 @@ class PortableFileBackedProtoLog {
    public:
     static constexpr int32_t kMagic = 0xf4c6f67a;
 
+    // We should go directly from 0 to 2 the next time we have to change the
+    // format.
     static constexpr int32_t kFileFormatVersion = 0;
 
     uint32_t CalculateHeaderChecksum() const {
@@ -282,7 +284,7 @@ class PortableFileBackedProtoLog {
     // before updating our checksum.
     bool recalculated_checksum = false;
 
-    bool has_data_loss() {
+    bool has_data_loss() const {
       return data_loss == DataLoss::PARTIAL || data_loss == DataLoss::COMPLETE;
     }
   };
@@ -376,8 +378,7 @@ class PortableFileBackedProtoLog {
   // }
   class Iterator {
    public:
-    Iterator(const Filesystem& filesystem, const std::string& file_path,
-             int64_t initial_offset);
+    Iterator(const Filesystem& filesystem, int fd, int64_t initial_offset);
 
     // Advances to the position of next proto whether it has been erased or not.
     //
@@ -393,11 +394,12 @@ class PortableFileBackedProtoLog {
    private:
     static constexpr int64_t kInvalidOffset = -1;
     // Used to read proto metadata
-    MemoryMappedFile mmapped_file_;
     // Offset of first proto
+    const Filesystem* const filesystem_;
     int64_t initial_offset_;
     int64_t current_offset_;
     int64_t file_size_;
+    int fd_;
   };
 
   // Returns an iterator of current proto log. The caller needs to keep the
@@ -513,7 +515,7 @@ class PortableFileBackedProtoLog {
       const Filesystem* filesystem, const std::string& file_path,
       Crc32 initial_crc, int64_t start, int64_t end);
 
-  // Reads out the metadata of a proto located at file_offset from the file.
+  // Reads out the metadata of a proto located at file_offset from the fd.
   // Metadata will be returned in host byte order endianness.
   //
   // Returns:
@@ -521,7 +523,8 @@ class PortableFileBackedProtoLog {
   //   OUT_OF_RANGE_ERROR if file_offset exceeds file_size
   //   INTERNAL_ERROR if the metadata is invalid or any IO errors happen
   static libtextclassifier3::StatusOr<int32_t> ReadProtoMetadata(
-      MemoryMappedFile* mmapped_file, int64_t file_offset, int64_t file_size);
+      const Filesystem* const filesystem, int fd, int64_t file_offset,
+      int64_t file_size);
 
   // Writes metadata of a proto to the fd. Takes in a host byte order endianness
   // metadata and converts it into a portable metadata before writing.
@@ -937,35 +940,37 @@ template <typename ProtoT>
 libtextclassifier3::StatusOr<ProtoT>
 PortableFileBackedProtoLog<ProtoT>::ReadProto(int64_t file_offset) const {
   int64_t file_size = filesystem_->GetFileSize(fd_.get());
-  MemoryMappedFile mmapped_file(*filesystem_, file_path_,
-                                MemoryMappedFile::Strategy::READ_ONLY);
-  if (file_offset >= file_size) {
-    // file_size points to the next byte to write at, so subtract one to get
-    // the inclusive, actual size of file.
-    return absl_ports::OutOfRangeError(
-        IcingStringUtil::StringPrintf("Trying to read from a location, %lld, "
-                                      "out of range of the file size, %lld",
-                                      static_cast<long long>(file_offset),
-                                      static_cast<long long>(file_size - 1)));
-  }
-
   // Read out the metadata
+  if (file_size == Filesystem::kBadFileSize) {
+    return absl_ports::OutOfRangeError("Unable to correctly read size.");
+  }
   ICING_ASSIGN_OR_RETURN(
       int32_t metadata,
-      ReadProtoMetadata(&mmapped_file, file_offset, file_size));
+      ReadProtoMetadata(filesystem_, fd_.get(), file_offset, file_size));
 
   // Copy out however many bytes it says the proto is
   int stored_size = GetProtoSize(metadata);
+  file_offset += sizeof(metadata);
 
-  ICING_RETURN_IF_ERROR(
-      mmapped_file.Remap(file_offset + sizeof(metadata), stored_size));
+  // Read the compressed proto out.
+  if (file_offset + stored_size > file_size) {
+    return absl_ports::OutOfRangeError(
+        IcingStringUtil::StringPrintf("Trying to read from a location, %lld, "
+                                      "out of range of the file size, %lld",
+                                      static_cast<long long>(file_offset),
+                                      static_cast<long long>(file_size - 1)));
+  }
+  auto buf = std::make_unique<char[]>(stored_size);
+  if (!filesystem_->PRead(fd_.get(), buf.get(), stored_size, file_offset)) {
+    return absl_ports::InternalError("");
+  }
 
-  if (IsEmptyBuffer(mmapped_file.region(), mmapped_file.region_size())) {
+  if (IsEmptyBuffer(buf.get(), stored_size)) {
     return absl_ports::NotFoundError("The proto data has been erased.");
   }
 
-  google::protobuf::io::ArrayInputStream proto_stream(
-      mmapped_file.mutable_region(), stored_size);
+  google::protobuf::io::ArrayInputStream proto_stream(buf.get(),
+                                                          stored_size);
 
   // Deserialize proto
   ProtoT proto;
@@ -983,33 +988,29 @@ template <typename ProtoT>
 libtextclassifier3::Status PortableFileBackedProtoLog<ProtoT>::EraseProto(
     int64_t file_offset) {
   int64_t file_size = filesystem_->GetFileSize(fd_.get());
-  if (file_offset >= file_size) {
-    // file_size points to the next byte to write at, so subtract one to get
-    // the inclusive, actual size of file.
-    return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
-        "Trying to erase data at a location, %lld, "
-        "out of range of the file size, %lld",
-        static_cast<long long>(file_offset),
-        static_cast<long long>(file_size - 1)));
+  if (file_size == Filesystem::kBadFileSize) {
+    return absl_ports::OutOfRangeError("Unable to correctly read size.");
   }
 
-  MemoryMappedFile mmapped_file(
-      *filesystem_, file_path_,
-      MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC);
-
-  // Read out the metadata
   ICING_ASSIGN_OR_RETURN(
       int32_t metadata,
-      ReadProtoMetadata(&mmapped_file, file_offset, file_size));
-
-  ICING_RETURN_IF_ERROR(mmapped_file.Remap(file_offset + sizeof(metadata),
-                                           GetProtoSize(metadata)));
+      ReadProtoMetadata(filesystem_, fd_.get(), file_offset, file_size));
+  // Copy out however many bytes it says the proto is
+  int stored_size = GetProtoSize(metadata);
+  file_offset += sizeof(metadata);
+  if (file_offset + stored_size > file_size) {
+    return absl_ports::OutOfRangeError(
+        IcingStringUtil::StringPrintf("Trying to read from a location, %lld, "
+                                      "out of range of the file size, %lld",
+                                      static_cast<long long>(file_offset),
+                                      static_cast<long long>(file_size - 1)));
+  }
+  auto buf = std::make_unique<char[]>(stored_size);
 
   // We need to update the crc checksum if the erased area is before the
   // rewind position.
   int32_t new_crc;
-  int64_t erased_proto_offset = file_offset + sizeof(metadata);
-  if (erased_proto_offset < header_->GetRewindOffset()) {
+  if (file_offset < header_->GetRewindOffset()) {
     // Set to "dirty" before we start writing anything.
     header_->SetDirtyFlag(true);
     header_->SetHeaderChecksum(header_->CalculateHeaderChecksum());
@@ -1022,24 +1023,30 @@ libtextclassifier3::Status PortableFileBackedProtoLog<ProtoT>::EraseProto(
     // We need to calculate [original string xor 0s].
     // The xored string is the same as the original string because 0 xor 0 =
     // 0, 1 xor 0 = 1.
-    const std::string_view xored_str(mmapped_file.region(),
-                                     mmapped_file.region_size());
+    // Read the compressed proto out.
+    if (!filesystem_->PRead(fd_.get(), buf.get(), stored_size, file_offset)) {
+      return absl_ports::InternalError("");
+    }
+    const std::string_view xored_str(buf.get(), stored_size);
 
     Crc32 crc(header_->GetLogChecksum());
     ICING_ASSIGN_OR_RETURN(
-        new_crc, crc.UpdateWithXor(
-                     xored_str,
-                     /*full_data_size=*/header_->GetRewindOffset() -
-                         kHeaderReservedBytes,
-                     /*position=*/erased_proto_offset - kHeaderReservedBytes));
+        new_crc,
+        crc.UpdateWithXor(xored_str,
+                          /*full_data_size=*/header_->GetRewindOffset() -
+                              kHeaderReservedBytes,
+                          /*position=*/file_offset - kHeaderReservedBytes));
   }
 
   // Clear the region.
-  memset(mmapped_file.mutable_region(), '\0', mmapped_file.region_size());
+  memset(buf.get(), '\0', stored_size);
+  if (!filesystem_->PWrite(fd_.get(), file_offset, buf.get(), stored_size)) {
+    return absl_ports::InternalError("");
+  }
 
   // If we cleared something in our checksummed area, we should update our
   // checksum and reset our dirty bit.
-  if (erased_proto_offset < header_->GetRewindOffset()) {
+  if (file_offset < header_->GetRewindOffset()) {
     header_->SetDirtyFlag(false);
     header_->SetLogChecksum(new_crc);
     header_->SetHeaderChecksum(header_->CalculateHeaderChecksum());
@@ -1077,13 +1084,12 @@ PortableFileBackedProtoLog<ProtoT>::GetElementsFileSize() const {
 
 template <typename ProtoT>
 PortableFileBackedProtoLog<ProtoT>::Iterator::Iterator(
-    const Filesystem& filesystem, const std::string& file_path,
-    int64_t initial_offset)
-    : mmapped_file_(filesystem, file_path,
-                    MemoryMappedFile::Strategy::READ_ONLY),
+    const Filesystem& filesystem, int fd, int64_t initial_offset)
+    : filesystem_(&filesystem),
       initial_offset_(initial_offset),
       current_offset_(kInvalidOffset),
-      file_size_(filesystem.GetFileSize(file_path.c_str())) {
+      fd_(fd) {
+  file_size_ = filesystem_->GetFileSize(fd_);
   if (file_size_ == Filesystem::kBadFileSize) {
     // Fails all Advance() calls
     file_size_ = 0;
@@ -1100,7 +1106,7 @@ PortableFileBackedProtoLog<ProtoT>::Iterator::Advance() {
     // Jumps to the next proto position
     ICING_ASSIGN_OR_RETURN(
         int32_t metadata,
-        ReadProtoMetadata(&mmapped_file_, current_offset_, file_size_));
+        ReadProtoMetadata(filesystem_, fd_, current_offset_, file_size_));
     current_offset_ += sizeof(metadata) + GetProtoSize(metadata);
   }
 
@@ -1122,14 +1128,15 @@ int64_t PortableFileBackedProtoLog<ProtoT>::Iterator::GetOffset() {
 template <typename ProtoT>
 typename PortableFileBackedProtoLog<ProtoT>::Iterator
 PortableFileBackedProtoLog<ProtoT>::GetIterator() {
-  return Iterator(*filesystem_, file_path_,
+  return Iterator(*filesystem_, fd_.get(),
                   /*initial_offset=*/kHeaderReservedBytes);
 }
 
 template <typename ProtoT>
 libtextclassifier3::StatusOr<int32_t>
 PortableFileBackedProtoLog<ProtoT>::ReadProtoMetadata(
-    MemoryMappedFile* mmapped_file, int64_t file_offset, int64_t file_size) {
+    const Filesystem* const filesystem, int fd, int64_t file_offset,
+    int64_t file_size) {
   // Checks file_offset
   if (file_offset >= file_size) {
     return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
@@ -1147,9 +1154,9 @@ PortableFileBackedProtoLog<ProtoT>::ReadProtoMetadata(
         static_cast<long long>(file_size)));
   }
 
-  // Reads metadata
-  ICING_RETURN_IF_ERROR(mmapped_file->Remap(file_offset, metadata_size));
-  memcpy(&portable_metadata, mmapped_file->region(), metadata_size);
+  if (!filesystem->PRead(fd, &portable_metadata, metadata_size, file_offset)) {
+    return absl_ports::InternalError("");
+  }
 
   // Need to switch it back to host order endianness after reading from disk.
   int32_t host_order_metadata = GNetworkToHostL(portable_metadata);
diff --git a/icing/file/portable-file-backed-proto-log_benchmark.cc b/icing/file/portable-file-backed-proto-log_benchmark.cc
index f83ccd6..80a8011 100644
--- a/icing/file/portable-file-backed-proto-log_benchmark.cc
+++ b/icing/file/portable-file-backed-proto-log_benchmark.cc
@@ -55,7 +55,7 @@ namespace lib {
 
 namespace {
 
-static void BM_Write(benchmark::State& state) {
+void BM_Write(benchmark::State& state) {
   const Filesystem filesystem;
   int string_length = state.range(0);
   const std::string file_path = IcingStringUtil::StringPrintf(
@@ -108,7 +108,7 @@ BENCHMARK(BM_Write)
                               // 16MiB, and we need some extra space for the
                               // rest of the document properties
 
-static void BM_Read(benchmark::State& state) {
+void BM_Read(benchmark::State& state) {
   const Filesystem filesystem;
   int string_length = state.range(0);
   const std::string file_path = IcingStringUtil::StringPrintf(
@@ -164,7 +164,7 @@ BENCHMARK(BM_Read)
                               // 16MiB, and we need some extra space for the
                               // rest of the document properties
                               //
-static void BM_Erase(benchmark::State& state) {
+void BM_Erase(benchmark::State& state) {
   const Filesystem filesystem;
   const std::string file_path = IcingStringUtil::StringPrintf(
       "%s%s", GetTestTempDir().c_str(), "/proto.log");
@@ -204,7 +204,7 @@ static void BM_Erase(benchmark::State& state) {
 }
 BENCHMARK(BM_Erase);
 
-static void BM_ComputeChecksum(benchmark::State& state) {
+void BM_ComputeChecksum(benchmark::State& state) {
   const Filesystem filesystem;
   const std::string file_path = GetTestTempDir() + "/proto.log";
   int max_proto_size = (1 << 24) - 1;  // 16 MiB
@@ -246,7 +246,7 @@ static void BM_ComputeChecksum(benchmark::State& state) {
 }
 BENCHMARK(BM_ComputeChecksum)->Range(1024, 1 << 20);
 
-static void BM_ComputeChecksumWithCachedChecksum(benchmark::State& state) {
+void BM_ComputeChecksumWithCachedChecksum(benchmark::State& state) {
   const Filesystem filesystem;
   const std::string file_path = GetTestTempDir() + "/proto.log";
   int max_proto_size = (1 << 24) - 1;  // 16 MiB
@@ -290,7 +290,7 @@ static void BM_ComputeChecksumWithCachedChecksum(benchmark::State& state) {
 }
 BENCHMARK(BM_ComputeChecksumWithCachedChecksum);
 
-static void BM_ComputeChecksumOnlyForTail(benchmark::State& state) {
+void BM_ComputeChecksumOnlyForTail(benchmark::State& state) {
   const Filesystem filesystem;
   const std::string file_path = GetTestTempDir() + "/proto.log";
   int max_proto_size = (1 << 24) - 1;  // 16 MiB
diff --git a/icing/file/portable-file-backed-proto-log_test.cc b/icing/file/portable-file-backed-proto-log_test.cc
index b5fee4b..795271a 100644
--- a/icing/file/portable-file-backed-proto-log_test.cc
+++ b/icing/file/portable-file-backed-proto-log_test.cc
@@ -851,11 +851,12 @@ TEST_F(PortableFileBackedProtoLogTest, Iterator) {
 
   {
     // Iterator with bad filesystem
+    ScopedFd sfd(filesystem_.OpenForRead(file_path_.c_str()));
     MockFilesystem mock_filesystem;
-    ON_CALL(mock_filesystem, GetFileSize(A<const char*>()))
+    ON_CALL(mock_filesystem, GetFileSize(A<int>()))
         .WillByDefault(Return(Filesystem::kBadFileSize));
     PortableFileBackedProtoLog<DocumentProto>::Iterator bad_iterator(
-        mock_filesystem, file_path_, /*initial_offset=*/0);
+        mock_filesystem, sfd.get(), /*initial_offset=*/0);
     ASSERT_THAT(bad_iterator.Advance(),
                 StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
   }
diff --git a/icing/icing-search-engine.cc b/icing/icing-search-engine.cc
index 6acdd31..952ba21 100644
--- a/icing/icing-search-engine.cc
+++ b/icing/icing-search-engine.cc
@@ -18,6 +18,7 @@
 #include <memory>
 #include <string>
 #include <string_view>
+#include <unordered_map>
 #include <utility>
 #include <vector>
 
@@ -88,6 +89,23 @@ constexpr std::string_view kOptimizeStatusFilename = "optimize_status";
 // fresh state.
 constexpr int kMaxUnsuccessfulInitAttempts = 5;
 
+// A pair that holds namespace and type.
+struct NamespaceTypePair {
+  std::string namespace_;
+  std::string type;
+
+  bool operator==(const NamespaceTypePair& other) const {
+    return namespace_ == other.namespace_ && type == other.type;
+  }
+};
+
+struct NamespaceTypePairHasher {
+  std::size_t operator()(const NamespaceTypePair& pair) const {
+    return std::hash<std::string>()(pair.namespace_) ^
+           std::hash<std::string>()(pair.type);
+  }
+};
+
 libtextclassifier3::Status ValidateResultSpec(
     const ResultSpecProto& result_spec) {
   if (result_spec.num_per_page() < 0) {
@@ -255,6 +273,28 @@ void TransformStatus(const libtextclassifier3::Status& internal_status,
   status_proto->set_message(internal_status.error_message());
 }
 
+libtextclassifier3::Status RetrieveAndAddDocumentInfo(
+    const DocumentStore* document_store, DeleteByQueryResultProto& result_proto,
+    std::unordered_map<NamespaceTypePair,
+                       DeleteByQueryResultProto::DocumentGroupInfo*,
+                       NamespaceTypePairHasher>& info_map,
+    DocumentId document_id) {
+  ICING_ASSIGN_OR_RETURN(DocumentProto document,
+                         document_store->Get(document_id));
+  NamespaceTypePair key = {document.namespace_(), document.schema()};
+  auto iter = info_map.find(key);
+  if (iter == info_map.end()) {
+    auto entry = result_proto.add_deleted_documents();
+    entry->set_namespace_(std::move(document.namespace_()));
+    entry->set_schema(std::move(document.schema()));
+    entry->add_uris(std::move(document.uri()));
+    info_map[key] = entry;
+  } else {
+    iter->second->add_uris(std::move(document.uri()));
+  }
+  return libtextclassifier3::Status::OK;
+}
+
 }  // namespace
 
 IcingSearchEngine::IcingSearchEngine(const IcingSearchEngineOptions& options,
@@ -443,8 +483,6 @@ libtextclassifier3::Status IcingSearchEngine::InitializeMembers(
     // last tried to set the schema.
     ICING_RETURN_IF_ERROR(InitializeDocumentStore(
         /*force_recovery_and_revalidate_documents=*/true, initialize_stats));
-    initialize_stats->set_document_store_recovery_cause(
-        InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC);
 
     // We're going to need to build the index from scratch. So just delete its
     // files now.
@@ -934,7 +972,7 @@ DeleteResultProto IcingSearchEngine::Delete(const std::string_view name_space,
   delete_stats->set_delete_type(DeleteStatsProto::DeleteType::SINGLE);
 
   std::unique_ptr<Timer> delete_timer = clock_->GetNewTimer();
-  // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
+  // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
   // that can support error logging.
   libtextclassifier3::Status status = document_store_->Delete(name_space, uri);
   if (!status.ok()) {
@@ -968,7 +1006,7 @@ DeleteByNamespaceResultProto IcingSearchEngine::DeleteByNamespace(
   delete_stats->set_delete_type(DeleteStatsProto::DeleteType::NAMESPACE);
 
   std::unique_ptr<Timer> delete_timer = clock_->GetNewTimer();
-  // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
+  // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
   // that can support error logging.
   DocumentStore::DeleteByGroupResult doc_store_result =
       document_store_->DeleteByNamespace(name_space);
@@ -1002,7 +1040,7 @@ DeleteBySchemaTypeResultProto IcingSearchEngine::DeleteBySchemaType(
   delete_stats->set_delete_type(DeleteStatsProto::DeleteType::SCHEMA_TYPE);
 
   std::unique_ptr<Timer> delete_timer = clock_->GetNewTimer();
-  // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
+  // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
   // that can support error logging.
   DocumentStore::DeleteByGroupResult doc_store_result =
       document_store_->DeleteBySchemaType(schema_type);
@@ -1020,7 +1058,7 @@ DeleteBySchemaTypeResultProto IcingSearchEngine::DeleteBySchemaType(
 }
 
 DeleteByQueryResultProto IcingSearchEngine::DeleteByQuery(
-    const SearchSpecProto& search_spec) {
+    const SearchSpecProto& search_spec, bool return_deleted_document_info) {
   ICING_VLOG(1) << "Deleting documents for query " << search_spec.query()
                 << " from doc store";
 
@@ -1074,12 +1112,27 @@ DeleteByQueryResultProto IcingSearchEngine::DeleteByQuery(
 
   ICING_VLOG(2) << "Deleting the docs that matched the query.";
   int num_deleted = 0;
+  // A map used to group deleted documents.
+  // From the (namespace, type) pair to a list of uris.
+  std::unordered_map<NamespaceTypePair,
+                     DeleteByQueryResultProto::DocumentGroupInfo*,
+                     NamespaceTypePairHasher>
+      deleted_info_map;
 
   component_timer = clock_->GetNewTimer();
   while (query_results.root_iterator->Advance().ok()) {
     ICING_VLOG(3) << "Deleting doc "
                   << query_results.root_iterator->doc_hit_info().document_id();
     ++num_deleted;
+    if (return_deleted_document_info) {
+      status = RetrieveAndAddDocumentInfo(
+          document_store_.get(), result_proto, deleted_info_map,
+          query_results.root_iterator->doc_hit_info().document_id());
+      if (!status.ok()) {
+        TransformStatus(status, result_status);
+        return result_proto;
+      }
+    }
     status = document_store_->Delete(
         query_results.root_iterator->doc_hit_info().document_id());
     if (!status.ok()) {
@@ -1148,12 +1201,8 @@ OptimizeResultProto IcingSearchEngine::Optimize() {
   std::unique_ptr<Timer> optimize_timer = clock_->GetNewTimer();
   OptimizeStatsProto* optimize_stats = result_proto.mutable_optimize_stats();
   int64_t before_size = filesystem_->GetDiskUsage(options_.base_dir().c_str());
-  if (before_size != Filesystem::kBadFileSize) {
-    optimize_stats->set_storage_size_before(before_size);
-  } else {
-    // Set -1 as a sentinel value when failures occur.
-    optimize_stats->set_storage_size_before(-1);
-  }
+  optimize_stats->set_storage_size_before(
+      Filesystem::SanitizeFileSize(before_size));
 
   // Flushes data to disk before doing optimization
   auto status = InternalPersistToDisk(PersistType::FULL);
@@ -1230,12 +1279,8 @@ OptimizeResultProto IcingSearchEngine::Optimize() {
   optimize_status_file.Write(std::move(optimize_status));
 
   int64_t after_size = filesystem_->GetDiskUsage(options_.base_dir().c_str());
-  if (after_size != Filesystem::kBadFileSize) {
-    optimize_stats->set_storage_size_after(after_size);
-  } else {
-    // Set -1 as a sentinel value when failures occur.
-    optimize_stats->set_storage_size_after(-1);
-  }
+  optimize_stats->set_storage_size_after(
+      Filesystem::SanitizeFileSize(after_size));
   optimize_stats->set_latency_ms(optimize_timer->GetElapsedMilliseconds());
 
   TransformStatus(optimization_status, result_status);
@@ -1317,11 +1362,8 @@ StorageInfoResultProto IcingSearchEngine::GetStorageInfo() {
   }
 
   int64_t index_size = filesystem_->GetDiskUsage(options_.base_dir().c_str());
-  if (index_size != Filesystem::kBadFileSize) {
-    result.mutable_storage_info()->set_total_storage_size(index_size);
-  } else {
-    result.mutable_storage_info()->set_total_storage_size(-1);
-  }
+  result.mutable_storage_info()->set_total_storage_size(
+      Filesystem::SanitizeFileSize(index_size));
   *result.mutable_storage_info()->mutable_document_storage_info() =
       document_store_->GetStorageInfo();
   *result.mutable_storage_info()->mutable_schema_store_storage_info() =
diff --git a/icing/icing-search-engine.h b/icing/icing-search-engine.h
index 0a79714..ff9c7fb 100644
--- a/icing/icing-search-engine.h
+++ b/icing/icing-search-engine.h
@@ -280,8 +280,9 @@ class IcingSearchEngine {
   //   NOT_FOUND if the query doesn't match any documents
   //   FAILED_PRECONDITION IcingSearchEngine has not been initialized yet
   //   INTERNAL_ERROR on IO error
-  DeleteByQueryResultProto DeleteByQuery(const SearchSpecProto& search_spec)
-      ICING_LOCKS_EXCLUDED(mutex_);
+  DeleteByQueryResultProto DeleteByQuery(
+      const SearchSpecProto& search_spec,
+      bool return_deleted_document_info = false) ICING_LOCKS_EXCLUDED(mutex_);
 
   // Retrieves, scores, ranks, and returns the results according to the specs.
   // Results can be empty. If there're multiple pages of results,
diff --git a/icing/icing-search-engine_fuzz_test.cc b/icing/icing-search-engine_fuzz_test.cc
index 2a44f8a..bf486da 100644
--- a/icing/icing-search-engine_fuzz_test.cc
+++ b/icing/icing-search-engine_fuzz_test.cc
@@ -18,12 +18,12 @@
 #include "icing/text_classifier/lib3/utils/base/status.h"
 #include "icing/text_classifier/lib3/utils/base/statusor.h"
 #include "icing/document-builder.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
 #include "icing/icing-search-engine.h"
 #include "icing/proto/document.pb.h"
 #include "icing/proto/initialize.pb.h"
 #include "icing/proto/scoring.pb.h"
 #include "icing/schema-builder.h"
+#include "icing/testing/icu-data-file-helper.h"
 #include "icing/testing/test-data.h"
 #include "icing/testing/tmp-directory.h"
 
diff --git a/icing/icing-search-engine_test.cc b/icing/icing-search-engine_test.cc
index 4f16bb6..7ed8885 100644
--- a/icing/icing-search-engine_test.cc
+++ b/icing/icing-search-engine_test.cc
@@ -27,7 +27,6 @@
 #include "icing/document-builder.h"
 #include "icing/file/filesystem.h"
 #include "icing/file/mock-filesystem.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
 #include "icing/legacy/index/icing-mock-filesystem.h"
 #include "icing/portable/endian.h"
 #include "icing/portable/equals-proto.h"
@@ -46,6 +45,7 @@
 #include "icing/store/document-log-creator.h"
 #include "icing/testing/common-matchers.h"
 #include "icing/testing/fake-clock.h"
+#include "icing/testing/icu-data-file-helper.h"
 #include "icing/testing/jni-test-helpers.h"
 #include "icing/testing/random-string.h"
 #include "icing/testing/snippet-helpers.h"
@@ -3493,6 +3493,105 @@ TEST_F(IcingSearchEngineTest, DeleteByQuery) {
                                        expected_search_result_proto));
 }
 
+TEST_F(IcingSearchEngineTest, DeleteByQueryReturnInfo) {
+  DocumentProto document1 =
+      DocumentBuilder()
+          .SetKey("namespace1", "uri1")
+          .SetSchema("Message")
+          .AddStringProperty("body", "message body1")
+          .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+          .Build();
+  DocumentProto document2 =
+      DocumentBuilder()
+          .SetKey("namespace2", "uri2")
+          .SetSchema("Message")
+          .AddStringProperty("body", "message body2")
+          .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+          .Build();
+  DocumentProto document3 =
+      DocumentBuilder()
+          .SetKey("namespace2", "uri3")
+          .SetSchema("Message")
+          .AddStringProperty("body", "message body3")
+          .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+          .Build();
+
+  auto fake_clock = std::make_unique<FakeClock>();
+  fake_clock->SetTimerElapsedMilliseconds(7);
+  TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+                              std::make_unique<Filesystem>(),
+                              std::make_unique<IcingFilesystem>(),
+                              std::move(fake_clock), GetTestJniCache());
+  ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+  ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+  ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+  ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+  ASSERT_THAT(icing.Put(document3).status(), ProtoIsOk());
+
+  GetResultProto expected_get_result_proto;
+  expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+  *expected_get_result_proto.mutable_document() = document1;
+  EXPECT_THAT(
+      icing.Get("namespace1", "uri1", GetResultSpecProto::default_instance()),
+      EqualsProto(expected_get_result_proto));
+
+  *expected_get_result_proto.mutable_document() = document2;
+  EXPECT_THAT(
+      icing.Get("namespace2", "uri2", GetResultSpecProto::default_instance()),
+      EqualsProto(expected_get_result_proto));
+
+  *expected_get_result_proto.mutable_document() = document3;
+  EXPECT_THAT(
+      icing.Get("namespace2", "uri3", GetResultSpecProto::default_instance()),
+      EqualsProto(expected_get_result_proto));
+
+  // Delete all docs to test the information is correctly grouped.
+  SearchSpecProto search_spec;
+  search_spec.set_query("message");
+  search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+  DeleteByQueryResultProto result_proto =
+      icing.DeleteByQuery(search_spec, true);
+  EXPECT_THAT(result_proto.status(), ProtoIsOk());
+  DeleteByQueryStatsProto exp_stats;
+  exp_stats.set_latency_ms(7);
+  exp_stats.set_num_documents_deleted(3);
+  exp_stats.set_query_length(search_spec.query().length());
+  exp_stats.set_num_terms(1);
+  exp_stats.set_num_namespaces_filtered(0);
+  exp_stats.set_num_schema_types_filtered(0);
+  exp_stats.set_parse_query_latency_ms(7);
+  exp_stats.set_document_removal_latency_ms(7);
+  EXPECT_THAT(result_proto.delete_by_query_stats(), EqualsProto(exp_stats));
+
+  // Check that DeleteByQuery can return information for deleted documents.
+  DeleteByQueryResultProto::DocumentGroupInfo info1, info2;
+  info1.set_namespace_("namespace1");
+  info1.set_schema("Message");
+  info1.add_uris("uri1");
+  info2.set_namespace_("namespace2");
+  info2.set_schema("Message");
+  info2.add_uris("uri3");
+  info2.add_uris("uri2");
+  EXPECT_THAT(result_proto.deleted_documents(),
+              UnorderedElementsAre(EqualsProto(info1), EqualsProto(info2)));
+
+  EXPECT_THAT(
+      icing.Get("namespace1", "uri1", GetResultSpecProto::default_instance())
+          .status()
+          .code(),
+      Eq(StatusProto::NOT_FOUND));
+  EXPECT_THAT(
+      icing.Get("namespace2", "uri2", GetResultSpecProto::default_instance())
+          .status()
+          .code(),
+      Eq(StatusProto::NOT_FOUND));
+  EXPECT_THAT(
+      icing.Get("namespace2", "uri3", GetResultSpecProto::default_instance())
+          .status()
+          .code(),
+      Eq(StatusProto::NOT_FOUND));
+}
+
 TEST_F(IcingSearchEngineTest, DeleteByQueryNotFound) {
   DocumentProto document1 =
       DocumentBuilder()
@@ -8489,7 +8588,7 @@ TEST_F(IcingSearchEngineTest, MigrateToPortableFileBackedProtoLog) {
   EXPECT_THAT(init_result.initialize_stats().document_store_data_status(),
               Eq(InitializeStatsProto::NO_DATA_LOSS));
   EXPECT_THAT(init_result.initialize_stats().document_store_recovery_cause(),
-              Eq(InitializeStatsProto::NONE));
+              Eq(InitializeStatsProto::LEGACY_DOCUMENT_LOG_FORMAT));
   EXPECT_THAT(init_result.initialize_stats().schema_store_recovery_cause(),
               Eq(InitializeStatsProto::NONE));
   EXPECT_THAT(init_result.initialize_stats().index_restoration_cause(),
diff --git a/icing/index/index-processor_benchmark.cc b/icing/index/index-processor_benchmark.cc
index 6e072c7..1aad7d0 100644
--- a/icing/index/index-processor_benchmark.cc
+++ b/icing/index/index-processor_benchmark.cc
@@ -16,7 +16,6 @@
 #include "gmock/gmock.h"
 #include "icing/document-builder.h"
 #include "icing/file/filesystem.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
 #include "icing/index/index-processor.h"
 #include "icing/index/index.h"
 #include "icing/legacy/core/icing-string-util.h"
@@ -24,6 +23,7 @@
 #include "icing/schema/schema-util.h"
 #include "icing/schema/section-manager.h"
 #include "icing/testing/common-matchers.h"
+#include "icing/testing/icu-data-file-helper.h"
 #include "icing/testing/test-data.h"
 #include "icing/testing/tmp-directory.h"
 #include "icing/tokenization/language-segmenter-factory.h"
diff --git a/icing/index/index-processor_test.cc b/icing/index/index-processor_test.cc
index d21a06f..bd310de 100644
--- a/icing/index/index-processor_test.cc
+++ b/icing/index/index-processor_test.cc
@@ -30,7 +30,6 @@
 #include "icing/absl_ports/str_join.h"
 #include "icing/document-builder.h"
 #include "icing/file/filesystem.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
 #include "icing/index/hit/doc-hit-info.h"
 #include "icing/index/index.h"
 #include "icing/index/iterator/doc-hit-info-iterator.h"
@@ -49,6 +48,7 @@
 #include "icing/store/document-id.h"
 #include "icing/testing/common-matchers.h"
 #include "icing/testing/fake-clock.h"
+#include "icing/testing/icu-data-file-helper.h"
 #include "icing/testing/random-string.h"
 #include "icing/testing/test-data.h"
 #include "icing/testing/tmp-directory.h"
diff --git a/icing/index/index.cc b/icing/index/index.cc
index 984dc78..02ba699 100644
--- a/icing/index/index.cc
+++ b/icing/index/index.cc
@@ -119,7 +119,7 @@ std::vector<TermMetadata> MergeAndRankTermMetadatas(
         int total_est_hit_count =
             lite_term_itr->hit_count + main_term_itr->hit_count;
         PushToTermHeap(TermMetadata(std::move(lite_term_itr->content),
-                         total_est_hit_count),
+                                    total_est_hit_count),
                        num_to_return, merged_term_metadata_heap);
         ++lite_term_itr;
         ++main_term_itr;
@@ -259,11 +259,7 @@ Index::FindTermsByPrefix(const std::string& prefix, int num_to_return,
 IndexStorageInfoProto Index::GetStorageInfo() const {
   IndexStorageInfoProto storage_info;
   int64_t directory_size = filesystem_->GetDiskUsage(options_.base_dir.c_str());
-  if (directory_size != Filesystem::kBadFileSize) {
-    storage_info.set_index_size(directory_size);
-  } else {
-    storage_info.set_index_size(-1);
-  }
+  storage_info.set_index_size(Filesystem::SanitizeFileSize(directory_size));
   storage_info = lite_index_->GetStorageInfo(std::move(storage_info));
   return main_index_->GetStorageInfo(std::move(storage_info));
 }
diff --git a/icing/index/index.h b/icing/index/index.h
index 4aa9ba3..5c53349 100644
--- a/icing/index/index.h
+++ b/icing/index/index.h
@@ -32,6 +32,7 @@
 #include "icing/index/term-id-codec.h"
 #include "icing/index/term-metadata.h"
 #include "icing/legacy/index/icing-filesystem.h"
+#include "icing/proto/debug.pb.h"
 #include "icing/proto/storage.pb.h"
 #include "icing/proto/term.pb.h"
 #include "icing/schema/section.h"
@@ -143,9 +144,14 @@ class Index {
   //                 index.
   // verbosity > 0, more detailed debug information including raw postings
   //                lists.
-  void GetDebugInfo(int verbosity, std::string* out) const {
-    lite_index_->GetDebugInfo(verbosity, out);
-    main_index_->GetDebugInfo(verbosity, out);
+  IndexDebugInfoProto GetDebugInfo(int verbosity) const {
+    IndexDebugInfoProto debug_info;
+    *debug_info.mutable_index_storage_info() = GetStorageInfo();
+    *debug_info.mutable_lite_index_info() =
+        lite_index_->GetDebugInfo(verbosity);
+    *debug_info.mutable_main_index_info() =
+        main_index_->GetDebugInfo(verbosity);
+    return debug_info;
   }
 
   // Returns the byte size of the all the elements held in the index. This
diff --git a/icing/index/index_test.cc b/icing/index/index_test.cc
index cd64202..8355c01 100644
--- a/icing/index/index_test.cc
+++ b/icing/index/index_test.cc
@@ -31,6 +31,7 @@
 #include "icing/index/iterator/doc-hit-info-iterator.h"
 #include "icing/legacy/index/icing-filesystem.h"
 #include "icing/legacy/index/icing-mock-filesystem.h"
+#include "icing/proto/debug.pb.h"
 #include "icing/proto/storage.pb.h"
 #include "icing/proto/term.pb.h"
 #include "icing/schema/section.h"
@@ -1394,12 +1395,14 @@ TEST_F(IndexTest, GetDebugInfo) {
   EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
   edit = index_->Edit(kDocumentId1, kSectionId3, TermMatchType::PREFIX,
                       /*namespace_id=*/0);
+  index_->set_last_added_document_id(kDocumentId1);
   ASSERT_THAT(edit.BufferTerm("foot"), IsOk());
   EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
   ICING_ASSERT_OK(index_->Merge());
 
   edit = index_->Edit(kDocumentId2, kSectionId2, TermMatchType::EXACT_ONLY,
                       /*namespace_id=*/0);
+  index_->set_last_added_document_id(kDocumentId2);
   ASSERT_THAT(edit.BufferTerm("footer"), IsOk());
   EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
   edit = index_->Edit(kDocumentId2, kSectionId3, TermMatchType::PREFIX,
@@ -1407,40 +1410,45 @@ TEST_F(IndexTest, GetDebugInfo) {
   ASSERT_THAT(edit.BufferTerm("foo"), IsOk());
   EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
 
-  std::string out0;
-  index_->GetDebugInfo(/*verbosity=*/0, &out0);
-  EXPECT_THAT(out0, Not(IsEmpty()));
+  IndexDebugInfoProto out0 = index_->GetDebugInfo(/*verbosity=*/0);
+  EXPECT_FALSE(out0.main_index_info().has_flash_index_storage_info());
+  EXPECT_THAT(out0.main_index_info().last_added_document_id(),
+              Eq(kDocumentId1));
+  EXPECT_THAT(out0.lite_index_info().curr_size(), Eq(2));
+  EXPECT_THAT(out0.lite_index_info().last_added_document_id(),
+              Eq(kDocumentId2));
 
-  std::string out1;
-  index_->GetDebugInfo(/*verbosity=*/1, &out1);
-  EXPECT_THAT(out1, SizeIs(Gt(out0.size())));
+  IndexDebugInfoProto out1 = index_->GetDebugInfo(/*verbosity=*/1);
+  EXPECT_THAT(out1.main_index_info().flash_index_storage_info(),
+              Not(IsEmpty()));
 
   // Add one more doc to the lite index. Debug strings should change.
   edit = index_->Edit(kDocumentId3, kSectionId2, TermMatchType::EXACT_ONLY,
                       /*namespace_id=*/0);
+  index_->set_last_added_document_id(kDocumentId3);
   ASSERT_THAT(edit.BufferTerm("far"), IsOk());
   EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
 
-  std::string out2;
-  index_->GetDebugInfo(/*verbosity=*/0, &out2);
-  EXPECT_THAT(out2, Ne(out0));
-
-  std::string out3;
-  index_->GetDebugInfo(/*verbosity=*/1, &out3);
-  EXPECT_THAT(out3, Ne(out1));
+  IndexDebugInfoProto out2 = index_->GetDebugInfo(/*verbosity=*/0);
+  EXPECT_THAT(out2.lite_index_info().curr_size(), Eq(3));
+  EXPECT_THAT(out2.lite_index_info().last_added_document_id(),
+              Eq(kDocumentId3));
 
   // Merge into the man index. Debuug strings should change again.
   ICING_ASSERT_OK(index_->Merge());
 
-  std::string out4;
-  index_->GetDebugInfo(/*verbosity=*/0, &out4);
-  EXPECT_THAT(out4, Ne(out0));
-  EXPECT_THAT(out4, Ne(out2));
-
-  std::string out5;
-  index_->GetDebugInfo(/*verbosity=*/1, &out5);
-  EXPECT_THAT(out5, Ne(out1));
-  EXPECT_THAT(out5, Ne(out3));
+  IndexDebugInfoProto out3 = index_->GetDebugInfo(/*verbosity=*/0);
+  EXPECT_TRUE(out3.has_index_storage_info());
+  EXPECT_THAT(out3.main_index_info().lexicon_info(), Not(IsEmpty()));
+  EXPECT_THAT(out3.main_index_info().last_added_document_id(),
+              Eq(kDocumentId3));
+  EXPECT_THAT(out3.lite_index_info().curr_size(), Eq(0));
+  EXPECT_THAT(out3.lite_index_info().hit_buffer_size(), Gt(0));
+  EXPECT_THAT(out3.lite_index_info().last_added_document_id(),
+              Eq(kInvalidDocumentId));
+  EXPECT_THAT(out3.lite_index_info().searchable_end(), Eq(0));
+  EXPECT_THAT(out3.lite_index_info().index_crc(), Gt(0));
+  EXPECT_THAT(out3.lite_index_info().lexicon_info(), Not(IsEmpty()));
 }
 
 TEST_F(IndexTest, BackfillingMultipleTermsSucceeds) {
diff --git a/icing/index/lite/lite-index.cc b/icing/index/lite/lite-index.cc
index e7a8cb3..a5c6baf 100644
--- a/icing/index/lite/lite-index.cc
+++ b/icing/index/lite/lite-index.cc
@@ -340,6 +340,8 @@ int LiteIndex::AppendHits(uint32_t term_id, SectionIdMask section_id_mask,
                           std::vector<DocHitInfo>* hits_out) {
   int count = 0;
   DocumentId last_document_id = kInvalidDocumentId;
+  // Record whether the last document belongs to the given namespaces.
+  bool last_document_in_namespace = false;
   for (uint32_t idx = Seek(term_id); idx < header_->cur_size(); idx++) {
     TermIdHitPair term_id_hit_pair(
         hit_buffer_.array_cast<TermIdHitPair>()[idx]);
@@ -357,9 +359,10 @@ int LiteIndex::AppendHits(uint32_t term_id, SectionIdMask section_id_mask,
     DocumentId document_id = hit.document_id();
     if (document_id != last_document_id) {
       last_document_id = document_id;
-      // Check does current document belongs to the given namespaces.
-      if (namespace_checker != nullptr &&
-          !namespace_checker->BelongsToTargetNamespaces(document_id)) {
+      last_document_in_namespace =
+          namespace_checker == nullptr ||
+          namespace_checker->BelongsToTargetNamespaces(document_id);
+      if (!last_document_in_namespace) {
         // The document is removed or expired or not belongs to target
         // namespaces.
         continue;
@@ -369,7 +372,7 @@ int LiteIndex::AppendHits(uint32_t term_id, SectionIdMask section_id_mask,
         hits_out->push_back(DocHitInfo(document_id));
       }
     }
-    if (hits_out != nullptr) {
+    if (hits_out != nullptr && last_document_in_namespace) {
       hits_out->back().UpdateSection(hit.section_id(), hit.term_frequency());
     }
   }
@@ -388,15 +391,16 @@ bool LiteIndex::is_full() const {
           lexicon_.min_free_fraction() < (1.0 - kTrieFullFraction));
 }
 
-void LiteIndex::GetDebugInfo(int verbosity, std::string* out) const {
-  absl_ports::StrAppend(
-      out, IcingStringUtil::StringPrintf("Lite Index\nHit buffer %u/%u\n",
-                                         header_->cur_size(),
-                                         options_.hit_buffer_size));
-
-  // Lexicon.
-  out->append("Lexicon stats:\n");
-  lexicon_.GetDebugInfo(verbosity, out);
+IndexDebugInfoProto::LiteIndexDebugInfoProto LiteIndex::GetDebugInfo(
+    int verbosity) {
+  IndexDebugInfoProto::LiteIndexDebugInfoProto res;
+  res.set_curr_size(header_->cur_size());
+  res.set_hit_buffer_size(options_.hit_buffer_size);
+  res.set_last_added_document_id(header_->last_added_docid());
+  res.set_searchable_end(header_->searchable_end());
+  res.set_index_crc(ComputeChecksum().Get());
+  lexicon_.GetDebugInfo(verbosity, res.mutable_lexicon_info());
+  return res;
 }
 
 libtextclassifier3::StatusOr<int64_t> LiteIndex::GetElementsSize() const {
@@ -417,12 +421,8 @@ IndexStorageInfoProto LiteIndex::GetStorageInfo(
     IndexStorageInfoProto storage_info) const {
   int64_t header_and_hit_buffer_file_size =
       filesystem_->GetFileSize(hit_buffer_fd_.get());
-  if (header_and_hit_buffer_file_size != Filesystem::kBadFileSize) {
-    storage_info.set_lite_index_hit_buffer_size(
-        header_and_hit_buffer_file_size);
-  } else {
-    storage_info.set_lite_index_hit_buffer_size(-1);
-  }
+  storage_info.set_lite_index_hit_buffer_size(
+      IcingFilesystem::SanitizeFileSize(header_and_hit_buffer_file_size));
   int64_t lexicon_disk_usage = lexicon_.GetElementsSize();
   if (lexicon_disk_usage != Filesystem::kBadFileSize) {
     storage_info.set_lite_index_lexicon_size(lexicon_disk_usage);
diff --git a/icing/index/lite/lite-index.h b/icing/index/lite/lite-index.h
index 890980c..378fc94 100644
--- a/icing/index/lite/lite-index.h
+++ b/icing/index/lite/lite-index.h
@@ -37,6 +37,7 @@
 #include "icing/legacy/index/icing-lite-index-header.h"
 #include "icing/legacy/index/icing-lite-index-options.h"
 #include "icing/legacy/index/icing-mmapper.h"
+#include "icing/proto/debug.pb.h"
 #include "icing/proto/storage.pb.h"
 #include "icing/proto/term.pb.h"
 #include "icing/schema/section.h"
@@ -241,7 +242,7 @@ class LiteIndex {
   // Returns debug information for the index in out.
   // verbosity <= 0, simplest debug information - size of lexicon, hit buffer
   // verbosity > 0, more detailed debug information from the lexicon.
-  void GetDebugInfo(int verbosity, std::string* out) const;
+  IndexDebugInfoProto::LiteIndexDebugInfoProto GetDebugInfo(int verbosity);
 
   // Returns the byte size of all the elements held in the index. This excludes
   // the size of any internal metadata of the index, e.g. the index's header.
diff --git a/icing/index/lite/lite-index_test.cc b/icing/index/lite/lite-index_test.cc
new file mode 100644
index 0000000..825f830
--- /dev/null
+++ b/icing/index/lite/lite-index_test.cc
@@ -0,0 +1,110 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/lite/lite-index.h"
+
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/index/term-id-codec.h"
+#include "icing/legacy/index/icing-mock-filesystem.h"
+#include "icing/schema/section.h"
+#include "icing/store/namespace-checker.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::Eq;
+using ::testing::IsEmpty;
+using ::testing::SizeIs;
+
+class AlwaysFalseNamespaceCheckerImpl : public NamespaceChecker {
+ public:
+  bool BelongsToTargetNamespaces(DocumentId document_id) const override {
+    return false;
+  }
+};
+
+class LiteIndexTest : public testing::Test {
+ protected:
+  void SetUp() override {
+    index_dir_ = GetTestTempDir() + "/test_dir";
+    ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(index_dir_.c_str()));
+
+    std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index";
+    LiteIndex::Options options(lite_index_file_name,
+                               /*hit_buffer_want_merge_bytes=*/1024 * 1024);
+    ICING_ASSERT_OK_AND_ASSIGN(lite_index_,
+                               LiteIndex::Create(options, &icing_filesystem_));
+
+    ICING_ASSERT_OK_AND_ASSIGN(
+        term_id_codec_,
+        TermIdCodec::Create(
+            IcingDynamicTrie::max_value_index(IcingDynamicTrie::Options()),
+            IcingDynamicTrie::max_value_index(options.lexicon_options)));
+  }
+
+  void TearDown() override {
+    ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(index_dir_.c_str()));
+  }
+
+  std::string index_dir_;
+  Filesystem filesystem_;
+  IcingFilesystem icing_filesystem_;
+  std::unique_ptr<LiteIndex> lite_index_;
+  std::unique_ptr<TermIdCodec> term_id_codec_;
+};
+
+constexpr NamespaceId kNamespace0 = 0;
+
+TEST_F(LiteIndexTest, LiteIndexAppendHits) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      uint32_t tvi,
+      lite_index_->InsertTerm("foo", TermMatchType::PREFIX, kNamespace0));
+  ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id,
+                             term_id_codec_->EncodeTvi(tvi, TviType::LITE));
+  Hit doc_hit0(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency,
+               /*is_in_prefix_section=*/false);
+  Hit doc_hit1(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency,
+               /*is_in_prefix_section=*/false);
+  ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc_hit0));
+  ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc_hit1));
+
+  std::vector<DocHitInfo> hits1;
+  lite_index_->AppendHits(foo_term_id, kSectionIdMaskAll,
+                          /*only_from_prefix_sections=*/false,
+                          /*namespace_checker=*/nullptr, &hits1);
+  EXPECT_THAT(hits1, SizeIs(1));
+  EXPECT_THAT(hits1.back().document_id(), Eq(0));
+  // Check that the hits are coming from section 0 and section 1.
+  EXPECT_THAT(hits1.back().hit_section_ids_mask(), Eq(0b11));
+
+  std::vector<DocHitInfo> hits2;
+  AlwaysFalseNamespaceCheckerImpl always_false_namespace_checker;
+  lite_index_->AppendHits(foo_term_id, kSectionIdMaskAll,
+                          /*only_from_prefix_sections=*/false,
+                          &always_false_namespace_checker, &hits2);
+  // Check that no hits are returned because they get skipped by the namespace
+  // checker.
+  EXPECT_THAT(hits2, IsEmpty());
+}
+
+}  // namespace
+}  // namespace lib
+}  // namespace icing
diff --git a/icing/index/main/flash-index-storage.h b/icing/index/main/flash-index-storage.h
index 8d5b50b..6c6fbb8 100644
--- a/icing/index/main/flash-index-storage.h
+++ b/icing/index/main/flash-index-storage.h
@@ -159,6 +159,7 @@ class FlashIndexStorage {
 
   libtextclassifier3::Status Reset();
 
+  // TODO(b/222349894) Convert the string output to a protocol buffer instead.
   void GetDebugInfo(int verbosity, std::string* out) const;
 
  private:
diff --git a/icing/index/main/main-index.cc b/icing/index/main/main-index.cc
index c104dd3..2d6007b 100644
--- a/icing/index/main/main-index.cc
+++ b/icing/index/main/main-index.cc
@@ -133,18 +133,10 @@ libtextclassifier3::StatusOr<int64_t> MainIndex::GetElementsSize() const {
 
 IndexStorageInfoProto MainIndex::GetStorageInfo(
     IndexStorageInfoProto storage_info) const {
-  int64_t lexicon_elt_size = main_lexicon_->GetElementsSize();
-  if (lexicon_elt_size != IcingFilesystem::kBadFileSize) {
-    storage_info.set_main_index_lexicon_size(lexicon_elt_size);
-  } else {
-    storage_info.set_main_index_lexicon_size(-1);
-  }
-  int64_t index_elt_size = flash_index_storage_->GetElementsSize();
-  if (lexicon_elt_size != IcingFilesystem::kBadFileSize) {
-    storage_info.set_main_index_storage_size(index_elt_size);
-  } else {
-    storage_info.set_main_index_storage_size(-1);
-  }
+  storage_info.set_main_index_lexicon_size(
+      IcingFilesystem::SanitizeFileSize(main_lexicon_->GetElementsSize()));
+  storage_info.set_main_index_storage_size(
+      Filesystem::SanitizeFileSize(flash_index_storage_->GetElementsSize()));
   storage_info.set_main_index_block_size(flash_index_storage_->block_size());
   storage_info.set_num_blocks(flash_index_storage_->num_blocks());
   storage_info.set_min_free_fraction(flash_index_storage_->min_free_fraction());
@@ -615,16 +607,22 @@ libtextclassifier3::Status MainIndex::AddPrefixBackfillHits(
   return libtextclassifier3::Status::OK;
 }
 
-void MainIndex::GetDebugInfo(int verbosity, std::string* out) const {
+IndexDebugInfoProto::MainIndexDebugInfoProto MainIndex::GetDebugInfo(
+    int verbosity) const {
+  IndexDebugInfoProto::MainIndexDebugInfoProto res;
+
   // Lexicon.
-  out->append("Main Lexicon stats:\n");
-  main_lexicon_->GetDebugInfo(verbosity, out);
+  main_lexicon_->GetDebugInfo(verbosity, res.mutable_lexicon_info());
+
+  res.set_last_added_document_id(last_added_document_id());
 
   if (verbosity <= 0) {
-    return;
+    return res;
   }
 
-  flash_index_storage_->GetDebugInfo(verbosity, out);
+  flash_index_storage_->GetDebugInfo(verbosity,
+                                     res.mutable_flash_index_storage_info());
+  return res;
 }
 
 }  // namespace lib
diff --git a/icing/index/main/main-index.h b/icing/index/main/main-index.h
index e4073cb..abb0418 100644
--- a/icing/index/main/main-index.h
+++ b/icing/index/main/main-index.h
@@ -27,6 +27,7 @@
 #include "icing/index/term-metadata.h"
 #include "icing/legacy/index/icing-dynamic-trie.h"
 #include "icing/legacy/index/icing-filesystem.h"
+#include "icing/proto/debug.pb.h"
 #include "icing/proto/storage.pb.h"
 #include "icing/store/namespace-checker.h"
 #include "icing/store/namespace-id.h"
@@ -185,7 +186,8 @@ class MainIndex {
   // verbosity <= 0, simplest debug information - just the lexicon
   // verbosity > 0, more detailed debug information including raw postings
   //                lists.
-  void GetDebugInfo(int verbosity, std::string* out) const;
+  IndexDebugInfoProto::MainIndexDebugInfoProto GetDebugInfo(
+      int verbosity) const;
 
  private:
   libtextclassifier3::Status Init(const std::string& index_directory,
diff --git a/icing/legacy/index/icing-dynamic-trie.cc b/icing/legacy/index/icing-dynamic-trie.cc
index baa043a..77876c4 100644
--- a/icing/legacy/index/icing-dynamic-trie.cc
+++ b/icing/legacy/index/icing-dynamic-trie.cc
@@ -70,6 +70,7 @@
 #include <algorithm>
 #include <cerrno>
 #include <cinttypes>
+#include <cstdint>
 #include <cstring>
 #include <memory>
 #include <utility>
@@ -397,6 +398,8 @@ class IcingDynamicTrie::IcingDynamicTrieStorage {
   // storage.
   IcingScopedFd array_fds_[NUM_ARRAY_TYPES];
   std::vector<IcingArrayStorage> array_storage_;
+
+  // Legacy file system. Switch to use the new Filesystem class instead.
   const IcingFilesystem *filesystem_;
 };
 
@@ -1364,10 +1367,12 @@ uint32_t IcingDynamicTrie::size() const {
   return storage_->hdr().num_keys();
 }
 
-void IcingDynamicTrie::CollectStatsRecursive(const Node &node,
-                                             Stats *stats) const {
+void IcingDynamicTrie::CollectStatsRecursive(const Node &node, Stats *stats,
+                                             uint32_t depth) const {
   if (node.is_leaf()) {
     stats->num_leaves++;
+    stats->sum_depth += depth;
+    stats->max_depth = max(stats->max_depth, depth);
     const char *suffix = storage_->GetSuffix(node.next_index());
     stats->suffixes_used += strlen(suffix) + 1 + value_size();
     if (!suffix[0]) {
@@ -1379,13 +1384,16 @@ void IcingDynamicTrie::CollectStatsRecursive(const Node &node,
     for (; i < (1U << node.log2_num_children()); i++) {
       const Next &next = *storage_->GetNext(node.next_index(), i);
       if (next.node_index() == kInvalidNodeIndex) break;
-      CollectStatsRecursive(*storage_->GetNode(next.node_index()), stats);
+      CollectStatsRecursive(*storage_->GetNode(next.node_index()), stats,
+                            depth + 1);
     }
 
     // At least one valid node in each next array
     if (i == 0) {
       ICING_LOG(FATAL) << "No valid node in 'next' array";
     }
+    stats->sum_children += i;
+    stats->max_children = max(stats->max_children, i);
 
     stats->child_counts[i - 1]++;
     stats->wasted[node.log2_num_children()] +=
@@ -1467,9 +1475,12 @@ std::string IcingDynamicTrie::Stats::DumpStats(int verbosity) const {
         "Wasted total: %u\n"
         "Num intermediates %u num leaves %u "
         "suffixes used %u null %u\n"
+        "avg and max children for intermediates: %.3f, %u\n"
+        "avg and max depth for leaves: %.3f, %u\n"
         "Total next frag: %.3f%%\n",
         total_wasted, num_intermediates, num_leaves, suffixes_used,
-        null_suffixes,
+        null_suffixes, 1. * sum_children / num_intermediates, max_children,
+        1. * sum_depth / num_leaves, max_depth,
         100. * math_util::SafeDivide((total_free + total_wasted), num_nexts));
   }
   IcingStringUtil::SStringAppendF(
diff --git a/icing/legacy/index/icing-dynamic-trie.h b/icing/legacy/index/icing-dynamic-trie.h
index 8821799..013b926 100644
--- a/icing/legacy/index/icing-dynamic-trie.h
+++ b/icing/legacy/index/icing-dynamic-trie.h
@@ -152,8 +152,13 @@ class IcingDynamicTrie : public IIcingStorage {
     uint32_t max_nodes;
     // Count of intermediate nodes.
     uint32_t num_intermediates;
+    // Total and maximum number of children of intermediate nodes.
+    uint32_t sum_children, max_children;
+
     // Count of leaf nodes.
     uint32_t num_leaves;
+    // Total and maximum depth of leaf nodes.
+    uint32_t sum_depth, max_depth;
 
     // Next stats
 
@@ -186,6 +191,7 @@ class IcingDynamicTrie : public IIcingStorage {
     uint32_t dirty_pages_nexts;
     uint32_t dirty_pages_suffixes;
 
+    // TODO(b/222349894) Convert the string output to a protocol buffer instead.
     std::string DumpStats(int verbosity) const;
   };
 
@@ -601,7 +607,8 @@ class IcingDynamicTrie : public IIcingStorage {
   static const uint32_t kInvalidSuffixIndex;
 
   // Stats helpers.
-  void CollectStatsRecursive(const Node &node, Stats *stats) const;
+  void CollectStatsRecursive(const Node &node, Stats *stats,
+                             uint32_t depth = 0) const;
 
   // Helpers for Find and Insert.
   const Next *GetNextByChar(const Node *node, uint8_t key_char) const;
diff --git a/icing/legacy/index/icing-filesystem.h b/icing/legacy/index/icing-filesystem.h
index f645632..ce75a82 100644
--- a/icing/legacy/index/icing-filesystem.h
+++ b/icing/legacy/index/icing-filesystem.h
@@ -224,6 +224,11 @@ class IcingFilesystem {
   // Increments to_increment by size if size is valid, or sets to_increment
   // to kBadFileSize if either size or to_increment is kBadFileSize.
   static void IncrementByOrSetInvalid(uint64_t size, uint64_t *to_increment);
+
+  // Return -1 if file_size is invalid. Otherwise, return file_size.
+  static int64_t SanitizeFileSize(int64_t file_size) {
+    return (file_size != kBadFileSize) ? file_size : -1;
+  }
 };
 
 }  // namespace lib
diff --git a/icing/legacy/index/icing-flash-bitmap.h b/icing/legacy/index/icing-flash-bitmap.h
index e3ba0e2..6bb9591 100644
--- a/icing/legacy/index/icing-flash-bitmap.h
+++ b/icing/legacy/index/icing-flash-bitmap.h
@@ -138,6 +138,7 @@ class IcingFlashBitmap {
   // Upgrade for version 18.
   bool UpgradeTo18();
 
+  // Legacy file system. Switch to use the new Filesystem class instead.
   const IcingFilesystem *const filesystem_;
   std::string filename_;
   OpenType open_type_;
diff --git a/icing/query/query-processor_benchmark.cc b/icing/query/query-processor_benchmark.cc
index bdd40aa..e48fe78 100644
--- a/icing/query/query-processor_benchmark.cc
+++ b/icing/query/query-processor_benchmark.cc
@@ -16,7 +16,6 @@
 #include "gmock/gmock.h"
 #include "third_party/absl/flags/flag.h"
 #include "icing/document-builder.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
 #include "icing/index/index.h"
 #include "icing/proto/term.pb.h"
 #include "icing/query/query-processor.h"
@@ -24,6 +23,7 @@
 #include "icing/schema/section.h"
 #include "icing/store/document-id.h"
 #include "icing/testing/common-matchers.h"
+#include "icing/testing/icu-data-file-helper.h"
 #include "icing/testing/test-data.h"
 #include "icing/testing/tmp-directory.h"
 #include "icing/tokenization/language-segmenter-factory.h"
diff --git a/icing/query/query-processor_test.cc b/icing/query/query-processor_test.cc
index ef12b10..950f739 100644
--- a/icing/query/query-processor_test.cc
+++ b/icing/query/query-processor_test.cc
@@ -23,7 +23,6 @@
 #include "gtest/gtest.h"
 #include "icing/document-builder.h"
 #include "icing/file/filesystem.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
 #include "icing/index/hit/doc-hit-info.h"
 #include "icing/index/index.h"
 #include "icing/index/iterator/doc-hit-info-iterator-test-util.h"
@@ -40,6 +39,7 @@
 #include "icing/store/document-store.h"
 #include "icing/testing/common-matchers.h"
 #include "icing/testing/fake-clock.h"
+#include "icing/testing/icu-data-file-helper.h"
 #include "icing/testing/jni-test-helpers.h"
 #include "icing/testing/test-data.h"
 #include "icing/testing/tmp-directory.h"
diff --git a/icing/query/suggestion-processor_test.cc b/icing/query/suggestion-processor_test.cc
index 2f22a47..ba4c90a 100644
--- a/icing/query/suggestion-processor_test.cc
+++ b/icing/query/suggestion-processor_test.cc
@@ -15,11 +15,11 @@
 #include "icing/query/suggestion-processor.h"
 
 #include "gmock/gmock.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
 #include "icing/store/document-store.h"
 #include "icing/testing/always-true-namespace-checker-impl.h"
 #include "icing/testing/common-matchers.h"
 #include "icing/testing/fake-clock.h"
+#include "icing/testing/icu-data-file-helper.h"
 #include "icing/testing/jni-test-helpers.h"
 #include "icing/testing/test-data.h"
 #include "icing/testing/tmp-directory.h"
diff --git a/icing/result/result-retriever_test.cc b/icing/result/result-retriever_test.cc
index 2695433..0d812e4 100644
--- a/icing/result/result-retriever_test.cc
+++ b/icing/result/result-retriever_test.cc
@@ -22,7 +22,6 @@
 #include "gtest/gtest.h"
 #include "icing/document-builder.h"
 #include "icing/file/mock-filesystem.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
 #include "icing/portable/equals-proto.h"
 #include "icing/portable/platform.h"
 #include "icing/proto/document.pb.h"
@@ -36,6 +35,7 @@
 #include "icing/store/document-id.h"
 #include "icing/testing/common-matchers.h"
 #include "icing/testing/fake-clock.h"
+#include "icing/testing/icu-data-file-helper.h"
 #include "icing/testing/snippet-helpers.h"
 #include "icing/testing/test-data.h"
 #include "icing/testing/tmp-directory.h"
@@ -362,8 +362,8 @@ TEST_F(ResultRetrieverTest, NotIgnoreErrors) {
 
 TEST_F(ResultRetrieverTest, IOErrorShouldReturnInternalError) {
   MockFilesystem mock_filesystem;
-  ON_CALL(mock_filesystem, OpenForRead(_)).WillByDefault(Return(false));
-
+  ON_CALL(mock_filesystem, PRead(A<int>(), A<void*>(), A<size_t>(), A<off_t>()))
+      .WillByDefault(Return(false));
   ICING_ASSERT_OK_AND_ASSIGN(
       DocumentStore::CreateResult create_result,
       DocumentStore::Create(&mock_filesystem, test_dir_, &fake_clock_,
diff --git a/icing/result/snippet-retriever.cc b/icing/result/snippet-retriever.cc
index 6af7017..bd1524e 100644
--- a/icing/result/snippet-retriever.cc
+++ b/icing/result/snippet-retriever.cc
@@ -78,26 +78,25 @@ inline std::string AddIndexToPath(int values_size, int index,
 
 // Returns a string of the normalized text of the input Token. Normalization
 // is applied based on the Token's type.
-std::string NormalizeToken(const Normalizer& normalizer,
-                           const Token& token) {
+std::string NormalizeToken(const Normalizer& normalizer, const Token& token) {
   switch (token.type) {
-    case Token::REGULAR:
+    case Token::Type::REGULAR:
       return normalizer.NormalizeTerm(token.text);
-    case Token::VERBATIM:
+    case Token::Type::VERBATIM:
       return std::string(token.text);
-    case Token::QUERY_EXCLUSION:
+    case Token::Type::QUERY_EXCLUSION:
       [[fallthrough]];
-    case Token::QUERY_LEFT_PARENTHESES:
+    case Token::Type::QUERY_LEFT_PARENTHESES:
       [[fallthrough]];
-    case Token::QUERY_RIGHT_PARENTHESES:
+    case Token::Type::QUERY_RIGHT_PARENTHESES:
       [[fallthrough]];
-    case Token::QUERY_OR:
+    case Token::Type::QUERY_OR:
       [[fallthrough]];
-    case Token::QUERY_PROPERTY:
+    case Token::Type::QUERY_PROPERTY:
       [[fallthrough]];
-    case Token::INVALID:
+    case Token::Type::INVALID:
       ICING_LOG(WARNING) << "Unable to normalize token of type: "
-                         << token.type;
+                         << static_cast<int>(token.type);
       return std::string(token.text);
   }
 }
@@ -107,7 +106,7 @@ std::string NormalizeToken(const Normalizer& normalizer,
 CharacterIterator FindMatchEnd(const Normalizer& normalizer, const Token& token,
                                const std::string& match_query_term) {
   switch (token.type) {
-    case Token::VERBATIM: {
+    case Token::Type::VERBATIM: {
       // VERBATIM tokens are not normalized. This means the non-normalized
       // matched query term must be either equal to or a prefix of the token's
       // text. Therefore, the match must end at the end of the matched query
@@ -117,22 +116,22 @@ CharacterIterator FindMatchEnd(const Normalizer& normalizer, const Token& token,
       verbatim_match_end.AdvanceToUtf8(match_query_term.length());
       return verbatim_match_end;
     }
-    case Token::QUERY_EXCLUSION:
+    case Token::Type::QUERY_EXCLUSION:
       [[fallthrough]];
-    case Token::QUERY_LEFT_PARENTHESES:
+    case Token::Type::QUERY_LEFT_PARENTHESES:
       [[fallthrough]];
-    case Token::QUERY_RIGHT_PARENTHESES:
+    case Token::Type::QUERY_RIGHT_PARENTHESES:
       [[fallthrough]];
-    case Token::QUERY_OR:
+    case Token::Type::QUERY_OR:
       [[fallthrough]];
-    case Token::QUERY_PROPERTY:
+    case Token::Type::QUERY_PROPERTY:
       [[fallthrough]];
-    case Token::INVALID:
+    case Token::Type::INVALID:
       ICING_LOG(WARNING)
-          << "Unexpected Token type " << token.type
+          << "Unexpected Token type " << static_cast<int>(token.type)
           << " found when finding match end of query term and token.";
       [[fallthrough]];
-    case Token::REGULAR:
+    case Token::Type::REGULAR:
       return normalizer.FindNormalizedMatchEndPosition(token.text,
                                                        match_query_term);
   }
diff --git a/icing/result/snippet-retriever_test.cc b/icing/result/snippet-retriever_test.cc
index 365eba2..0de2295 100644
--- a/icing/result/snippet-retriever_test.cc
+++ b/icing/result/snippet-retriever_test.cc
@@ -22,7 +22,6 @@
 #include "gtest/gtest.h"
 #include "icing/document-builder.h"
 #include "icing/file/mock-filesystem.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
 #include "icing/portable/equals-proto.h"
 #include "icing/portable/platform.h"
 #include "icing/proto/document.pb.h"
@@ -37,6 +36,7 @@
 #include "icing/store/key-mapper.h"
 #include "icing/testing/common-matchers.h"
 #include "icing/testing/fake-clock.h"
+#include "icing/testing/icu-data-file-helper.h"
 #include "icing/testing/jni-test-helpers.h"
 #include "icing/testing/snippet-helpers.h"
 #include "icing/testing/test-data.h"
diff --git a/icing/schema/schema-store.cc b/icing/schema/schema-store.cc
index 67528ab..b5c976f 100644
--- a/icing/schema/schema-store.cc
+++ b/icing/schema/schema-store.cc
@@ -268,7 +268,7 @@ libtextclassifier3::Status SchemaStore::UpdateHeader(const Crc32& checksum) {
 libtextclassifier3::Status SchemaStore::ResetSchemaTypeMapper() {
   // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
   schema_type_mapper_.reset();
-  // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
+  // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
   // that can support error logging.
   libtextclassifier3::Status status = KeyMapper<SchemaTypeId>::Delete(
       filesystem_, MakeSchemaTypeMapperFilename(base_dir_));
@@ -464,11 +464,8 @@ libtextclassifier3::Status SchemaStore::PersistToDisk() {
 SchemaStoreStorageInfoProto SchemaStore::GetStorageInfo() const {
   SchemaStoreStorageInfoProto storage_info;
   int64_t directory_size = filesystem_.GetDiskUsage(base_dir_.c_str());
-  if (directory_size != Filesystem::kBadFileSize) {
-    storage_info.set_schema_store_size(directory_size);
-  } else {
-    storage_info.set_schema_store_size(-1);
-  }
+  storage_info.set_schema_store_size(
+      Filesystem::SanitizeFileSize(directory_size));
   ICING_ASSIGN_OR_RETURN(const SchemaProto* schema, GetSchema(), storage_info);
   storage_info.set_num_schema_types(schema->types_size());
   int total_sections = 0;
diff --git a/icing/store/document-log-creator.cc b/icing/store/document-log-creator.cc
index 5e0426e..5e23a8e 100644
--- a/icing/store/document-log-creator.cc
+++ b/icing/store/document-log-creator.cc
@@ -72,19 +72,20 @@ DocumentLogCreator::Create(const Filesystem* filesystem,
   bool v1_exists =
       filesystem->FileExists(MakeDocumentLogFilenameV1(base_dir).c_str());
 
-  bool regen_derived_files = false;
+  bool new_file = false;
+  int preexisting_file_version = kCurrentVersion;
   if (v0_exists && !v1_exists) {
     ICING_RETURN_IF_ERROR(MigrateFromV0ToV1(filesystem, base_dir));
 
     // Need to regenerate derived files since documents may be written to a
     // different file offset in the log.
-    regen_derived_files = true;
+    preexisting_file_version = 0;
   } else if (!v1_exists) {
     // First time initializing a v1 log. There are no existing derived files at
     // this point, so we should generate some. "regenerate" here also means
     // "generate for the first time", i.e. we shouldn't expect there to be any
     // existing derived files.
-    regen_derived_files = true;
+    new_file = true;
   }
 
   ICING_ASSIGN_OR_RETURN(
@@ -96,7 +97,7 @@ DocumentLogCreator::Create(const Filesystem* filesystem,
               /*compress_in=*/true)));
 
   CreateResult create_result = {std::move(log_create_result),
-                                regen_derived_files};
+                                preexisting_file_version, new_file};
   return create_result;
 }
 
diff --git a/icing/store/document-log-creator.h b/icing/store/document-log-creator.h
index 51cf497..be8feed 100644
--- a/icing/store/document-log-creator.h
+++ b/icing/store/document-log-creator.h
@@ -30,14 +30,20 @@ namespace lib {
 // be necessary.
 class DocumentLogCreator {
  public:
+  // Version 0 refers to FileBackedProtoLog
+  // Version 1 refers to PortableFileBackedProtoLog with kFileFormatVersion = 0
+  static constexpr int32_t kCurrentVersion = 1;
   struct CreateResult {
     // The create result passed up from the PortableFileBackedProtoLog::Create.
     // Contains the document log.
     PortableFileBackedProtoLog<DocumentWrapper>::CreateResult log_create_result;
 
-    // Whether the caller needs to also regenerate/generate any derived files
-    // based off of the initialized document log.
-    bool regen_derived_files;
+    // The version number of the pre-existing document log file.
+    // If there is no document log file, it will be set to kCurrentVersion.
+    int preexisting_file_version;
+
+    // Whether the created file is new.
+    bool new_file;
   };
 
   // Creates the document log in the base_dir. Will create one if it doesn't
diff --git a/icing/store/document-store.cc b/icing/store/document-store.cc
index 226a96b..a2ae5f9 100644
--- a/icing/store/document-store.cc
+++ b/icing/store/document-store.cc
@@ -164,6 +164,32 @@ int64_t CalculateExpirationTimestampMs(int64_t creation_timestamp_ms,
   return expiration_timestamp_ms;
 }
 
+InitializeStatsProto::RecoveryCause GetRecoveryCause(
+    const DocumentLogCreator::CreateResult& create_result,
+    bool force_recovery_and_revalidate_documents) {
+  if (force_recovery_and_revalidate_documents) {
+    return InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC;
+  } else if (create_result.log_create_result.has_data_loss()) {
+    return InitializeStatsProto::DATA_LOSS;
+  } else if (create_result.preexisting_file_version !=
+             DocumentLogCreator::kCurrentVersion) {
+    return InitializeStatsProto::LEGACY_DOCUMENT_LOG_FORMAT;
+  }
+  return InitializeStatsProto::NONE;
+}
+
+InitializeStatsProto::DocumentStoreDataStatus GetDataStatus(
+    DataLoss data_loss) {
+  switch (data_loss) {
+    case DataLoss::PARTIAL:
+      return InitializeStatsProto::PARTIAL_LOSS;
+    case DataLoss::COMPLETE:
+      return InitializeStatsProto::COMPLETE_LOSS;
+    case DataLoss::NONE:
+      return InitializeStatsProto::NO_DATA_LOSS;
+  }
+}
+
 }  // namespace
 
 DocumentStore::DocumentStore(const Filesystem* filesystem,
@@ -236,44 +262,34 @@ libtextclassifier3::StatusOr<DataLoss> DocumentStore::Initialize(
       std::move(create_result_or).ValueOrDie();
 
   document_log_ = std::move(create_result.log_create_result.proto_log);
-
-  if (create_result.regen_derived_files ||
-      force_recovery_and_revalidate_documents ||
-      create_result.log_create_result.has_data_loss()) {
+  InitializeStatsProto::RecoveryCause recovery_cause =
+      GetRecoveryCause(create_result, force_recovery_and_revalidate_documents);
+
+  if (recovery_cause != InitializeStatsProto::NONE || create_result.new_file) {
+    ICING_LOG(WARNING) << "Starting Document Store Recovery with cause="
+        << recovery_cause << ", and create result { new_file="
+        << create_result.new_file << ", preeisting_file_version="
+        << create_result.preexisting_file_version << ", data_loss="
+        << create_result.log_create_result.data_loss << "} and kCurrentVersion="
+        << DocumentLogCreator::kCurrentVersion;
     // We can't rely on any existing derived files. Recreate them from scratch.
     // Currently happens if:
     //   1) This is a new log and we don't have derived files yet
     //   2) Client wanted us to force a regeneration.
     //   3) Log has some data loss, can't rely on existing derived data.
-    if (create_result.log_create_result.has_data_loss() &&
-        initialize_stats != nullptr) {
-      ICING_LOG(WARNING)
-          << "Data loss in document log, regenerating derived files.";
-      initialize_stats->set_document_store_recovery_cause(
-          InitializeStatsProto::DATA_LOSS);
-
-      if (create_result.log_create_result.data_loss == DataLoss::PARTIAL) {
-        // Ground truth is partially lost.
-        initialize_stats->set_document_store_data_status(
-            InitializeStatsProto::PARTIAL_LOSS);
-      } else {
-        // Ground truth is completely lost.
-        initialize_stats->set_document_store_data_status(
-            InitializeStatsProto::COMPLETE_LOSS);
-      }
-    }
-
     std::unique_ptr<Timer> document_recovery_timer = clock_.GetNewTimer();
     libtextclassifier3::Status status =
         RegenerateDerivedFiles(force_recovery_and_revalidate_documents);
     if (initialize_stats != nullptr &&
-        (force_recovery_and_revalidate_documents ||
-         create_result.log_create_result.has_data_loss())) {
+        recovery_cause != InitializeStatsProto::NONE) {
       // Only consider it a recovery if the client forced a recovery or there
       // was data loss. Otherwise, this could just be the first time we're
       // initializing and generating derived files.
       initialize_stats->set_document_store_recovery_latency_ms(
           document_recovery_timer->GetElapsedMilliseconds());
+      initialize_stats->set_document_store_recovery_cause(recovery_cause);
+      initialize_stats->set_document_store_data_status(
+          GetDataStatus(create_result.log_create_result.data_loss));
     }
     if (!status.ok()) {
       ICING_LOG(ERROR)
@@ -282,13 +298,13 @@ libtextclassifier3::StatusOr<DataLoss> DocumentStore::Initialize(
     }
   } else {
     if (!InitializeExistingDerivedFiles().ok()) {
-      ICING_VLOG(1)
+      ICING_LOG(WARNING)
           << "Couldn't find derived files or failed to initialize them, "
              "regenerating derived files for DocumentStore.";
       std::unique_ptr<Timer> document_recovery_timer = clock_.GetNewTimer();
       libtextclassifier3::Status status = RegenerateDerivedFiles(
-          /*force_recovery_and_revalidate_documents*/ false);
-      if (initialize_stats != nullptr && num_documents() > 0) {
+          /*force_recovery_and_revalidate_documents=*/false);
+      if (initialize_stats != nullptr) {
         initialize_stats->set_document_store_recovery_cause(
             InitializeStatsProto::IO_ERROR);
         initialize_stats->set_document_store_recovery_latency_ms(
@@ -530,7 +546,7 @@ libtextclassifier3::Status DocumentStore::RegenerateDerivedFiles(
 libtextclassifier3::Status DocumentStore::ResetDocumentKeyMapper() {
   // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
   document_key_mapper_.reset();
-  // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
+  // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
   // that can support error logging.
   libtextclassifier3::Status status =
       KeyMapper<DocumentId>::Delete(*filesystem_, base_dir_);
@@ -540,7 +556,7 @@ libtextclassifier3::Status DocumentStore::ResetDocumentKeyMapper() {
     return status;
   }
 
-  // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
+  // TODO(b/216487496): Implement a more robust version of TC_ASSIGN_OR_RETURN
   // that can support error logging.
   auto document_key_mapper_or =
       KeyMapper<DocumentId>::Create(*filesystem_, base_dir_, kUriMapperMaxSize);
@@ -556,7 +572,7 @@ libtextclassifier3::Status DocumentStore::ResetDocumentKeyMapper() {
 libtextclassifier3::Status DocumentStore::ResetDocumentIdMapper() {
   // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
   document_id_mapper_.reset();
-  // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
+  // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
   // that can support error logging.
   libtextclassifier3::Status status = FileBackedVector<int64_t>::Delete(
       *filesystem_, MakeDocumentIdMapperFilename(base_dir_));
@@ -565,7 +581,7 @@ libtextclassifier3::Status DocumentStore::ResetDocumentIdMapper() {
                      << "Failed to delete old document_id mapper";
     return status;
   }
-  // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
+  // TODO(b/216487496): Implement a more robust version of TC_ASSIGN_OR_RETURN
   // that can support error logging.
   auto document_id_mapper_or = FileBackedVector<int64_t>::Create(
       *filesystem_, MakeDocumentIdMapperFilename(base_dir_),
@@ -618,7 +634,7 @@ libtextclassifier3::Status DocumentStore::ResetFilterCache() {
 libtextclassifier3::Status DocumentStore::ResetNamespaceMapper() {
   // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
   namespace_mapper_.reset();
-  // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
+  // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
   // that can support error logging.
   libtextclassifier3::Status status = KeyMapper<NamespaceId>::Delete(
       *filesystem_, MakeNamespaceMapperFilename(base_dir_));
@@ -638,7 +654,7 @@ libtextclassifier3::Status DocumentStore::ResetNamespaceMapper() {
 libtextclassifier3::Status DocumentStore::ResetCorpusMapper() {
   // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
   corpus_mapper_.reset();
-  // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
+  // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
   // that can support error logging.
   libtextclassifier3::Status status = KeyMapper<CorpusId>::Delete(
       *filesystem_, MakeCorpusMapperFilename(base_dir_));
@@ -1749,5 +1765,63 @@ libtextclassifier3::Status DocumentStore::SetUsageScores(
   return usage_store_->SetUsageScores(document_id, usage_scores);
 }
 
+libtextclassifier3::StatusOr<
+    google::protobuf::RepeatedPtrField<DocumentDebugInfoProto::CorpusInfo>>
+DocumentStore::CollectCorpusInfo() const {
+  google::protobuf::RepeatedPtrField<DocumentDebugInfoProto::CorpusInfo>
+      corpus_info;
+  libtextclassifier3::StatusOr<const SchemaProto*> schema_proto_or =
+      schema_store_->GetSchema();
+  if (!schema_proto_or.ok()) {
+    return corpus_info;
+  }
+  // Maps from CorpusId to the corresponding protocol buffer in the result.
+  std::unordered_map<CorpusId, DocumentDebugInfoProto::CorpusInfo*> info_map;
+  std::unordered_map<NamespaceId, std::string> namespace_id_to_namespace =
+      namespace_mapper_->GetValuesToKeys();
+  const SchemaProto* schema_proto = schema_proto_or.ValueOrDie();
+  for (DocumentId document_id = 0; document_id < filter_cache_->num_elements();
+       ++document_id) {
+    if (!InternalDoesDocumentExist(document_id)) {
+      continue;
+    }
+    ICING_ASSIGN_OR_RETURN(const DocumentFilterData* filter_data,
+                           filter_cache_->Get(document_id));
+    ICING_ASSIGN_OR_RETURN(const DocumentAssociatedScoreData* score_data,
+                           score_cache_->Get(document_id));
+    const std::string& name_space =
+        namespace_id_to_namespace[filter_data->namespace_id()];
+    const std::string& schema =
+        schema_proto->types()[filter_data->schema_type_id()].schema_type();
+    auto iter = info_map.find(score_data->corpus_id());
+    if (iter == info_map.end()) {
+      DocumentDebugInfoProto::CorpusInfo* entry = corpus_info.Add();
+      entry->set_namespace_(name_space);
+      entry->set_schema(schema);
+      iter = info_map.insert({score_data->corpus_id(), entry}).first;
+    }
+    iter->second->set_total_documents(iter->second->total_documents() + 1);
+    iter->second->set_total_token(iter->second->total_token() +
+                                  score_data->length_in_tokens());
+  }
+  return corpus_info;
+}
+
+libtextclassifier3::StatusOr<DocumentDebugInfoProto>
+DocumentStore::GetDebugInfo(int verbosity) const {
+  DocumentDebugInfoProto debug_info;
+  *debug_info.mutable_document_storage_info() = GetStorageInfo();
+  ICING_ASSIGN_OR_RETURN(Crc32 crc, ComputeChecksum());
+  debug_info.set_crc(crc.Get());
+  if (verbosity > 0) {
+    ICING_ASSIGN_OR_RETURN(google::protobuf::RepeatedPtrField<
+                               DocumentDebugInfoProto::CorpusInfo>
+                               corpus_info,
+                           CollectCorpusInfo());
+    *debug_info.mutable_corpus_info() = std::move(corpus_info);
+  }
+  return debug_info;
+}
+
 }  // namespace lib
 }  // namespace icing
diff --git a/icing/store/document-store.h b/icing/store/document-store.h
index c85c989..e6d2e5c 100644
--- a/icing/store/document-store.h
+++ b/icing/store/document-store.h
@@ -27,6 +27,7 @@
 #include "icing/file/file-backed-vector.h"
 #include "icing/file/filesystem.h"
 #include "icing/file/portable-file-backed-proto-log.h"
+#include "icing/proto/debug.pb.h"
 #include "icing/proto/document.pb.h"
 #include "icing/proto/document_wrapper.pb.h"
 #include "icing/proto/logging.pb.h"
@@ -422,6 +423,17 @@ class DocumentStore {
   //   INTERNAL_ERROR on compute error
   libtextclassifier3::StatusOr<Crc32> ComputeChecksum() const;
 
+  // Get debug information for the document store.
+  // verbosity <= 0, simplest debug information
+  // verbosity > 0, also return the total number of documents and tokens in each
+  // (namespace, schema type) pair.
+  //
+  // Returns:
+  //   DocumentDebugInfoProto on success
+  //   INTERNAL_ERROR on IO errors, crc compute error
+  libtextclassifier3::StatusOr<DocumentDebugInfoProto> GetDebugInfo(
+      int verbosity) const;
+
  private:
   // Use DocumentStore::Create() to instantiate.
   DocumentStore(const Filesystem* filesystem, std::string_view base_dir,
@@ -696,6 +708,13 @@ class DocumentStore {
   //     the document_id_mapper somehow became larger than the filter cache.
   DocumentStorageInfoProto CalculateDocumentStatusCounts(
       DocumentStorageInfoProto storage_info) const;
+
+  // Returns:
+  //   - on success, a RepeatedPtrField for CorpusInfo collected.
+  //   - OUT_OF_RANGE, this should never happen.
+  libtextclassifier3::StatusOr<google::protobuf::RepeatedPtrField<
+      DocumentDebugInfoProto::CorpusInfo>>
+  CollectCorpusInfo() const;
 };
 
 }  // namespace lib
diff --git a/icing/store/document-store_test.cc b/icing/store/document-store_test.cc
index 78d2f9c..96d11bf 100644
--- a/icing/store/document-store_test.cc
+++ b/icing/store/document-store_test.cc
@@ -29,7 +29,6 @@
 #include "icing/file/filesystem.h"
 #include "icing/file/memory-mapped-file.h"
 #include "icing/file/mock-filesystem.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
 #include "icing/portable/equals-proto.h"
 #include "icing/portable/platform.h"
 #include "icing/proto/document.pb.h"
@@ -45,6 +44,7 @@
 #include "icing/store/namespace-id.h"
 #include "icing/testing/common-matchers.h"
 #include "icing/testing/fake-clock.h"
+#include "icing/testing/icu-data-file-helper.h"
 #include "icing/testing/test-data.h"
 #include "icing/testing/tmp-directory.h"
 #include "icing/tokenization/language-segmenter-factory.h"
@@ -3170,15 +3170,6 @@ TEST_F(DocumentStoreTest, DetectCompleteDataLoss) {
   ASSERT_THAT(create_result.data_loss, Eq(DataLoss::COMPLETE));
 }
 
-// TODO(b/185845269) Re-enable this test by copying over a full valid set of
-// document store files. Right now this test only includes the score_cache and
-// the document store header.
-//
-// This causes a problem now because this cl changes behavior to not consider an
-// InitializeExistingDerivedFiles failure to be a recovery if there is nothing
-// to recover because the doocument store is empty.
-#define DISABLE_BACKWARDS_COMPAT_TEST
-#ifndef DISABLE_BACKWARDS_COMPAT_TEST
 TEST_F(DocumentStoreTest, LoadScoreCacheAndInitializeSuccessfully) {
   // The directory testdata/score_cache_without_length_in_tokens/document_store
   // contains only the scoring_cache and the document_store_header (holding the
@@ -3194,29 +3185,26 @@ TEST_F(DocumentStoreTest, LoadScoreCacheAndInitializeSuccessfully) {
 
   // Get src files
   std::string document_store_without_length_in_tokens;
-  if (IsAndroidPlatform() || IsIosPlatform()) {
+  if (IsAndroidArm() || IsIosPlatform()) {
     document_store_without_length_in_tokens = GetTestFilePath(
         "icing/testdata/score_cache_without_length_in_tokens/"
         "document_store_android_ios_compatible");
+  } else if (IsAndroidX86()) {
+    document_store_without_length_in_tokens = GetTestFilePath(
+        "icing/testdata/score_cache_without_length_in_tokens/"
+        "document_store_android_x86");
   } else {
     document_store_without_length_in_tokens = GetTestFilePath(
         "icing/testdata/score_cache_without_length_in_tokens/"
         "document_store");
   }
-  std::vector<std::string> document_store_files;
   Filesystem filesystem;
-  filesystem.ListDirectory(document_store_without_length_in_tokens.c_str(),
-                           &document_store_files);
-
-  ICING_LOG(INFO) << "Copying files " << document_store_without_length_in_tokens
-                  << ' ' << document_store_files.size();
-  for (size_t i = 0; i != document_store_files.size(); i++) {
-    std::string src = absl_ports::StrCat(
-        document_store_without_length_in_tokens, "/", document_store_files[i]);
-    std::string dst =
-        absl_ports::StrCat(document_store_dir_, "/", document_store_files[i]);
-    ASSERT_THAT(filesystem_.CopyFile(src.c_str(), dst.c_str()), true);
-  }
+  ICING_LOG(INFO) << "Copying files "
+                  << document_store_without_length_in_tokens;
+  ASSERT_THAT(
+      filesystem.CopyDirectory(document_store_without_length_in_tokens.c_str(),
+                               document_store_dir_.c_str(), /*recursive=*/true),
+      true);
 
   InitializeStatsProto initialize_stats;
   ICING_ASSERT_OK_AND_ASSIGN(
@@ -3227,12 +3215,11 @@ TEST_F(DocumentStoreTest, LoadScoreCacheAndInitializeSuccessfully) {
                             &initialize_stats));
   std::unique_ptr<DocumentStore> doc_store =
       std::move(create_result.document_store);
-  // The store_cache trigger regeneration because its element size is
-  // inconsistent: expected 20 (current new size), actual 12 (as per the v0
-  // score_cache).
-  EXPECT_TRUE(initialize_stats.has_document_store_recovery_cause());
+  // The document log is using the legacy v0 format so that a migration is
+  // needed, which will also trigger regeneration.
+  EXPECT_EQ(initialize_stats.document_store_recovery_cause(),
+            InitializeStatsProto::LEGACY_DOCUMENT_LOG_FORMAT);
 }
-#endif  // DISABLE_BACKWARDS_COMPAT_TEST
 
 TEST_F(DocumentStoreTest, DocumentStoreStorageInfo) {
   ICING_ASSERT_OK_AND_ASSIGN(
@@ -3422,18 +3409,22 @@ TEST_F(DocumentStoreTest, InitializeForceRecoveryUpdatesTypeIds) {
 
   {
     // Create the document store the second time and force recovery
+    InitializeStatsProto initialize_stats;
     ICING_ASSERT_OK_AND_ASSIGN(
         DocumentStore::CreateResult create_result,
-        DocumentStore::Create(
-            &filesystem_, document_store_dir_, &fake_clock_, schema_store.get(),
-            /*force_recovery_and_revalidate_documents=*/true));
+        DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+                              schema_store.get(),
+                              /*force_recovery_and_revalidate_documents=*/true,
+                              &initialize_stats));
     std::unique_ptr<DocumentStore> doc_store =
         std::move(create_result.document_store);
 
     // Ensure that the type id of the email document has been correctly updated.
     ICING_ASSERT_OK_AND_ASSIGN(DocumentFilterData filter_data,
                                doc_store->GetDocumentFilterData(docid));
-    ASSERT_THAT(filter_data.schema_type_id(), Eq(1));
+    EXPECT_THAT(filter_data.schema_type_id(), Eq(1));
+    EXPECT_THAT(initialize_stats.document_store_recovery_cause(),
+                Eq(InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC));
   }
 }
 
@@ -3840,7 +3831,8 @@ TEST_F(DocumentStoreTest, MigrateToPortableFileBackedProtoLog) {
   // Check that we didn't lose anything. A migration also doesn't technically
   // count as a recovery.
   EXPECT_THAT(create_result.data_loss, Eq(DataLoss::NONE));
-  EXPECT_FALSE(initialize_stats.has_document_store_recovery_cause());
+  EXPECT_EQ(initialize_stats.document_store_recovery_cause(),
+            InitializeStatsProto::LEGACY_DOCUMENT_LOG_FORMAT);
 
   // Document 1 and 3 were put normally, and document 2 was deleted in our
   // testdata files.
@@ -3862,6 +3854,164 @@ TEST_F(DocumentStoreTest, MigrateToPortableFileBackedProtoLog) {
               IsOkAndHolds(EqualsProto(document3)));
 }
 
+TEST_F(DocumentStoreTest, GetDebugInfo) {
+  SchemaProto schema =
+      SchemaBuilder()
+          .AddType(SchemaTypeConfigBuilder()
+                       .SetType("email")
+                       .AddProperty(
+                           PropertyConfigBuilder()
+                               .SetName("subject")
+                               .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
+                               .SetCardinality(CARDINALITY_OPTIONAL))
+                       .AddProperty(
+                           PropertyConfigBuilder()
+                               .SetName("body")
+                               .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
+                               .SetCardinality(CARDINALITY_OPTIONAL)))
+          .AddType(SchemaTypeConfigBuilder().SetType("person").AddProperty(
+              PropertyConfigBuilder()
+                  .SetName("name")
+                  .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
+                  .SetCardinality(CARDINALITY_OPTIONAL)))
+          .Build();
+  std::string schema_store_dir = schema_store_dir_ + "_custom";
+  filesystem_.DeleteDirectoryRecursively(schema_store_dir.c_str());
+  filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str());
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<SchemaStore> schema_store,
+      SchemaStore::Create(&filesystem_, schema_store_dir, &fake_clock_));
+
+  ICING_ASSERT_OK(schema_store->SetSchema(schema));
+
+  ICING_ASSERT_OK_AND_ASSIGN(
+      DocumentStore::CreateResult create_result,
+      DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+                            schema_store.get()));
+  std::unique_ptr<DocumentStore> document_store =
+      std::move(create_result.document_store);
+
+  DocumentProto document1 = DocumentBuilder()
+                                .SetKey("namespace1", "email/1")
+                                .SetSchema("email")
+                                .AddStringProperty("subject", "aa bb cc")
+                                .AddStringProperty("body", "dd ee")
+                                .SetCreationTimestampMs(1)
+                                .Build();
+  ICING_ASSERT_OK(document_store->Put(document1, 5));
+
+  DocumentProto document2 = DocumentBuilder()
+                                .SetKey("namespace2", "email/2")
+                                .SetSchema("email")
+                                .AddStringProperty("subject", "aa bb")
+                                .AddStringProperty("body", "cc")
+                                .SetCreationTimestampMs(1)
+                                .Build();
+  ICING_ASSERT_OK(document_store->Put(document2, 3));
+
+  DocumentProto document3 = DocumentBuilder()
+                                .SetKey("namespace2", "email/3")
+                                .SetSchema("email")
+                                .AddStringProperty("subject", "aa")
+                                .AddStringProperty("body", "")
+                                .SetCreationTimestampMs(1)
+                                .Build();
+  ICING_ASSERT_OK(document_store->Put(document3, 1));
+
+  DocumentProto document4 = DocumentBuilder()
+                                .SetKey("namespace1", "person/1")
+                                .SetSchema("person")
+                                .AddStringProperty("name", "test test")
+                                .SetCreationTimestampMs(1)
+                                .Build();
+  ICING_ASSERT_OK(document_store->Put(document4, 2));
+
+  ICING_ASSERT_OK_AND_ASSIGN(DocumentDebugInfoProto out1,
+                             document_store->GetDebugInfo(/*verbosity=*/1));
+  EXPECT_THAT(out1.crc(), Gt(0));
+  EXPECT_THAT(out1.document_storage_info().num_alive_documents(), Eq(4));
+  EXPECT_THAT(out1.document_storage_info().num_deleted_documents(), Eq(0));
+  EXPECT_THAT(out1.document_storage_info().num_expired_documents(), Eq(0));
+
+  DocumentDebugInfoProto::CorpusInfo info1, info2, info3;
+  info1.set_namespace_("namespace1");
+  info1.set_schema("email");
+  info1.set_total_documents(1);  // document1
+  info1.set_total_token(5);
+
+  info2.set_namespace_("namespace2");
+  info2.set_schema("email");
+  info2.set_total_documents(2);  // document2 and document3
+  info2.set_total_token(4);      // 3 + 1
+
+  info3.set_namespace_("namespace1");
+  info3.set_schema("person");
+  info3.set_total_documents(1);  // document4
+  info3.set_total_token(2);
+
+  EXPECT_THAT(out1.corpus_info(),
+              UnorderedElementsAre(EqualsProto(info1), EqualsProto(info2),
+                                   EqualsProto(info3)));
+
+  // Delete document3.
+  ICING_ASSERT_OK(document_store->Delete("namespace2", "email/3"));
+  ICING_ASSERT_OK_AND_ASSIGN(DocumentDebugInfoProto out2,
+                             document_store->GetDebugInfo(/*verbosity=*/1));
+  EXPECT_THAT(out2.crc(), Gt(0));
+  EXPECT_THAT(out2.crc(), Not(Eq(out1.crc())));
+  EXPECT_THAT(out2.document_storage_info().num_alive_documents(), Eq(3));
+  EXPECT_THAT(out2.document_storage_info().num_deleted_documents(), Eq(1));
+  EXPECT_THAT(out2.document_storage_info().num_expired_documents(), Eq(0));
+  info2.set_total_documents(1);  // document2
+  info2.set_total_token(3);
+  EXPECT_THAT(out2.corpus_info(),
+              UnorderedElementsAre(EqualsProto(info1), EqualsProto(info2),
+                                   EqualsProto(info3)));
+
+  ICING_ASSERT_OK_AND_ASSIGN(DocumentDebugInfoProto out3,
+                             document_store->GetDebugInfo(/*verbosity=*/0));
+  EXPECT_THAT(out3.corpus_info(), IsEmpty());
+}
+
+TEST_F(DocumentStoreTest, GetDebugInfoWithoutSchema) {
+  std::string schema_store_dir = schema_store_dir_ + "_custom";
+  filesystem_.DeleteDirectoryRecursively(schema_store_dir.c_str());
+  filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str());
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<SchemaStore> schema_store,
+      SchemaStore::Create(&filesystem_, schema_store_dir, &fake_clock_));
+
+  ICING_ASSERT_OK_AND_ASSIGN(
+      DocumentStore::CreateResult create_result,
+      DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+                            schema_store.get()));
+  std::unique_ptr<DocumentStore> document_store =
+      std::move(create_result.document_store);
+  ICING_ASSERT_OK_AND_ASSIGN(DocumentDebugInfoProto out,
+                             document_store->GetDebugInfo(/*verbosity=*/1));
+  EXPECT_THAT(out.crc(), Gt(0));
+  EXPECT_THAT(out.document_storage_info().num_alive_documents(), Eq(0));
+  EXPECT_THAT(out.document_storage_info().num_deleted_documents(), Eq(0));
+  EXPECT_THAT(out.document_storage_info().num_expired_documents(), Eq(0));
+  EXPECT_THAT(out.corpus_info(), IsEmpty());
+}
+
+TEST_F(DocumentStoreTest, GetDebugInfoForEmptyDocumentStore) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      DocumentStore::CreateResult create_result,
+      DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+                            schema_store_.get()));
+  std::unique_ptr<DocumentStore> document_store =
+      std::move(create_result.document_store);
+  ICING_ASSERT_OK_AND_ASSIGN(DocumentDebugInfoProto out,
+                             document_store->GetDebugInfo(/*verbosity=*/1));
+  EXPECT_THAT(out.crc(), Gt(0));
+  EXPECT_THAT(out.document_storage_info().num_alive_documents(), Eq(0));
+  EXPECT_THAT(out.document_storage_info().num_deleted_documents(), Eq(0));
+  EXPECT_THAT(out.document_storage_info().num_expired_documents(), Eq(0));
+  EXPECT_THAT(out.corpus_info(), IsEmpty());
+}
+
 }  // namespace
 
 }  // namespace lib
diff --git a/icing/helpers/icu/icu-data-file-helper.cc b/icing/testing/icu-data-file-helper.cc
index 6607c40..aaeb738 100644
--- a/icing/helpers/icu/icu-data-file-helper.cc
+++ b/icing/testing/icu-data-file-helper.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "icing/helpers/icu/icu-data-file-helper.h"
+#include "icing/testing/icu-data-file-helper.h"
 
 #include <sys/mman.h>
 
diff --git a/icing/helpers/icu/icu-data-file-helper.h b/icing/testing/icu-data-file-helper.h
index 90f5bc7..d0276e7 100644
--- a/icing/helpers/icu/icu-data-file-helper.h
+++ b/icing/testing/icu-data-file-helper.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef ICING_HELPERS_ICU_ICU_DATA_FILE_HELPER
-#define ICING_HELPERS_ICU_ICU_DATA_FILE_HELPER
+#ifndef ICING_TESTING_ICU_DATA_FILE_HELPER
+#define ICING_TESTING_ICU_DATA_FILE_HELPER
 
 #include "icing/text_classifier/lib3/utils/base/status.h"
 
@@ -40,4 +40,4 @@ libtextclassifier3::Status SetUpICUDataFile(
 }  // namespace lib
 }  // namespace icing
 
-#endif  // ICING_HELPERS_ICU_ICU_DATA_FILE_HELPER
+#endif  // ICING_TESTING_ICU_DATA_FILE_HELPER
diff --git a/icing/testing/random-string.h b/icing/testing/random-string.h
index 3165bf6..fd8d87b 100644
--- a/icing/testing/random-string.h
+++ b/icing/testing/random-string.h
@@ -15,6 +15,7 @@
 #ifndef ICING_TESTING_RANDOM_STRING_H_
 #define ICING_TESTING_RANDOM_STRING_H_
 
+#include <algorithm>
 #include <random>
 #include <string>
 
diff --git a/icing/tokenization/icu/icu-language-segmenter.cc b/icing/tokenization/icu/icu-language-segmenter.cc
index 598ede7..8e0f789 100644
--- a/icing/tokenization/icu/icu-language-segmenter.cc
+++ b/icing/tokenization/icu/icu-language-segmenter.cc
@@ -59,34 +59,35 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
 
   ~IcuLanguageSegmenterIterator() {
     ubrk_close(break_iterator_);
-    utext_close(&u_text_);
+    utext_close(u_text_);
   }
 
   // Advances to the next term. Returns false if it has reached the end.
   bool Advance() override {
-    // Prerequisite check
-    if (term_end_index_exclusive_ == UBRK_DONE) {
-      return false;
-    }
+    while (true) {
+      // Prerequisite check
+      if (term_end_index_exclusive_ == UBRK_DONE) {
+        return false;
+      }
 
-    if (term_end_index_exclusive_ == 0) {
-      // First Advance() call
-      term_start_index_ = ubrk_first(break_iterator_);
-    } else {
-      term_start_index_ = term_end_index_exclusive_;
-    }
-    term_end_index_exclusive_ = ubrk_next(break_iterator_);
+      if (term_end_index_exclusive_ == 0) {
+        // First Advance() call
+        term_start_index_ = ubrk_first(break_iterator_);
+      } else {
+        term_start_index_ = term_end_index_exclusive_;
+      }
+      term_end_index_exclusive_ = ubrk_next(break_iterator_);
 
-    // Reached the end
-    if (term_end_index_exclusive_ == UBRK_DONE) {
-      MarkAsDone();
-      return false;
-    }
+      // Reached the end
+      if (term_end_index_exclusive_ == UBRK_DONE) {
+        MarkAsDone();
+        return false;
+      }
 
-    if (!IsValidSegment()) {
-      return Advance();
+      if (IsValidSegment()) {
+        return true;
+      }
     }
-    return true;
   }
 
   // Returns the current term. It can be called only when Advance() returns
@@ -253,7 +254,7 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
       : break_iterator_(nullptr),
         text_(text),
         locale_(locale),
-        u_text_(UTEXT_INITIALIZER),
+        u_text_(nullptr),
         offset_iterator_(text),
         term_start_index_(0),
         term_end_index_exclusive_(0) {}
@@ -261,10 +262,13 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
   // Returns true on success
   bool Initialize() {
     UErrorCode status = U_ZERO_ERROR;
-    utext_openUTF8(&u_text_, text_.data(), text_.length(), &status);
+    u_text_ = utext_openUTF8(nullptr, text_.data(), text_.length(), &status);
+    if (u_text_ == nullptr) {
+      return false;
+    }
     break_iterator_ = ubrk_open(UBRK_WORD, locale_.data(), /*text=*/nullptr,
                                 /*textLength=*/0, &status);
-    ubrk_setUText(break_iterator_, &u_text_, &status);
+    ubrk_setUText(break_iterator_, u_text_, &status);
     return !U_FAILURE(status);
   }
 
@@ -322,8 +326,8 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
   std::string_view locale_;
 
   // A thin wrapper around the input UTF8 text, needed by break_iterator_.
-  // utext_close() must be called after using.
-  UText u_text_;
+  // Allocated by calling utext_openUtf8() and freed by calling utext_close().
+  UText* u_text_;
 
   // Offset iterator. This iterator is not guaranteed to point to any particular
   // character, but is guaranteed to point to a valid UTF character sequence.
diff --git a/icing/tokenization/icu/icu-language-segmenter_test.cc b/icing/tokenization/icu/icu-language-segmenter_test.cc
index 3090087..fe0b96e 100644
--- a/icing/tokenization/icu/icu-language-segmenter_test.cc
+++ b/icing/tokenization/icu/icu-language-segmenter_test.cc
@@ -21,8 +21,8 @@
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 #include "icing/absl_ports/str_cat.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
 #include "icing/testing/common-matchers.h"
+#include "icing/testing/icu-data-file-helper.h"
 #include "icing/testing/icu-i18n-test-utils.h"
 #include "icing/testing/jni-test-helpers.h"
 #include "icing/testing/test-data.h"
diff --git a/icing/tokenization/language-segmenter-iterator_test.cc b/icing/tokenization/language-segmenter-iterator_test.cc
index d293581..3aff45c 100644
--- a/icing/tokenization/language-segmenter-iterator_test.cc
+++ b/icing/tokenization/language-segmenter-iterator_test.cc
@@ -15,9 +15,9 @@
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 #include "icing/absl_ports/str_cat.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
 #include "icing/portable/platform.h"
 #include "icing/testing/common-matchers.h"
+#include "icing/testing/icu-data-file-helper.h"
 #include "icing/testing/jni-test-helpers.h"
 #include "icing/testing/test-data.h"
 #include "icing/tokenization/language-segmenter-factory.h"
diff --git a/icing/tokenization/language-segmenter_benchmark.cc b/icing/tokenization/language-segmenter_benchmark.cc
index bd86169..6f7d4df 100644
--- a/icing/tokenization/language-segmenter_benchmark.cc
+++ b/icing/tokenization/language-segmenter_benchmark.cc
@@ -14,8 +14,8 @@
 
 #include "testing/base/public/benchmark.h"
 #include "gmock/gmock.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
 #include "icing/testing/common-matchers.h"
+#include "icing/testing/icu-data-file-helper.h"
 #include "icing/testing/test-data.h"
 #include "icing/tokenization/language-segmenter-factory.h"
 #include "icing/tokenization/language-segmenter.h"
diff --git a/icing/tokenization/plain-tokenizer.cc b/icing/tokenization/plain-tokenizer.cc
index df98bee..7a1949f 100644
--- a/icing/tokenization/plain-tokenizer.cc
+++ b/icing/tokenization/plain-tokenizer.cc
@@ -66,9 +66,9 @@ class PlainTokenIterator : public Tokenizer::Iterator {
 
   Token GetToken() const override {
     if (current_term_.empty()) {
-      return Token(Token::INVALID);
+      return Token(Token::Type::INVALID);
     }
-    return Token(Token::REGULAR, current_term_);
+    return Token(Token::Type::REGULAR, current_term_);
   }
 
   libtextclassifier3::StatusOr<CharacterIterator> CalculateTokenStart()
diff --git a/icing/tokenization/plain-tokenizer_test.cc b/icing/tokenization/plain-tokenizer_test.cc
index f76ba9f..c48b51e 100644
--- a/icing/tokenization/plain-tokenizer_test.cc
+++ b/icing/tokenization/plain-tokenizer_test.cc
@@ -18,9 +18,9 @@
 
 #include "gmock/gmock.h"
 #include "icing/absl_ports/str_cat.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
 #include "icing/portable/platform.h"
 #include "icing/testing/common-matchers.h"
+#include "icing/testing/icu-data-file-helper.h"
 #include "icing/testing/icu-i18n-test-utils.h"
 #include "icing/testing/jni-test-helpers.h"
 #include "icing/testing/test-data.h"
@@ -68,26 +68,27 @@ TEST_F(PlainTokenizerTest, Simple) {
 
   EXPECT_THAT(plain_tokenizer->TokenizeAll(""), IsOkAndHolds(IsEmpty()));
 
-  EXPECT_THAT(plain_tokenizer->TokenizeAll("Hello World"),
-              IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "Hello"),
-                                       EqualsToken(Token::REGULAR, "World"))));
+  EXPECT_THAT(
+      plain_tokenizer->TokenizeAll("Hello World"),
+      IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "Hello"),
+                               EqualsToken(Token::Type::REGULAR, "World"))));
 
   EXPECT_THAT(
       plain_tokenizer->TokenizeAll(
           "Lorem ipsum dolor sit amet, consectetur adipiscing elit. "
           "Duis efficitur iaculis auctor."),
-      IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "Lorem"),
-                               EqualsToken(Token::REGULAR, "ipsum"),
-                               EqualsToken(Token::REGULAR, "dolor"),
-                               EqualsToken(Token::REGULAR, "sit"),
-                               EqualsToken(Token::REGULAR, "amet"),
-                               EqualsToken(Token::REGULAR, "consectetur"),
-                               EqualsToken(Token::REGULAR, "adipiscing"),
-                               EqualsToken(Token::REGULAR, "elit"),
-                               EqualsToken(Token::REGULAR, "Duis"),
-                               EqualsToken(Token::REGULAR, "efficitur"),
-                               EqualsToken(Token::REGULAR, "iaculis"),
-                               EqualsToken(Token::REGULAR, "auctor"))));
+      IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "Lorem"),
+                               EqualsToken(Token::Type::REGULAR, "ipsum"),
+                               EqualsToken(Token::Type::REGULAR, "dolor"),
+                               EqualsToken(Token::Type::REGULAR, "sit"),
+                               EqualsToken(Token::Type::REGULAR, "amet"),
+                               EqualsToken(Token::Type::REGULAR, "consectetur"),
+                               EqualsToken(Token::Type::REGULAR, "adipiscing"),
+                               EqualsToken(Token::Type::REGULAR, "elit"),
+                               EqualsToken(Token::Type::REGULAR, "Duis"),
+                               EqualsToken(Token::Type::REGULAR, "efficitur"),
+                               EqualsToken(Token::Type::REGULAR, "iaculis"),
+                               EqualsToken(Token::Type::REGULAR, "auctor"))));
 }
 
 TEST_F(PlainTokenizerTest, Whitespace) {
@@ -107,16 +108,18 @@ TEST_F(PlainTokenizerTest, Whitespace) {
   // 0x0009 is horizontal tab, considered as a whitespace
   std::string text_with_horizontal_tab =
       absl_ports::StrCat("Hello", UCharToString(0x0009), "World");
-  EXPECT_THAT(plain_tokenizer->TokenizeAll(text_with_horizontal_tab),
-              IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "Hello"),
-                                       EqualsToken(Token::REGULAR, "World"))));
+  EXPECT_THAT(
+      plain_tokenizer->TokenizeAll(text_with_horizontal_tab),
+      IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "Hello"),
+                               EqualsToken(Token::Type::REGULAR, "World"))));
 
   // 0x000B is vertical tab, considered as a whitespace
   std::string text_with_vertical_tab =
       absl_ports::StrCat("Hello", UCharToString(0x000B), "World");
-  EXPECT_THAT(plain_tokenizer->TokenizeAll(text_with_vertical_tab),
-              IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "Hello"),
-                                       EqualsToken(Token::REGULAR, "World"))));
+  EXPECT_THAT(
+      plain_tokenizer->TokenizeAll(text_with_vertical_tab),
+      IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "Hello"),
+                               EqualsToken(Token::Type::REGULAR, "World"))));
 }
 
 TEST_F(PlainTokenizerTest, Punctuation) {
@@ -131,38 +134,39 @@ TEST_F(PlainTokenizerTest, Punctuation) {
                                  language_segmenter.get()));
 
   // Half-width punctuation marks are filtered out.
-  EXPECT_THAT(plain_tokenizer->TokenizeAll(
-                  "Hello, World! Hello: World. \"Hello\" World?"),
-              IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "Hello"),
-                                       EqualsToken(Token::REGULAR, "World"),
-                                       EqualsToken(Token::REGULAR, "Hello"),
-                                       EqualsToken(Token::REGULAR, "World"),
-                                       EqualsToken(Token::REGULAR, "Hello"),
-                                       EqualsToken(Token::REGULAR, "World"))));
+  EXPECT_THAT(
+      plain_tokenizer->TokenizeAll(
+          "Hello, World! Hello: World. \"Hello\" World?"),
+      IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "Hello"),
+                               EqualsToken(Token::Type::REGULAR, "World"),
+                               EqualsToken(Token::Type::REGULAR, "Hello"),
+                               EqualsToken(Token::Type::REGULAR, "World"),
+                               EqualsToken(Token::Type::REGULAR, "Hello"),
+                               EqualsToken(Token::Type::REGULAR, "World"))));
 
   // Full-width punctuation marks are filtered out.
   std::vector<std::string_view> exp_tokens;
   if (IsCfStringTokenization()) {
     EXPECT_THAT(
         plain_tokenizer->TokenizeAll("你好，世界！你好：世界。“你好”世界？"),
-        IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "你"),
-                                 EqualsToken(Token::REGULAR, "好"),
-                                 EqualsToken(Token::REGULAR, "世界"),
-                                 EqualsToken(Token::REGULAR, "你"),
-                                 EqualsToken(Token::REGULAR, "好"),
-                                 EqualsToken(Token::REGULAR, "世界"),
-                                 EqualsToken(Token::REGULAR, "你"),
-                                 EqualsToken(Token::REGULAR, "好"),
-                                 EqualsToken(Token::REGULAR, "世界"))));
+        IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "你"),
+                                 EqualsToken(Token::Type::REGULAR, "好"),
+                                 EqualsToken(Token::Type::REGULAR, "世界"),
+                                 EqualsToken(Token::Type::REGULAR, "你"),
+                                 EqualsToken(Token::Type::REGULAR, "好"),
+                                 EqualsToken(Token::Type::REGULAR, "世界"),
+                                 EqualsToken(Token::Type::REGULAR, "你"),
+                                 EqualsToken(Token::Type::REGULAR, "好"),
+                                 EqualsToken(Token::Type::REGULAR, "世界"))));
   } else {
     EXPECT_THAT(
         plain_tokenizer->TokenizeAll("你好，世界！你好：世界。“你好”世界？"),
-        IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "你好"),
-                                 EqualsToken(Token::REGULAR, "世界"),
-                                 EqualsToken(Token::REGULAR, "你好"),
-                                 EqualsToken(Token::REGULAR, "世界"),
-                                 EqualsToken(Token::REGULAR, "你好"),
-                                 EqualsToken(Token::REGULAR, "世界"))));
+        IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "你好"),
+                                 EqualsToken(Token::Type::REGULAR, "世界"),
+                                 EqualsToken(Token::Type::REGULAR, "你好"),
+                                 EqualsToken(Token::Type::REGULAR, "世界"),
+                                 EqualsToken(Token::Type::REGULAR, "你好"),
+                                 EqualsToken(Token::Type::REGULAR, "世界"))));
   }
 }
 
@@ -180,14 +184,16 @@ TEST_F(PlainTokenizerTest, SpecialCharacters) {
   // Right now we don't have special logic for these characters, just output
   // them as tokens.
 
-  EXPECT_THAT(plain_tokenizer->TokenizeAll("1+1"),
-              IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "1"),
-                                       EqualsToken(Token::REGULAR, "+"),
-                                       EqualsToken(Token::REGULAR, "1"))));
+  EXPECT_THAT(
+      plain_tokenizer->TokenizeAll("1+1"),
+      IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "1"),
+                               EqualsToken(Token::Type::REGULAR, "+"),
+                               EqualsToken(Token::Type::REGULAR, "1"))));
 
-  EXPECT_THAT(plain_tokenizer->TokenizeAll("$50"),
-              IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "$"),
-                                       EqualsToken(Token::REGULAR, "50"))));
+  EXPECT_THAT(
+      plain_tokenizer->TokenizeAll("$50"),
+      IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "$"),
+                               EqualsToken(Token::Type::REGULAR, "50"))));
 }
 
 TEST_F(PlainTokenizerTest, CJKT) {
@@ -203,12 +209,13 @@ TEST_F(PlainTokenizerTest, CJKT) {
                              tokenizer_factory::CreateIndexingTokenizer(
                                  StringIndexingConfig::TokenizerType::PLAIN,
                                  language_segmenter.get()));
-  EXPECT_THAT(plain_tokenizer->TokenizeAll("我每天走路去上班。"),
-              IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "我"),
-                                       EqualsToken(Token::REGULAR, "每天"),
-                                       EqualsToken(Token::REGULAR, "走路"),
-                                       EqualsToken(Token::REGULAR, "去"),
-                                       EqualsToken(Token::REGULAR, "上班"))));
+  EXPECT_THAT(
+      plain_tokenizer->TokenizeAll("我每天走路去上班。"),
+      IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "我"),
+                               EqualsToken(Token::Type::REGULAR, "每天"),
+                               EqualsToken(Token::Type::REGULAR, "走路"),
+                               EqualsToken(Token::Type::REGULAR, "去"),
+                               EqualsToken(Token::Type::REGULAR, "上班"))));
   // Japanese
   options = language_segmenter_factory::SegmenterOptions(ULOC_JAPANESE,
                                                          jni_cache_.get());
@@ -220,41 +227,44 @@ TEST_F(PlainTokenizerTest, CJKT) {
                                  StringIndexingConfig::TokenizerType::PLAIN,
                                  language_segmenter.get()));
   if (IsCfStringTokenization()) {
-    EXPECT_THAT(plain_tokenizer->TokenizeAll("私は毎日仕事に歩いています。"),
-                IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "私"),
-                                         EqualsToken(Token::REGULAR, "は"),
-                                         EqualsToken(Token::REGULAR, "毎日"),
-                                         EqualsToken(Token::REGULAR, "仕事"),
-                                         EqualsToken(Token::REGULAR, "に"),
-                                         EqualsToken(Token::REGULAR, "歩い"),
-                                         EqualsToken(Token::REGULAR, "て"),
-                                         EqualsToken(Token::REGULAR, "い"),
-                                         EqualsToken(Token::REGULAR, "ます"))));
+    EXPECT_THAT(
+        plain_tokenizer->TokenizeAll("私は毎日仕事に歩いています。"),
+        IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "私"),
+                                 EqualsToken(Token::Type::REGULAR, "は"),
+                                 EqualsToken(Token::Type::REGULAR, "毎日"),
+                                 EqualsToken(Token::Type::REGULAR, "仕事"),
+                                 EqualsToken(Token::Type::REGULAR, "に"),
+                                 EqualsToken(Token::Type::REGULAR, "歩い"),
+                                 EqualsToken(Token::Type::REGULAR, "て"),
+                                 EqualsToken(Token::Type::REGULAR, "い"),
+                                 EqualsToken(Token::Type::REGULAR, "ます"))));
   } else {
-    EXPECT_THAT(plain_tokenizer->TokenizeAll("私は毎日仕事に歩いています。"),
-                IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "私"),
-                                         EqualsToken(Token::REGULAR, "は"),
-                                         EqualsToken(Token::REGULAR, "毎日"),
-                                         EqualsToken(Token::REGULAR, "仕事"),
-                                         EqualsToken(Token::REGULAR, "に"),
-                                         EqualsToken(Token::REGULAR, "歩"),
-                                         EqualsToken(Token::REGULAR, "い"),
-                                         EqualsToken(Token::REGULAR, "てい"),
-                                         EqualsToken(Token::REGULAR, "ます"))));
+    EXPECT_THAT(
+        plain_tokenizer->TokenizeAll("私は毎日仕事に歩いています。"),
+        IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "私"),
+                                 EqualsToken(Token::Type::REGULAR, "は"),
+                                 EqualsToken(Token::Type::REGULAR, "毎日"),
+                                 EqualsToken(Token::Type::REGULAR, "仕事"),
+                                 EqualsToken(Token::Type::REGULAR, "に"),
+                                 EqualsToken(Token::Type::REGULAR, "歩"),
+                                 EqualsToken(Token::Type::REGULAR, "い"),
+                                 EqualsToken(Token::Type::REGULAR, "てい"),
+                                 EqualsToken(Token::Type::REGULAR, "ます"))));
   }
 
   // Khmer
-  EXPECT_THAT(plain_tokenizer->TokenizeAll("ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"),
-              IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "ញុំ"),
-                                       EqualsToken(Token::REGULAR, "ដើរទៅ"),
-                                       EqualsToken(Token::REGULAR, "ធ្វើការ"),
-                                       EqualsToken(Token::REGULAR, "រាល់ថ្ងៃ"))));
-  // Korean
   EXPECT_THAT(
-      plain_tokenizer->TokenizeAll("나는 매일 출근합니다."),
-      IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "나는"),
-                               EqualsToken(Token::REGULAR, "매일"),
-                               EqualsToken(Token::REGULAR, "출근합니다"))));
+      plain_tokenizer->TokenizeAll("ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"),
+      IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "ញុំ"),
+                               EqualsToken(Token::Type::REGULAR, "ដើរទៅ"),
+                               EqualsToken(Token::Type::REGULAR, "ធ្វើការ"),
+                               EqualsToken(Token::Type::REGULAR, "រាល់ថ្ងៃ"))));
+  // Korean
+  EXPECT_THAT(plain_tokenizer->TokenizeAll("나는 매일 출근합니다."),
+              IsOkAndHolds(ElementsAre(
+                  EqualsToken(Token::Type::REGULAR, "나는"),
+                  EqualsToken(Token::Type::REGULAR, "매일"),
+                  EqualsToken(Token::Type::REGULAR, "출근합니다"))));
 
   // Thai
   // DIFFERENCE!! Disagreement over how to segment "ทุกวัน" (iOS groups).
@@ -264,19 +274,20 @@ TEST_F(PlainTokenizerTest, CJKT) {
         std::vector<Token> tokens,
         plain_tokenizer->TokenizeAll("ฉันเดินไปทำงานทุกวัน"));
 
-    EXPECT_THAT(tokens, ElementsAre(EqualsToken(Token::REGULAR, "ฉัน"),
-                                    EqualsToken(Token::REGULAR, "เดิน"),
-                                    EqualsToken(Token::REGULAR, "ไป"),
-                                    EqualsToken(Token::REGULAR, "ทำงาน"),
-                                    EqualsToken(Token::REGULAR, "ทุกวัน")));
+    EXPECT_THAT(tokens, ElementsAre(EqualsToken(Token::Type::REGULAR, "ฉัน"),
+                                    EqualsToken(Token::Type::REGULAR, "เดิน"),
+                                    EqualsToken(Token::Type::REGULAR, "ไป"),
+                                    EqualsToken(Token::Type::REGULAR, "ทำงาน"),
+                                    EqualsToken(Token::Type::REGULAR, "ทุกวัน")));
   } else {
-    EXPECT_THAT(plain_tokenizer->TokenizeAll("ฉันเดินไปทำงานทุกวัน"),
-                IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "ฉัน"),
-                                         EqualsToken(Token::REGULAR, "เดิน"),
-                                         EqualsToken(Token::REGULAR, "ไป"),
-                                         EqualsToken(Token::REGULAR, "ทำงาน"),
-                                         EqualsToken(Token::REGULAR, "ทุก"),
-                                         EqualsToken(Token::REGULAR, "วัน"))));
+    EXPECT_THAT(
+        plain_tokenizer->TokenizeAll("ฉันเดินไปทำงานทุกวัน"),
+        IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "ฉัน"),
+                                 EqualsToken(Token::Type::REGULAR, "เดิน"),
+                                 EqualsToken(Token::Type::REGULAR, "ไป"),
+                                 EqualsToken(Token::Type::REGULAR, "ทำงาน"),
+                                 EqualsToken(Token::Type::REGULAR, "ทุก"),
+                                 EqualsToken(Token::Type::REGULAR, "วัน"))));
   }
 }
 
@@ -295,7 +306,7 @@ TEST_F(PlainTokenizerTest, ResetToTokenStartingAfterSimple) {
   auto iterator = plain_tokenizer->Tokenize(kText).ValueOrDie();
 
   EXPECT_TRUE(iterator->ResetToTokenStartingAfter(0));
-  EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::REGULAR, "b"));
+  EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::Type::REGULAR, "b"));
 
   EXPECT_FALSE(iterator->ResetToTokenStartingAfter(2));
 }
@@ -315,7 +326,7 @@ TEST_F(PlainTokenizerTest, ResetToTokenEndingBeforeSimple) {
   auto iterator = plain_tokenizer->Tokenize(kText).ValueOrDie();
 
   EXPECT_TRUE(iterator->ResetToTokenEndingBefore(2));
-  EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::REGULAR, "f"));
+  EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::Type::REGULAR, "f"));
 
   EXPECT_FALSE(iterator->ResetToTokenEndingBefore(0));
 }
@@ -332,11 +343,12 @@ TEST_F(PlainTokenizerTest, ResetToTokenStartingAfter) {
                                  language_segmenter.get()));
 
   constexpr std::string_view kText = " foo . bar baz.. bat ";
-  EXPECT_THAT(plain_tokenizer->TokenizeAll(kText),
-              IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "foo"),
-                                       EqualsToken(Token::REGULAR, "bar"),
-                                       EqualsToken(Token::REGULAR, "baz"),
-                                       EqualsToken(Token::REGULAR, "bat"))));
+  EXPECT_THAT(
+      plain_tokenizer->TokenizeAll(kText),
+      IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "foo"),
+                               EqualsToken(Token::Type::REGULAR, "bar"),
+                               EqualsToken(Token::Type::REGULAR, "baz"),
+                               EqualsToken(Token::Type::REGULAR, "bat"))));
   std::vector<std::string> expected_text = {
       "foo",  //  0: " foo . bar"
       "bar",  //  1: "foo . bar "
@@ -359,12 +371,12 @@ TEST_F(PlainTokenizerTest, ResetToTokenStartingAfter) {
 
   auto iterator = plain_tokenizer->Tokenize(kText).ValueOrDie();
   EXPECT_TRUE(iterator->Advance());
-  EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::REGULAR, "foo"));
+  EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::Type::REGULAR, "foo"));
   for (int i = 0; i < kText.length(); ++i) {
     if (i < expected_text.size()) {
       EXPECT_TRUE(iterator->ResetToTokenStartingAfter(i));
       EXPECT_THAT(iterator->GetToken(),
-                  EqualsToken(Token::REGULAR, expected_text[i]));
+                  EqualsToken(Token::Type::REGULAR, expected_text[i]));
     } else {
       EXPECT_FALSE(iterator->ResetToTokenStartingAfter(i));
     }
@@ -383,11 +395,12 @@ TEST_F(PlainTokenizerTest, ResetToTokenEndingBefore) {
                                  language_segmenter.get()));
 
   constexpr std::string_view kText = " foo . bar baz.. bat ";
-  EXPECT_THAT(plain_tokenizer->TokenizeAll(kText),
-              IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "foo"),
-                                       EqualsToken(Token::REGULAR, "bar"),
-                                       EqualsToken(Token::REGULAR, "baz"),
-                                       EqualsToken(Token::REGULAR, "bat"))));
+  EXPECT_THAT(
+      plain_tokenizer->TokenizeAll(kText),
+      IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "foo"),
+                               EqualsToken(Token::Type::REGULAR, "bar"),
+                               EqualsToken(Token::Type::REGULAR, "baz"),
+                               EqualsToken(Token::Type::REGULAR, "bat"))));
   std::vector<std::string> expected_text = {
       "bat",  // 20: "baz.. bat "
       "baz",  // 19: " baz.. bat"
@@ -410,13 +423,14 @@ TEST_F(PlainTokenizerTest, ResetToTokenEndingBefore) {
 
   auto iterator = plain_tokenizer->Tokenize(kText).ValueOrDie();
   EXPECT_TRUE(iterator->Advance());
-  EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::REGULAR, "foo"));
+  EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::Type::REGULAR, "foo"));
   for (int i = kText.length() - 1; i >= 0; --i) {
     int expected_index = kText.length() - 1 - i;
     if (expected_index < expected_text.size()) {
       EXPECT_TRUE(iterator->ResetToTokenEndingBefore(i));
-      EXPECT_THAT(iterator->GetToken(),
-                  EqualsToken(Token::REGULAR, expected_text[expected_index]));
+      EXPECT_THAT(
+          iterator->GetToken(),
+          EqualsToken(Token::Type::REGULAR, expected_text[expected_index]));
     } else {
       EXPECT_FALSE(iterator->ResetToTokenEndingBefore(i));
     }
diff --git a/icing/tokenization/raw-query-tokenizer.cc b/icing/tokenization/raw-query-tokenizer.cc
index 2d461ee..8a27103 100644
--- a/icing/tokenization/raw-query-tokenizer.cc
+++ b/icing/tokenization/raw-query-tokenizer.cc
@@ -422,7 +422,7 @@ std::pair<TermType, std::string_view> GetTerm(std::string_view text,
 // and [(cat OR)]. This helps assert extra rule 3: "OR" is ignored if there's no
 // valid token on its right.
 void RemoveLastTokenIfOrOperator(std::vector<Token>* tokens) {
-  if (!tokens->empty() && tokens->back().type == Token::QUERY_OR) {
+  if (!tokens->empty() && tokens->back().type == Token::Type::QUERY_OR) {
     tokens->pop_back();
   }
 }
@@ -436,11 +436,11 @@ libtextclassifier3::Status OutputOrOperatorToken(std::vector<Token>* tokens) {
   }
   Token::Type last_token_type = tokens->back().type;
   switch (last_token_type) {
-    case Token::REGULAR:
-    case Token::QUERY_RIGHT_PARENTHESES:
-      tokens->emplace_back(Token::QUERY_OR);
+    case Token::Type::REGULAR:
+    case Token::Type::QUERY_RIGHT_PARENTHESES:
+      tokens->emplace_back(Token::Type::QUERY_OR);
       break;
-    case Token::QUERY_OR:
+    case Token::Type::QUERY_OR:
       // Ignores "OR" because there's already an "OR", e.g. "term1 OR OR term2"
       break;
     default:
@@ -481,21 +481,21 @@ libtextclassifier3::Status OutputToken(State new_state,
                 GetErrorMessage(ERROR_NON_ASCII_AS_PROPERTY_NAME));
           }
         }
-        tokens->emplace_back(Token::QUERY_PROPERTY, current_term);
+        tokens->emplace_back(Token::Type::QUERY_PROPERTY, current_term);
       } else {
-        tokens->emplace_back(Token::REGULAR, current_term);
+        tokens->emplace_back(Token::Type::REGULAR, current_term);
       }
       break;
     case LEFT_PARENTHESES:
-      tokens->emplace_back(Token::QUERY_LEFT_PARENTHESES);
+      tokens->emplace_back(Token::Type::QUERY_LEFT_PARENTHESES);
       break;
     case RIGHT_PARENTHESES:
       // Ignores "OR" if it's followed by right parentheses.
       RemoveLastTokenIfOrOperator(tokens);
-      tokens->emplace_back(Token::QUERY_RIGHT_PARENTHESES);
+      tokens->emplace_back(Token::Type::QUERY_RIGHT_PARENTHESES);
       break;
     case EXCLUSION_OPERATOR:
-      tokens->emplace_back(Token::QUERY_EXCLUSION);
+      tokens->emplace_back(Token::Type::QUERY_EXCLUSION);
       break;
     case OR_OPERATOR:
       return OutputOrOperatorToken(tokens);
@@ -648,7 +648,7 @@ class RawQueryTokenIterator : public Tokenizer::Iterator {
 
   Token GetToken() const override {
     if (current_ < 0 || current_ >= tokens_.size()) {
-      return Token(Token::INVALID);
+      return Token(Token::Type::INVALID);
     }
     return tokens_.at(current_);
   }
diff --git a/icing/tokenization/raw-query-tokenizer_test.cc b/icing/tokenization/raw-query-tokenizer_test.cc
index 500efa0..c6d981d 100644
--- a/icing/tokenization/raw-query-tokenizer_test.cc
+++ b/icing/tokenization/raw-query-tokenizer_test.cc
@@ -16,9 +16,9 @@
 
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
 #include "icing/portable/platform.h"
 #include "icing/testing/common-matchers.h"
+#include "icing/testing/icu-data-file-helper.h"
 #include "icing/testing/test-data.h"
 #include "icing/tokenization/language-segmenter-factory.h"
 #include "icing/tokenization/tokenizer-factory.h"
@@ -59,13 +59,15 @@ TEST_F(RawQueryTokenizerTest, Simple) {
       tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
                                               language_segmenter.get()));
 
-  EXPECT_THAT(raw_query_tokenizer->TokenizeAll("Hello World!"),
-              IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "Hello"),
-                                       EqualsToken(Token::REGULAR, "World"))));
+  EXPECT_THAT(
+      raw_query_tokenizer->TokenizeAll("Hello World!"),
+      IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "Hello"),
+                               EqualsToken(Token::Type::REGULAR, "World"))));
 
-  EXPECT_THAT(raw_query_tokenizer->TokenizeAll("hElLo WORLD"),
-              IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "hElLo"),
-                                       EqualsToken(Token::REGULAR, "WORLD"))));
+  EXPECT_THAT(
+      raw_query_tokenizer->TokenizeAll("hElLo WORLD"),
+      IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "hElLo"),
+                               EqualsToken(Token::Type::REGULAR, "WORLD"))));
 }
 
 TEST_F(RawQueryTokenizerTest, Parentheses) {
@@ -80,82 +82,82 @@ TEST_F(RawQueryTokenizerTest, Parentheses) {
 
   EXPECT_THAT(raw_query_tokenizer->TokenizeAll("()"),
               IsOkAndHolds(ElementsAre(
-                  EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
-                  EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+                  EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+                  EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
 
   EXPECT_THAT(raw_query_tokenizer->TokenizeAll("( )"),
               IsOkAndHolds(ElementsAre(
-                  EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
-                  EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+                  EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+                  EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
 
   EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1 term2)"),
               IsOkAndHolds(ElementsAre(
-                  EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
-                  EqualsToken(Token::REGULAR, "term1"),
-                  EqualsToken(Token::REGULAR, "term2"),
-                  EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+                  EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+                  EqualsToken(Token::Type::REGULAR, "term1"),
+                  EqualsToken(Token::Type::REGULAR, "term2"),
+                  EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
 
   EXPECT_THAT(raw_query_tokenizer->TokenizeAll("((term1 term2) (term3 term4))"),
               IsOkAndHolds(ElementsAre(
-                  EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
-                  EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
-                  EqualsToken(Token::REGULAR, "term1"),
-                  EqualsToken(Token::REGULAR, "term2"),
-                  EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""),
-                  EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
-                  EqualsToken(Token::REGULAR, "term3"),
-                  EqualsToken(Token::REGULAR, "term4"),
-                  EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""),
-                  EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+                  EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+                  EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+                  EqualsToken(Token::Type::REGULAR, "term1"),
+                  EqualsToken(Token::Type::REGULAR, "term2"),
+                  EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""),
+                  EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+                  EqualsToken(Token::Type::REGULAR, "term3"),
+                  EqualsToken(Token::Type::REGULAR, "term4"),
+                  EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""),
+                  EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
 
   EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1(term2)"),
               IsOkAndHolds(ElementsAre(
-                  EqualsToken(Token::REGULAR, "term1"),
-                  EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
-                  EqualsToken(Token::REGULAR, "term2"),
-                  EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+                  EqualsToken(Token::Type::REGULAR, "term1"),
+                  EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+                  EqualsToken(Token::Type::REGULAR, "term2"),
+                  EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
 
-  EXPECT_THAT(
-      raw_query_tokenizer->TokenizeAll("(term1)term2"),
-      IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
-                               EqualsToken(Token::REGULAR, "term1"),
-                               EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""),
-                               EqualsToken(Token::REGULAR, "term2"))));
+  EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1)term2"),
+              IsOkAndHolds(ElementsAre(
+                  EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+                  EqualsToken(Token::Type::REGULAR, "term1"),
+                  EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""),
+                  EqualsToken(Token::Type::REGULAR, "term2"))));
 
   EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1)(term2)"),
               IsOkAndHolds(ElementsAre(
-                  EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
-                  EqualsToken(Token::REGULAR, "term1"),
-                  EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""),
-                  EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
-                  EqualsToken(Token::REGULAR, "term2"),
-                  EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
-
-  EXPECT_THAT(
-      raw_query_tokenizer->TokenizeAll("(term1)-term2"),
-      IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
-                               EqualsToken(Token::REGULAR, "term1"),
-                               EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""),
-                               EqualsToken(Token::QUERY_EXCLUSION, ""),
-                               EqualsToken(Token::REGULAR, "term2"))));
+                  EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+                  EqualsToken(Token::Type::REGULAR, "term1"),
+                  EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""),
+                  EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+                  EqualsToken(Token::Type::REGULAR, "term2"),
+                  EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
+
+  EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1)-term2"),
+              IsOkAndHolds(ElementsAre(
+                  EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+                  EqualsToken(Token::Type::REGULAR, "term1"),
+                  EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""),
+                  EqualsToken(Token::Type::QUERY_EXCLUSION, ""),
+                  EqualsToken(Token::Type::REGULAR, "term2"))));
 
-  EXPECT_THAT(
-      raw_query_tokenizer->TokenizeAll("(term1)OR term2"),
-      IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
-                               EqualsToken(Token::REGULAR, "term1"),
-                               EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""),
-                               EqualsToken(Token::QUERY_OR, ""),
-                               EqualsToken(Token::REGULAR, "term2"))));
+  EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1)OR term2"),
+              IsOkAndHolds(ElementsAre(
+                  EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+                  EqualsToken(Token::Type::REGULAR, "term1"),
+                  EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""),
+                  EqualsToken(Token::Type::QUERY_OR, ""),
+                  EqualsToken(Token::Type::REGULAR, "term2"))));
 
   EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1)OR(term2)"),
               IsOkAndHolds(ElementsAre(
-                  EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
-                  EqualsToken(Token::REGULAR, "term1"),
-                  EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""),
-                  EqualsToken(Token::QUERY_OR, ""),
-                  EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
-                  EqualsToken(Token::REGULAR, "term2"),
-                  EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+                  EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+                  EqualsToken(Token::Type::REGULAR, "term1"),
+                  EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""),
+                  EqualsToken(Token::Type::QUERY_OR, ""),
+                  EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+                  EqualsToken(Token::Type::REGULAR, "term2"),
+                  EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
 
   EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1):term2"),
               StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
@@ -180,44 +182,49 @@ TEST_F(RawQueryTokenizerTest, Exclustion) {
       tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
                                               language_segmenter.get()));
 
-  EXPECT_THAT(raw_query_tokenizer->TokenizeAll("-term1"),
-              IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_EXCLUSION, ""),
-                                       EqualsToken(Token::REGULAR, "term1"))));
+  EXPECT_THAT(
+      raw_query_tokenizer->TokenizeAll("-term1"),
+      IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::QUERY_EXCLUSION, ""),
+                               EqualsToken(Token::Type::REGULAR, "term1"))));
 
   EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(-term1)"),
               IsOkAndHolds(ElementsAre(
-                  EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
-                  EqualsToken(Token::QUERY_EXCLUSION, ""),
-                  EqualsToken(Token::REGULAR, "term1"),
-                  EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+                  EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+                  EqualsToken(Token::Type::QUERY_EXCLUSION, ""),
+                  EqualsToken(Token::Type::REGULAR, "term1"),
+                  EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
 
   // Exclusion operator is ignored
-  EXPECT_THAT(raw_query_tokenizer->TokenizeAll("- term1"),
-              IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"))));
+  EXPECT_THAT(
+      raw_query_tokenizer->TokenizeAll("- term1"),
+      IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"))));
 
   // Exclusion operator is ignored
-  EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1- term2"),
-              IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"),
-                                       EqualsToken(Token::REGULAR, "term2"))));
+  EXPECT_THAT(
+      raw_query_tokenizer->TokenizeAll("term1- term2"),
+      IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"),
+                               EqualsToken(Token::Type::REGULAR, "term2"))));
 
   // Exclusion operator is ignored
   EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1 -)"),
               IsOkAndHolds(ElementsAre(
-                  EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
-                  EqualsToken(Token::REGULAR, "term1"),
-                  EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+                  EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+                  EqualsToken(Token::Type::REGULAR, "term1"),
+                  EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
 
   // First exclusion operator is ignored
-  EXPECT_THAT(raw_query_tokenizer->TokenizeAll("--term1"),
-              IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_EXCLUSION, ""),
-                                       EqualsToken(Token::REGULAR, "term1"))));
+  EXPECT_THAT(
+      raw_query_tokenizer->TokenizeAll("--term1"),
+      IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::QUERY_EXCLUSION, ""),
+                               EqualsToken(Token::Type::REGULAR, "term1"))));
 
   // First "-" is exclusion operator, second is not and will be discarded.
   // In other words, exclusion only applies to the term right after it.
-  EXPECT_THAT(raw_query_tokenizer->TokenizeAll("-term1-term2"),
-              IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_EXCLUSION, ""),
-                                       EqualsToken(Token::REGULAR, "term1"),
-                                       EqualsToken(Token::REGULAR, "term2"))));
+  EXPECT_THAT(
+      raw_query_tokenizer->TokenizeAll("-term1-term2"),
+      IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::QUERY_EXCLUSION, ""),
+                               EqualsToken(Token::Type::REGULAR, "term1"),
+                               EqualsToken(Token::Type::REGULAR, "term2"))));
 
   EXPECT_THAT(raw_query_tokenizer->TokenizeAll("-(term1)"),
               StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
@@ -249,73 +256,75 @@ TEST_F(RawQueryTokenizerTest, PropertyRestriction) {
       tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
                                               language_segmenter.get()));
 
-  EXPECT_THAT(
-      raw_query_tokenizer->TokenizeAll("property1:term1"),
-      IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"),
-                               EqualsToken(Token::REGULAR, "term1"))));
+  EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:term1"),
+              IsOkAndHolds(ElementsAre(
+                  EqualsToken(Token::Type::QUERY_PROPERTY, "property1"),
+                  EqualsToken(Token::Type::REGULAR, "term1"))));
 
   EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(property1:term1)"),
               IsOkAndHolds(ElementsAre(
-                  EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
-                  EqualsToken(Token::QUERY_PROPERTY, "property1"),
-                  EqualsToken(Token::REGULAR, "term1"),
-                  EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+                  EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+                  EqualsToken(Token::Type::QUERY_PROPERTY, "property1"),
+                  EqualsToken(Token::Type::REGULAR, "term1"),
+                  EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
 
   // Colon is ignored
-  EXPECT_THAT(raw_query_tokenizer->TokenizeAll(":term1"),
-              IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"))));
+  EXPECT_THAT(
+      raw_query_tokenizer->TokenizeAll(":term1"),
+      IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"))));
 
   // Colon is ignored
   EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(:term1)"),
               IsOkAndHolds(ElementsAre(
-                  EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
-                  EqualsToken(Token::REGULAR, "term1"),
-                  EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+                  EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+                  EqualsToken(Token::Type::REGULAR, "term1"),
+                  EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
 
   // Colon is ignored
-  EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1:"),
-              IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"))));
+  EXPECT_THAT(
+      raw_query_tokenizer->TokenizeAll("term1:"),
+      IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"))));
 
   // property name can be a path
   EXPECT_THAT(raw_query_tokenizer->TokenizeAll("email.title:hello"),
-              IsOkAndHolds(
-                  ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "email.title"),
-                              EqualsToken(Token::REGULAR, "hello"))));
+              IsOkAndHolds(ElementsAre(
+                  EqualsToken(Token::Type::QUERY_PROPERTY, "email.title"),
+                  EqualsToken(Token::Type::REGULAR, "hello"))));
 
   // The first colon ":" triggers property restriction, the second colon is used
   // as a word connector per ICU's rule
   // (https://unicode.org/reports/tr29/#Word_Boundaries).
-  EXPECT_THAT(
-      raw_query_tokenizer->TokenizeAll("property:foo:bar"),
-      IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property"),
-                               EqualsToken(Token::REGULAR, "foo:bar"))));
+  EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property:foo:bar"),
+              IsOkAndHolds(ElementsAre(
+                  EqualsToken(Token::Type::QUERY_PROPERTY, "property"),
+                  EqualsToken(Token::Type::REGULAR, "foo:bar"))));
 
   // Property restriction only applies to the term right after it.
   // Note: "term1:term2" is not a term but 2 terms because word connectors
   // don't apply to numbers and alphabets.
-  EXPECT_THAT(
-      raw_query_tokenizer->TokenizeAll("property1:term1:term2"),
-      IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"),
-                               EqualsToken(Token::REGULAR, "term1"),
-                               EqualsToken(Token::REGULAR, "term2"))));
+  EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:term1:term2"),
+              IsOkAndHolds(ElementsAre(
+                  EqualsToken(Token::Type::QUERY_PROPERTY, "property1"),
+                  EqualsToken(Token::Type::REGULAR, "term1"),
+                  EqualsToken(Token::Type::REGULAR, "term2"))));
 
-  EXPECT_THAT(
-      raw_query_tokenizer->TokenizeAll("property1:今天:天气"),
-      IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"),
-                               EqualsToken(Token::REGULAR, "今天"),
-                               EqualsToken(Token::REGULAR, "天气"))));
+  EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:今天:天气"),
+              IsOkAndHolds(ElementsAre(
+                  EqualsToken(Token::Type::QUERY_PROPERTY, "property1"),
+                  EqualsToken(Token::Type::REGULAR, "今天"),
+                  EqualsToken(Token::Type::REGULAR, "天气"))));
 
-  EXPECT_THAT(
-      raw_query_tokenizer->TokenizeAll("property1:term1-"),
-      IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"),
-                               EqualsToken(Token::REGULAR, "term1"))));
+  EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:term1-"),
+              IsOkAndHolds(ElementsAre(
+                  EqualsToken(Token::Type::QUERY_PROPERTY, "property1"),
+                  EqualsToken(Token::Type::REGULAR, "term1"))));
 
   // Multiple continuous colons will still be recognized as a property
   // restriction operator
-  EXPECT_THAT(
-      raw_query_tokenizer->TokenizeAll("property1::term1"),
-      IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"),
-                               EqualsToken(Token::REGULAR, "term1"))));
+  EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1::term1"),
+              IsOkAndHolds(ElementsAre(
+                  EqualsToken(Token::Type::QUERY_PROPERTY, "property1"),
+                  EqualsToken(Token::Type::REGULAR, "term1"))));
 
   EXPECT_THAT(
       raw_query_tokenizer->TokenizeAll("property1:(term1)"),
@@ -345,105 +354,109 @@ TEST_F(RawQueryTokenizerTest, OR) {
       tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
                                               language_segmenter.get()));
 
-  EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1 OR term2"),
-              IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"),
-                                       EqualsToken(Token::QUERY_OR, ""),
-                                       EqualsToken(Token::REGULAR, "term2"))));
+  EXPECT_THAT(
+      raw_query_tokenizer->TokenizeAll("term1 OR term2"),
+      IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"),
+                               EqualsToken(Token::Type::QUERY_OR, ""),
+                               EqualsToken(Token::Type::REGULAR, "term2"))));
 
   // Two continuous "OR"s are treated as one
-  EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1 OR OR term2"),
-              IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"),
-                                       EqualsToken(Token::QUERY_OR, ""),
-                                       EqualsToken(Token::REGULAR, "term2"))));
-
   EXPECT_THAT(
-      raw_query_tokenizer->TokenizeAll("(term1) OR term2"),
-      IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
-                               EqualsToken(Token::REGULAR, "term1"),
-                               EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""),
-                               EqualsToken(Token::QUERY_OR, ""),
-                               EqualsToken(Token::REGULAR, "term2"))));
+      raw_query_tokenizer->TokenizeAll("term1 OR OR term2"),
+      IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"),
+                               EqualsToken(Token::Type::QUERY_OR, ""),
+                               EqualsToken(Token::Type::REGULAR, "term2"))));
+
+  EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1) OR term2"),
+              IsOkAndHolds(ElementsAre(
+                  EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+                  EqualsToken(Token::Type::REGULAR, "term1"),
+                  EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""),
+                  EqualsToken(Token::Type::QUERY_OR, ""),
+                  EqualsToken(Token::Type::REGULAR, "term2"))));
 
   EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1 OR (term2)"),
               IsOkAndHolds(ElementsAre(
-                  EqualsToken(Token::REGULAR, "term1"),
-                  EqualsToken(Token::QUERY_OR, ""),
-                  EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
-                  EqualsToken(Token::REGULAR, "term2"),
-                  EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+                  EqualsToken(Token::Type::REGULAR, "term1"),
+                  EqualsToken(Token::Type::QUERY_OR, ""),
+                  EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+                  EqualsToken(Token::Type::REGULAR, "term2"),
+                  EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
 
   EXPECT_THAT(raw_query_tokenizer->TokenizeAll("((term1) OR (term2))"),
               IsOkAndHolds(ElementsAre(
-                  EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
-                  EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
-                  EqualsToken(Token::REGULAR, "term1"),
-                  EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""),
-                  EqualsToken(Token::QUERY_OR, ""),
-                  EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
-                  EqualsToken(Token::REGULAR, "term2"),
-                  EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""),
-                  EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+                  EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+                  EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+                  EqualsToken(Token::Type::REGULAR, "term1"),
+                  EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""),
+                  EqualsToken(Token::Type::QUERY_OR, ""),
+                  EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+                  EqualsToken(Token::Type::REGULAR, "term2"),
+                  EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""),
+                  EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
 
   // Only "OR" (all in uppercase) is the operator
   EXPECT_THAT(
       raw_query_tokenizer->TokenizeAll("term1 or term2 Or term3 oR term4"),
-      IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"),
-                               EqualsToken(Token::REGULAR, "or"),
-                               EqualsToken(Token::REGULAR, "term2"),
-                               EqualsToken(Token::REGULAR, "Or"),
-                               EqualsToken(Token::REGULAR, "term3"),
-                               EqualsToken(Token::REGULAR, "oR"),
-                               EqualsToken(Token::REGULAR, "term4"))));
+      IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"),
+                               EqualsToken(Token::Type::REGULAR, "or"),
+                               EqualsToken(Token::Type::REGULAR, "term2"),
+                               EqualsToken(Token::Type::REGULAR, "Or"),
+                               EqualsToken(Token::Type::REGULAR, "term3"),
+                               EqualsToken(Token::Type::REGULAR, "oR"),
+                               EqualsToken(Token::Type::REGULAR, "term4"))));
 
   // "OR" is ignored
-  EXPECT_THAT(raw_query_tokenizer->TokenizeAll("OR term1"),
-              IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"))));
+  EXPECT_THAT(
+      raw_query_tokenizer->TokenizeAll("OR term1"),
+      IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"))));
 
   // "OR" is ignored
-  EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1 OR"),
-              IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"))));
+  EXPECT_THAT(
+      raw_query_tokenizer->TokenizeAll("term1 OR"),
+      IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"))));
 
   // "OR" is ignored
   EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(OR term1)"),
               IsOkAndHolds(ElementsAre(
-                  EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
-                  EqualsToken(Token::REGULAR, "term1"),
-                  EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+                  EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+                  EqualsToken(Token::Type::REGULAR, "term1"),
+                  EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
 
   // "OR" is ignored
   EXPECT_THAT(raw_query_tokenizer->TokenizeAll("( OR term1)"),
               IsOkAndHolds(ElementsAre(
-                  EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
-                  EqualsToken(Token::REGULAR, "term1"),
-                  EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+                  EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+                  EqualsToken(Token::Type::REGULAR, "term1"),
+                  EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
 
   // "OR" is ignored
   EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1 OR)"),
               IsOkAndHolds(ElementsAre(
-                  EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
-                  EqualsToken(Token::REGULAR, "term1"),
-                  EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+                  EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+                  EqualsToken(Token::Type::REGULAR, "term1"),
+                  EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
 
   // "OR" is ignored
   EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1 OR )"),
               IsOkAndHolds(ElementsAre(
-                  EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
-                  EqualsToken(Token::REGULAR, "term1"),
-                  EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+                  EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+                  EqualsToken(Token::Type::REGULAR, "term1"),
+                  EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
 
   // "OR" is ignored
   EXPECT_THAT(raw_query_tokenizer->TokenizeAll("( OR )"),
               IsOkAndHolds(ElementsAre(
-                  EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
-                  EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+                  EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+                  EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
 
   EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1 OR(term2)"),
               IsOkAndHolds(ElementsAre(
-                  EqualsToken(Token::REGULAR, "term1"),
-                  EqualsToken(Token::QUERY_OR, ""),
-                  EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
-                  EqualsToken(Token::REGULAR, "term2"),
-                  EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+                  EqualsToken(Token::Type::REGULAR, "term1"),
+                  EqualsToken(Token::Type::QUERY_OR, ""),
+                  EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+                  EqualsToken(Token::Type::REGULAR, "term2"),
+                  EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
 
   EXPECT_THAT(
       raw_query_tokenizer->TokenizeAll("term1 OR-term2"),
@@ -472,31 +485,31 @@ TEST_F(RawQueryTokenizerTest, CJKT) {
   if (IsCfStringTokenization()) {
     EXPECT_THAT(
         raw_query_tokenizer->TokenizeAll("-今天天气很好"),
-        IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_EXCLUSION, ""),
-                                 EqualsToken(Token::REGULAR, "今天"),
-                                 EqualsToken(Token::REGULAR, "天气"),
-                                 EqualsToken(Token::REGULAR, "很"),
-                                 EqualsToken(Token::REGULAR, "好"))));
+        IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::QUERY_EXCLUSION, ""),
+                                 EqualsToken(Token::Type::REGULAR, "今天"),
+                                 EqualsToken(Token::Type::REGULAR, "天气"),
+                                 EqualsToken(Token::Type::REGULAR, "很"),
+                                 EqualsToken(Token::Type::REGULAR, "好"))));
   } else {
     EXPECT_THAT(
         raw_query_tokenizer->TokenizeAll("-今天天气很好"),
-        IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_EXCLUSION, ""),
-                                 EqualsToken(Token::REGULAR, "今天"),
-                                 EqualsToken(Token::REGULAR, "天气"),
-                                 EqualsToken(Token::REGULAR, "很好"))));
+        IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::QUERY_EXCLUSION, ""),
+                                 EqualsToken(Token::Type::REGULAR, "今天"),
+                                 EqualsToken(Token::Type::REGULAR, "天气"),
+                                 EqualsToken(Token::Type::REGULAR, "很好"))));
   }
 
   if (IsCfStringTokenization()) {
     EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:你好"),
-                IsOkAndHolds(
-                    ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"),
-                                EqualsToken(Token::REGULAR, "你"),
-                                EqualsToken(Token::REGULAR, "好"))));
+                IsOkAndHolds(ElementsAre(
+                    EqualsToken(Token::Type::QUERY_PROPERTY, "property1"),
+                    EqualsToken(Token::Type::REGULAR, "你"),
+                    EqualsToken(Token::Type::REGULAR, "好"))));
   } else {
     EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:你好"),
-                IsOkAndHolds(
-                    ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"),
-                                EqualsToken(Token::REGULAR, "你好"))));
+                IsOkAndHolds(ElementsAre(
+                    EqualsToken(Token::Type::QUERY_PROPERTY, "property1"),
+                    EqualsToken(Token::Type::REGULAR, "你好"))));
   }
 
   EXPECT_THAT(
@@ -504,10 +517,11 @@ TEST_F(RawQueryTokenizerTest, CJKT) {
       StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
                HasSubstr("Characters in property name must all be ASCII")));
 
-  EXPECT_THAT(raw_query_tokenizer->TokenizeAll("cat OR ねこ"),
-              IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "cat"),
-                                       EqualsToken(Token::QUERY_OR, ""),
-                                       EqualsToken(Token::REGULAR, "ねこ"))));
+  EXPECT_THAT(
+      raw_query_tokenizer->TokenizeAll("cat OR ねこ"),
+      IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "cat"),
+                               EqualsToken(Token::Type::QUERY_OR, ""),
+                               EqualsToken(Token::Type::REGULAR, "ねこ"))));
 
   EXPECT_THAT(
       raw_query_tokenizer->TokenizeAll("cat ORねこ"),
@@ -543,40 +557,45 @@ TEST_F(RawQueryTokenizerTest, OtherChars) {
                                               language_segmenter.get()));
 
   // Comma is ignored
-  EXPECT_THAT(raw_query_tokenizer->TokenizeAll(",term1, ,"),
-              IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"))));
+  EXPECT_THAT(
+      raw_query_tokenizer->TokenizeAll(",term1, ,"),
+      IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"))));
 
   EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(,term1),"),
               IsOkAndHolds(ElementsAre(
-                  EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
-                  EqualsToken(Token::REGULAR, "term1"),
-                  EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+                  EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+                  EqualsToken(Token::Type::REGULAR, "term1"),
+                  EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
 
   // Exclusion operator and comma are ignored
-  EXPECT_THAT(raw_query_tokenizer->TokenizeAll("-,term1"),
-              IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"))));
+  EXPECT_THAT(
+      raw_query_tokenizer->TokenizeAll("-,term1"),
+      IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"))));
 
-  EXPECT_THAT(raw_query_tokenizer->TokenizeAll("-term1,"),
-              IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_EXCLUSION, ""),
-                                       EqualsToken(Token::REGULAR, "term1"))));
+  EXPECT_THAT(
+      raw_query_tokenizer->TokenizeAll("-term1,"),
+      IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::QUERY_EXCLUSION, ""),
+                               EqualsToken(Token::Type::REGULAR, "term1"))));
 
   // Colon and comma are ignored
-  EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:,term1"),
-              IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "property1"),
-                                       EqualsToken(Token::REGULAR, "term1"))));
-
   EXPECT_THAT(
-      raw_query_tokenizer->TokenizeAll("property1:term1,term2"),
-      IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"),
-                               EqualsToken(Token::REGULAR, "term1"),
-                               EqualsToken(Token::REGULAR, "term2"))));
+      raw_query_tokenizer->TokenizeAll("property1:,term1"),
+      IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "property1"),
+                               EqualsToken(Token::Type::REGULAR, "term1"))));
+
+  EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:term1,term2"),
+              IsOkAndHolds(ElementsAre(
+                  EqualsToken(Token::Type::QUERY_PROPERTY, "property1"),
+                  EqualsToken(Token::Type::REGULAR, "term1"),
+                  EqualsToken(Token::Type::REGULAR, "term2"))));
 
   // This is a special case for OR, unknown chars are treated the same as
   // whitespaces before and after OR.
-  EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1,OR,term2"),
-              IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"),
-                                       EqualsToken(Token::QUERY_OR, ""),
-                                       EqualsToken(Token::REGULAR, "term2"))));
+  EXPECT_THAT(
+      raw_query_tokenizer->TokenizeAll("term1,OR,term2"),
+      IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"),
+                               EqualsToken(Token::Type::QUERY_OR, ""),
+                               EqualsToken(Token::Type::REGULAR, "term2"))));
 }
 
 TEST_F(RawQueryTokenizerTest, Mix) {
@@ -593,37 +612,38 @@ TEST_F(RawQueryTokenizerTest, Mix) {
     EXPECT_THAT(raw_query_tokenizer->TokenizeAll(
                     "こんにちはgood afternoon, title:今天 OR (ในวันนี้ -B12)"),
                 IsOkAndHolds(ElementsAre(
-                    EqualsToken(Token::REGULAR, "こんにちは"),
-                    EqualsToken(Token::REGULAR, "good"),
-                    EqualsToken(Token::REGULAR, "afternoon"),
-                    EqualsToken(Token::QUERY_PROPERTY, "title"),
-                    EqualsToken(Token::REGULAR, "今天"),
-                    EqualsToken(Token::QUERY_OR, ""),
-                    EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
-                    EqualsToken(Token::REGULAR, "ใน"),
-                    EqualsToken(Token::REGULAR, "วันนี้"),
-                    EqualsToken(Token::QUERY_EXCLUSION, ""),
-                    EqualsToken(Token::REGULAR, "B12"),
-                    EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+                    EqualsToken(Token::Type::REGULAR, "こんにちは"),
+                    EqualsToken(Token::Type::REGULAR, "good"),
+                    EqualsToken(Token::Type::REGULAR, "afternoon"),
+                    EqualsToken(Token::Type::QUERY_PROPERTY, "title"),
+                    EqualsToken(Token::Type::REGULAR, "今天"),
+                    EqualsToken(Token::Type::QUERY_OR, ""),
+                    EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+                    EqualsToken(Token::Type::REGULAR, "ใน"),
+                    EqualsToken(Token::Type::REGULAR, "วันนี้"),
+                    EqualsToken(Token::Type::QUERY_EXCLUSION, ""),
+                    EqualsToken(Token::Type::REGULAR, "B12"),
+                    EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
   } else {
     ICING_ASSERT_OK_AND_ASSIGN(
         std::vector<Token> tokens,
         raw_query_tokenizer->TokenizeAll(
             "こんにちはgood afternoon, title:今天 OR (ในวันนี้ -B12)"));
-    EXPECT_THAT(tokens,
-                ElementsAre(EqualsToken(Token::REGULAR, "こんにちは"),
-                            EqualsToken(Token::REGULAR, "good"),
-                            EqualsToken(Token::REGULAR, "afternoon"),
-                            EqualsToken(Token::QUERY_PROPERTY, "title"),
-                            EqualsToken(Token::REGULAR, "今天"),
-                            EqualsToken(Token::QUERY_OR, ""),
-                            EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
-                            EqualsToken(Token::REGULAR, "ใน"),
-                            EqualsToken(Token::REGULAR, "วัน"),
-                            EqualsToken(Token::REGULAR, "นี้"),
-                            EqualsToken(Token::QUERY_EXCLUSION, ""),
-                            EqualsToken(Token::REGULAR, "B12"),
-                            EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")));
+    EXPECT_THAT(
+        tokens,
+        ElementsAre(EqualsToken(Token::Type::REGULAR, "こんにちは"),
+                    EqualsToken(Token::Type::REGULAR, "good"),
+                    EqualsToken(Token::Type::REGULAR, "afternoon"),
+                    EqualsToken(Token::Type::QUERY_PROPERTY, "title"),
+                    EqualsToken(Token::Type::REGULAR, "今天"),
+                    EqualsToken(Token::Type::QUERY_OR, ""),
+                    EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+                    EqualsToken(Token::Type::REGULAR, "ใน"),
+                    EqualsToken(Token::Type::REGULAR, "วัน"),
+                    EqualsToken(Token::Type::REGULAR, "นี้"),
+                    EqualsToken(Token::Type::QUERY_EXCLUSION, ""),
+                    EqualsToken(Token::Type::REGULAR, "B12"),
+                    EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")));
   }
 }
 
diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc
index b936f2b..cb474c6 100644
--- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc
+++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc
@@ -43,45 +43,46 @@ class ReverseJniLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
 
   // Advances to the next term. Returns false if it has reached the end.
   bool Advance() override {
-    // Prerequisite check
-    if (IsDone()) {
-      return false;
-    }
+    while (true) {
+      // Prerequisite check
+      if (IsDone()) {
+        return false;
+      }
 
-    if (term_end_exclusive_.utf16_index() == 0) {
-      int first = break_iterator_->First();
-      if (!term_start_.MoveToUtf16(first)) {
-        // First is guaranteed to succeed and return a position within bonds. So
-        // the only possible failure could be an invalid sequence. Mark as DONE
-        // and return.
+      if (term_end_exclusive_.utf16_index() == 0) {
+        int first = break_iterator_->First();
+        if (!term_start_.MoveToUtf16(first)) {
+          // First is guaranteed to succeed and return a position within bonds.
+          // So the only possible failure could be an invalid sequence. Mark as
+          // DONE and return.
+          MarkAsDone();
+          return false;
+        }
+      } else {
+        term_start_ = term_end_exclusive_;
+      }
+
+      int next_utf16_index_exclusive = break_iterator_->Next();
+      // Reached the end
+      if (next_utf16_index_exclusive == ReverseJniBreakIterator::kDone) {
+        MarkAsDone();
+        return false;
+      }
+      if (!term_end_exclusive_.MoveToUtf16(next_utf16_index_exclusive)) {
+        // next_utf16_index_exclusive is guaranteed to be within bonds thanks to
+        // the check for kDone above. So the only possible failure could be an
+        // invalid sequence. Mark as DONE and return.
         MarkAsDone();
         return false;
       }
-    } else {
-      term_start_ = term_end_exclusive_;
-    }
-
-    int next_utf16_index_exclusive = break_iterator_->Next();
-    // Reached the end
-    if (next_utf16_index_exclusive == ReverseJniBreakIterator::kDone) {
-      MarkAsDone();
-      return false;
-    }
-    if (!term_end_exclusive_.MoveToUtf16(next_utf16_index_exclusive)) {
-      // next_utf16_index_exclusive is guaranteed to be within bonds thanks to
-      // the check for kDone above. So the only possible failure could be an
-      // invalid sequence. Mark as DONE and return.
-      MarkAsDone();
-      return false;
-    }
 
-    // Check if the current term is valid. We consider any term valid if its
-    // first character is valid. If it's not valid, then we need to advance to
-    // the next term.
-    if (IsValidTerm()) {
-      return true;
+      // Check if the current term is valid. We consider any term valid if its
+      // first character is valid. If it's not valid, then we need to advance to
+      // the next term.
+      if (IsValidTerm()) {
+        return true;
+      }
     }
-    return Advance();
   }
 
   // Returns the current term. It can be called only when Advance() returns
diff --git a/icing/tokenization/token.h b/icing/tokenization/token.h
index b97ee0b..0c268be 100644
--- a/icing/tokenization/token.h
+++ b/icing/tokenization/token.h
@@ -21,7 +21,7 @@ namespace icing {
 namespace lib {
 
 struct Token {
-  enum Type {
+  enum class Type {
     // Common types
     REGULAR,  // A token without special meanings, the value of it will be
               // indexed or searched directly
diff --git a/icing/tokenization/verbatim-tokenizer.cc b/icing/tokenization/verbatim-tokenizer.cc
index 173486a..dc54696 100644
--- a/icing/tokenization/verbatim-tokenizer.cc
+++ b/icing/tokenization/verbatim-tokenizer.cc
@@ -37,10 +37,10 @@ class VerbatimTokenIterator : public Tokenizer::Iterator {
 
   Token GetToken() const override {
     if (term_.empty() || !has_advanced_to_end_) {
-      return Token(Token::INVALID);
+      return Token(Token::Type::INVALID);
     }
 
-    return Token(Token::VERBATIM, term_);
+    return Token(Token::Type::VERBATIM, term_);
   }
 
   libtextclassifier3::StatusOr<CharacterIterator> CalculateTokenStart()
diff --git a/icing/tokenization/verbatim-tokenizer_test.cc b/icing/tokenization/verbatim-tokenizer_test.cc
index b6bc5fa..e38c7aa 100644
--- a/icing/tokenization/verbatim-tokenizer_test.cc
+++ b/icing/tokenization/verbatim-tokenizer_test.cc
@@ -15,9 +15,9 @@
 #include <string_view>
 
 #include "gmock/gmock.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
 #include "icing/portable/platform.h"
 #include "icing/testing/common-matchers.h"
+#include "icing/testing/icu-data-file-helper.h"
 #include "icing/testing/jni-test-helpers.h"
 #include "icing/testing/test-data.h"
 #include "icing/tokenization/language-segmenter-factory.h"
@@ -71,7 +71,7 @@ TEST_F(VerbatimTokenizerTest, Simple) {
 
   EXPECT_THAT(
       verbatim_tokenizer->TokenizeAll("foo bar"),
-      IsOkAndHolds(ElementsAre(EqualsToken(Token::VERBATIM, "foo bar"))));
+      IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::VERBATIM, "foo bar"))));
 }
 
 TEST_F(VerbatimTokenizerTest, Punctuation) {
@@ -80,9 +80,9 @@ TEST_F(VerbatimTokenizerTest, Punctuation) {
                                  StringIndexingConfig::TokenizerType::VERBATIM,
                                  language_segmenter_.get()));
 
-  EXPECT_THAT(
-      verbatim_tokenizer->TokenizeAll("Hello, world!"),
-      IsOkAndHolds(ElementsAre(EqualsToken(Token::VERBATIM, "Hello, world!"))));
+  EXPECT_THAT(verbatim_tokenizer->TokenizeAll("Hello, world!"),
+              IsOkAndHolds(ElementsAre(
+                  EqualsToken(Token::Type::VERBATIM, "Hello, world!"))));
 }
 
 TEST_F(VerbatimTokenizerTest, InvalidTokenBeforeAdvancing) {
@@ -95,7 +95,8 @@ TEST_F(VerbatimTokenizerTest, InvalidTokenBeforeAdvancing) {
   auto token_iterator = verbatim_tokenizer->Tokenize(kText).ValueOrDie();
 
   // We should get an invalid token if we get the token before advancing.
-  EXPECT_THAT(token_iterator->GetToken(), EqualsToken(Token::INVALID, ""));
+  EXPECT_THAT(token_iterator->GetToken(),
+              EqualsToken(Token::Type::INVALID, ""));
 }
 
 TEST_F(VerbatimTokenizerTest, ResetToTokenEndingBefore) {
@@ -111,13 +112,13 @@ TEST_F(VerbatimTokenizerTest, ResetToTokenEndingBefore) {
   // is larger than the final index (12) of the verbatim token.
   EXPECT_TRUE(token_iterator->ResetToTokenEndingBefore(13));
   EXPECT_THAT(token_iterator->GetToken(),
-              EqualsToken(Token::VERBATIM, "Hello, world!"));
+              EqualsToken(Token::Type::VERBATIM, "Hello, world!"));
 
   // Ensure our cached character iterator propertly maintains the end of the
   // verbatim token.
   EXPECT_TRUE(token_iterator->ResetToTokenEndingBefore(13));
   EXPECT_THAT(token_iterator->GetToken(),
-              EqualsToken(Token::VERBATIM, "Hello, world!"));
+              EqualsToken(Token::Type::VERBATIM, "Hello, world!"));
 
   // We should not be able to reset with an offset before or within
   // the verbatim token's utf-32 length.
@@ -137,7 +138,7 @@ TEST_F(VerbatimTokenizerTest, ResetToTokenStartingAfter) {
   // Get token without resetting
   EXPECT_TRUE(token_iterator->Advance());
   EXPECT_THAT(token_iterator->GetToken(),
-              EqualsToken(Token::VERBATIM, "Hello, world!"));
+              EqualsToken(Token::Type::VERBATIM, "Hello, world!"));
 
   // We expect a sole verbatim token, so it's not possible to reset after the
   // start of the token.
@@ -147,7 +148,7 @@ TEST_F(VerbatimTokenizerTest, ResetToTokenStartingAfter) {
   // negative.
   EXPECT_TRUE(token_iterator->ResetToTokenStartingAfter(-1));
   EXPECT_THAT(token_iterator->GetToken(),
-              EqualsToken(Token::VERBATIM, "Hello, world!"));
+              EqualsToken(Token::Type::VERBATIM, "Hello, world!"));
 }
 
 TEST_F(VerbatimTokenizerTest, ResetToStart) {
@@ -162,12 +163,12 @@ TEST_F(VerbatimTokenizerTest, ResetToStart) {
   // Get token without resetting
   EXPECT_TRUE(token_iterator->Advance());
   EXPECT_THAT(token_iterator->GetToken(),
-              EqualsToken(Token::VERBATIM, "Hello, world!"));
+              EqualsToken(Token::Type::VERBATIM, "Hello, world!"));
 
   // Retrieve token again after resetting to start
   EXPECT_TRUE(token_iterator->ResetToStart());
   EXPECT_THAT(token_iterator->GetToken(),
-              EqualsToken(Token::VERBATIM, "Hello, world!"));
+              EqualsToken(Token::Type::VERBATIM, "Hello, world!"));
 }
 
 TEST_F(VerbatimTokenizerTest, CalculateTokenStart) {
diff --git a/icing/transform/icu/icu-normalizer_benchmark.cc b/icing/transform/icu/icu-normalizer_benchmark.cc
index 8d09be2..fdd4c70 100644
--- a/icing/transform/icu/icu-normalizer_benchmark.cc
+++ b/icing/transform/icu/icu-normalizer_benchmark.cc
@@ -14,8 +14,8 @@
 
 #include "testing/base/public/benchmark.h"
 #include "gmock/gmock.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
 #include "icing/testing/common-matchers.h"
+#include "icing/testing/icu-data-file-helper.h"
 #include "icing/testing/test-data.h"
 #include "icing/transform/normalizer-factory.h"
 #include "icing/transform/normalizer.h"
diff --git a/icing/transform/icu/icu-normalizer_test.cc b/icing/transform/icu/icu-normalizer_test.cc
index a46fcc7..143da17 100644
--- a/icing/transform/icu/icu-normalizer_test.cc
+++ b/icing/transform/icu/icu-normalizer_test.cc
@@ -16,8 +16,8 @@
 
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
 #include "icing/testing/common-matchers.h"
+#include "icing/testing/icu-data-file-helper.h"
 #include "icing/testing/icu-i18n-test-utils.h"
 #include "icing/testing/test-data.h"
 #include "icing/transform/normalizer-factory.h"
diff --git a/proto/icing/proto/debug.proto b/proto/icing/proto/debug.proto
new file mode 100644
index 0000000..3f07539
--- /dev/null
+++ b/proto/icing/proto/debug.proto
@@ -0,0 +1,112 @@
+// Copyright 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto2";
+
+package icing.lib;
+
+import "icing/proto/status.proto";
+import "icing/proto/storage.proto";
+
+option java_package = "com.google.android.icing.proto";
+option java_multiple_files = true;
+option objc_class_prefix = "ICNG";
+
+// Next tag: 4
+message IndexDebugInfoProto {
+  // Storage information of the index.
+  optional IndexStorageInfoProto index_storage_info = 1;
+
+  message MainIndexDebugInfoProto {
+    // Information about the main lexicon.
+    // TODO(b/222349894) Convert the string output to a protocol buffer instead.
+    optional string lexicon_info = 1;
+
+    // Last added document id.
+    optional uint32 last_added_document_id = 2;
+
+    // If verbosity > 0, return information about the posting list storage.
+    // TODO(b/222349894) Convert the string output to a protocol buffer instead.
+    optional string flash_index_storage_info = 3;
+  }
+  optional MainIndexDebugInfoProto main_index_info = 2;
+
+  message LiteIndexDebugInfoProto {
+    // Current number of hits.
+    optional uint32 curr_size = 1;
+
+    // The maximum possible number of hits.
+    optional uint32 hit_buffer_size = 2;
+
+    // Last added document id.
+    optional uint32 last_added_document_id = 3;
+
+    // The first position in the hit buffer that is not sorted yet,
+    // or curr_size if all hits are sorted.
+    optional uint32 searchable_end = 4;
+
+    // The most recent checksum of the lite index, by calling
+    // LiteIndex::ComputeChecksum().
+    optional uint32 index_crc = 5;
+
+    // Information about the lite lexicon.
+    // TODO(b/222349894) Convert the string output to a protocol buffer instead.
+    optional string lexicon_info = 6;
+  }
+  optional LiteIndexDebugInfoProto lite_index_info = 3;
+}
+
+// Next tag: 4
+message DocumentDebugInfoProto {
+  // Storage information of the document store.
+  optional DocumentStorageInfoProto document_storage_info = 1;
+
+  // The most recent checksum of the document store, by calling
+  // DocumentStore::ComputeChecksum().
+  optional uint32 crc = 2;
+
+  message CorpusInfo {
+    optional string namespace = 1;
+    optional string schema = 2;
+    optional uint32 total_documents = 3;
+    optional uint32 total_token = 4;
+  }
+
+  // If verbosity > 0, return the total number of documents and tokens in each
+  // (namespace, schema type) pair.
+  // Note that deleted and expired documents are skipped in the output.
+  repeated CorpusInfo corpus_info = 3;
+}
+
+// Next tag: 3
+message DebugInfoProto {
+  // Debug information of the index.
+  optional IndexDebugInfoProto index_info = 1;
+
+  // Debug information of the document store.
+  optional DocumentDebugInfoProto document_info = 2;
+}
+
+// Next tag: 3
+message DebugInfoResultProto {
+  // Status code can be one of:
+  //   OK
+  //   FAILED_PRECONDITION
+  //
+  // See status.proto for more details.
+  optional StatusProto status = 1;
+
+  // Debug information for Icing.
+  optional DebugInfoProto debug_info = 2;
+}
diff --git a/proto/icing/proto/document.proto b/proto/icing/proto/document.proto
index 2e8321b..1a501e7 100644
--- a/proto/icing/proto/document.proto
+++ b/proto/icing/proto/document.proto
@@ -209,7 +209,7 @@ message DeleteBySchemaTypeResultProto {
 }
 
 // Result of a call to IcingSearchEngine.DeleteByQuery
-// Next tag: 4
+// Next tag: 5
 message DeleteByQueryResultProto {
   // Status code can be one of:
   //   OK
@@ -226,5 +226,18 @@ message DeleteByQueryResultProto {
   // Stats for delete execution performance.
   optional DeleteByQueryStatsProto delete_by_query_stats = 3;
 
+  // Used by DeleteByQueryResultProto to return information about deleted
+  // documents.
+  message DocumentGroupInfo {
+    optional string namespace = 1;
+    optional string schema = 2;
+    repeated string uris = 3;
+  }
+
+  // Additional return message that shows the uris of the deleted documents, if
+  // users set return_deleted_document_info to true.
+  // The result is grouped by the corresponding namespace and type.
+  repeated DocumentGroupInfo deleted_documents = 4;
+
   reserved 2;
 }
diff --git a/proto/icing/proto/logging.proto b/proto/icing/proto/logging.proto
index 2f1f271..0a7c4a6 100644
--- a/proto/icing/proto/logging.proto
+++ b/proto/icing/proto/logging.proto
@@ -46,6 +46,9 @@ message InitializeStatsProto {
 
     // Random I/O errors.
     IO_ERROR = 4;
+
+    // The document log is using legacy format.
+    LEGACY_DOCUMENT_LOG_FORMAT = 5;
   }
 
   // Possible recovery causes for document store:
diff --git a/synced_AOSP_CL_number.txt b/synced_AOSP_CL_number.txt
index 26469ab..83c7099 100644
--- a/synced_AOSP_CL_number.txt
+++ b/synced_AOSP_CL_number.txt
@@ -1 +1 @@
-set(synced_AOSP_CL_number=417867838)
+set(synced_AOSP_CL_number=435140515)
author	Tim Barron <tjbarron@google.com>	2022-04-12 14:30:14 -0700
committer	Tim Barron <tjbarron@google.com>	2022-04-12 14:36:38 -0700
commit	d5c9ae94052a0f2f1b9ddec9dbbe502bc4f11d54 (patch)
tree	90b929dc92d5874b5c15caca064401196ab4fc65
parent	beff93fe1f5165aeeb871d9711963aa1846299ae (diff)
download	icing-d5c9ae94052a0f2f1b9ddec9dbbe502bc4f11d54.tar.gz