Merge remote-tracking branch 'aosp/upstream-master' into androidx-main

* aosp/upstream-master: Sync from upstream. Descriptions: ================ Remove no-longer-used write paths for file-backed-proto-log. ================ Modify segmentation rules to consider any segment that begins with a non-Ascii alphanumeric character as valid ================= Implement CalculateNormalizedMatchLength for IcuNormalizer. ================ Add additional benchmark cases that were useful in developing submatching and CalculateNormalizedMatchLength for IcuNormalizer ================= Switch NormalizationMap from static const std::unordered_map<char16_t, char16_t>& to static const std::unordered_map<char16_t, char16_t> *const. ================== Bug: 147509515 Bug: 149610413 Bug: 195720764 Bug: 196257995 Change-Id: I8e8d7a7fcceb8eaae1fdcb45a92ea4399d47f343
author: My Name <dsaadati@google.com> 2021-09-09 11:11:03 -0700
committer: Dan Saadati <dsaadati@google.com> 2021-09-09 11:27:04 -0700
commit: 34fc8c85b9f690ffd0a095a4bbcac9aaacfa387b (patch)
tree: 0b9bbc9ca107551327b396e7c355b65c4baa43ef
parent: 14ee9a8eb8f3ed47f68117208626045878c943ac (diff)
parent: 39f59853b980d94a55e9b0f76185b0d3fff88455 (diff)
download: icing-34fc8c85b9f690ffd0a095a4bbcac9aaacfa387b.tar.gz
27 files changed, 665 insertions, 1320 deletions
diff --git a/icing/file/file-backed-proto-log.h b/icing/file/file-backed-proto-log.h
index b2b37e8..cf16b4f 100644
--- a/icing/file/file-backed-proto-log.h
+++ b/icing/file/file-backed-proto-log.h
@@ -14,16 +14,14 @@
 
 // File-backed log of protos with append-only writes and position based reads.
 //
-// There should only be one instance of a FileBackedProtoLog of the same file at
-// a time; using multiple instances at the same time may lead to undefined
-// behavior.
+// The implementation in this file is deprecated and replaced by
+// portable-file-backed-proto-log.h.
 //
-// The entire checksum is computed on initialization to verify the contents are
-// valid. On failure, the log will be truncated to the last verified state when
-// PersistToDisk() was called. If the log cannot successfully restore the last
-// state due to disk corruption or some other inconsistency, then the entire log
-// will be lost.
+// This deprecated implementation has been made read-only for the purposes of
+// migration; writing and erasing this format of log is no longer supported and
+// the methods to accomplish this have been removed.
 //
+// The details of this format follow below:
 // Each proto written to the file will have a metadata written just before it.
 // The metadata consists of
 //   {
@@ -31,37 +29,16 @@
 //     3 bytes of the proto size
 //     n bytes of the proto itself
 //   }
-//
-// Example usage:
-//   ICING_ASSERT_OK_AND_ASSIGN(auto create_result,
-//       FileBackedProtoLog<DocumentProto>::Create(filesystem, file_path_,
-//                                                  options));
-//   auto proto_log = create_result.proto_log;
-//
-//   Document document;
-//   document.set_namespace("com.google.android.example");
-//   document.set_uri("www.google.com");
-//
-//   int64_t document_offset = proto_log->WriteProto(document));
-//   Document same_document = proto_log->ReadProto(document_offset));
-//   proto_log->PersistToDisk();
-//
 // TODO(b/136514769): Add versioning to the header and a UpgradeToVersion
 // migration method.
-
 #ifndef ICING_FILE_FILE_BACKED_PROTO_LOG_H_
 #define ICING_FILE_FILE_BACKED_PROTO_LOG_H_
 
-#include <cstddef>
 #include <cstdint>
-#include <cstring>
 #include <memory>
 #include <string>
 #include <string_view>
-#include <utility>
-#include <vector>
 
-#include "icing/text_classifier/lib3/utils/base/status.h"
 #include "icing/text_classifier/lib3/utils/base/statusor.h"
 #include <google/protobuf/io/gzip_stream.h>
 #include <google/protobuf/io/zero_copy_stream_impl_lite.h>
@@ -112,10 +89,6 @@ class FileBackedProtoLog {
 
   // Header stored at the beginning of the file before the rest of the log
   // contents. Stores metadata on the log.
-  //
-  // TODO(b/139375388): Migrate the Header struct to a proto. This makes
-  // migrations easier since we don't need to worry about different size padding
-  // (which would affect the checksum) and different endians.
   struct Header {
     static constexpr int32_t kMagic = 0xf4c6f67a;
 
@@ -195,20 +168,6 @@ class FileBackedProtoLog {
   FileBackedProtoLog(const FileBackedProtoLog&) = delete;
   FileBackedProtoLog& operator=(const FileBackedProtoLog&) = delete;
 
-  // This will update the checksum of the log as well.
-  ~FileBackedProtoLog();
-
-  // Writes the serialized proto to the underlying file. Writes are applied
-  // directly to the underlying file. Users do not need to sync the file after
-  // writing.
-  //
-  // Returns:
-  //   Offset of the newly appended proto in file on success
-  //   INVALID_ARGUMENT if proto is too large, as decided by
-  //     Options.max_proto_size
-  //   INTERNAL_ERROR on IO error
-  libtextclassifier3::StatusOr<int64_t> WriteProto(const ProtoT& proto);
-
   // Reads out a proto located at file_offset from the file.
   //
   // Returns:
@@ -218,31 +177,6 @@ class FileBackedProtoLog {
   //   INTERNAL_ERROR on IO error
   libtextclassifier3::StatusOr<ProtoT> ReadProto(int64_t file_offset) const;
 
-  // Erases the data of a proto located at file_offset from the file.
-  //
-  // Returns:
-  //   OK on success
-  //   OUT_OF_RANGE_ERROR if file_offset exceeds file size
-  //   INTERNAL_ERROR on IO error
-  libtextclassifier3::Status EraseProto(int64_t file_offset);
-
-  // Calculates and returns the disk usage in bytes. Rounds up to the nearest
-  // block size.
-  //
-  // Returns:
-  //   Disk usage on success
-  //   INTERNAL_ERROR on IO error
-  libtextclassifier3::StatusOr<int64_t> GetDiskUsage() const;
-
-  // Returns the file size of all the elements held in the log. File size is in
-  // bytes. This excludes the size of any internal metadata of the log, e.g. the
-  // log's header.
-  //
-  // Returns:
-  //   File size on success
-  //   INTERNAL_ERROR on IO error
-  libtextclassifier3::StatusOr<int64_t> GetElementsFileSize() const;
-
   // An iterator helping to find offsets of all the protos in file.
   // Example usage:
   //
@@ -281,72 +215,6 @@ class FileBackedProtoLog {
   // behaviors could happen.
   Iterator GetIterator();
 
-  // Persists all changes since initialization or the last call to
-  // PersistToDisk(). Any changes that aren't persisted may be lost if the
-  // system fails to close safely.
-  //
-  // Example use case:
-  //
-  //   Document document;
-  //   document.set_namespace("com.google.android.example");
-  //   document.set_uri("www.google.com");
-  //
-  //   {
-  //     ICING_ASSERT_OK_AND_ASSIGN(auto create_result,
-  //         FileBackedProtoLog<DocumentProto>::Create(filesystem, file_path,
-  //                                                    options));
-  //     auto proto_log = std::move(create_result.proto_log);
-  //
-  //     int64_t document_offset = proto_log->WriteProto(document));
-  //
-  //     // We lose the document here since it wasn't persisted.
-  //     // *SYSTEM CRASH*
-  //   }
-  //
-  //   {
-  //     // Can still successfully create after a crash since the log can
-  //     // rewind/truncate to recover into a previously good state
-  //     ICING_ASSERT_OK_AND_ASSIGN(auto create_result,
-  //         FileBackedProtoLog<DocumentProto>::Create(filesystem, file_path,
-  //                                                    options));
-  //     auto proto_log = std::move(create_result.proto_log);
-  //
-  //     // Lost the proto since we didn't PersistToDisk before the crash
-  //     proto_log->ReadProto(document_offset)); // INVALID_ARGUMENT error
-  //
-  //     int64_t document_offset = proto_log->WriteProto(document));
-  //
-  //     // Persisted this time, so we should be ok.
-  //     ICING_ASSERT_OK(proto_log->PersistToDisk());
-  //   }
-  //
-  //   {
-  //     ICING_ASSERT_OK_AND_ASSIGN(auto create_result,
-  //         FileBackedProtoLog<DocumentProto>::Create(filesystem, file_path,
-  //                                                    options));
-  //     auto proto_log = std::move(create_result.proto_log);
-  //
-  //     // SUCCESS
-  //     Document same_document = proto_log->ReadProto(document_offset));
-  //   }
-  //
-  // NOTE: Since all protos are already written to the file directly, this
-  // just updates the checksum and rewind position. Without these updates,
-  // future initializations will truncate the file and discard unpersisted
-  // changes.
-  //
-  // Returns:
-  //   OK on success
-  //   INTERNAL_ERROR on IO error
-  libtextclassifier3::Status PersistToDisk();
-
-  // Calculates the checksum of the log contents. Excludes the header content.
-  //
-  // Returns:
-  //   Crc of the log content
-  //   INTERNAL_ERROR on IO error
-  libtextclassifier3::StatusOr<Crc32> ComputeChecksum();
-
  private:
   // Object can only be instantiated via the ::Create factory.
   FileBackedProtoLog(const Filesystem* filesystem, const std::string& file_path,
@@ -451,15 +319,6 @@ FileBackedProtoLog<ProtoT>::FileBackedProtoLog(const Filesystem* filesystem,
 }
 
 template <typename ProtoT>
-FileBackedProtoLog<ProtoT>::~FileBackedProtoLog() {
-  if (!PersistToDisk().ok()) {
-    ICING_LOG(WARNING)
-        << "Error persisting to disk during destruction of FileBackedProtoLog: "
-        << file_path_;
-  }
-}
-
-template <typename ProtoT>
 libtextclassifier3::StatusOr<typename FileBackedProtoLog<ProtoT>::CreateResult>
 FileBackedProtoLog<ProtoT>::Create(const Filesystem* filesystem,
                                    const std::string& file_path,
@@ -688,79 +547,6 @@ libtextclassifier3::StatusOr<Crc32> FileBackedProtoLog<ProtoT>::ComputeChecksum(
 }
 
 template <typename ProtoT>
-libtextclassifier3::StatusOr<int64_t> FileBackedProtoLog<ProtoT>::WriteProto(
-    const ProtoT& proto) {
-  int64_t proto_size = proto.ByteSizeLong();
-  int32_t metadata;
-  int metadata_size = sizeof(metadata);
-  int64_t current_position = filesystem_->GetCurrentPosition(fd_.get());
-
-  if (proto_size > header_->max_proto_size) {
-    return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
-        "proto_size, %lld, was too large to write. Max is %d",
-        static_cast<long long>(proto_size), header_->max_proto_size));
-  }
-
-  // At this point, we've guaranteed that proto_size is under kMaxProtoSize
-  // (see
-  // ::Create), so we can safely store it in an int.
-  int final_size = 0;
-
-  std::string proto_str;
-  google::protobuf::io::StringOutputStream proto_stream(&proto_str);
-
-  if (header_->compress) {
-    google::protobuf::io::GzipOutputStream::Options options;
-    options.format = google::protobuf::io::GzipOutputStream::ZLIB;
-    options.compression_level = kDeflateCompressionLevel;
-
-    google::protobuf::io::GzipOutputStream compressing_stream(&proto_stream,
-                                                                  options);
-
-    bool success = proto.SerializeToZeroCopyStream(&compressing_stream) &&
-                   compressing_stream.Close();
-
-    if (!success) {
-      return absl_ports::InternalError("Error compressing proto.");
-    }
-
-    final_size = proto_str.size();
-
-    // In case the compressed proto is larger than the original proto, we also
-    // can't write it.
-    if (final_size > header_->max_proto_size) {
-      return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
-          "Compressed proto size, %d, was greater than "
-          "max_proto_size, %d",
-          final_size, header_->max_proto_size));
-    }
-  } else {
-    // Serialize the proto directly into the write buffer at an offset of the
-    // metadata.
-    proto.SerializeToZeroCopyStream(&proto_stream);
-    final_size = proto_str.size();
-  }
-
-  // 1st byte for magic, next 3 bytes for proto size.
-  metadata = (kProtoMagic << 24) | final_size;
-
-  // Actually write metadata, has to be done after we know the possibly
-  // compressed proto size
-  if (!filesystem_->Write(fd_.get(), &metadata, metadata_size)) {
-    return absl_ports::InternalError(
-        absl_ports::StrCat("Failed to write proto metadata to: ", file_path_));
-  }
-
-  // Write the serialized proto
-  if (!filesystem_->Write(fd_.get(), proto_str.data(), proto_str.size())) {
-    return absl_ports::InternalError(
-        absl_ports::StrCat("Failed to write proto to: ", file_path_));
-  }
-
-  return current_position;
-}
-
-template <typename ProtoT>
 libtextclassifier3::StatusOr<ProtoT> FileBackedProtoLog<ProtoT>::ReadProto(
     int64_t file_offset) const {
   int64_t file_size = filesystem_->GetFileSize(fd_.get());
@@ -806,83 +592,6 @@ libtextclassifier3::StatusOr<ProtoT> FileBackedProtoLog<ProtoT>::ReadProto(
 }
 
 template <typename ProtoT>
-libtextclassifier3::Status FileBackedProtoLog<ProtoT>::EraseProto(
-    int64_t file_offset) {
-  int64_t file_size = filesystem_->GetFileSize(fd_.get());
-  if (file_offset >= file_size) {
-    // file_size points to the next byte to write at, so subtract one to get
-    // the inclusive, actual size of file.
-    return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
-        "Trying to erase data at a location, %lld, "
-        "out of range of the file size, %lld",
-        static_cast<long long>(file_offset),
-        static_cast<long long>(file_size - 1)));
-  }
-
-  MemoryMappedFile mmapped_file(
-      *filesystem_, file_path_,
-      MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC);
-
-  // Read out the metadata
-  ICING_ASSIGN_OR_RETURN(
-      int metadata, ReadProtoMetadata(&mmapped_file, file_offset, file_size));
-
-  ICING_RETURN_IF_ERROR(mmapped_file.Remap(file_offset + sizeof(metadata),
-                                           GetProtoSize(metadata)));
-
-  // We need to update the crc checksum if the erased area is before the
-  // rewind position.
-  if (file_offset + sizeof(metadata) < header_->rewind_offset) {
-    // We need to calculate [original string xor 0s].
-    // The xored string is the same as the original string because 0 xor 0 =
-    // 0, 1 xor 0 = 1.
-    const std::string_view xored_str(mmapped_file.region(),
-                                     mmapped_file.region_size());
-
-    Crc32 crc(header_->log_checksum);
-    ICING_ASSIGN_OR_RETURN(
-        uint32_t new_crc,
-        crc.UpdateWithXor(
-            xored_str,
-            /*full_data_size=*/header_->rewind_offset - sizeof(Header),
-            /*position=*/file_offset + sizeof(metadata) - sizeof(Header)));
-
-    header_->log_checksum = new_crc;
-    header_->header_checksum = header_->CalculateHeaderChecksum();
-
-    if (!filesystem_->PWrite(fd_.get(), /*offset=*/0, header_.get(),
-                             sizeof(Header))) {
-      return absl_ports::InternalError(
-          absl_ports::StrCat("Failed to update header to: ", file_path_));
-    }
-  }
-
-  memset(mmapped_file.mutable_region(), '\0', mmapped_file.region_size());
-  return libtextclassifier3::Status::OK;
-}
-
-template <typename ProtoT>
-libtextclassifier3::StatusOr<int64_t> FileBackedProtoLog<ProtoT>::GetDiskUsage()
-    const {
-  int64_t size = filesystem_->GetDiskUsage(file_path_.c_str());
-  if (size == Filesystem::kBadFileSize) {
-    return absl_ports::InternalError("Failed to get disk usage of proto log");
-  }
-  return size;
-}
-
-template <typename ProtoT>
-libtextclassifier3::StatusOr<int64_t>
-FileBackedProtoLog<ProtoT>::GetElementsFileSize() const {
-  int64_t total_file_size = filesystem_->GetFileSize(file_path_.c_str());
-  if (total_file_size == Filesystem::kBadFileSize) {
-    return absl_ports::InternalError(
-        "Failed to get file size of elments in the proto log");
-  }
-  return total_file_size - sizeof(Header);
-}
-
-template <typename ProtoT>
 FileBackedProtoLog<ProtoT>::Iterator::Iterator(const Filesystem& filesystem,
                                                const std::string& file_path,
                                                int64_t initial_offset)
@@ -964,51 +673,6 @@ libtextclassifier3::StatusOr<int> FileBackedProtoLog<ProtoT>::ReadProtoMetadata(
   return metadata;
 }
 
-template <typename ProtoT>
-libtextclassifier3::Status FileBackedProtoLog<ProtoT>::PersistToDisk() {
-  int64_t file_size = filesystem_->GetFileSize(file_path_.c_str());
-  if (file_size == header_->rewind_offset) {
-    // No new protos appended, don't need to update the checksum.
-    return libtextclassifier3::Status::OK;
-  }
-
-  int64_t new_content_size = file_size - header_->rewind_offset;
-  Crc32 crc;
-  if (new_content_size < 0) {
-    // File shrunk, recalculate the entire checksum.
-    ICING_ASSIGN_OR_RETURN(
-        crc, ComputeChecksum(filesystem_, file_path_, Crc32(), sizeof(Header),
-                             file_size));
-  } else {
-    // Append new changes to the existing checksum.
-    ICING_ASSIGN_OR_RETURN(
-        crc,
-        ComputeChecksum(filesystem_, file_path_, Crc32(header_->log_checksum),
-                        header_->rewind_offset, file_size));
-  }
-
-  header_->log_checksum = crc.Get();
-  header_->rewind_offset = file_size;
-  header_->header_checksum = header_->CalculateHeaderChecksum();
-
-  if (!filesystem_->PWrite(fd_.get(), /*offset=*/0, header_.get(),
-                           sizeof(Header)) ||
-      !filesystem_->DataSync(fd_.get())) {
-    return absl_ports::InternalError(
-        absl_ports::StrCat("Failed to update header to: ", file_path_));
-  }
-
-  return libtextclassifier3::Status::OK;
-}
-
-template <typename ProtoT>
-libtextclassifier3::StatusOr<Crc32>
-FileBackedProtoLog<ProtoT>::ComputeChecksum() {
-  return FileBackedProtoLog<ProtoT>::ComputeChecksum(
-      filesystem_, file_path_, Crc32(), /*start=*/sizeof(Header),
-      /*end=*/filesystem_->GetFileSize(file_path_.c_str()));
-}
-
 }  // namespace lib
 }  // namespace icing
 
diff --git a/icing/file/file-backed-proto-log_benchmark.cc b/icing/file/file-backed-proto-log_benchmark.cc
deleted file mode 100644
index c09fd5a..0000000
--- a/icing/file/file-backed-proto-log_benchmark.cc
+++ /dev/null
@@ -1,251 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <cstdint>
-#include <random>
-
-#include "testing/base/public/benchmark.h"
-#include "gmock/gmock.h"
-#include "icing/document-builder.h"
-#include "icing/file/file-backed-proto-log.h"
-#include "icing/file/filesystem.h"
-#include "icing/legacy/core/icing-string-util.h"
-#include "icing/proto/document.pb.h"
-#include "icing/testing/common-matchers.h"
-#include "icing/testing/random-string.h"
-#include "icing/testing/tmp-directory.h"
-
-// go/microbenchmarks
-//
-// To build and run on a local machine:
-//   $ blaze build -c opt --dynamic_mode=off --copt=-gmlt
-//   icing/file:file-backed-proto-log_benchmark
-//
-//   $ blaze-bin/icing/file/file-backed-proto-log_benchmark
-//   --benchmarks=all
-//
-//
-// To build and run on an Android device (must be connected and rooted):
-//   $ blaze build --copt="-DGOOGLE_COMMANDLINEFLAGS_FULL_API=1"
-//   --config=android_arm64 -c opt --dynamic_mode=off --copt=-gmlt
-//   icing/file:file-backed-proto-log_benchmark
-//
-//   $ adb root
-//
-//   $ adb push
-//   blaze-bin/icing/file/file-backed-proto-log_benchmark
-//   /data/local/tmp/
-//
-//   $ adb shell /data/local/tmp/file-backed-proto-log-benchmark
-//   --benchmarks=all
-
-namespace icing {
-namespace lib {
-
-namespace {
-
-static void BM_Write(benchmark::State& state) {
-  const Filesystem filesystem;
-  int string_length = state.range(0);
-  const std::string file_path = IcingStringUtil::StringPrintf(
-      "%s%s%d%s", GetTestTempDir().c_str(), "/proto_", string_length, ".log");
-  int max_proto_size = (1 << 24) - 1;  // 16 MiB
-  bool compress = true;
-
-  // Make sure it doesn't already exist.
-  filesystem.DeleteFile(file_path.c_str());
-
-  auto proto_log =
-      FileBackedProtoLog<DocumentProto>::Create(
-          &filesystem, file_path,
-          FileBackedProtoLog<DocumentProto>::Options(compress, max_proto_size))
-          .ValueOrDie()
-          .proto_log;
-
-  DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
-
-  std::default_random_engine random;
-  const std::string rand_str =
-      RandomString(kAlNumAlphabet, string_length, &random);
-
-  auto document_properties = document.add_properties();
-  document_properties->set_name("string property");
-  document_properties->add_string_values(rand_str);
-
-  for (auto _ : state) {
-    testing::DoNotOptimize(proto_log->WriteProto(document));
-  }
-  state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) *
-                          string_length);
-
-  // Cleanup after ourselves
-  filesystem.DeleteFile(file_path.c_str());
-}
-BENCHMARK(BM_Write)
-    ->Arg(1)
-    ->Arg(32)
-    ->Arg(512)
-    ->Arg(1024)
-    ->Arg(4 * 1024)
-    ->Arg(8 * 1024)
-    ->Arg(16 * 1024)
-    ->Arg(32 * 1024)
-    ->Arg(256 * 1024)
-    ->Arg(2 * 1024 * 1024)
-    ->Arg(8 * 1024 * 1024)
-    ->Arg(15 * 1024 * 1024);  // We do 15MiB here since our max proto size is
-                              // 16MiB, and we need some extra space for the
-                              // rest of the document properties
-
-static void BM_Read(benchmark::State& state) {
-  const Filesystem filesystem;
-  int string_length = state.range(0);
-  const std::string file_path = IcingStringUtil::StringPrintf(
-      "%s%s%d%s", GetTestTempDir().c_str(), "/proto_", string_length, ".log");
-  int max_proto_size = (1 << 24) - 1;  // 16 MiB
-  bool compress = true;
-
-  // Make sure it doesn't already exist.
-  filesystem.DeleteFile(file_path.c_str());
-
-  auto proto_log =
-      FileBackedProtoLog<DocumentProto>::Create(
-          &filesystem, file_path,
-          FileBackedProtoLog<DocumentProto>::Options(compress, max_proto_size))
-          .ValueOrDie()
-          .proto_log;
-
-  DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
-
-  std::default_random_engine random;
-  const std::string rand_str =
-      RandomString(kAlNumAlphabet, string_length, &random);
-
-  auto document_properties = document.add_properties();
-  document_properties->set_name("string property");
-  document_properties->add_string_values(rand_str);
-
-  ICING_ASSERT_OK_AND_ASSIGN(int64_t write_offset,
-                             proto_log->WriteProto(document));
-
-  for (auto _ : state) {
-    testing::DoNotOptimize(proto_log->ReadProto(write_offset));
-  }
-  state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) *
-                          string_length);
-
-  // Cleanup after ourselves
-  filesystem.DeleteFile(file_path.c_str());
-}
-BENCHMARK(BM_Read)
-    ->Arg(1)
-    ->Arg(32)
-    ->Arg(512)
-    ->Arg(1024)
-    ->Arg(4 * 1024)
-    ->Arg(8 * 1024)
-    ->Arg(16 * 1024)
-    ->Arg(32 * 1024)
-    ->Arg(256 * 1024)
-    ->Arg(2 * 1024 * 1024)
-    ->Arg(8 * 1024 * 1024)
-    ->Arg(15 * 1024 * 1024);  // We do 15MiB here since our max proto size is
-                              // 16MiB, and we need some extra space for the
-                              // rest of the document properties
-
-static void BM_Erase(benchmark::State& state) {
-  const Filesystem filesystem;
-  const std::string file_path = IcingStringUtil::StringPrintf(
-      "%s%s", GetTestTempDir().c_str(), "/proto.log");
-  int max_proto_size = (1 << 24) - 1;  // 16 MiB
-  bool compress = true;
-
-  // Make sure it doesn't already exist.
-  filesystem.DeleteFile(file_path.c_str());
-
-  auto proto_log =
-      FileBackedProtoLog<DocumentProto>::Create(
-          &filesystem, file_path,
-          FileBackedProtoLog<DocumentProto>::Options(compress, max_proto_size))
-          .ValueOrDie()
-          .proto_log;
-
-  DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
-
-  std::default_random_engine random;
-  const std::string rand_str = RandomString(kAlNumAlphabet, /*len=*/1, &random);
-
-  auto document_properties = document.add_properties();
-  document_properties->set_name("string property");
-  document_properties->add_string_values(rand_str);
-
-  for (auto _ : state) {
-    state.PauseTiming();
-    ICING_ASSERT_OK_AND_ASSIGN(int64_t write_offset,
-                               proto_log->WriteProto(document));
-    state.ResumeTiming();
-
-    testing::DoNotOptimize(proto_log->EraseProto(write_offset));
-  }
-
-  // Cleanup after ourselves
-  filesystem.DeleteFile(file_path.c_str());
-}
-BENCHMARK(BM_Erase);
-
-static void BM_ComputeChecksum(benchmark::State& state) {
-  const Filesystem filesystem;
-  const std::string file_path = GetTestTempDir() + "/proto.log";
-  int max_proto_size = (1 << 24) - 1;  // 16 MiB
-  bool compress = true;
-
-  // Make sure it doesn't already exist.
-  filesystem.DeleteFile(file_path.c_str());
-
-  auto proto_log =
-      FileBackedProtoLog<DocumentProto>::Create(
-          &filesystem, file_path,
-          FileBackedProtoLog<DocumentProto>::Options(compress, max_proto_size))
-          .ValueOrDie()
-          .proto_log;
-
-  DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
-
-  // Make each document 1KiB
-  int string_length = 1024;
-  std::default_random_engine random;
-  const std::string rand_str =
-      RandomString(kAlNumAlphabet, string_length, &random);
-
-  auto document_properties = document.add_properties();
-  document_properties->set_name("string property");
-  document_properties->add_string_values(rand_str);
-
-  int num_docs = state.range(0);
-  for (int i = 0; i < num_docs; ++i) {
-    ICING_ASSERT_OK(proto_log->WriteProto(document));
-  }
-
-  for (auto _ : state) {
-    testing::DoNotOptimize(proto_log->ComputeChecksum());
-  }
-
-  // Cleanup after ourselves
-  filesystem.DeleteFile(file_path.c_str());
-}
-BENCHMARK(BM_ComputeChecksum)->Range(1024, 1 << 20);
-
-}  // namespace
-}  // namespace lib
-}  // namespace icing
diff --git a/icing/file/file-backed-proto-log_test.cc b/icing/file/file-backed-proto-log_test.cc
index d429277..eccb0c7 100644
--- a/icing/file/file-backed-proto-log_test.cc
+++ b/icing/file/file-backed-proto-log_test.cc
@@ -19,10 +19,7 @@
 
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
-#include "icing/document-builder.h"
 #include "icing/file/filesystem.h"
-#include "icing/file/mock-filesystem.h"
-#include "icing/portable/equals-proto.h"
 #include "icing/proto/document.pb.h"
 #include "icing/testing/common-matchers.h"
 #include "icing/testing/tmp-directory.h"
@@ -32,14 +29,7 @@ namespace lib {
 
 namespace {
 
-using ::icing::lib::portable_equals_proto::EqualsProto;
-using ::testing::A;
-using ::testing::Eq;
-using ::testing::Gt;
-using ::testing::Not;
 using ::testing::NotNull;
-using ::testing::Pair;
-using ::testing::Return;
 
 class FileBackedProtoLogTest : public ::testing::Test {
  protected:
@@ -87,193 +77,6 @@ TEST_F(FileBackedProtoLogTest, Initialize) {
               StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
 }
 
-TEST_F(FileBackedProtoLogTest, WriteProtoTooLarge) {
-  int max_proto_size = 1;
-  ICING_ASSERT_OK_AND_ASSIGN(
-      FileBackedProtoLog<DocumentProto>::CreateResult create_result,
-      FileBackedProtoLog<DocumentProto>::Create(
-          &filesystem_, file_path_,
-          FileBackedProtoLog<DocumentProto>::Options(compress_,
-                                                     max_proto_size)));
-  auto proto_log = std::move(create_result.proto_log);
-  ASSERT_FALSE(create_result.has_data_loss());
-
-  DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
-
-  // Proto is too large for the max_proto_size_in
-  ASSERT_THAT(proto_log->WriteProto(document),
-              StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
-}
-
-TEST_F(FileBackedProtoLogTest, ReadProtoWrongKProtoMagic) {
-  ICING_ASSERT_OK_AND_ASSIGN(
-      FileBackedProtoLog<DocumentProto>::CreateResult create_result,
-      FileBackedProtoLog<DocumentProto>::Create(
-          &filesystem_, file_path_,
-          FileBackedProtoLog<DocumentProto>::Options(compress_,
-                                                     max_proto_size_)));
-  auto proto_log = std::move(create_result.proto_log);
-  ASSERT_FALSE(create_result.has_data_loss());
-
-  // Write a proto
-  DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
-
-  ICING_ASSERT_OK_AND_ASSIGN(int64_t file_offset,
-                             proto_log->WriteProto(document));
-
-  // The 4 bytes of metadata that just doesn't have the same kProtoMagic
-  // specified in file-backed-proto-log.h
-  uint32_t wrong_magic = 0x7E000000;
-
-  // Sanity check that we opened the file correctly
-  int fd = filesystem_.OpenForWrite(file_path_.c_str());
-  ASSERT_GT(fd, 0);
-
-  // Write the wrong kProtoMagic in, kProtoMagics are stored at the beginning of
-  // a proto entry.
-  filesystem_.PWrite(fd, file_offset, &wrong_magic, sizeof(wrong_magic));
-
-  ASSERT_THAT(proto_log->ReadProto(file_offset),
-              StatusIs(libtextclassifier3::StatusCode::INTERNAL));
-}
-
-TEST_F(FileBackedProtoLogTest, ReadWriteUncompressedProto) {
-  int last_offset;
-  {
-    ICING_ASSERT_OK_AND_ASSIGN(
-        FileBackedProtoLog<DocumentProto>::CreateResult create_result,
-        FileBackedProtoLog<DocumentProto>::Create(
-            &filesystem_, file_path_,
-            FileBackedProtoLog<DocumentProto>::Options(
-                /*compress_in=*/false, max_proto_size_)));
-    auto proto_log = std::move(create_result.proto_log);
-    ASSERT_FALSE(create_result.has_data_loss());
-
-    // Write the first proto
-    DocumentProto document1 =
-        DocumentBuilder().SetKey("namespace1", "uri1").Build();
-
-    ICING_ASSERT_OK_AND_ASSIGN(int written_position,
-                               proto_log->WriteProto(document1));
-
-    int document1_offset = written_position;
-
-    // Check that what we read is what we wrote
-    ASSERT_THAT(proto_log->ReadProto(written_position),
-                IsOkAndHolds(EqualsProto(document1)));
-
-    // Write a second proto that's close to the max size. Leave some room for
-    // the rest of the proto properties.
-    std::string long_str(max_proto_size_ - 1024, 'a');
-    DocumentProto document2 = DocumentBuilder()
-                                  .SetKey("namespace2", "uri2")
-                                  .AddStringProperty("long_str", long_str)
-                                  .Build();
-
-    ICING_ASSERT_OK_AND_ASSIGN(written_position,
-                               proto_log->WriteProto(document2));
-
-    int document2_offset = written_position;
-    last_offset = written_position;
-    ASSERT_GT(document2_offset, document1_offset);
-
-    // Check the second proto
-    ASSERT_THAT(proto_log->ReadProto(written_position),
-                IsOkAndHolds(EqualsProto(document2)));
-
-    ICING_ASSERT_OK(proto_log->PersistToDisk());
-  }
-
-  {
-    // Make a new proto_log with the same file_path, and make sure we
-    // can still write to the same underlying file.
-    ICING_ASSERT_OK_AND_ASSIGN(
-        FileBackedProtoLog<DocumentProto>::CreateResult create_result,
-        FileBackedProtoLog<DocumentProto>::Create(
-            &filesystem_, file_path_,
-            FileBackedProtoLog<DocumentProto>::Options(
-                /*compress_in=*/false, max_proto_size_)));
-    auto recreated_proto_log = std::move(create_result.proto_log);
-    ASSERT_FALSE(create_result.has_data_loss());
-
-    // Write a third proto
-    DocumentProto document3 =
-        DocumentBuilder().SetKey("namespace3", "uri3").Build();
-
-    ASSERT_THAT(recreated_proto_log->WriteProto(document3),
-                IsOkAndHolds(Gt(last_offset)));
-  }
-}
-
-TEST_F(FileBackedProtoLogTest, ReadWriteCompressedProto) {
-  int last_offset;
-
-  {
-    ICING_ASSERT_OK_AND_ASSIGN(
-        FileBackedProtoLog<DocumentProto>::CreateResult create_result,
-        FileBackedProtoLog<DocumentProto>::Create(
-            &filesystem_, file_path_,
-            FileBackedProtoLog<DocumentProto>::Options(
-                /*compress_in=*/true, max_proto_size_)));
-    auto proto_log = std::move(create_result.proto_log);
-    ASSERT_FALSE(create_result.has_data_loss());
-
-    // Write the first proto
-    DocumentProto document1 =
-        DocumentBuilder().SetKey("namespace1", "uri1").Build();
-
-    ICING_ASSERT_OK_AND_ASSIGN(int written_position,
-                               proto_log->WriteProto(document1));
-
-    int document1_offset = written_position;
-
-    // Check that what we read is what we wrote
-    ASSERT_THAT(proto_log->ReadProto(written_position),
-                IsOkAndHolds(EqualsProto(document1)));
-
-    // Write a second proto that's close to the max size. Leave some room for
-    // the rest of the proto properties.
-    std::string long_str(max_proto_size_ - 1024, 'a');
-    DocumentProto document2 = DocumentBuilder()
-                                  .SetKey("namespace2", "uri2")
-                                  .AddStringProperty("long_str", long_str)
-                                  .Build();
-
-    ICING_ASSERT_OK_AND_ASSIGN(written_position,
-                               proto_log->WriteProto(document2));
-
-    int document2_offset = written_position;
-    last_offset = written_position;
-    ASSERT_GT(document2_offset, document1_offset);
-
-    // Check the second proto
-    ASSERT_THAT(proto_log->ReadProto(written_position),
-                IsOkAndHolds(EqualsProto(document2)));
-
-    ICING_ASSERT_OK(proto_log->PersistToDisk());
-  }
-
-  {
-    // Make a new proto_log with the same file_path, and make sure we
-    // can still write to the same underlying file.
-    ICING_ASSERT_OK_AND_ASSIGN(
-        FileBackedProtoLog<DocumentProto>::CreateResult create_result,
-        FileBackedProtoLog<DocumentProto>::Create(
-            &filesystem_, file_path_,
-            FileBackedProtoLog<DocumentProto>::Options(
-                /*compress_in=*/true, max_proto_size_)));
-    auto recreated_proto_log = std::move(create_result.proto_log);
-    ASSERT_FALSE(create_result.has_data_loss());
-
-    // Write a third proto
-    DocumentProto document3 =
-        DocumentBuilder().SetKey("namespace3", "uri3").Build();
-
-    ASSERT_THAT(recreated_proto_log->WriteProto(document3),
-                IsOkAndHolds(Gt(last_offset)));
-  }
-}
-
 TEST_F(FileBackedProtoLogTest, CorruptHeader) {
   {
     ICING_ASSERT_OK_AND_ASSIGN(
@@ -303,382 +106,6 @@ TEST_F(FileBackedProtoLogTest, CorruptHeader) {
   }
 }
 
-TEST_F(FileBackedProtoLogTest, CorruptContent) {
-  {
-    ICING_ASSERT_OK_AND_ASSIGN(
-        FileBackedProtoLog<DocumentProto>::CreateResult create_result,
-        FileBackedProtoLog<DocumentProto>::Create(
-            &filesystem_, file_path_,
-            FileBackedProtoLog<DocumentProto>::Options(compress_,
-                                                       max_proto_size_)));
-    auto proto_log = std::move(create_result.proto_log);
-    EXPECT_FALSE(create_result.has_data_loss());
-
-    DocumentProto document =
-        DocumentBuilder().SetKey("namespace1", "uri1").Build();
-
-    // Write and persist an document.
-    ICING_ASSERT_OK_AND_ASSIGN(int document_offset,
-                               proto_log->WriteProto(document));
-    ICING_ASSERT_OK(proto_log->PersistToDisk());
-
-    // "Corrupt" the content written in the log.
-    document.set_uri("invalid");
-    std::string serialized_document = document.SerializeAsString();
-    filesystem_.PWrite(file_path_.c_str(), document_offset,
-                       serialized_document.data(), serialized_document.size());
-  }
-
-  {
-    // We can recover, but we have data loss.
-    ICING_ASSERT_OK_AND_ASSIGN(
-        FileBackedProtoLog<DocumentProto>::CreateResult create_result,
-        FileBackedProtoLog<DocumentProto>::Create(
-            &filesystem_, file_path_,
-            FileBackedProtoLog<DocumentProto>::Options(compress_,
-                                                       max_proto_size_)));
-    auto proto_log = std::move(create_result.proto_log);
-    ASSERT_TRUE(create_result.has_data_loss());
-    ASSERT_THAT(create_result.data_loss, Eq(DataLoss::COMPLETE));
-
-    // Lost everything in the log since the rewind position doesn't help if
-    // there's been data corruption within the persisted region
-    ASSERT_EQ(filesystem_.GetFileSize(file_path_.c_str()),
-              sizeof(FileBackedProtoLog<DocumentProto>::Header));
-  }
-}
-
-TEST_F(FileBackedProtoLogTest, PersistToDisk) {
-  DocumentProto document1 =
-      DocumentBuilder().SetKey("namespace1", "uri1").Build();
-  DocumentProto document2 =
-      DocumentBuilder().SetKey("namespace2", "uri2").Build();
-  int document1_offset, document2_offset;
-  int log_size;
-
-  {
-    ICING_ASSERT_OK_AND_ASSIGN(
-        FileBackedProtoLog<DocumentProto>::CreateResult create_result,
-        FileBackedProtoLog<DocumentProto>::Create(
-            &filesystem_, file_path_,
-            FileBackedProtoLog<DocumentProto>::Options(compress_,
-                                                       max_proto_size_)));
-    auto proto_log = std::move(create_result.proto_log);
-    ASSERT_FALSE(create_result.has_data_loss());
-
-    // Write and persist the first proto
-    ICING_ASSERT_OK_AND_ASSIGN(document1_offset,
-                               proto_log->WriteProto(document1));
-    ICING_ASSERT_OK(proto_log->PersistToDisk());
-
-    // Write, but don't explicitly persist the second proto
-    ICING_ASSERT_OK_AND_ASSIGN(document2_offset,
-                               proto_log->WriteProto(document2));
-
-    // Check that what we read is what we wrote
-    ASSERT_THAT(proto_log->ReadProto(document1_offset),
-                IsOkAndHolds(EqualsProto(document1)));
-    ASSERT_THAT(proto_log->ReadProto(document2_offset),
-                IsOkAndHolds(EqualsProto(document2)));
-
-    log_size = filesystem_.GetFileSize(file_path_.c_str());
-    ASSERT_GT(log_size, 0);
-  }
-
-  {
-    // The header rewind position and checksum aren't updated in this "system
-    // crash" scenario.
-
-    std::string bad_proto =
-        "some incomplete proto that we didn't finish writing before the system "
-        "crashed";
-    filesystem_.PWrite(file_path_.c_str(), log_size, bad_proto.data(),
-                       bad_proto.size());
-
-    // Double check that we actually wrote something to the underlying file
-    ASSERT_GT(filesystem_.GetFileSize(file_path_.c_str()), log_size);
-  }
-
-  {
-    // We can recover, but we have data loss
-    ICING_ASSERT_OK_AND_ASSIGN(
-        FileBackedProtoLog<DocumentProto>::CreateResult create_result,
-        FileBackedProtoLog<DocumentProto>::Create(
-            &filesystem_, file_path_,
-            FileBackedProtoLog<DocumentProto>::Options(compress_,
-                                                       max_proto_size_)));
-    auto proto_log = std::move(create_result.proto_log);
-    ASSERT_TRUE(create_result.has_data_loss());
-    ASSERT_THAT(create_result.data_loss, Eq(DataLoss::PARTIAL));
-
-    // Check that everything was persisted across instances
-    ASSERT_THAT(proto_log->ReadProto(document1_offset),
-                IsOkAndHolds(EqualsProto(document1)));
-    ASSERT_THAT(proto_log->ReadProto(document2_offset),
-                IsOkAndHolds(EqualsProto(document2)));
-
-    // We correctly rewound to the last good state.
-    ASSERT_EQ(log_size, filesystem_.GetFileSize(file_path_.c_str()));
-  }
-}
-
-TEST_F(FileBackedProtoLogTest, Iterator) {
-  DocumentProto document1 =
-      DocumentBuilder().SetKey("namespace", "uri1").Build();
-  DocumentProto document2 =
-      DocumentBuilder().SetKey("namespace", "uri2").Build();
-
-  ICING_ASSERT_OK_AND_ASSIGN(
-      FileBackedProtoLog<DocumentProto>::CreateResult create_result,
-      FileBackedProtoLog<DocumentProto>::Create(
-          &filesystem_, file_path_,
-          FileBackedProtoLog<DocumentProto>::Options(compress_,
-                                                     max_proto_size_)));
-  auto proto_log = std::move(create_result.proto_log);
-  ASSERT_FALSE(create_result.has_data_loss());
-
-  {
-    // Empty iterator
-    auto iterator = proto_log->GetIterator();
-    ASSERT_THAT(iterator.Advance(),
-                StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
-  }
-
-  {
-    // Iterates through some documents
-    ICING_ASSERT_OK(proto_log->WriteProto(document1));
-    ICING_ASSERT_OK(proto_log->WriteProto(document2));
-    auto iterator = proto_log->GetIterator();
-    // 1st proto
-    ICING_ASSERT_OK(iterator.Advance());
-    ASSERT_THAT(proto_log->ReadProto(iterator.GetOffset()),
-                IsOkAndHolds(EqualsProto(document1)));
-    // 2nd proto
-    ICING_ASSERT_OK(iterator.Advance());
-    ASSERT_THAT(proto_log->ReadProto(iterator.GetOffset()),
-                IsOkAndHolds(EqualsProto(document2)));
-    // Tries to advance
-    ASSERT_THAT(iterator.Advance(),
-                StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
-  }
-
-  {
-    // Iterator with bad filesystem
-    MockFilesystem mock_filesystem;
-    ON_CALL(mock_filesystem, GetFileSize(A<const char *>()))
-        .WillByDefault(Return(Filesystem::kBadFileSize));
-    FileBackedProtoLog<DocumentProto>::Iterator bad_iterator(
-        mock_filesystem, file_path_, /*initial_offset=*/0);
-    ASSERT_THAT(bad_iterator.Advance(),
-                StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
-  }
-}
-
-TEST_F(FileBackedProtoLogTest, ComputeChecksum) {
-  DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
-  Crc32 checksum;
-
-  {
-    ICING_ASSERT_OK_AND_ASSIGN(
-        FileBackedProtoLog<DocumentProto>::CreateResult create_result,
-        FileBackedProtoLog<DocumentProto>::Create(
-            &filesystem_, file_path_,
-            FileBackedProtoLog<DocumentProto>::Options(compress_,
-                                                       max_proto_size_)));
-    auto proto_log = std::move(create_result.proto_log);
-    ASSERT_FALSE(create_result.has_data_loss());
-
-    ICING_EXPECT_OK(proto_log->WriteProto(document));
-
-    ICING_ASSERT_OK_AND_ASSIGN(checksum, proto_log->ComputeChecksum());
-
-    // Calling it twice with no changes should get us the same checksum
-    EXPECT_THAT(proto_log->ComputeChecksum(), IsOkAndHolds(Eq(checksum)));
-  }
-
-  {
-    ICING_ASSERT_OK_AND_ASSIGN(
-        FileBackedProtoLog<DocumentProto>::CreateResult create_result,
-        FileBackedProtoLog<DocumentProto>::Create(
-            &filesystem_, file_path_,
-            FileBackedProtoLog<DocumentProto>::Options(compress_,
-                                                       max_proto_size_)));
-    auto proto_log = std::move(create_result.proto_log);
-    ASSERT_FALSE(create_result.has_data_loss());
-
-    // Checksum should be consistent across instances
-    EXPECT_THAT(proto_log->ComputeChecksum(), IsOkAndHolds(Eq(checksum)));
-
-    // PersistToDisk shouldn't affect the checksum value
-    ICING_EXPECT_OK(proto_log->PersistToDisk());
-    EXPECT_THAT(proto_log->ComputeChecksum(), IsOkAndHolds(Eq(checksum)));
-
-    // Check that modifying the log leads to a different checksum
-    ICING_EXPECT_OK(proto_log->WriteProto(document));
-    EXPECT_THAT(proto_log->ComputeChecksum(), IsOkAndHolds(Not(Eq(checksum))));
-  }
-}
-
-TEST_F(FileBackedProtoLogTest, EraseProtoShouldSetZero) {
-  DocumentProto document1 =
-      DocumentBuilder().SetKey("namespace", "uri1").Build();
-
-  ICING_ASSERT_OK_AND_ASSIGN(
-      FileBackedProtoLog<DocumentProto>::CreateResult create_result,
-      FileBackedProtoLog<DocumentProto>::Create(
-          &filesystem_, file_path_,
-          FileBackedProtoLog<DocumentProto>::Options(compress_,
-                                                     max_proto_size_)));
-  auto proto_log = std::move(create_result.proto_log);
-  ASSERT_FALSE(create_result.has_data_loss());
-
-  // Writes and erases proto
-  ICING_ASSERT_OK_AND_ASSIGN(int64_t document1_offset,
-                             proto_log->WriteProto(document1));
-  ICING_ASSERT_OK(proto_log->EraseProto(document1_offset));
-
-  // Checks if the erased area is set to 0.
-  int64_t file_size = filesystem_.GetFileSize(file_path_.c_str());
-  MemoryMappedFile mmapped_file(filesystem_, file_path_,
-                                MemoryMappedFile::Strategy::READ_ONLY);
-
-  // document1_offset + sizeof(int) is the start byte of the proto where
-  // sizeof(int) is the size of the proto metadata.
-  mmapped_file.Remap(document1_offset + sizeof(int), file_size - 1);
-  for (size_t i = 0; i < mmapped_file.region_size(); ++i) {
-    ASSERT_THAT(mmapped_file.region()[i], Eq(0));
-  }
-}
-
-TEST_F(FileBackedProtoLogTest, EraseProtoShouldReturnNotFound) {
-  DocumentProto document1 =
-      DocumentBuilder().SetKey("namespace", "uri1").Build();
-  DocumentProto document2 =
-      DocumentBuilder().SetKey("namespace", "uri2").Build();
-
-  ICING_ASSERT_OK_AND_ASSIGN(
-      FileBackedProtoLog<DocumentProto>::CreateResult create_result,
-      FileBackedProtoLog<DocumentProto>::Create(
-          &filesystem_, file_path_,
-          FileBackedProtoLog<DocumentProto>::Options(compress_,
-                                                     max_proto_size_)));
-  auto proto_log = std::move(create_result.proto_log);
-  ASSERT_FALSE(create_result.has_data_loss());
-
-  // Writes 2 protos
-  ICING_ASSERT_OK_AND_ASSIGN(int64_t document1_offset,
-                             proto_log->WriteProto(document1));
-  ICING_ASSERT_OK_AND_ASSIGN(int64_t document2_offset,
-                             proto_log->WriteProto(document2));
-
-  // Erases the first proto
-  ICING_ASSERT_OK(proto_log->EraseProto(document1_offset));
-
-  // The first proto has been erased.
-  ASSERT_THAT(proto_log->ReadProto(document1_offset),
-              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
-  // The second proto should be returned.
-  ASSERT_THAT(proto_log->ReadProto(document2_offset),
-              IsOkAndHolds(EqualsProto(document2)));
-}
-
-TEST_F(FileBackedProtoLogTest, ChecksumShouldBeCorrectWithErasedProto) {
-  DocumentProto document1 =
-      DocumentBuilder().SetKey("namespace", "uri1").Build();
-  DocumentProto document2 =
-      DocumentBuilder().SetKey("namespace", "uri2").Build();
-  DocumentProto document3 =
-      DocumentBuilder().SetKey("namespace", "uri3").Build();
-  DocumentProto document4 =
-      DocumentBuilder().SetKey("namespace", "uri4").Build();
-
-  int64_t document2_offset;
-  int64_t document3_offset;
-
-  {
-    // Erase data after the rewind position. This won't update the checksum
-    // immediately.
-    ICING_ASSERT_OK_AND_ASSIGN(
-        FileBackedProtoLog<DocumentProto>::CreateResult create_result,
-        FileBackedProtoLog<DocumentProto>::Create(
-            &filesystem_, file_path_,
-            FileBackedProtoLog<DocumentProto>::Options(compress_,
-                                                       max_proto_size_)));
-    auto proto_log = std::move(create_result.proto_log);
-    ASSERT_FALSE(create_result.has_data_loss());
-
-    // Writes 3 protos
-    ICING_ASSERT_OK_AND_ASSIGN(int64_t document1_offset,
-                               proto_log->WriteProto(document1));
-    ICING_ASSERT_OK_AND_ASSIGN(document2_offset,
-                               proto_log->WriteProto(document2));
-    ICING_ASSERT_OK_AND_ASSIGN(document3_offset,
-                               proto_log->WriteProto(document3));
-
-    // Erases the 1st proto, checksum won't be updated immediately because the
-    // rewind position is 0.
-    ICING_ASSERT_OK(proto_log->EraseProto(document1_offset));
-
-    EXPECT_THAT(proto_log->ComputeChecksum(),
-                IsOkAndHolds(Eq(Crc32(2293202502))));
-  }  // New checksum is updated in destructor.
-
-  {
-    // Erase data before the rewind position. This will update the checksum
-    // immediately.
-    ICING_ASSERT_OK_AND_ASSIGN(
-        FileBackedProtoLog<DocumentProto>::CreateResult create_result,
-        FileBackedProtoLog<DocumentProto>::Create(
-            &filesystem_, file_path_,
-            FileBackedProtoLog<DocumentProto>::Options(compress_,
-                                                       max_proto_size_)));
-    auto proto_log = std::move(create_result.proto_log);
-    ASSERT_FALSE(create_result.has_data_loss());
-
-    // Erases the 2nd proto that is now before the rewind position. Checksum is
-    // updated.
-    ICING_ASSERT_OK(proto_log->EraseProto(document2_offset));
-
-    EXPECT_THAT(proto_log->ComputeChecksum(),
-                IsOkAndHolds(Eq(Crc32(639634028))));
-  }
-
-  {
-    // Append data and erase data before the rewind position. This will update
-    // the checksum twice: in EraseProto() and destructor.
-    ICING_ASSERT_OK_AND_ASSIGN(
-        FileBackedProtoLog<DocumentProto>::CreateResult create_result,
-        FileBackedProtoLog<DocumentProto>::Create(
-            &filesystem_, file_path_,
-            FileBackedProtoLog<DocumentProto>::Options(compress_,
-                                                       max_proto_size_)));
-    auto proto_log = std::move(create_result.proto_log);
-    ASSERT_FALSE(create_result.has_data_loss());
-
-    // Append a new document which is after the rewind position.
-    ICING_ASSERT_OK(proto_log->WriteProto(document4));
-
-    // Erases the 3rd proto that is now before the rewind position. Checksum is
-    // updated.
-    ICING_ASSERT_OK(proto_log->EraseProto(document3_offset));
-
-    EXPECT_THAT(proto_log->ComputeChecksum(),
-                IsOkAndHolds(Eq(Crc32(1990198693))));
-  }  // Checksum is updated with the newly appended document.
-
-  {
-    // A successful creation means that the checksum matches.
-    ICING_ASSERT_OK_AND_ASSIGN(
-        FileBackedProtoLog<DocumentProto>::CreateResult create_result,
-        FileBackedProtoLog<DocumentProto>::Create(
-            &filesystem_, file_path_,
-            FileBackedProtoLog<DocumentProto>::Options(compress_,
-                                                       max_proto_size_)));
-    auto proto_log = std::move(create_result.proto_log);
-    EXPECT_FALSE(create_result.has_data_loss());
-  }
-}
-
 }  // namespace
 }  // namespace lib
 }  // namespace icing
diff --git a/icing/icing-search-engine_benchmark.cc b/icing/icing-search-engine_benchmark.cc
index ba9aed1..5e610d5 100644
--- a/icing/icing-search-engine_benchmark.cc
+++ b/icing/icing-search-engine_benchmark.cc
@@ -43,7 +43,6 @@
 #include "icing/testing/common-matchers.h"
 #include "icing/testing/document-generator.h"
 #include "icing/testing/random-string.h"
-#include "icing/testing/recorder-test-utils.h"
 #include "icing/testing/schema-generator.h"
 #include "icing/testing/tmp-directory.h"
 
@@ -178,12 +177,12 @@ class DestructibleDirectory {
 };
 
 std::vector<DocumentProto> GenerateRandomDocuments(
-    EvenDistributionTypeSelector* type_selector, int num_docs) {
+    EvenDistributionTypeSelector* type_selector, int num_docs,
+    const std::vector<std::string>& language) {
   std::vector<std::string> namespaces = CreateNamespaces(kAvgNumNamespaces);
   EvenDistributionNamespaceSelector namespace_selector(namespaces);
 
   std::default_random_engine random;
-  std::vector<std::string> language = CreateLanguages(kLanguageSize, &random);
   UniformDistributionLanguageTokenGenerator<std::default_random_engine>
       token_generator(language, &random);
 
@@ -227,8 +226,9 @@ void BM_IndexLatency(benchmark::State& state) {
   ASSERT_THAT(icing->SetSchema(schema).status(), ProtoIsOk());
 
   int num_docs = state.range(0);
+  std::vector<std::string> language = CreateLanguages(kLanguageSize, &random);
   const std::vector<DocumentProto> random_docs =
-      GenerateRandomDocuments(&type_selector, num_docs);
+      GenerateRandomDocuments(&type_selector, num_docs, language);
   Timer timer;
   for (const DocumentProto& doc : random_docs) {
     ASSERT_THAT(icing->Put(doc).status(), ProtoIsOk());
@@ -271,6 +271,56 @@ BENCHMARK(BM_IndexLatency)
     ->ArgPair(1 << 15, 10)
     ->ArgPair(1 << 17, 10);
 
+void BM_QueryLatency(benchmark::State& state) {
+  // Initialize the filesystem
+  std::string test_dir = GetTestTempDir() + "/icing/benchmark";
+  Filesystem filesystem;
+  DestructibleDirectory ddir(filesystem, test_dir);
+
+  // Create the schema.
+  std::default_random_engine random;
+  int num_types = kAvgNumNamespaces * kAvgNumTypes;
+  ExactStringPropertyGenerator property_generator;
+  SchemaGenerator<ExactStringPropertyGenerator> schema_generator(
+      /*num_properties=*/state.range(1), &property_generator);
+  SchemaProto schema = schema_generator.GenerateSchema(num_types);
+  EvenDistributionTypeSelector type_selector(schema);
+
+  // Create the index.
+  IcingSearchEngineOptions options;
+  options.set_base_dir(test_dir);
+  options.set_index_merge_size(kIcingFullIndexSize);
+  std::unique_ptr<IcingSearchEngine> icing =
+      std::make_unique<IcingSearchEngine>(options);
+
+  ASSERT_THAT(icing->Initialize().status(), ProtoIsOk());
+  ASSERT_THAT(icing->SetSchema(schema).status(), ProtoIsOk());
+
+  int num_docs = state.range(0);
+  std::vector<std::string> language = CreateLanguages(kLanguageSize, &random);
+  const std::vector<DocumentProto> random_docs =
+      GenerateRandomDocuments(&type_selector, num_docs, language);
+  for (const DocumentProto& doc : random_docs) {
+    ASSERT_THAT(icing->Put(doc).status(), ProtoIsOk());
+  }
+
+  SearchSpecProto search_spec = CreateSearchSpec(
+      language.at(0), std::vector<std::string>(), TermMatchType::PREFIX);
+  ResultSpecProto result_spec = CreateResultSpec(1000000, 1000000, 1000000);
+  ScoringSpecProto scoring_spec =
+      CreateScoringSpec(ScoringSpecProto::RankingStrategy::CREATION_TIMESTAMP);
+  for (auto _ : state) {
+    SearchResultProto results = icing->Search(
+        search_spec, ScoringSpecProto::default_instance(), result_spec);
+  }
+}
+BENCHMARK(BM_QueryLatency)
+    // Arguments: num_indexed_documents, num_sections
+    ->ArgPair(32, 2)
+    ->ArgPair(128, 2)
+    ->ArgPair(1 << 10, 2)
+    ->ArgPair(1 << 13, 2);
+
 void BM_IndexThroughput(benchmark::State& state) {
   // Initialize the filesystem
   std::string test_dir = GetTestTempDir() + "/icing/benchmark";
@@ -297,8 +347,9 @@ void BM_IndexThroughput(benchmark::State& state) {
   ASSERT_THAT(icing->SetSchema(schema).status(), ProtoIsOk());
 
   int num_docs = state.range(0);
+  std::vector<std::string> language = CreateLanguages(kLanguageSize, &random);
   const std::vector<DocumentProto> random_docs =
-      GenerateRandomDocuments(&type_selector, num_docs);
+      GenerateRandomDocuments(&type_selector, num_docs, language);
   for (auto s : state) {
     for (const DocumentProto& doc : random_docs) {
       ASSERT_THAT(icing->Put(doc).status(), ProtoIsOk());
diff --git a/icing/result/snippet-retriever.cc b/icing/result/snippet-retriever.cc
index 33b343e..c46762e 100644
--- a/icing/result/snippet-retriever.cc
+++ b/icing/result/snippet-retriever.cc
@@ -109,7 +109,7 @@ class TokenMatcherExact : public TokenMatcher {
     }
     if (itr != unrestricted_query_terms_.end() &&
         itr != restricted_query_terms_.end()) {
-      return normalizer_.CalculateNormalizedMatchLength(token.text, *itr);
+      return normalizer_.FindNormalizedMatchEndPosition(token.text, *itr);
     }
     return CharacterIterator(token.text, -1, -1, -1);
   }
@@ -135,14 +135,14 @@ class TokenMatcherPrefix : public TokenMatcher {
     for (const std::string& query_term : unrestricted_query_terms_) {
       if (query_term.length() <= s.length() &&
           s.compare(0, query_term.length(), query_term) == 0) {
-        return normalizer_.CalculateNormalizedMatchLength(token.text,
+        return normalizer_.FindNormalizedMatchEndPosition(token.text,
                                                           query_term);
       }
     }
     for (const std::string& query_term : restricted_query_terms_) {
       if (query_term.length() <= s.length() &&
           s.compare(0, query_term.length(), query_term) == 0) {
-        return normalizer_.CalculateNormalizedMatchLength(token.text,
+        return normalizer_.FindNormalizedMatchEndPosition(token.text,
                                                           query_term);
       }
     }
diff --git a/icing/result/snippet-retriever_test.cc b/icing/result/snippet-retriever_test.cc
index ad70038..f811941 100644
--- a/icing/result/snippet-retriever_test.cc
+++ b/icing/result/snippet-retriever_test.cc
@@ -691,10 +691,7 @@ TEST_F(SnippetRetrieverTest, PrefixSnippeting) {
   EXPECT_THAT(GetWindows(content, snippet.entries(0)),
               ElementsAre("subject foo"));
   EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("foo"));
-
-  if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) {
-    EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("f"));
-  }
+  EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("f"));
 }
 
 TEST_F(SnippetRetrieverTest, ExactSnippeting) {
@@ -738,9 +735,7 @@ TEST_F(SnippetRetrieverTest, SimpleSnippetingNoWindowing) {
       GetString(&document, snippet.entries(0).property_name());
   EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre(""));
   EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("foo"));
-  if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) {
-    EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("foo"));
-  }
+  EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("foo"));
 }
 
 TEST_F(SnippetRetrieverTest, SnippetingMultipleMatches) {
@@ -787,19 +782,15 @@ TEST_F(SnippetRetrieverTest, SnippetingMultipleMatches) {
           "we need to begin considering our options regarding body bar."));
   EXPECT_THAT(GetMatches(content, snippet.entries(0)),
               ElementsAre("foo", "bar"));
-  if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) {
-    EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
-                ElementsAre("foo", "bar"));
-  }
+  EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
+              ElementsAre("foo", "bar"));
 
   EXPECT_THAT(snippet.entries(1).property_name(), Eq("subject"));
   content = GetString(&document, snippet.entries(1).property_name());
   EXPECT_THAT(GetWindows(content, snippet.entries(1)),
               ElementsAre("subject foo"));
   EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("foo"));
-  if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) {
-    EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("foo"));
-  }
+  EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("foo"));
 }
 
 TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesSectionRestrict) {
@@ -849,10 +840,8 @@ TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesSectionRestrict) {
           "we need to begin considering our options regarding body bar."));
   EXPECT_THAT(GetMatches(content, snippet.entries(0)),
               ElementsAre("foo", "bar"));
-  if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) {
-    EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
-                ElementsAre("foo", "bar"));
-  }
+  EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
+              ElementsAre("foo", "bar"));
 }
 
 TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesSectionRestrictedTerm) {
@@ -903,20 +892,16 @@ TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesSectionRestrictedTerm) {
           "Concerning the subject of foo, we need to begin considering our"));
   EXPECT_THAT(GetMatches(content, snippet.entries(0)),
               ElementsAre("subject", "foo"));
-  if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) {
-    EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
-                ElementsAre("subject", "foo"));
-  }
+  EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
+              ElementsAre("subject", "foo"));
 
   EXPECT_THAT(snippet.entries(1).property_name(), Eq("subject"));
   content = GetString(&document, snippet.entries(1).property_name());
   EXPECT_THAT(GetWindows(content, snippet.entries(1)),
               ElementsAre("subject foo"));
   EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("subject"));
-  if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) {
-    EXPECT_THAT(GetSubMatches(content, snippet.entries(1)),
-                ElementsAre("subject"));
-  }
+  EXPECT_THAT(GetSubMatches(content, snippet.entries(1)),
+              ElementsAre("subject"));
 }
 
 TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesOneMatchPerProperty) {
@@ -960,18 +945,14 @@ TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesOneMatchPerProperty) {
       ElementsAre(
           "Concerning the subject of foo, we need to begin considering our"));
   EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("foo"));
-  if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) {
-    EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("foo"));
-  }
+  EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("foo"));
 
   EXPECT_THAT(snippet.entries(1).property_name(), Eq("subject"));
   content = GetString(&document, snippet.entries(1).property_name());
   EXPECT_THAT(GetWindows(content, snippet.entries(1)),
               ElementsAre("subject foo"));
   EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("foo"));
-  if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) {
-    EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("foo"));
-  }
+  EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("foo"));
 }
 
 TEST_F(SnippetRetrieverTest, PrefixSnippetingNormalization) {
@@ -993,9 +974,7 @@ TEST_F(SnippetRetrieverTest, PrefixSnippetingNormalization) {
       GetString(&document, snippet.entries(0).property_name());
   EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("MDI team"));
   EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("MDI"));
-  if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) {
-    EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("MD"));
-  }
+  EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("MD"));
 }
 
 TEST_F(SnippetRetrieverTest, ExactSnippetingNormalization) {
@@ -1020,10 +999,8 @@ TEST_F(SnippetRetrieverTest, ExactSnippetingNormalization) {
               ElementsAre("Some members are in Zürich."));
   EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("Zürich"));
 
-  if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) {
-    EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
-                ElementsAre("Zürich"));
-  }
+  EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
+              ElementsAre("Zürich"));
 }
 
 TEST_F(SnippetRetrieverTest, SnippetingTestOneLevel) {
@@ -1084,20 +1061,13 @@ TEST_F(SnippetRetrieverTest, SnippetingTestOneLevel) {
       GetString(&document, snippet.entries(0).property_name());
   EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("polo"));
   EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("polo"));
-  if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) {
-    EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
-                ElementsAre("polo"));
-  }
+  EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("polo"));
 
   EXPECT_THAT(snippet.entries(1).property_name(), Eq("X[3]"));
   content = GetString(&document, snippet.entries(1).property_name());
   EXPECT_THAT(GetWindows(content, snippet.entries(1)), ElementsAre("polo"));
   EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("polo"));
-
-  if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) {
-    EXPECT_THAT(GetSubMatches(content, snippet.entries(1)),
-                ElementsAre("polo"));
-  }
+  EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("polo"));
 
   EXPECT_THAT(GetPropertyPaths(snippet),
               ElementsAre("X[1]", "X[3]", "Y[1]", "Y[3]", "Z[1]", "Z[3]"));
@@ -1194,19 +1164,13 @@ TEST_F(SnippetRetrieverTest, SnippetingTestMultiLevel) {
       GetString(&document, snippet.entries(0).property_name());
   EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("polo"));
   EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("polo"));
-  if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) {
-    EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
-                ElementsAre("polo"));
-  }
+  EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("polo"));
 
   EXPECT_THAT(snippet.entries(1).property_name(), Eq("A.X[3]"));
   content = GetString(&document, snippet.entries(1).property_name());
   EXPECT_THAT(GetWindows(content, snippet.entries(1)), ElementsAre("polo"));
   EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("polo"));
-  if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) {
-    EXPECT_THAT(GetSubMatches(content, snippet.entries(1)),
-                ElementsAre("polo"));
-  }
+  EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("polo"));
 
   EXPECT_THAT(
       GetPropertyPaths(snippet),
@@ -1309,19 +1273,13 @@ TEST_F(SnippetRetrieverTest, SnippetingTestMultiLevelRepeated) {
       GetString(&document, snippet.entries(0).property_name());
   EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("polo"));
   EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("polo"));
-  if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) {
-    EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
-                ElementsAre("polo"));
-  }
+  EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("polo"));
 
   EXPECT_THAT(snippet.entries(1).property_name(), Eq("A[0].X[3]"));
   content = GetString(&document, snippet.entries(1).property_name());
   EXPECT_THAT(GetWindows(content, snippet.entries(1)), ElementsAre("polo"));
   EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("polo"));
-  if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) {
-    EXPECT_THAT(GetSubMatches(content, snippet.entries(1)),
-                ElementsAre("polo"));
-  }
+  EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("polo"));
 
   EXPECT_THAT(GetPropertyPaths(snippet),
               ElementsAre("A[0].X[1]", "A[0].X[3]", "A[1].X[1]", "A[1].X[3]",
@@ -1422,19 +1380,13 @@ TEST_F(SnippetRetrieverTest, SnippetingTestMultiLevelSingleValue) {
       GetString(&document, snippet.entries(0).property_name());
   EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("polo"));
   EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("polo"));
-  if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) {
-    EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
-                ElementsAre("polo"));
-  }
+  EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("polo"));
 
   EXPECT_THAT(snippet.entries(1).property_name(), Eq("A[1].X"));
   content = GetString(&document, snippet.entries(1).property_name());
   EXPECT_THAT(GetWindows(content, snippet.entries(1)), ElementsAre("polo"));
   EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("polo"));
-  if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) {
-    EXPECT_THAT(GetSubMatches(content, snippet.entries(1)),
-                ElementsAre("polo"));
-  }
+  EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("polo"));
 
   EXPECT_THAT(
       GetPropertyPaths(snippet),
@@ -1478,16 +1430,12 @@ TEST_F(SnippetRetrieverTest, CJKSnippetMatchTest) {
 
   // Ensure that the match is correct.
   EXPECT_THAT(GetMatches(content, *entry), ElementsAre("走路"));
-  if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) {
-    EXPECT_THAT(GetSubMatches(content, *entry), ElementsAre("走"));
-  }
+  EXPECT_THAT(GetSubMatches(content, *entry), ElementsAre("走"));
 
   // Ensure that the utf-16 values are also as expected
   EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(3));
   EXPECT_THAT(match_proto.exact_match_utf16_length(), Eq(2));
-  if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) {
-    EXPECT_THAT(match_proto.submatch_utf16_length(), Eq(1));
-  }
+  EXPECT_THAT(match_proto.submatch_utf16_length(), Eq(1));
 }
 
 TEST_F(SnippetRetrieverTest, CJKSnippetWindowTest) {
@@ -1587,16 +1535,12 @@ TEST_F(SnippetRetrieverTest, Utf16MultiCodeUnitSnippetMatchTest) {
 
   // Ensure that the match is correct.
   EXPECT_THAT(GetMatches(content, *entry), ElementsAre("𐀂𐀃"));
-  if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) {
-    EXPECT_THAT(GetSubMatches(content, *entry), ElementsAre("𐀂"));
-  }
+  EXPECT_THAT(GetSubMatches(content, *entry), ElementsAre("𐀂"));
 
   // Ensure that the utf-16 values are also as expected
   EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(5));
   EXPECT_THAT(match_proto.exact_match_utf16_length(), Eq(4));
-  if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) {
-    EXPECT_THAT(match_proto.submatch_utf16_length(), Eq(2));
-  }
+  EXPECT_THAT(match_proto.submatch_utf16_length(), Eq(2));
 }
 
 TEST_F(SnippetRetrieverTest, Utf16MultiCodeUnitWindowTest) {
diff --git a/icing/tokenization/icu/icu-language-segmenter.cc b/icing/tokenization/icu/icu-language-segmenter.cc
index cb31441..598ede7 100644
--- a/icing/tokenization/icu/icu-language-segmenter.cc
+++ b/icing/tokenization/icu/icu-language-segmenter.cc
@@ -300,9 +300,10 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
 
     UChar32 uchar32 = i18n_utils::GetUChar32At(text_.data(), text_.length(),
                                                term_start_index_);
-    // Rule 2: for non-ASCII terms, only the alphabetic terms are returned.
-    // We know it's an alphabetic term by checking the first unicode character.
-    if (u_isUAlphabetic(uchar32)) {
+    // Rule 2: for non-ASCII terms, only the alphanumeric terms are returned.
+    // We know it's an alphanumeric term by checking the first unicode
+    // character.
+    if (i18n_utils::IsAlphaNumeric(uchar32)) {
       return true;
     }
     return false;
diff --git a/icing/tokenization/icu/icu-language-segmenter_test.cc b/icing/tokenization/icu/icu-language-segmenter_test.cc
index 01eb7d8..3090087 100644
--- a/icing/tokenization/icu/icu-language-segmenter_test.cc
+++ b/icing/tokenization/icu/icu-language-segmenter_test.cc
@@ -372,6 +372,15 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, Number) {
               IsOkAndHolds(ElementsAre("-", "123")));
 }
 
+TEST_P(IcuLanguageSegmenterAllLocalesTest, FullWidthNumbers) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+  EXPECT_THAT(language_segmenter->GetAllTerms("０１２３４５６７８９"),
+              IsOkAndHolds(ElementsAre("０１２３４５６７８９")));
+}
+
 TEST_P(IcuLanguageSegmenterAllLocalesTest, ContinuousWhitespaces) {
   ICING_ASSERT_OK_AND_ASSIGN(
       auto language_segmenter,
diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc
index 76219b5..b936f2b 100644
--- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc
+++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc
@@ -291,9 +291,12 @@ class ReverseJniLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
       return true;
     }
 
-    // Rule 2: for non-ASCII terms, only the alphabetic terms are returned.
-    // We know it's an alphabetic term by checking the first unicode character.
-    if (i18n_utils::IsAlphabeticAt(text_, term_start_.utf8_index())) {
+    UChar32 uchar32 = i18n_utils::GetUChar32At(text_.data(), text_.length(),
+                                               term_start_.utf8_index());
+    // Rule 2: for non-ASCII terms, only the alphanumeric terms are returned.
+    // We know it's an alphanumeric term by checking the first unicode
+    // character.
+    if (i18n_utils::IsAlphaNumeric(uchar32)) {
       return true;
     }
     return false;
diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc
index b1a8f72..45d6475 100644
--- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc
+++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc
@@ -366,6 +366,17 @@ TEST_P(ReverseJniLanguageSegmenterTest, Number) {
               IsOkAndHolds(ElementsAre("-", "123")));
 }
 
+TEST_P(ReverseJniLanguageSegmenterTest, FullWidthNumbers) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+
+  EXPECT_THAT(language_segmenter->GetAllTerms("０１２３４５６７８９"),
+              IsOkAndHolds(ElementsAre("０", "１", "２", "３", "４", "５", "６",
+                                       "７", "８", "９")));
+}
+
 TEST_P(ReverseJniLanguageSegmenterTest, ContinuousWhitespaces) {
   ICING_ASSERT_OK_AND_ASSIGN(
       auto language_segmenter,
diff --git a/icing/transform/icu/icu-normalizer-factory.cc b/icing/transform/icu/icu-normalizer-factory.cc
index 9951325..493aeb5 100644
--- a/icing/transform/icu/icu-normalizer-factory.cc
+++ b/icing/transform/icu/icu-normalizer-factory.cc
@@ -44,8 +44,6 @@ libtextclassifier3::StatusOr<std::unique_ptr<Normalizer>> Create(
   return IcuNormalizer::Create(max_term_byte_size);
 }
 
-std::string_view GetNormalizerName() { return IcuNormalizer::kName; }
-
 }  // namespace normalizer_factory
 
 }  // namespace lib
diff --git a/icing/transform/icu/icu-normalizer.cc b/icing/transform/icu/icu-normalizer.cc
index eb0eead..250d6cf 100644
--- a/icing/transform/icu/icu-normalizer.cc
+++ b/icing/transform/icu/icu-normalizer.cc
@@ -29,6 +29,7 @@
 #include "icing/util/status-macros.h"
 #include "unicode/umachine.h"
 #include "unicode/unorm2.h"
+#include "unicode/ustring.h"
 #include "unicode/utrans.h"
 
 namespace icing {
@@ -157,14 +158,18 @@ std::string IcuNormalizer::NormalizeLatin(const UNormalizer2* normalizer2,
                                           const std::string_view term) const {
   std::string result;
   result.reserve(term.length());
-  for (int i = 0; i < term.length(); i++) {
-    if (i18n_utils::IsAscii(term[i])) {
-      result.push_back(std::tolower(term[i]));
-    } else if (i18n_utils::IsLeadUtf8Byte(term[i])) {
-      UChar32 uchar32 = i18n_utils::GetUChar32At(term.data(), term.length(), i);
+  int current_pos = 0;
+  while (current_pos < term.length()) {
+    if (i18n_utils::IsAscii(term[current_pos])) {
+      result.push_back(std::tolower(term[current_pos]));
+      ++current_pos;
+    } else {
+      UChar32 uchar32 =
+          i18n_utils::GetUChar32At(term.data(), term.length(), current_pos);
       if (uchar32 == i18n_utils::kInvalidUChar32) {
         ICING_LOG(WARNING) << "Unable to get uchar32 from " << term
-                           << " at position" << i;
+                           << " at position" << current_pos;
+        current_pos += i18n_utils::GetUtf8Length(uchar32);
         continue;
       }
       char ascii_char;
@@ -177,8 +182,9 @@ std::string IcuNormalizer::NormalizeLatin(const UNormalizer2* normalizer2,
         // tokenized. We handle it here in case there're something wrong with
         // the tokenizers.
         int utf8_length = i18n_utils::GetUtf8Length(uchar32);
-        absl_ports::StrAppend(&result, term.substr(i, utf8_length));
+        absl_ports::StrAppend(&result, term.substr(current_pos, utf8_length));
       }
+      current_pos += i18n_utils::GetUtf8Length(uchar32);
     }
   }
 
@@ -261,5 +267,103 @@ std::string IcuNormalizer::TermTransformer::Transform(
   return std::move(utf8_term_or).ValueOrDie();
 }
 
+CharacterIterator FindNormalizedLatinMatchEndPosition(
+    const UNormalizer2* normalizer2, std::string_view term,
+    CharacterIterator char_itr, std::string_view normalized_term) {
+  CharacterIterator normalized_char_itr(normalized_term);
+  char ascii_char;
+  while (char_itr.utf8_index() < term.length() &&
+         normalized_char_itr.utf8_index() < normalized_term.length()) {
+    UChar32 c = char_itr.GetCurrentChar();
+    if (i18n_utils::IsAscii(c)) {
+      c = std::tolower(c);
+    } else if (DiacriticCharToAscii(normalizer2, c, &ascii_char)) {
+      c = ascii_char;
+    }
+    UChar32 normalized_c = normalized_char_itr.GetCurrentChar();
+    if (c != normalized_c) {
+      return char_itr;
+    }
+    char_itr.AdvanceToUtf32(char_itr.utf32_index() + 1);
+    normalized_char_itr.AdvanceToUtf32(normalized_char_itr.utf32_index() + 1);
+  }
+  return char_itr;
+}
+
+CharacterIterator
+IcuNormalizer::TermTransformer::FindNormalizedNonLatinMatchEndPosition(
+    std::string_view term, CharacterIterator char_itr,
+    std::string_view normalized_term) const {
+  CharacterIterator normalized_char_itr(normalized_term);
+  UErrorCode status = U_ZERO_ERROR;
+
+  constexpr int kUtf16CharBufferLength = 6;
+  UChar c16[kUtf16CharBufferLength];
+  int32_t c16_length;
+  int32_t limit;
+
+  constexpr int kUtf32CharBufferLength = 3;
+  UChar32 normalized_buffer[kUtf32CharBufferLength];
+  int32_t c32_length;
+  while (char_itr.utf8_index() < term.length() &&
+         normalized_char_itr.utf8_index() < normalized_term.length()) {
+    UChar32 c = char_itr.GetCurrentChar();
+    u_strFromUTF32(c16, kUtf16CharBufferLength, &c16_length, &c,
+                   /*srcLength=*/1, &status);
+    if (U_FAILURE(status)) {
+      break;
+    }
+
+    limit = c16_length;
+    utrans_transUChars(u_transliterator_, c16, &c16_length,
+                       kUtf16CharBufferLength,
+                       /*start=*/0, &limit, &status);
+    if (U_FAILURE(status)) {
+      break;
+    }
+
+    u_strToUTF32(normalized_buffer, kUtf32CharBufferLength, &c32_length, c16,
+                 c16_length, &status);
+    if (U_FAILURE(status)) {
+      break;
+    }
+
+    for (int i = 0; i < c32_length; ++i) {
+      UChar32 normalized_c = normalized_char_itr.GetCurrentChar();
+      if (normalized_buffer[i] != normalized_c) {
+        return char_itr;
+      }
+      normalized_char_itr.AdvanceToUtf32(normalized_char_itr.utf32_index() + 1);
+    }
+    char_itr.AdvanceToUtf32(char_itr.utf32_index() + 1);
+  }
+  if (U_FAILURE(status)) {
+    // Failed to transform, return its original form.
+    ICING_LOG(WARNING) << "Failed to normalize UTF8 term: " << term;
+  }
+  return char_itr;
+}
+
+CharacterIterator IcuNormalizer::FindNormalizedMatchEndPosition(
+    std::string_view term, std::string_view normalized_term) const {
+  UErrorCode status = U_ZERO_ERROR;
+  // ICU manages the singleton instance
+  const UNormalizer2* normalizer2 = unorm2_getNFCInstance(&status);
+  if (U_FAILURE(status)) {
+    ICING_LOG(WARNING) << "Failed to create a UNormalizer2 instance";
+  }
+
+  CharacterIterator char_itr(term);
+  UChar32 first_uchar32 = char_itr.GetCurrentChar();
+  if (normalizer2 != nullptr && first_uchar32 != i18n_utils::kInvalidUChar32 &&
+      DiacriticCharToAscii(normalizer2, first_uchar32, /*char_out=*/nullptr)) {
+    return FindNormalizedLatinMatchEndPosition(normalizer2, term, char_itr,
+                                               normalized_term);
+  } else {
+    return term_transformer_->FindNormalizedNonLatinMatchEndPosition(
+        term, char_itr, normalized_term);
+  }
+}
+
 }  // namespace lib
 }  // namespace icing
diff --git a/icing/transform/icu/icu-normalizer.h b/icing/transform/icu/icu-normalizer.h
index 4442f3b..d4f1ebd 100644
--- a/icing/transform/icu/icu-normalizer.h
+++ b/icing/transform/icu/icu-normalizer.h
@@ -21,6 +21,7 @@
 
 #include "icing/text_classifier/lib3/utils/base/statusor.h"
 #include "icing/transform/normalizer.h"
+#include "icing/util/character-iterator.h"
 #include "unicode/unorm2.h"
 #include "unicode/utrans.h"
 
@@ -39,8 +40,6 @@ namespace lib {
 // details.
 class IcuNormalizer : public Normalizer {
  public:
-  static constexpr std::string_view kName = "IcuNormalizer";
-
   // Creates a normalizer with the subcomponents it needs. max_term_byte_size
   // enforces the max size of text after normalization, text will be truncated
   // if exceeds the max size.
@@ -58,6 +57,17 @@ class IcuNormalizer : public Normalizer {
   // result in the non-Latin characters not properly being normalized
   std::string NormalizeTerm(std::string_view term) const override;
 
+  // Returns a CharacterIterator pointing to one past the end of the segment of
+  // term that (once normalized) matches with normalized_term.
+  //
+  // Ex. FindNormalizedMatchEndPosition("YELLOW", "yell") will return
+  // CharacterIterator(u8:4, u16:4, u32:4).
+  //
+  // Ex. FindNormalizedMatchEndPosition("YELLOW", "red") will return
+  // CharacterIterator(u8:0, u16:0, u32:0).
+  CharacterIterator FindNormalizedMatchEndPosition(
+      std::string_view term, std::string_view normalized_term) const override;
+
  private:
   // A handler class that helps manage the lifecycle of UTransliterator. It's
   // used in IcuNormalizer to transform terms into the formats we need.
@@ -77,6 +87,12 @@ class IcuNormalizer : public Normalizer {
     // Transforms the text based on our rules described at top of this file
     std::string Transform(std::string_view term) const;
 
+    // Returns a CharacterIterator pointing to one past the end of the segment
+    // of a non-latin term that (once normalized) matches with normalized_term.
+    CharacterIterator FindNormalizedNonLatinMatchEndPosition(
+        std::string_view term, CharacterIterator char_itr,
+        std::string_view normalized_term) const;
+
    private:
     explicit TermTransformer(UTransliterator* u_transliterator);
 
diff --git a/icing/transform/icu/icu-normalizer_benchmark.cc b/icing/transform/icu/icu-normalizer_benchmark.cc
index b037538..8d09be2 100644
--- a/icing/transform/icu/icu-normalizer_benchmark.cc
+++ b/icing/transform/icu/icu-normalizer_benchmark.cc
@@ -161,6 +161,124 @@ BENCHMARK(BM_NormalizeHiragana)
     ->Arg(2048000)
     ->Arg(4096000);
 
+void BM_UppercaseSubTokenLength(benchmark::State& state) {
+  bool run_via_adb = absl::GetFlag(FLAGS_adb);
+  if (!run_via_adb) {
+    ICING_ASSERT_OK(icu_data_file_helper::SetUpICUDataFile(
+        GetTestFilePath("icing/icu.dat")));
+  }
+
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Normalizer> normalizer,
+      normalizer_factory::Create(
+
+          /*max_term_byte_size=*/std::numeric_limits<int>::max()));
+
+  std::string input_string(state.range(0), 'A');
+  std::string normalized_input_string(state.range(0), 'a');
+  for (auto _ : state) {
+    normalizer->FindNormalizedMatchEndPosition(input_string,
+                                               normalized_input_string);
+  }
+}
+BENCHMARK(BM_UppercaseSubTokenLength)
+    ->Arg(1000)
+    ->Arg(2000)
+    ->Arg(4000)
+    ->Arg(8000)
+    ->Arg(16000)
+    ->Arg(32000)
+    ->Arg(64000)
+    ->Arg(128000)
+    ->Arg(256000)
+    ->Arg(384000)
+    ->Arg(512000)
+    ->Arg(1024000)
+    ->Arg(2048000)
+    ->Arg(4096000);
+
+void BM_AccentSubTokenLength(benchmark::State& state) {
+  bool run_via_adb = absl::GetFlag(FLAGS_adb);
+  if (!run_via_adb) {
+    ICING_ASSERT_OK(icu_data_file_helper::SetUpICUDataFile(
+        GetTestFilePath("icing/icu.dat")));
+  }
+
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Normalizer> normalizer,
+      normalizer_factory::Create(
+
+          /*max_term_byte_size=*/std::numeric_limits<int>::max()));
+
+  std::string input_string;
+  std::string normalized_input_string;
+  while (input_string.length() < state.range(0)) {
+    input_string.append("àáâãā");
+    normalized_input_string.append("aaaaa");
+  }
+
+  for (auto _ : state) {
+    normalizer->FindNormalizedMatchEndPosition(input_string,
+                                               normalized_input_string);
+  }
+}
+BENCHMARK(BM_AccentSubTokenLength)
+    ->Arg(1000)
+    ->Arg(2000)
+    ->Arg(4000)
+    ->Arg(8000)
+    ->Arg(16000)
+    ->Arg(32000)
+    ->Arg(64000)
+    ->Arg(128000)
+    ->Arg(256000)
+    ->Arg(384000)
+    ->Arg(512000)
+    ->Arg(1024000)
+    ->Arg(2048000)
+    ->Arg(4096000);
+
+void BM_HiraganaSubTokenLength(benchmark::State& state) {
+  bool run_via_adb = absl::GetFlag(FLAGS_adb);
+  if (!run_via_adb) {
+    ICING_ASSERT_OK(icu_data_file_helper::SetUpICUDataFile(
+        GetTestFilePath("icing/icu.dat")));
+  }
+
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Normalizer> normalizer,
+      normalizer_factory::Create(
+
+          /*max_term_byte_size=*/std::numeric_limits<int>::max()));
+
+  std::string input_string;
+  std::string normalized_input_string;
+  while (input_string.length() < state.range(0)) {
+    input_string.append("あいうえお");
+    normalized_input_string.append("アイウエオ");
+  }
+
+  for (auto _ : state) {
+    normalizer->FindNormalizedMatchEndPosition(input_string,
+                                               normalized_input_string);
+  }
+}
+BENCHMARK(BM_HiraganaSubTokenLength)
+    ->Arg(1000)
+    ->Arg(2000)
+    ->Arg(4000)
+    ->Arg(8000)
+    ->Arg(16000)
+    ->Arg(32000)
+    ->Arg(64000)
+    ->Arg(128000)
+    ->Arg(256000)
+    ->Arg(384000)
+    ->Arg(512000)
+    ->Arg(1024000)
+    ->Arg(2048000)
+    ->Arg(4096000);
+
 }  // namespace
 
 }  // namespace lib
diff --git a/icing/transform/icu/icu-normalizer_test.cc b/icing/transform/icu/icu-normalizer_test.cc
index f5d20ff..a46fcc7 100644
--- a/icing/transform/icu/icu-normalizer_test.cc
+++ b/icing/transform/icu/icu-normalizer_test.cc
@@ -231,6 +231,104 @@ TEST_F(IcuNormalizerTest, Truncate) {
   }
 }
 
+TEST_F(IcuNormalizerTest, PrefixMatchLength) {
+  // Verify that FindNormalizedMatchEndPosition will properly find the length of
+  // the prefix match when given a non-normalized term and a normalized term
+  // is a prefix of the non-normalized one.
+  ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create(
+                                                  /*max_term_byte_size=*/1000));
+
+  // Upper to lower
+  std::string term = "MDI";
+  CharacterIterator match_end =
+      normalizer->FindNormalizedMatchEndPosition(term, "md");
+  EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("MD"));
+
+  term = "Icing";
+  match_end = normalizer->FindNormalizedMatchEndPosition(term, "icin");
+  EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Icin"));
+
+  // Full-width
+  term = "５２５６００";
+  match_end = normalizer->FindNormalizedMatchEndPosition(term, "525");
+  EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("５２５"));
+
+  term = "ＦＵＬＬＷＩＤＴＨ";
+  match_end = normalizer->FindNormalizedMatchEndPosition(term, "full");
+  EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("ＦＵＬＬ"));
+
+  // Hiragana to Katakana
+  term = "あいうえお";
+  match_end = normalizer->FindNormalizedMatchEndPosition(term, "アイ");
+  EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("あい"));
+
+  term = "かきくけこ";
+  match_end = normalizer->FindNormalizedMatchEndPosition(term, "カ");
+  EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("か"));
+
+  // Latin accents
+  term = "Zürich";
+  match_end = normalizer->FindNormalizedMatchEndPosition(term, "zur");
+  EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Zür"));
+
+  term = "après-midi";
+  match_end = normalizer->FindNormalizedMatchEndPosition(term, "apre");
+  EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("aprè"));
+
+  term = "Buenos días";
+  match_end = normalizer->FindNormalizedMatchEndPosition(term, "buenos di");
+  EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Buenos dí"));
+}
+
+TEST_F(IcuNormalizerTest, SharedPrefixMatchLength) {
+  // Verify that FindNormalizedMatchEndPosition will properly find the length of
+  // the prefix match when given a non-normalized term and a normalized term
+  // that share a common prefix.
+  ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create(
+                                                  /*max_term_byte_size=*/1000));
+
+  // Upper to lower
+  std::string term = "MDI";
+  CharacterIterator match_end =
+      normalizer->FindNormalizedMatchEndPosition(term, "mgm");
+  EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("M"));
+
+  term = "Icing";
+  match_end = normalizer->FindNormalizedMatchEndPosition(term, "icky");
+  EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Ic"));
+
+  // Full-width
+  term = "５２５６００";
+  match_end = normalizer->FindNormalizedMatchEndPosition(term, "525788");
+  EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("５２５"));
+
+  term = "ＦＵＬＬＷＩＤＴＨ";
+  match_end = normalizer->FindNormalizedMatchEndPosition(term, "fully");
+  EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("ＦＵＬＬ"));
+
+  // Hiragana to Katakana
+  term = "あいうえお";
+  match_end = normalizer->FindNormalizedMatchEndPosition(term, "アイエオ");
+  EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("あい"));
+
+  term = "かきくけこ";
+  match_end = normalizer->FindNormalizedMatchEndPosition(term, "カケコ");
+  EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("か"));
+
+  // Latin accents
+  term = "Zürich";
+  match_end = normalizer->FindNormalizedMatchEndPosition(term, "zurg");
+  EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Zür"));
+
+  term = "après-midi";
+  match_end = normalizer->FindNormalizedMatchEndPosition(term, "apreciate");
+  EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("aprè"));
+
+  term = "días";
+  match_end = normalizer->FindNormalizedMatchEndPosition(term, "diamond");
+  EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("día"));
+}
+
 }  // namespace
 }  // namespace lib
 }  // namespace icing
diff --git a/icing/transform/map/map-normalizer-factory.cc b/icing/transform/map/map-normalizer-factory.cc
index 286b8f6..3bf84b3 100644
--- a/icing/transform/map/map-normalizer-factory.cc
+++ b/icing/transform/map/map-normalizer-factory.cc
@@ -42,8 +42,6 @@ libtextclassifier3::StatusOr<std::unique_ptr<Normalizer>> Create(
   return std::make_unique<MapNormalizer>(max_term_byte_size);
 }
 
-std::string_view GetNormalizerName() { return MapNormalizer::kName; }
-
 }  // namespace normalizer_factory
 
 }  // namespace lib
diff --git a/icing/transform/map/map-normalizer.cc b/icing/transform/map/map-normalizer.cc
index 4ad5dec..95aa633 100644
--- a/icing/transform/map/map-normalizer.cc
+++ b/icing/transform/map/map-normalizer.cc
@@ -42,10 +42,16 @@ UChar32 NormalizeChar(UChar32 c) {
   }
 
   // The original character can be encoded into a single char16_t.
-  const std::unordered_map<char16_t, char16_t>& normalization_map =
+  const std::unordered_map<char16_t, char16_t>* normalization_map =
       GetNormalizationMap();
-  auto iterator = normalization_map.find(static_cast<char16_t>(c));
-  if (iterator == normalization_map.end()) {
+  if (normalization_map == nullptr) {
+    // Normalization map couldn't be properly initialized, append the original
+    // character.
+    ICING_LOG(WARNING) << "Unable to get a valid pointer to normalization map!";
+    return c;
+  }
+  auto iterator = normalization_map->find(static_cast<char16_t>(c));
+  if (iterator == normalization_map->end()) {
     // Normalization mapping not found, append the original character.
     return c;
   }
@@ -99,7 +105,7 @@ std::string MapNormalizer::NormalizeTerm(std::string_view term) const {
   return normalized_text;
 }
 
-CharacterIterator MapNormalizer::CalculateNormalizedMatchLength(
+CharacterIterator MapNormalizer::FindNormalizedMatchEndPosition(
     std::string_view term, std::string_view normalized_term) const {
   CharacterIterator char_itr(term);
   CharacterIterator normalized_char_itr(normalized_term);
diff --git a/icing/transform/map/map-normalizer.h b/icing/transform/map/map-normalizer.h
index 8fbe83b..ed996ae 100644
--- a/icing/transform/map/map-normalizer.h
+++ b/icing/transform/map/map-normalizer.h
@@ -26,8 +26,6 @@ namespace lib {
 
 class MapNormalizer : public Normalizer {
  public:
-  static constexpr std::string_view kName = "MapNormalizer";
-
   explicit MapNormalizer(int max_term_byte_size)
       : max_term_byte_size_(max_term_byte_size){};
 
@@ -45,12 +43,12 @@ class MapNormalizer : public Normalizer {
   // Returns a CharacterIterator pointing to one past the end of the segment of
   // term that (once normalized) matches with normalized_term.
   //
-  // Ex. CalculateNormalizedMatchLength("YELLOW", "yell") will return
+  // Ex. FindNormalizedMatchEndPosition("YELLOW", "yell") will return
   // CharacterIterator(u8:4, u16:4, u32:4).
   //
-  // Ex. CalculateNormalizedMatchLength("YELLOW", "red") will return
+  // Ex. FindNormalizedMatchEndPosition("YELLOW", "red") will return
   // CharacterIterator(u8:0, u16:0, u32:0).
-  CharacterIterator CalculateNormalizedMatchLength(
+  CharacterIterator FindNormalizedMatchEndPosition(
       std::string_view term, std::string_view normalized_term) const override;
 
  private:
diff --git a/icing/transform/map/map-normalizer_benchmark.cc b/icing/transform/map/map-normalizer_benchmark.cc
index 691afc6..8268541 100644
--- a/icing/transform/map/map-normalizer_benchmark.cc
+++ b/icing/transform/map/map-normalizer_benchmark.cc
@@ -143,6 +143,104 @@ BENCHMARK(BM_NormalizeHiragana)
     ->Arg(2048000)
     ->Arg(4096000);
 
+void BM_UppercaseSubTokenLength(benchmark::State& state) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Normalizer> normalizer,
+      normalizer_factory::Create(
+
+          /*max_term_byte_size=*/std::numeric_limits<int>::max()));
+
+  std::string input_string(state.range(0), 'A');
+  std::string normalized_input_string(state.range(0), 'a');
+  for (auto _ : state) {
+    normalizer->FindNormalizedMatchEndPosition(input_string,
+                                               normalized_input_string);
+  }
+}
+BENCHMARK(BM_UppercaseSubTokenLength)
+    ->Arg(1000)
+    ->Arg(2000)
+    ->Arg(4000)
+    ->Arg(8000)
+    ->Arg(16000)
+    ->Arg(32000)
+    ->Arg(64000)
+    ->Arg(128000)
+    ->Arg(256000)
+    ->Arg(384000)
+    ->Arg(512000)
+    ->Arg(1024000)
+    ->Arg(2048000)
+    ->Arg(4096000);
+
+void BM_AccentSubTokenLength(benchmark::State& state) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Normalizer> normalizer,
+      normalizer_factory::Create(
+          /*max_term_byte_size=*/std::numeric_limits<int>::max()));
+
+  std::string input_string;
+  std::string normalized_input_string;
+  while (input_string.length() < state.range(0)) {
+    input_string.append("àáâãā");
+    normalized_input_string.append("aaaaa");
+  }
+
+  for (auto _ : state) {
+    normalizer->FindNormalizedMatchEndPosition(input_string,
+                                               normalized_input_string);
+  }
+}
+BENCHMARK(BM_AccentSubTokenLength)
+    ->Arg(1000)
+    ->Arg(2000)
+    ->Arg(4000)
+    ->Arg(8000)
+    ->Arg(16000)
+    ->Arg(32000)
+    ->Arg(64000)
+    ->Arg(128000)
+    ->Arg(256000)
+    ->Arg(384000)
+    ->Arg(512000)
+    ->Arg(1024000)
+    ->Arg(2048000)
+    ->Arg(4096000);
+
+void BM_HiraganaSubTokenLength(benchmark::State& state) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Normalizer> normalizer,
+      normalizer_factory::Create(
+          /*max_term_byte_size=*/std::numeric_limits<int>::max()));
+
+  std::string input_string;
+  std::string normalized_input_string;
+  while (input_string.length() < state.range(0)) {
+    input_string.append("あいうえお");
+    normalized_input_string.append("アイウエオ");
+  }
+
+  for (auto _ : state) {
+    normalizer->FindNormalizedMatchEndPosition(input_string,
+                                               normalized_input_string);
+  }
+}
+BENCHMARK(BM_HiraganaSubTokenLength)
+    ->Arg(1000)
+    ->Arg(2000)
+    ->Arg(4000)
+    ->Arg(8000)
+    ->Arg(16000)
+    ->Arg(32000)
+    ->Arg(64000)
+    ->Arg(128000)
+    ->Arg(256000)
+    ->Arg(384000)
+    ->Arg(512000)
+    ->Arg(1024000)
+    ->Arg(2048000)
+    ->Arg(4096000);
+
 }  // namespace
 
 }  // namespace lib
diff --git a/icing/transform/map/map-normalizer_test.cc b/icing/transform/map/map-normalizer_test.cc
index 26fdd4a..adc5623 100644
--- a/icing/transform/map/map-normalizer_test.cc
+++ b/icing/transform/map/map-normalizer_test.cc
@@ -201,51 +201,103 @@ TEST(MapNormalizerTest, Truncate) {
 }
 
 TEST(MapNormalizerTest, PrefixMatchLength) {
+  // Verify that FindNormalizedMatchEndPosition will properly find the length of
+  // the prefix match when given a non-normalized term and a normalized term
+  // is a prefix of the non-normalized one.
   ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create(
                                                   /*max_term_byte_size=*/1000));
 
   // Upper to lower
   std::string term = "MDI";
   CharacterIterator match_end =
-      normalizer->CalculateNormalizedMatchLength(term, "md");
+      normalizer->FindNormalizedMatchEndPosition(term, "md");
   EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("MD"));
 
   term = "Icing";
-  match_end = normalizer->CalculateNormalizedMatchLength(term, "icin");
+  match_end = normalizer->FindNormalizedMatchEndPosition(term, "icin");
   EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Icin"));
 
   // Full-width
   term = "５２５６００";
-  match_end = normalizer->CalculateNormalizedMatchLength(term, "525");
+  match_end = normalizer->FindNormalizedMatchEndPosition(term, "525");
   EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("５２５"));
 
   term = "ＦＵＬＬＷＩＤＴＨ";
-  match_end = normalizer->CalculateNormalizedMatchLength(term, "full");
+  match_end = normalizer->FindNormalizedMatchEndPosition(term, "full");
   EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("ＦＵＬＬ"));
 
   // Hiragana to Katakana
   term = "あいうえお";
-  match_end = normalizer->CalculateNormalizedMatchLength(term, "アイ");
+  match_end = normalizer->FindNormalizedMatchEndPosition(term, "アイ");
   EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("あい"));
 
   term = "かきくけこ";
-  match_end = normalizer->CalculateNormalizedMatchLength(term, "カ");
+  match_end = normalizer->FindNormalizedMatchEndPosition(term, "カ");
   EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("か"));
 
   // Latin accents
   term = "Zürich";
-  match_end = normalizer->CalculateNormalizedMatchLength(term, "zur");
+  match_end = normalizer->FindNormalizedMatchEndPosition(term, "zur");
   EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Zür"));
 
   term = "après-midi";
-  match_end = normalizer->CalculateNormalizedMatchLength(term, "apre");
+  match_end = normalizer->FindNormalizedMatchEndPosition(term, "apre");
   EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("aprè"));
 
   term = "Buenos días";
-  match_end = normalizer->CalculateNormalizedMatchLength(term, "buenos di");
+  match_end = normalizer->FindNormalizedMatchEndPosition(term, "buenos di");
   EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Buenos dí"));
 }
 
+TEST(MapNormalizerTest, SharedPrefixMatchLength) {
+  // Verify that FindNormalizedMatchEndPosition will properly find the length of
+  // the prefix match when given a non-normalized term and a normalized term
+  // that share a common prefix.
+  ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create(
+                                                  /*max_term_byte_size=*/1000));
+
+  // Upper to lower
+  std::string term = "MDI";
+  CharacterIterator match_end =
+      normalizer->FindNormalizedMatchEndPosition(term, "mgm");
+  EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("M"));
+
+  term = "Icing";
+  match_end = normalizer->FindNormalizedMatchEndPosition(term, "icky");
+  EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Ic"));
+
+  // Full-width
+  term = "５２５６００";
+  match_end = normalizer->FindNormalizedMatchEndPosition(term, "525788");
+  EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("５２５"));
+
+  term = "ＦＵＬＬＷＩＤＴＨ";
+  match_end = normalizer->FindNormalizedMatchEndPosition(term, "fully");
+  EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("ＦＵＬＬ"));
+
+  // Hiragana to Katakana
+  term = "あいうえお";
+  match_end = normalizer->FindNormalizedMatchEndPosition(term, "アイエオ");
+  EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("あい"));
+
+  term = "かきくけこ";
+  match_end = normalizer->FindNormalizedMatchEndPosition(term, "カケコ");
+  EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("か"));
+
+  // Latin accents
+  term = "Zürich";
+  match_end = normalizer->FindNormalizedMatchEndPosition(term, "zurg");
+  EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Zür"));
+
+  term = "après-midi";
+  match_end = normalizer->FindNormalizedMatchEndPosition(term, "apreciate");
+  EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("aprè"));
+
+  term = "días";
+  match_end = normalizer->FindNormalizedMatchEndPosition(term, "diamond");
+  EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("día"));
+}
+
 }  // namespace
 
 }  // namespace lib
diff --git a/icing/transform/map/normalization-map.cc b/icing/transform/map/normalization-map.cc
index c318036..0994ab8 100644
--- a/icing/transform/map/normalization-map.cc
+++ b/icing/transform/map/normalization-map.cc
@@ -691,19 +691,21 @@ constexpr NormalizationPair kNormalizationMappings[] = {
 
 }  // namespace
 
-const std::unordered_map<char16_t, char16_t>& GetNormalizationMap() {
+const std::unordered_map<char16_t, char16_t> *GetNormalizationMap() {
   // The map is allocated dynamically the first time this function is executed.
-  static const std::unordered_map<char16_t, char16_t> normalization_map = [] {
-    std::unordered_map<char16_t, char16_t> map;
-    // Size of all the mappings is about 2.5 KiB.
-    constexpr int numMappings =
-        sizeof(kNormalizationMappings) / sizeof(NormalizationPair);
-    map.reserve(numMappings);
-    for (size_t i = 0; i < numMappings; ++i) {
-      map.emplace(kNormalizationMappings[i].from, kNormalizationMappings[i].to);
-    }
-    return map;
-  }();
+  static const std::unordered_map<char16_t, char16_t> *const normalization_map =
+      [] {
+        auto *map = new std::unordered_map<char16_t, char16_t>();
+        // Size of all the mappings is about 2.5 KiB.
+        constexpr int numMappings =
+            sizeof(kNormalizationMappings) / sizeof(NormalizationPair);
+        map->reserve(numMappings);
+        for (size_t i = 0; i < numMappings; ++i) {
+          map->emplace(kNormalizationMappings[i].from,
+                       kNormalizationMappings[i].to);
+        }
+        return map;
+      }();
 
   return normalization_map;
 }
diff --git a/icing/transform/map/normalization-map.h b/icing/transform/map/normalization-map.h
index aea85bd..ac7872b 100644
--- a/icing/transform/map/normalization-map.h
+++ b/icing/transform/map/normalization-map.h
@@ -23,7 +23,7 @@ namespace lib {
 // Returns a map containing normalization mappings. A mapping (A -> B) means
 // that we'll transform every character 'A' into 'B'. See normalization-map.cc
 // for mapping details.
-const std::unordered_map<char16_t, char16_t>& GetNormalizationMap();
+const std::unordered_map<char16_t, char16_t>* GetNormalizationMap();
 
 }  // namespace lib
 }  // namespace icing
diff --git a/icing/transform/normalizer-factory.h b/icing/transform/normalizer-factory.h
index 1db9915..f1f3f62 100644
--- a/icing/transform/normalizer-factory.h
+++ b/icing/transform/normalizer-factory.h
@@ -36,9 +36,6 @@ namespace normalizer_factory {
 libtextclassifier3::StatusOr<std::unique_ptr<Normalizer>> Create(
     int max_term_byte_size);
 
-// Returns the name of the normalizer being used.
-std::string_view GetNormalizerName();
-
 }  // namespace normalizer_factory
 
 }  // namespace lib
diff --git a/icing/transform/normalizer.h b/icing/transform/normalizer.h
index 7305c46..2110f0f 100644
--- a/icing/transform/normalizer.h
+++ b/icing/transform/normalizer.h
@@ -44,17 +44,13 @@ class Normalizer {
   // Returns a CharacterIterator pointing to one past the end of the segment of
   // term that (once normalized) matches with normalized_term.
   //
-  // Ex. CalculateNormalizedMatchLength("YELLOW", "yell") will return
+  // Ex. FindNormalizedMatchEndPosition("YELLOW", "yell") will return
   // CharacterIterator(u8:4, u16:4, u32:4).
   //
-  // Ex. CalculateNormalizedMatchLength("YELLOW", "red") will return
+  // Ex. FindNormalizedMatchEndPosition("YELLOW", "red") will return
   // CharacterIterator(u8:0, u16:0, u32:0).
-  virtual CharacterIterator CalculateNormalizedMatchLength(
-      std::string_view term, std::string_view normalized_term) const {
-    // TODO(b/195720764) Remove this default impl and implement in all
-    // subclasses.
-    return CharacterIterator(term, 0, 0, 0);
-  }
+  virtual CharacterIterator FindNormalizedMatchEndPosition(
+      std::string_view term, std::string_view normalized_term) const = 0;
 };
 
 }  // namespace lib
diff --git a/icing/util/i18n-utils.cc b/icing/util/i18n-utils.cc
index cd0a227..ec327ad 100644
--- a/icing/util/i18n-utils.cc
+++ b/icing/util/i18n-utils.cc
@@ -116,6 +116,8 @@ bool IsAscii(char c) { return U8_IS_SINGLE((uint8_t)c); }
 
 bool IsAscii(UChar32 c) { return U8_LENGTH(c) == 1; }
 
+bool IsAlphaNumeric(UChar32 c) { return u_isalnum(c); }
+
 int GetUtf8Length(UChar32 c) { return U8_LENGTH(c); }
 
 int GetUtf16Length(UChar32 c) { return U16_LENGTH(c); }
diff --git a/icing/util/i18n-utils.h b/icing/util/i18n-utils.h
index 82ae828..491df6b 100644
--- a/icing/util/i18n-utils.h
+++ b/icing/util/i18n-utils.h
@@ -67,6 +67,9 @@ bool IsAscii(char c);
 // Checks if the Unicode char is within ASCII range.
 bool IsAscii(UChar32 c);
 
+// Checks if the Unicode char is alphanumeric.
+bool IsAlphaNumeric(UChar32 c);
+
 // Returns how many code units (char) are used for the UTF-8 encoding of this
 // Unicode character. Returns 0 if not valid.
 int GetUtf8Length(UChar32 c);
diff --git a/synced_AOSP_CL_number.txt b/synced_AOSP_CL_number.txt
index f0c066f..d57de81 100644
--- a/synced_AOSP_CL_number.txt
+++ b/synced_AOSP_CL_number.txt
@@ -1 +1 @@
-set(synced_AOSP_CL_number=390638574)
+set(synced_AOSP_CL_number=395331611)
author	My Name <dsaadati@google.com>	2021-09-09 11:11:03 -0700
committer	Dan Saadati <dsaadati@google.com>	2021-09-09 11:27:04 -0700
commit	34fc8c85b9f690ffd0a095a4bbcac9aaacfa387b (patch)
tree	0b9bbc9ca107551327b396e7c355b65c4baa43ef
parent	14ee9a8eb8f3ed47f68117208626045878c943ac (diff)
parent	39f59853b980d94a55e9b0f76185b0d3fff88455 (diff)
download	icing-34fc8c85b9f690ffd0a095a4bbcac9aaacfa387b.tar.gz