Copy over changes made to Google3 codebase in Icing.

Change-Id: Ia36edb0a1b085e249dabfc220a5b72418063604f
author: Tim Barron <tjbarron@google.com> 2020-06-05 13:55:31 -0700
committer: Tim Barron <tjbarron@google.com> 2020-06-05 14:04:31 -0700
commit: a4a63ec8e7e70912ef04019e7dc9f3c3ecf2eabf (patch)
tree: 090955adb6f2abfc09f5275d6bab35a2c0d74198 /icing
parent: 79321d1f286ac650cc99fcf795a67c5dde8c0597 (diff)
download: icing-a4a63ec8e7e70912ef04019e7dc9f3c3ecf2eabf.tar.gz
102 files changed, 7414 insertions, 1342 deletions
diff --git a/icing/file/file-backed-proto-log.h b/icing/file/file-backed-proto-log.h
index 0b36e18..62943b8 100644
--- a/icing/file/file-backed-proto-log.h
+++ b/icing/file/file-backed-proto-log.h
@@ -210,13 +210,23 @@ class FileBackedProtoLog {
   //   INTERNAL_ERROR on IO error
   libtextclassifier3::StatusOr<ProtoT> ReadProto(int64_t file_offset) const;
 
-  // Calculates and returns the disk usage in bytes.
+  // Calculates and returns the disk usage in bytes. Rounds up to the nearest
+  // block size.
   //
   // Returns:
   //   Disk usage on success
   //   INTERNAL_ERROR on IO error
   libtextclassifier3::StatusOr<int64_t> GetDiskUsage() const;
 
+  // Returns the file size of all the elements held in the log. File size is in
+  // bytes. This excludes the size of any internal metadata of the log, e.g. the
+  // log's header.
+  //
+  // Returns:
+  //   File size on success
+  //   INTERNAL_ERROR on IO error
+  libtextclassifier3::StatusOr<int64_t> GetElementsFileSize() const;
+
   // An iterator helping to find offsets of all the protos in file.
   // Example usage:
   //
@@ -736,6 +746,17 @@ libtextclassifier3::StatusOr<int64_t> FileBackedProtoLog<ProtoT>::GetDiskUsage()
 }
 
 template <typename ProtoT>
+libtextclassifier3::StatusOr<int64_t>
+FileBackedProtoLog<ProtoT>::GetElementsFileSize() const {
+  int64_t total_file_size = filesystem_->GetFileSize(file_path_.c_str());
+  if (total_file_size == Filesystem::kBadFileSize) {
+    return absl_ports::InternalError(
+        "Failed to get file size of elments in the proto log");
+  }
+  return total_file_size - sizeof(Header);
+}
+
+template <typename ProtoT>
 FileBackedProtoLog<ProtoT>::Iterator::Iterator(const Filesystem& filesystem,
                                                const std::string& file_path,
                                                int64_t initial_offset)
diff --git a/icing/file/file-backed-vector.h b/icing/file/file-backed-vector.h
index f13b67b..27d03b2 100644
--- a/icing/file/file-backed-vector.h
+++ b/icing/file/file-backed-vector.h
@@ -194,13 +194,23 @@ class FileBackedVector {
   //   INTERNAL on I/O error
   libtextclassifier3::Status PersistToDisk();
 
-  // Calculates and returns the disk usage in bytes.
+  // Calculates and returns the disk usage in bytes. Rounds up to the nearest
+  // block size.
   //
   // Returns:
   //   Disk usage on success
   //   INTERNAL_ERROR on IO error
   libtextclassifier3::StatusOr<int64_t> GetDiskUsage() const;
 
+  // Returns the file size of the all the elements held in the vector. File size
+  // is in bytes. This excludes the size of any internal metadata of the vector,
+  // e.g. the vector's header.
+  //
+  // Returns:
+  //   File size on success
+  //   INTERNAL_ERROR on IO error
+  libtextclassifier3::StatusOr<int64_t> GetElementsFileSize() const;
+
   // Accessors.
   const T* array() const {
     return reinterpret_cast<const T*>(mmapped_file_->region());
@@ -705,6 +715,17 @@ libtextclassifier3::StatusOr<int64_t> FileBackedVector<T>::GetDiskUsage()
   return size;
 }
 
+template <typename T>
+libtextclassifier3::StatusOr<int64_t> FileBackedVector<T>::GetElementsFileSize()
+    const {
+  int64_t total_file_size = filesystem_->GetFileSize(file_path_.c_str());
+  if (total_file_size == Filesystem::kBadFileSize) {
+    return absl_ports::InternalError(
+        "Failed to get file size of elements in the file-backed vector");
+  }
+  return total_file_size - sizeof(Header);
+}
+
 }  // namespace lib
 }  // namespace icing
 
diff --git a/icing/icu-data-file-helper.cc b/icing/helpers/icu/icu-data-file-helper.cc
index 9741dbb..5cf6a1d 100644
--- a/icing/icu-data-file-helper.cc
+++ b/icing/helpers/icu/icu-data-file-helper.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "icing/icu-data-file-helper.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
 
 #include <sys/mman.h>
 
diff --git a/icing/icu-data-file-helper.h b/icing/helpers/icu/icu-data-file-helper.h
index e92491d..90f5bc7 100644
--- a/icing/icu-data-file-helper.h
+++ b/icing/helpers/icu/icu-data-file-helper.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef ICING_ICU_DATA_FILE_HELPER
-#define ICING_ICU_DATA_FILE_HELPER
+#ifndef ICING_HELPERS_ICU_ICU_DATA_FILE_HELPER
+#define ICING_HELPERS_ICU_ICU_DATA_FILE_HELPER
 
 #include "icing/text_classifier/lib3/utils/base/status.h"
 
@@ -40,4 +40,4 @@ libtextclassifier3::Status SetUpICUDataFile(
 }  // namespace lib
 }  // namespace icing
 
-#endif  // ICING_ICU_DATA_FILE_HELPER
+#endif  // ICING_HELPERS_ICU_ICU_DATA_FILE_HELPER
diff --git a/icing/icing-search-engine.cc b/icing/icing-search-engine.cc
index 6dc535c..01a2922 100644
--- a/icing/icing-search-engine.cc
+++ b/icing/icing-search-engine.cc
@@ -194,19 +194,22 @@ void TransformStatus(const libtextclassifier3::Status& internal_status,
 
 }  // namespace
 
-IcingSearchEngine::IcingSearchEngine(const IcingSearchEngineOptions& options)
+IcingSearchEngine::IcingSearchEngine(const IcingSearchEngineOptions& options,
+                                     std::unique_ptr<const JniCache> jni_cache)
     : IcingSearchEngine(options, std::make_unique<Filesystem>(),
-                        std::make_unique<Clock>()) {}
+                        std::make_unique<Clock>(), std::move(jni_cache)) {}
 
 IcingSearchEngine::IcingSearchEngine(
     IcingSearchEngineOptions options,
-    std::unique_ptr<const Filesystem> filesystem, std::unique_ptr<Clock> clock)
+    std::unique_ptr<const Filesystem> filesystem, std::unique_ptr<Clock> clock,
+    std::unique_ptr<const JniCache> jni_cache)
     : options_(std::move(options)),
       filesystem_(std::move(filesystem)),
       icing_filesystem_(std::make_unique<IcingFilesystem>()),
       clock_(std::move(clock)),
       result_state_manager_(performance_configuration_.max_num_hits_per_query,
-                            performance_configuration_.max_num_cache_results) {
+                            performance_configuration_.max_num_cache_results),
+      jni_cache_(std::move(jni_cache)) {
   ICING_VLOG(1) << "Creating IcingSearchEngine in dir: " << options_.base_dir();
 }
 
@@ -220,23 +223,25 @@ IcingSearchEngine::~IcingSearchEngine() {
 }
 
 InitializeResultProto IcingSearchEngine::Initialize() {
+  // This method does both read and write so we need a writer lock. Using two
+  // locks (reader and writer) has the chance to be interrupted during
+  // switching.
+  absl_ports::unique_lock l(&mutex_);
+  return InternalInitialize();
+}
+
+InitializeResultProto IcingSearchEngine::InternalInitialize() {
   ICING_VLOG(1) << "Initializing IcingSearchEngine in dir: "
                 << options_.base_dir();
 
   InitializeResultProto result_proto;
   StatusProto* result_status = result_proto.mutable_status();
-
   if (initialized_) {
     // Already initialized.
     result_status->set_code(StatusProto::OK);
     return result_proto;
   }
 
-  // This method does both read and write so we need a writer lock. Using two
-  // locks (reader and writer) has the chance to be interrupted during
-  // switching.
-  absl_ports::unique_lock l(&mutex_);
-
   // Releases result / query cache if any
   result_state_manager_.InvalidateAllResultStates();
 
@@ -269,14 +274,14 @@ libtextclassifier3::Status IcingSearchEngine::InitializeMembers() {
   ICING_RETURN_IF_ERROR(InitializeSchemaStore());
   ICING_RETURN_IF_ERROR(InitializeDocumentStore());
 
-  TC3_ASSIGN_OR_RETURN(
-      language_segmenter_,
-      language_segmenter_factory::Create(language_segmenter_factory::ICU4C));
+  // TODO(b/156383798) : Resolve how to specify the locale.
+  language_segmenter_factory::SegmenterOptions segmenter_options(
+      ULOC_US, jni_cache_.get());
+  TC3_ASSIGN_OR_RETURN(language_segmenter_, language_segmenter_factory::Create(
+                                                std::move(segmenter_options)));
 
-  TC3_ASSIGN_OR_RETURN(
-      normalizer_,
-      normalizer_factory::Create(normalizer_factory::NormalizerType::ICU4C,
-                                 options_.max_token_length()));
+  TC3_ASSIGN_OR_RETURN(normalizer_,
+                       normalizer_factory::Create(options_.max_token_length()));
 
   ICING_RETURN_IF_ERROR(InitializeIndex());
 
@@ -416,14 +421,19 @@ SetSchemaResultProto IcingSearchEngine::SetSchema(
   SetSchemaResultProto result_proto;
   StatusProto* result_status = result_proto.mutable_status();
 
+  absl_ports::unique_lock l(&mutex_);
+  if (!initialized_) {
+    result_status->set_code(StatusProto::FAILED_PRECONDITION);
+    result_status->set_message("IcingSearchEngine has not been initialized!");
+    return result_proto;
+  }
+
   libtextclassifier3::Status status = SchemaUtil::Validate(new_schema);
   if (!status.ok()) {
     TransformStatus(status, result_status);
     return result_proto;
   }
 
-  absl_ports::unique_lock l(&mutex_);
-
   auto lost_previous_schema_or = LostPreviousSchema();
   if (!lost_previous_schema_or.ok()) {
     TransformStatus(lost_previous_schema_or.status(), result_status);
@@ -498,6 +508,11 @@ GetSchemaResultProto IcingSearchEngine::GetSchema() {
   StatusProto* result_status = result_proto.mutable_status();
 
   absl_ports::shared_lock l(&mutex_);
+  if (!initialized_) {
+    result_status->set_code(StatusProto::FAILED_PRECONDITION);
+    result_status->set_message("IcingSearchEngine has not been initialized!");
+    return result_proto;
+  }
 
   auto schema_or = schema_store_->GetSchema();
   if (!schema_or.ok()) {
@@ -516,6 +531,11 @@ GetSchemaTypeResultProto IcingSearchEngine::GetSchemaType(
   StatusProto* result_status = result_proto.mutable_status();
 
   absl_ports::shared_lock l(&mutex_);
+  if (!initialized_) {
+    result_status->set_code(StatusProto::FAILED_PRECONDITION);
+    result_status->set_message("IcingSearchEngine has not been initialized!");
+    return result_proto;
+  }
 
   auto type_config_or = schema_store_->GetSchemaTypeConfig(schema_type);
   if (!type_config_or.ok()) {
@@ -542,6 +562,11 @@ PutResultProto IcingSearchEngine::Put(DocumentProto&& document) {
   // the schema file to validate, and the schema could be changed in
   // SetSchema() which is protected by the same mutex.
   absl_ports::unique_lock l(&mutex_);
+  if (!initialized_) {
+    result_status->set_code(StatusProto::FAILED_PRECONDITION);
+    result_status->set_message("IcingSearchEngine has not been initialized!");
+    return result_proto;
+  }
 
   auto document_id_or = document_store_->Put(document);
   if (!document_id_or.ok()) {
@@ -576,6 +601,11 @@ GetResultProto IcingSearchEngine::Get(const std::string_view name_space,
   StatusProto* result_status = result_proto.mutable_status();
 
   absl_ports::shared_lock l(&mutex_);
+  if (!initialized_) {
+    result_status->set_code(StatusProto::FAILED_PRECONDITION);
+    result_status->set_message("IcingSearchEngine has not been initialized!");
+    return result_proto;
+  }
 
   auto document_or = document_store_->Get(name_space, uri);
   if (!document_or.ok()) {
@@ -596,6 +626,11 @@ DeleteResultProto IcingSearchEngine::Delete(const std::string_view name_space,
   StatusProto* result_status = result_proto.mutable_status();
 
   absl_ports::unique_lock l(&mutex_);
+  if (!initialized_) {
+    result_status->set_code(StatusProto::FAILED_PRECONDITION);
+    result_status->set_message("IcingSearchEngine has not been initialized!");
+    return result_proto;
+  }
 
   // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
   // that can support error logging.
@@ -616,14 +651,20 @@ DeleteByNamespaceResultProto IcingSearchEngine::DeleteByNamespace(
     const std::string_view name_space) {
   ICING_VLOG(1) << "Deleting namespace from doc store";
 
+  DeleteByNamespaceResultProto delete_result;
+  StatusProto* result_status = delete_result.mutable_status();
   absl_ports::unique_lock l(&mutex_);
+  if (!initialized_) {
+    result_status->set_code(StatusProto::FAILED_PRECONDITION);
+    result_status->set_message("IcingSearchEngine has not been initialized!");
+    return delete_result;
+  }
 
   // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
   // that can support error logging.
   libtextclassifier3::Status status =
       document_store_->DeleteByNamespace(name_space);
-  DeleteByNamespaceResultProto delete_result;
-  TransformStatus(status, delete_result.mutable_status());
+  TransformStatus(status, result_status);
   if (!status.ok()) {
     ICING_LOG(ERROR) << status.error_message()
                      << "Failed to delete Namespace: " << name_space;
@@ -636,14 +677,20 @@ DeleteBySchemaTypeResultProto IcingSearchEngine::DeleteBySchemaType(
     const std::string_view schema_type) {
   ICING_VLOG(1) << "Deleting type from doc store";
 
+  DeleteBySchemaTypeResultProto delete_result;
+  StatusProto* result_status = delete_result.mutable_status();
   absl_ports::unique_lock l(&mutex_);
+  if (!initialized_) {
+    result_status->set_code(StatusProto::FAILED_PRECONDITION);
+    result_status->set_message("IcingSearchEngine has not been initialized!");
+    return delete_result;
+  }
 
   // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
   // that can support error logging.
   libtextclassifier3::Status status =
       document_store_->DeleteBySchemaType(schema_type);
-  DeleteBySchemaTypeResultProto delete_result;
-  TransformStatus(status, delete_result.mutable_status());
+  TransformStatus(status, result_status);
   if (!status.ok()) {
     ICING_LOG(ERROR) << status.error_message()
                      << "Failed to delete SchemaType: " << schema_type;
@@ -659,6 +706,11 @@ PersistToDiskResultProto IcingSearchEngine::PersistToDisk() {
   StatusProto* result_status = result_proto.mutable_status();
 
   absl_ports::unique_lock l(&mutex_);
+  if (!initialized_) {
+    result_status->set_code(StatusProto::FAILED_PRECONDITION);
+    result_status->set_message("IcingSearchEngine has not been initialized!");
+    return result_proto;
+  }
 
   auto status = InternalPersistToDisk();
   TransformStatus(status, result_status);
@@ -678,6 +730,11 @@ OptimizeResultProto IcingSearchEngine::Optimize() {
   StatusProto* result_status = result_proto.mutable_status();
 
   absl_ports::unique_lock l(&mutex_);
+  if (!initialized_) {
+    result_status->set_code(StatusProto::FAILED_PRECONDITION);
+    result_status->set_message("IcingSearchEngine has not been initialized!");
+    return result_proto;
+  }
 
   // Releases result / query cache if any
   result_state_manager_.InvalidateAllResultStates();
@@ -729,6 +786,54 @@ OptimizeResultProto IcingSearchEngine::Optimize() {
   return result_proto;
 }
 
+GetOptimizeInfoResultProto IcingSearchEngine::GetOptimizeInfo() {
+  ICING_VLOG(1) << "Getting optimize info from IcingSearchEngine";
+
+  GetOptimizeInfoResultProto result_proto;
+  StatusProto* result_status = result_proto.mutable_status();
+
+  absl_ports::shared_lock l(&mutex_);
+  if (!initialized_) {
+    result_status->set_code(StatusProto::FAILED_PRECONDITION);
+    result_status->set_message("IcingSearchEngine has not been initialized!");
+    return result_proto;
+  }
+
+  // Get stats from DocumentStore
+  auto doc_store_optimize_info_or = document_store_->GetOptimizeInfo();
+  if (!doc_store_optimize_info_or.ok()) {
+    TransformStatus(doc_store_optimize_info_or.status(), result_status);
+    return result_proto;
+  }
+  DocumentStore::OptimizeInfo doc_store_optimize_info =
+      doc_store_optimize_info_or.ValueOrDie();
+  result_proto.set_optimizable_docs(doc_store_optimize_info.optimizable_docs);
+
+  if (doc_store_optimize_info.optimizable_docs == 0) {
+    // Can return early since there's nothing to calculate on the index side
+    result_proto.set_estimated_optimizable_bytes(0);
+    result_status->set_code(StatusProto::OK);
+    return result_proto;
+  }
+
+  // Get stats from Index.
+  auto index_elements_size_or = index_->GetElementsSize();
+  if (!index_elements_size_or.ok()) {
+    TransformStatus(index_elements_size_or.status(), result_status);
+    return result_proto;
+  }
+  int64_t index_elements_size = index_elements_size_or.ValueOrDie();
+
+  // Sum up the optimizable sizes from DocumentStore and Index
+  result_proto.set_estimated_optimizable_bytes(
+      index_elements_size * doc_store_optimize_info.optimizable_docs /
+          doc_store_optimize_info.total_docs +
+      doc_store_optimize_info.estimated_optimizable_bytes);
+
+  result_status->set_code(StatusProto::OK);
+  return result_proto;
+}
+
 libtextclassifier3::Status IcingSearchEngine::InternalPersistToDisk() {
   ICING_RETURN_IF_ERROR(schema_store_->PersistToDisk());
   ICING_RETURN_IF_ERROR(document_store_->PersistToDisk());
@@ -808,6 +913,13 @@ SearchResultProto IcingSearchEngine::Search(
     const ResultSpecProto& result_spec) {
   SearchResultProto result_proto;
   StatusProto* result_status = result_proto.mutable_status();
+  // TODO(b/146008613) Explore ideas to make this function read-only.
+  absl_ports::unique_lock l(&mutex_);
+  if (!initialized_) {
+    result_status->set_code(StatusProto::FAILED_PRECONDITION);
+    result_status->set_message("IcingSearchEngine has not been initialized!");
+    return result_proto;
+  }
 
   libtextclassifier3::Status status = ValidateResultSpec(result_spec);
   if (!status.ok()) {
@@ -820,9 +932,6 @@ SearchResultProto IcingSearchEngine::Search(
     return result_proto;
   }
 
-  // TODO(b/146008613) Explore ideas to make this function read-only.
-  absl_ports::unique_lock l(&mutex_);
-
   // Gets unordered results from query processor
   auto query_processor_or = QueryProcessor::Create(
       index_.get(), language_segmenter_.get(), normalizer_.get(),
@@ -917,6 +1026,11 @@ SearchResultProto IcingSearchEngine::GetNextPage(uint64_t next_page_token) {
   // ResultStateManager has its own writer lock, so here we only need a reader
   // lock for other components.
   absl_ports::shared_lock l(&mutex_);
+  if (!initialized_) {
+    result_status->set_code(StatusProto::FAILED_PRECONDITION);
+    result_status->set_message("IcingSearchEngine has not been initialized!");
+    return result_proto;
+  }
 
   libtextclassifier3::StatusOr<PageResultState> page_result_state_or =
       result_state_manager_.GetNextPage(next_page_token);
@@ -969,6 +1083,11 @@ SearchResultProto IcingSearchEngine::GetNextPage(uint64_t next_page_token) {
 }
 
 void IcingSearchEngine::InvalidateNextPageToken(uint64_t next_page_token) {
+  absl_ports::shared_lock l(&mutex_);
+  if (!initialized_) {
+    ICING_LOG(ERROR) << "IcingSearchEngine has not been initialized!";
+    return;
+  }
   result_state_manager_.InvalidateResultState(next_page_token);
 }
 
@@ -1138,8 +1257,9 @@ ResetResultProto IcingSearchEngine::Reset() {
     return result_proto;
   }
 
+  absl_ports::unique_lock l(&mutex_);
   initialized_ = false;
-  if (Initialize().status().code() != StatusProto::OK) {
+  if (InternalInitialize().status().code() != StatusProto::OK) {
     // We shouldn't hit the following Initialize errors:
     //   NOT_FOUND: all data was cleared, we aren't expecting anything
     //   DATA_LOSS: all data was cleared, we aren't expecting anything
diff --git a/icing/icing-search-engine.h b/icing/icing-search-engine.h
index 196f243..746b5b4 100644
--- a/icing/icing-search-engine.h
+++ b/icing/icing-search-engine.h
@@ -20,6 +20,7 @@
 #include <string>
 #include <string_view>
 
+#include "icing/jni/jni-cache.h"
 #include "icing/text_classifier/lib3/utils/base/status.h"
 #include "icing/text_classifier/lib3/utils/base/statusor.h"
 #include "icing/absl_ports/mutex.h"
@@ -60,7 +61,12 @@ class IcingSearchEngine {
     uint32_t checksum;
   };
 
-  explicit IcingSearchEngine(const IcingSearchEngineOptions& options);
+  // Note: It is only required to provide a pointer to a valid instance of
+  // JniCache if this instance needs to perform reverse-jni calls. Users on
+  // Linux and iOS should always provide a nullptr.
+  explicit IcingSearchEngine(
+      const IcingSearchEngineOptions& options,
+      std::unique_ptr<const JniCache> jni_cache = nullptr);
 
   // Calculates integrity checks and persists files to disk.
   ~IcingSearchEngine();
@@ -117,14 +123,17 @@ class IcingSearchEngine {
   // So, callers should only have to call this if the schema changed.
   // However, calling it multiple times with the same schema is a no-op.
   //
-  // On any error, Icing will keep using the older schema.
+  // On some errors, Icing will keep using the older schema, but on
+  // INTERNAL_ERROR, it is undefined to continue using Icing.
   //
   // Returns:
   //   OK on success
   //   INVALID_ARGUMENT if 'new_schema' is invalid
-  //   FAILED_PRECONDITION if 'new_schema' is incompatible
+  //   FAILED_PRECONDITION if 'new_schema' is incompatible, or IcingSearchEngine
+  //     has not been initialized yet.
   //   INTERNAL_ERROR if Icing failed to store the new schema or upgrade
-  //     existing data based on the new schema.
+  //     existing data based on the new schema. Using Icing beyond this error is
+  //     undefined and may cause crashes.
   //
   // TODO(cassiewang) Figure out, document (and maybe even enforce) the best
   // way ordering of calls between Initialize() and SetSchema(), both when
@@ -149,6 +158,7 @@ class IcingSearchEngine {
   // Returns:
   //   SchemaProto on success
   //   NOT_FOUND if a schema has not been set yet
+  //   FAILED_PRECONDITION IcingSearchEngine has not been initialized yet.
   //   INTERNAL_ERROR on IO error
   GetSchemaResultProto GetSchema() ICING_LOCKS_EXCLUDED(mutex_);
 
@@ -156,7 +166,8 @@ class IcingSearchEngine {
   //
   // Returns:
   //   SchemaTypeConfigProto on success
-  //   FAILED_PRECONDITION if a schema has not been set yet
+  //   FAILED_PRECONDITION if a schema has not been set yet, IcingSearchEngine
+  //     has not been initialized yet.
   //   NOT_FOUND if there is no SchemaTypeConfig of schema_type in the
   //     SchemaProto
   //   INTERNAL_ERROR on IO error
@@ -169,7 +180,8 @@ class IcingSearchEngine {
   //
   // Returns:
   //   OK on success
-  //   FAILED_PRECONDITION if a schema has not been set yet
+  //   FAILED_PRECONDITION if a schema has not been set yet, IcingSearchEngine
+  //     has not been initialized yet.
   //   NOT_FOUND if there is no SchemaTypeConfig in the SchemaProto that matches
   //     the document's schema
   //   INTERNAL_ERROR on IO error
@@ -189,6 +201,7 @@ class IcingSearchEngine {
   // Returns:
   //   The document found on success
   //   NOT_FOUND if the key doesn't exist or doc has been deleted
+  //   FAILED_PRECONDITION IcingSearchEngine has not been initialized yet
   //   INTERNAL_ERROR on IO error
   GetResultProto Get(std::string_view name_space, std::string_view uri);
 
@@ -202,6 +215,7 @@ class IcingSearchEngine {
   // Returns:
   //   OK on success
   //   NOT_FOUND if no document exists with namespace, uri
+  //   FAILED_PRECONDITION IcingSearchEngine has not been initialized yet
   //   INTERNAL_ERROR on IO error
   DeleteResultProto Delete(std::string_view name_space, std::string_view uri)
       ICING_LOCKS_EXCLUDED(mutex_);
@@ -216,6 +230,7 @@ class IcingSearchEngine {
   // Returns:
   //   OK on success
   //   NOT_FOUND if namespace doesn't exist
+  //   FAILED_PRECONDITION IcingSearchEngine has not been initialized yet
   //   INTERNAL_ERROR on IO error
   DeleteByNamespaceResultProto DeleteByNamespace(std::string_view name_space)
       ICING_LOCKS_EXCLUDED(mutex_);
@@ -230,6 +245,7 @@ class IcingSearchEngine {
   // Returns:
   //   OK on success
   //   NOT_FOUND if schema type doesn't exist
+  //   FAILED_PRECONDITION IcingSearchEngine has not been initialized yet
   //   INTERNAL_ERROR on IO error
   DeleteBySchemaTypeResultProto DeleteBySchemaType(std::string_view schema_type)
       ICING_LOCKS_EXCLUDED(mutex_);
@@ -246,6 +262,7 @@ class IcingSearchEngine {
   //   OK with results on success
   //   INVALID_ARGUMENT if any of specs is invalid
   //   ABORTED if failed to perform search but existing data is not affected
+  //   FAILED_PRECONDITION IcingSearchEngine has not been initialized yet
   //   INTERNAL_ERROR on any other errors
   SearchResultProto Search(const SearchSpecProto& search_spec,
                            const ScoringSpecProto& scoring_spec,
@@ -258,6 +275,7 @@ class IcingSearchEngine {
   // Returns a SearchResultProto with status:
   //   OK with results on success
   //   ABORTED if failed to get results but existing data is not affected
+  //   FAILED_PRECONDITION IcingSearchEngine has not been initialized yet
   //   INTERNAL_ERROR on any other errors
   SearchResultProto GetNextPage(uint64_t next_page_token)
       ICING_LOCKS_EXCLUDED(mutex_);
@@ -276,6 +294,7 @@ class IcingSearchEngine {
   //
   // Returns:
   //   OK on success
+  //   FAILED_PRECONDITION IcingSearchEngine has not been initialized yet
   //   INTERNAL on I/O error
   PersistToDiskResultProto PersistToDisk() ICING_LOCKS_EXCLUDED(mutex_);
 
@@ -284,25 +303,35 @@ class IcingSearchEngine {
   // resource-efficient. This method purely optimizes the internal files and
   // has no functional impact on what gets accepted/returned.
   //
-  // NOTE: This method should be called about once every 24 hours when the
-  // device is idle and charging. It can also be called when the system needs
-  // to free up extra disk-space.
-  //
   // WARNING: This method is CPU and IO intensive and depending on the
   // contents stored, it can take from a few seconds to a few minutes.
   // This call also blocks all read/write operations on Icing.
   //
+  // SUGGESTION: Assuming the client has no restrictions on their side, it's
+  // recommended to call this method about once every 24 hours when the
+  // device is idle and charging. It can also be called when the system needs
+  // to free up extra disk-space.
+  //
   // Returns:
   //   OK on success
   //   ABORTED_ERROR if optimization is aborted due to non-fatal errors before
   //                 actual modifications are made.
   //   DATA_LOSS_ERROR on errors that could potentially cause data loss,
   //                   IcingSearchEngine is still functioning.
-  //   INTERNAL_ERROR on any IO errors or other unrecoverable errors. Icing
-  //                  could be in an inconsistent state and might not be usable.
+  //   INTERNAL_ERROR on any IO errors or other unrecoverable errors. Continued
+  //                  use of Icing is undefined.
   //                  Clients could clear and reinitialize IcingSearchEngine.
+  //   FAILED_PRECONDITION IcingSearchEngine has not been initialized yet
   OptimizeResultProto Optimize() ICING_LOCKS_EXCLUDED(mutex_);
 
+  // Returns potential size and document savings if Optimize were called.
+  //
+  // Returns:
+  //   OK on success
+  //   FAILED_PRECONDITION if IcingSearchEngine has not been initialized yet
+  //   INTERNAL_ERROR on IO error
+  GetOptimizeInfoResultProto GetOptimizeInfo() ICING_LOCKS_EXCLUDED(mutex_);
+
   // Clears all data from Icing and re-initializes. Clients DO NOT need to call
   // Initialize again.
   //
@@ -319,13 +348,14 @@ class IcingSearchEngine {
  protected:
   IcingSearchEngine(IcingSearchEngineOptions options,
                     std::unique_ptr<const Filesystem> filesystem,
-                    std::unique_ptr<Clock> clock);
+                    std::unique_ptr<Clock> clock,
+                    std::unique_ptr<const JniCache> jni_cache = nullptr);
 
  private:
   const IcingSearchEngineOptions options_;
   const std::unique_ptr<const Filesystem> filesystem_;
   const std::unique_ptr<const IcingFilesystem> icing_filesystem_;
-  bool initialized_ = false;
+  bool initialized_ ICING_GUARDED_BY(mutex_) = false;
 
   // Abstraction for accessing time values.
   std::unique_ptr<Clock> clock_;
@@ -355,6 +385,9 @@ class IcingSearchEngine {
   // Storage for all hits of content from the document store.
   std::unique_ptr<Index> index_ ICING_GUARDED_BY(mutex_);
 
+  // Pointer to JNI class references
+  const std::unique_ptr<const JniCache> jni_cache_;
+
   // Helper method to do the actual work to persist data to disk. We need this
   // separate method so that other public methods don't need to call
   // PersistToDisk(). Public methods calling each other may cause deadlock
@@ -362,6 +395,12 @@ class IcingSearchEngine {
   libtextclassifier3::Status InternalPersistToDisk()
       ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
 
+  // Helper method to the actual work to Initialize. We need this separate
+  // method so that other public methods don't need to call Initialize(). Public
+  // methods calling each other may cause deadlock issues.
+  InitializeResultProto InternalInitialize()
+      ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
   // Helper method to initialize member variables.
   //
   // Returns:
diff --git a/icing/icing-search-engine_fuzz_test.cc b/icing/icing-search-engine_fuzz_test.cc
index 632fd01..d31f836 100644
--- a/icing/icing-search-engine_fuzz_test.cc
+++ b/icing/icing-search-engine_fuzz_test.cc
@@ -18,8 +18,8 @@
 #include "icing/text_classifier/lib3/utils/base/status.h"
 #include "icing/text_classifier/lib3/utils/base/statusor.h"
 #include "icing/document-builder.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
 #include "icing/icing-search-engine.h"
-#include "icing/icu-data-file-helper.h"
 #include "icing/proto/document.pb.h"
 #include "icing/proto/initialize.pb.h"
 #include "icing/proto/scoring.pb.h"
diff --git a/icing/icing-search-engine_test.cc b/icing/icing-search-engine_test.cc
index 17795a3..baa469e 100644
--- a/icing/icing-search-engine_test.cc
+++ b/icing/icing-search-engine_test.cc
@@ -26,7 +26,7 @@
 #include "icing/document-builder.h"
 #include "icing/file/filesystem.h"
 #include "icing/file/mock-filesystem.h"
-#include "icing/icu-data-file-helper.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
 #include "icing/portable/equals-proto.h"
 #include "icing/proto/document.pb.h"
 #include "icing/proto/initialize.pb.h"
@@ -1367,6 +1367,72 @@ TEST_F(IcingSearchEngineTest, OptimizationShouldDeleteTemporaryDirectory) {
   EXPECT_FALSE(filesystem()->FileExists(tmp_file.c_str()));
 }
 
+TEST_F(IcingSearchEngineTest, GetOptimizeInfoHasCorrectStats) {
+  DocumentProto document1 = CreateMessageDocument("namespace", "uri1");
+  DocumentProto document2 = DocumentBuilder()
+                                .SetKey("namespace", "uri2")
+                                .SetSchema("Message")
+                                .AddStringProperty("body", "message body")
+                                .SetCreationTimestampMs(100)
+                                .SetTtlMs(500)
+                                .Build();
+
+  auto fake_clock = std::make_unique<FakeClock>();
+  fake_clock->SetSystemTimeMilliseconds(1000);
+
+  TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+                              std::make_unique<Filesystem>(),
+                              std::move(fake_clock));
+  ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
+
+  // Just initialized, nothing is optimizable yet.
+  GetOptimizeInfoResultProto optimize_info = icing.GetOptimizeInfo();
+  EXPECT_THAT(optimize_info.status().code(), Eq(StatusProto::OK));
+  EXPECT_THAT(optimize_info.optimizable_docs(), Eq(0));
+  EXPECT_THAT(optimize_info.estimated_optimizable_bytes(), Eq(0));
+
+  ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
+              Eq(StatusProto::OK));
+  ASSERT_THAT(icing.Put(document1).status().code(), Eq(StatusProto::OK));
+
+  // Only have active documents, nothing is optimizable yet.
+  optimize_info = icing.GetOptimizeInfo();
+  EXPECT_THAT(optimize_info.status().code(), Eq(StatusProto::OK));
+  EXPECT_THAT(optimize_info.optimizable_docs(), Eq(0));
+  EXPECT_THAT(optimize_info.estimated_optimizable_bytes(), Eq(0));
+
+  // Deletes document1
+  ASSERT_THAT(icing.Delete("namespace", "uri1").status().code(),
+              Eq(StatusProto::OK));
+
+  optimize_info = icing.GetOptimizeInfo();
+  EXPECT_THAT(optimize_info.status().code(), Eq(StatusProto::OK));
+  EXPECT_THAT(optimize_info.optimizable_docs(), Eq(1));
+  EXPECT_THAT(optimize_info.estimated_optimizable_bytes(), Gt(0));
+  int64_t first_estimated_optimizable_bytes =
+      optimize_info.estimated_optimizable_bytes();
+
+  // Add a second document, but it'll be expired since the time (1000) is
+  // greater than the document's creation timestamp (100) + the document's ttl
+  // (500)
+  ASSERT_THAT(icing.Put(document2).status().code(), Eq(StatusProto::OK));
+
+  optimize_info = icing.GetOptimizeInfo();
+  EXPECT_THAT(optimize_info.status().code(), Eq(StatusProto::OK));
+  EXPECT_THAT(optimize_info.optimizable_docs(), Eq(2));
+  EXPECT_THAT(optimize_info.estimated_optimizable_bytes(),
+              Gt(first_estimated_optimizable_bytes));
+
+  // Optimize
+  ASSERT_THAT(icing.Optimize().status().code(), Eq(StatusProto::OK));
+
+  // Nothing is optimizable now that everything has been optimized away.
+  optimize_info = icing.GetOptimizeInfo();
+  EXPECT_THAT(optimize_info.status().code(), Eq(StatusProto::OK));
+  EXPECT_THAT(optimize_info.optimizable_docs(), Eq(0));
+  EXPECT_THAT(optimize_info.estimated_optimizable_bytes(), Eq(0));
+}
+
 TEST_F(IcingSearchEngineTest, GetAndPutShouldWorkAfterOptimization) {
   DocumentProto document1 = CreateMessageDocument("namespace", "uri1");
   DocumentProto document2 = CreateMessageDocument("namespace", "uri2");
@@ -1861,7 +1927,7 @@ TEST_F(IcingSearchEngineTest, SearchIncludesDocumentsBeforeTtl) {
       document;
 
   // Time just has to be less than the document's creation timestamp (100) + the
-  // schema's ttl (500)
+  // document's ttl (500)
   auto fake_clock = std::make_unique<FakeClock>();
   fake_clock->SetSystemTimeMilliseconds(400);
 
@@ -1908,7 +1974,7 @@ TEST_F(IcingSearchEngineTest, SearchDoesntIncludeDocumentsPastTtl) {
   expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
 
   // Time just has to be greater than the document's creation timestamp (100) +
-  // the schema's ttl (500)
+  // the document's ttl (500)
   auto fake_clock = std::make_unique<FakeClock>();
   fake_clock->SetSystemTimeMilliseconds(700);
 
@@ -3150,6 +3216,49 @@ TEST_F(IcingSearchEngineTest, SnippetSectionRestrict) {
               IsEmpty());
 }
 
+TEST_F(IcingSearchEngineTest, UninitializedInstanceFailsSafely) {
+  IcingSearchEngine icing(GetDefaultIcingOptions());
+
+  SchemaProto email_schema = CreateMessageSchema();
+  EXPECT_THAT(icing.SetSchema(email_schema).status().code(),
+              Eq(StatusProto::FAILED_PRECONDITION));
+  EXPECT_THAT(icing.GetSchema().status().code(),
+              Eq(StatusProto::FAILED_PRECONDITION));
+  EXPECT_THAT(
+      icing.GetSchemaType(email_schema.types(0).schema_type()).status().code(),
+      Eq(StatusProto::FAILED_PRECONDITION));
+
+  DocumentProto doc = CreateMessageDocument("namespace", "uri");
+  EXPECT_THAT(icing.Put(doc).status().code(),
+              Eq(StatusProto::FAILED_PRECONDITION));
+  EXPECT_THAT(icing.Get(doc.namespace_(), doc.uri()).status().code(),
+              Eq(StatusProto::FAILED_PRECONDITION));
+  EXPECT_THAT(icing.Delete(doc.namespace_(), doc.uri()).status().code(),
+              Eq(StatusProto::FAILED_PRECONDITION));
+  EXPECT_THAT(icing.DeleteByNamespace(doc.namespace_()).status().code(),
+              Eq(StatusProto::FAILED_PRECONDITION));
+  EXPECT_THAT(icing.DeleteBySchemaType(email_schema.types(0).schema_type())
+                  .status()
+                  .code(),
+              Eq(StatusProto::FAILED_PRECONDITION));
+
+  SearchSpecProto search_spec = SearchSpecProto::default_instance();
+  ScoringSpecProto scoring_spec = ScoringSpecProto::default_instance();
+  ResultSpecProto result_spec = ResultSpecProto::default_instance();
+  EXPECT_THAT(
+      icing.Search(search_spec, scoring_spec, result_spec).status().code(),
+      Eq(StatusProto::FAILED_PRECONDITION));
+  constexpr int kSomePageToken = 12;
+  EXPECT_THAT(icing.GetNextPage(kSomePageToken).status().code(),
+              Eq(StatusProto::FAILED_PRECONDITION));
+  icing.InvalidateNextPageToken(kSomePageToken);  // Verify this doesn't crash.
+
+  EXPECT_THAT(icing.PersistToDisk().status().code(),
+              Eq(StatusProto::FAILED_PRECONDITION));
+  EXPECT_THAT(icing.Optimize().status().code(),
+              Eq(StatusProto::FAILED_PRECONDITION));
+}
+
 }  // namespace
 }  // namespace lib
 }  // namespace icing
diff --git a/icing/index/index-processor_benchmark.cc b/icing/index/index-processor_benchmark.cc
index 835478d..00d116f 100644
--- a/icing/index/index-processor_benchmark.cc
+++ b/icing/index/index-processor_benchmark.cc
@@ -16,7 +16,7 @@
 #include "gmock/gmock.h"
 #include "icing/document-builder.h"
 #include "icing/file/filesystem.h"
-#include "icing/icu-data-file-helper.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
 #include "icing/index/index-processor.h"
 #include "icing/index/index.h"
 #include "icing/legacy/core/icing-string-util.h"
@@ -140,7 +140,7 @@ std::unique_ptr<Index> CreateIndex(const IcingFilesystem& filesystem,
 
 std::unique_ptr<Normalizer> CreateNormalizer() {
   return normalizer_factory::Create(
-             normalizer_factory::NormalizerType::ICU4C,
+
              /*max_term_byte_size=*/std::numeric_limits<int>::max())
       .ValueOrDie();
 }
@@ -193,8 +193,7 @@ void BM_IndexDocumentWithOneProperty(benchmark::State& state) {
 
   std::unique_ptr<Index> index = CreateIndex(filesystem, index_dir);
   std::unique_ptr<LanguageSegmenter> language_segmenter =
-      language_segmenter_factory::Create(language_segmenter_factory::ICU4C)
-          .ValueOrDie();
+      language_segmenter_factory::Create().ValueOrDie();
   std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
   std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore();
   std::unique_ptr<IndexProcessor> index_processor =
@@ -241,8 +240,7 @@ void BM_IndexDocumentWithTenProperties(benchmark::State& state) {
 
   std::unique_ptr<Index> index = CreateIndex(filesystem, index_dir);
   std::unique_ptr<LanguageSegmenter> language_segmenter =
-      language_segmenter_factory::Create(language_segmenter_factory::ICU4C)
-          .ValueOrDie();
+      language_segmenter_factory::Create().ValueOrDie();
   std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
   std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore();
   std::unique_ptr<IndexProcessor> index_processor =
@@ -290,8 +288,7 @@ void BM_IndexDocumentWithDiacriticLetters(benchmark::State& state) {
 
   std::unique_ptr<Index> index = CreateIndex(filesystem, index_dir);
   std::unique_ptr<LanguageSegmenter> language_segmenter =
-      language_segmenter_factory::Create(language_segmenter_factory::ICU4C)
-          .ValueOrDie();
+      language_segmenter_factory::Create().ValueOrDie();
   std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
   std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore();
   std::unique_ptr<IndexProcessor> index_processor =
@@ -339,8 +336,7 @@ void BM_IndexDocumentWithHiragana(benchmark::State& state) {
 
   std::unique_ptr<Index> index = CreateIndex(filesystem, index_dir);
   std::unique_ptr<LanguageSegmenter> language_segmenter =
-      language_segmenter_factory::Create(language_segmenter_factory::ICU4C)
-          .ValueOrDie();
+      language_segmenter_factory::Create().ValueOrDie();
   std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
   std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore();
   std::unique_ptr<IndexProcessor> index_processor =
diff --git a/icing/index/index-processor_test.cc b/icing/index/index-processor_test.cc
index 126ea29..8dfb9c2 100644
--- a/icing/index/index-processor_test.cc
+++ b/icing/index/index-processor_test.cc
@@ -27,7 +27,7 @@
 #include "icing/absl_ports/str_cat.h"
 #include "icing/document-builder.h"
 #include "icing/file/filesystem.h"
-#include "icing/icu-data-file-helper.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
 #include "icing/index/hit/doc-hit-info.h"
 #include "icing/index/index.h"
 #include "icing/index/iterator/doc-hit-info-iterator.h"
@@ -91,14 +91,13 @@ class IndexProcessorTest : public Test {
     ICING_ASSERT_OK_AND_ASSIGN(index_,
                                Index::Create(options, &icing_filesystem_));
 
-    ICING_ASSERT_OK_AND_ASSIGN(
-        lang_segmenter_,
-        language_segmenter_factory::Create(language_segmenter_factory::ICU4C));
+    ICING_ASSERT_OK_AND_ASSIGN(lang_segmenter_,
+                               language_segmenter_factory::Create());
 
     ICING_ASSERT_OK_AND_ASSIGN(
         normalizer_,
         normalizer_factory::Create(
-            normalizer_factory::NormalizerType::ICU4C,
+
             /*max_term_byte_size=*/std::numeric_limits<int32_t>::max()));
 
     ICING_ASSERT_OK_AND_ASSIGN(
@@ -415,9 +414,8 @@ TEST_F(IndexProcessorTest, TooLongTokens) {
   IndexProcessor::Options options;
   options.max_tokens_per_document = 1000;
 
-  ICING_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<Normalizer> normalizer,
-      normalizer_factory::Create(normalizer_factory::NormalizerType::ICU4C,
+  ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Normalizer> normalizer,
+                             normalizer_factory::Create(
                                  /*max_term_byte_size=*/4));
 
   ICING_ASSERT_OK_AND_ASSIGN(
diff --git a/icing/index/index.h b/icing/index/index.h
index f287081..f30c8ad 100644
--- a/icing/index/index.h
+++ b/icing/index/index.h
@@ -113,6 +113,17 @@ class Index {
     lite_index_->GetDebugInfo(verbosity, out);
   }
 
+  // Returns the byte size of the all the elements held in the index. This
+  // excludes the size of any internal metadata of the index, e.g. the index's
+  // header.
+  //
+  // Returns:
+  //   Byte size on success
+  //   INTERNAL_ERROR on IO error
+  libtextclassifier3::StatusOr<int64_t> GetElementsSize() const {
+    return lite_index_->GetElementsSize();
+  }
+
   // Create an iterator to iterate through all doc hit infos in the index that
   // match the term. section_id_mask can be set to ignore hits from sections not
   // listed in the mask. Eg. section_id_mask = 1U << 3; would only return hits
diff --git a/icing/index/index_test.cc b/icing/index/index_test.cc
index ff29135..070e82a 100644
--- a/icing/index/index_test.cc
+++ b/icing/index/index_test.cc
@@ -45,6 +45,7 @@ namespace {
 
 using ::testing::ElementsAre;
 using ::testing::Eq;
+using ::testing::Gt;
 using ::testing::IsEmpty;
 using ::testing::IsTrue;
 using ::testing::NiceMock;
@@ -621,12 +622,13 @@ TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsInOneNamespace) {
   EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0},
                                         /*num_to_return=*/10),
               IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("fo", 1),
-                                       EqualsTermMetadata("foo", 1))));
+                                                EqualsTermMetadata("foo", 1))));
 
   // namespace with id 1 has 1 result.
-  EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{1},
-                                        /*num_to_return=*/10),
-              IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("fool", 1))));
+  EXPECT_THAT(
+      index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{1},
+                                /*num_to_return=*/10),
+      IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("fool", 1))));
 }
 
 TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsInMultipleNamespaces) {
@@ -650,7 +652,7 @@ TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsInMultipleNamespaces) {
       index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{1, 2},
                                 /*num_to_return=*/10),
       IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("foo", 1),
-                               EqualsTermMetadata("fool", 1))));
+                                        EqualsTermMetadata("fool", 1))));
 }
 
 TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsInAllNamespaces) {
@@ -672,9 +674,9 @@ TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsInAllNamespaces) {
   // Should return "fo", "foo" and "fool" across all namespaces.
   EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{},
                                         /*num_to_return=*/10),
-              IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("fo", 1),
-                                       EqualsTermMetadata("foo", 1),
-                                       EqualsTermMetadata("fool", 1))));
+              IsOkAndHolds(UnorderedElementsAre(
+                  EqualsTermMetadata("fo", 1), EqualsTermMetadata("foo", 1),
+                  EqualsTermMetadata("fool", 1))));
 }
 
 TEST_F(IndexTest, FindTermByPrefixShouldReturnCorrectHitCount) {
@@ -690,10 +692,22 @@ TEST_F(IndexTest, FindTermByPrefixShouldReturnCorrectHitCount) {
   EXPECT_THAT(edit2.AddHit("fool"), IsOk());
 
   // 'foo' has 1 hit, 'fool' has 2 hits.
-  EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0},
-                                        /*num_to_return=*/10),
-              IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("foo", 1),
-                                       EqualsTermMetadata("fool", 2))));
+  EXPECT_THAT(
+      index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0},
+                                /*num_to_return=*/10),
+      IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("foo", 1),
+                                        EqualsTermMetadata("fool", 2))));
+}
+
+TEST_F(IndexTest, GetElementsSize) {
+  // Check empty index.
+  EXPECT_THAT(index_->GetElementsSize(), IsOkAndHolds(Eq(0)));
+
+  // Add an element.
+  Index::Editor edit = index_->Edit(
+      kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
+  EXPECT_THAT(edit.AddHit("foo"), IsOk());
+  EXPECT_THAT(index_->GetElementsSize(), IsOkAndHolds(Gt(0)));
 }
 
 }  // namespace
diff --git a/icing/index/lite-index.cc b/icing/index/lite-index.cc
index c9f68b5..489c53d 100644
--- a/icing/index/lite-index.cc
+++ b/icing/index/lite-index.cc
@@ -391,6 +391,29 @@ void LiteIndex::GetDebugInfo(int verbosity, std::string* out) const {
   lexicon_.GetDebugInfo(verbosity, out);
 }
 
+libtextclassifier3::StatusOr<int64_t> LiteIndex::GetElementsSize() const {
+  int64_t header_and_hit_buffer_file_size =
+      filesystem_->GetFileSize(hit_buffer_fd_.get());
+
+  if (header_and_hit_buffer_file_size == Filesystem::kBadFileSize) {
+    return absl_ports::InternalError(
+        "Failed to get element size of the LiteIndex's header and hit buffer");
+  }
+
+  int64_t lexicon_disk_usage = lexicon_.GetElementsSize();
+  if (lexicon_disk_usage == IcingFilesystem::kBadFileSize) {
+    return absl_ports::InternalError(
+        "Failed to get element size of LiteIndex's lexicon");
+  }
+
+  // On initialization, we grow the file to a padded size first. So this size
+  // won't count towards the size taken up by elements
+  size_t header_padded_size = IcingMMapper::page_aligned_size(header_size());
+
+  return header_and_hit_buffer_file_size - header_padded_size +
+         lexicon_disk_usage;
+}
+
 uint32_t LiteIndex::Seek(uint32_t term_id) {
   // Make searchable by sorting by hit buffer.
   uint32_t sort_len = header_->cur_size() - header_->searchable_end();
diff --git a/icing/index/lite-index.h b/icing/index/lite-index.h
index 6d01f42..b60a947 100644
--- a/icing/index/lite-index.h
+++ b/icing/index/lite-index.h
@@ -205,6 +205,14 @@ class LiteIndex {
   // verbosity > 0, more detailed debug information from the lexicon.
   void GetDebugInfo(int verbosity, std::string* out) const;
 
+  // Returns the byte size of all the elements held in the index. This excludes
+  // the size of any internal metadata of the index, e.g. the index's header.
+  //
+  // Returns:
+  //   Byte size on success
+  //   INTERNAL_ERROR on IO error
+  libtextclassifier3::StatusOr<int64_t> GetElementsSize() const;
+
  private:
   static IcingDynamicTrie::RuntimeOptions MakeTrieRuntimeOptions();
 
@@ -228,15 +236,29 @@ class LiteIndex {
   // hit buffer if term_id is not present.
   uint32_t Seek(uint32_t term_id);
 
+  // File descriptor that points to where the header and hit buffer are written
+  // to.
   ScopedFd hit_buffer_fd_;
 
+  // Mmapped region past the header that stores the hits.
   IcingArrayStorage hit_buffer_;
+
+  // Crc checksum of the hits, excludes the header.
   uint32_t hit_buffer_crc_;
+
+  // Trie that maps indexed terms to their term id
   IcingDynamicTrie lexicon_;
+
   // TODO(b/140437260): Port over to MemoryMappedFile
+  // Memory mapped region of the underlying file that reflects the header.
   IcingMMapper header_mmap_;
+
+  // Wrapper around the mmapped header that contains stats on the lite index.
   std::unique_ptr<IcingLiteIndex_Header> header_;
+
+  // Options used to initialize the LiteIndex.
   const Options options_;
+
   // TODO(b/139087650) Move to icing::Filesystem
   const IcingFilesystem* const filesystem_;
 };
diff --git a/icing/icing-search-engine-jni.cc b/icing/jni/icing-search-engine-jni.cc
index 109f717..b1b5420 100644
--- a/icing/icing-search-engine-jni.cc
+++ b/icing/jni/icing-search-engine-jni.cc
@@ -16,6 +16,7 @@
 
 #include <string>
 
+#include "icing/jni/jni-cache.h"
 #include <google/protobuf/message_lite.h>
 #include "icing/absl_ports/status_imports.h"
 #include "icing/icing-search-engine.h"
@@ -26,6 +27,7 @@
 #include "icing/proto/schema.pb.h"
 #include "icing/proto/scoring.pb.h"
 #include "icing/proto/search.pb.h"
+#include "icing/util/status-macros.h"
 
 namespace {
 bool ParseProtoFromJniByteArray(JNIEnv* env, jbyteArray bytes,
@@ -85,8 +87,12 @@ Java_com_google_android_icing_IcingSearchEngine_nativeCreate(
     return 0;
   }
 
+  std::unique_ptr<const icing::lib::JniCache> jni_cache;
+#ifdef ICING_REVERSE_JNI_SEGMENTATION
+  ICING_ASSIGN_OR_RETURN(jni_cache, icing::lib::JniCache::Create(env), 0);
+#endif  // ICING_REVERSE_JNI_SEGMENTATION
   icing::lib::IcingSearchEngine* icing =
-      new icing::lib::IcingSearchEngine(options);
+      new icing::lib::IcingSearchEngine(options, std::move(jni_cache));
   return reinterpret_cast<jlong>(icing);
 }
 
@@ -282,6 +288,18 @@ Java_com_google_android_icing_IcingSearchEngine_nativeOptimize(
 }
 
 JNIEXPORT jbyteArray JNICALL
+Java_com_google_android_icing_IcingSearchEngine_nativeGetOptimizeInfo(
+    JNIEnv* env, jclass clazz, jlong native_pointer) {
+  icing::lib::IcingSearchEngine* icing =
+      GetIcingSearchEnginePointer(native_pointer);
+
+  icing::lib::GetOptimizeInfoResultProto get_optimize_info_result_proto =
+      icing->GetOptimizeInfo();
+
+  return SerializeProtoToJniByteArray(env, get_optimize_info_result_proto);
+}
+
+JNIEXPORT jbyteArray JNICALL
 Java_com_google_android_icing_IcingSearchEngine_nativeReset(
     JNIEnv* env, jclass clazz, jlong native_pointer) {
   icing::lib::IcingSearchEngine* icing =
diff --git a/icing/jni/jni-cache.cc b/icing/jni/jni-cache.cc
new file mode 100644
index 0000000..a186222
--- /dev/null
+++ b/icing/jni/jni-cache.cc
@@ -0,0 +1,216 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/jni/jni-cache.h"
+
+#include "icing/text_classifier/lib3/utils/java/jni-base.h"
+#include "icing/text_classifier/lib3/utils/java/jni-helper.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/util/logging.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+JniCache::JniCache(JavaVM* jvm)
+    : jvm(jvm),
+      string_class(nullptr, jvm),
+      string_utf8(nullptr, jvm),
+      locale_class(nullptr, jvm),
+      locale_us(nullptr, jvm),
+      breakiterator_class(nullptr, jvm) {}
+
+// The macros below are intended to reduce the boilerplate in Create and avoid
+// easily introduced copy/paste errors.
+#define ICING_GET_CLASS_OR_RETURN_NULL(FIELD, NAME)                    \
+  {                                                                    \
+    ICING_ASSIGN_OR_RETURN(                                            \
+        libtextclassifier3::ScopedLocalRef<jclass> clazz,              \
+        libtextclassifier3::JniHelper::FindClass(env, NAME), nullptr); \
+    result->FIELD##_class =                                            \
+        libtextclassifier3::MakeGlobalRef(clazz.get(), env, jvm);      \
+    if (result->FIELD##_class == nullptr) {                            \
+      ICING_LOG(ERROR) << "Error finding class: " << NAME;             \
+      return nullptr;                                                  \
+    }                                                                  \
+  }
+
+#define ICING_GET_OPTIONAL_CLASS(FIELD, NAME)                                  \
+  {                                                                            \
+    libtextclassifier3::StatusOr<libtextclassifier3::ScopedLocalRef<jclass>>   \
+        status_or_class = libtextclassifier3::JniHelper::FindClass(env, NAME); \
+    if (status_or_class.ok()) {                                                \
+      result->FIELD##_class = libtextclassifier3::MakeGlobalRef(               \
+          std::move(status_or_class).ValueOrDie().get(), env, jvm);            \
+    }                                                                          \
+  }
+
+#define ICING_GET_METHOD(CLASS, FIELD, NAME, SIGNATURE)               \
+  result->CLASS##_##FIELD =                                           \
+      env->GetMethodID(result->CLASS##_class.get(), NAME, SIGNATURE); \
+  if (!result->CLASS##_##FIELD) {                                     \
+    ICING_LOG(WARNING) << __FILE__ << ":" << __LINE__                 \
+                       << "Error finding method: " << NAME;           \
+    return absl_ports::AbortedError("Unable to get Java method.");    \
+  }
+
+#define ICING_GET_OPTIONAL_STATIC_METHOD(CLASS, FIELD, NAME, SIGNATURE)       \
+  if (result->CLASS##_class != nullptr) {                                     \
+    result->CLASS##_##FIELD =                                                 \
+        env->GetStaticMethodID(result->CLASS##_class.get(), NAME, SIGNATURE); \
+    env->ExceptionClear();                                                    \
+  }
+
+#define ICING_GET_STATIC_METHOD(CLASS, FIELD, NAME, SIGNATURE)              \
+  result->CLASS##_##FIELD =                                                 \
+      env->GetStaticMethodID(result->CLASS##_class.get(), NAME, SIGNATURE); \
+  if (!result->CLASS##_##FIELD) {                                           \
+    ICING_LOG(WARNING) << __FILE__ << ":" << __LINE__                       \
+                       << "Error finding method: " << NAME;                 \
+    return absl_ports::AbortedError("Unable to get Java static method.");   \
+  }
+
+#define ICING_GET_STATIC_OBJECT_FIELD_OR_RETURN_NULL(CLASS, FIELD, NAME,     \
+                                                     SIGNATURE)              \
+  {                                                                          \
+    const jfieldID CLASS##_##FIELD##_field =                                 \
+        env->GetStaticFieldID(result->CLASS##_class.get(), NAME, SIGNATURE); \
+    if (!CLASS##_##FIELD##_field) {                                          \
+      ICING_LOG(WARNING) << __FILE__ << ":" << __LINE__                      \
+                         << "Error finding field id: " << NAME;              \
+      return absl_ports::AbortedError("Unable to get Java field id.");       \
+    }                                                                        \
+    ICING_ASSIGN_OR_RETURN(                                                  \
+        libtextclassifier3::ScopedLocalRef<jobject> static_object,           \
+        libtextclassifier3::JniHelper::GetStaticObjectField(                 \
+            env, result->CLASS##_class.get(), CLASS##_##FIELD##_field),      \
+        nullptr);                                                            \
+    result->CLASS##_##FIELD =                                                \
+        libtextclassifier3::MakeGlobalRef(static_object.get(), env, jvm);    \
+    if (result->CLASS##_##FIELD == nullptr) {                                \
+      ICING_LOG(ERROR) << "Error finding field: " << NAME;                   \
+      return nullptr;                                                        \
+    }                                                                        \
+  }
+
+#define ICING_GET_STATIC_INT_FIELD(CLASS, FIELD, NAME)                       \
+  const jfieldID CLASS##_##FIELD##_field =                                   \
+      env->GetStaticFieldID(result->CLASS##_class.get(), NAME, "I");         \
+  << "Error finding field id: " << NAME;                                     \
+  if (!CLASS##_##FIELD##_field) {                                            \
+    ICING_LOG(WARNING) << __FILE__ << ":" << __LINE__                        \
+                       << "Error finding field id: " << NAME;                \
+    return absl_ports::AbortedError(                                         \
+        "Unable to get Java static int field id.");                          \
+  }                                                                          \
+  result->CLASS##_##FIELD = env->GetStaticIntField(                          \
+      result->CLASS##_class.get(), CLASS##_##FIELD##_field);                 \
+  if (!result->CLASS##_##FIELD) {                                            \
+    ICING_LOG(WARNING) << __FILE__ << ":" << __LINE__                        \
+                       << "Error finding field: " << NAME;                   \
+    return absl_ports::AbortedError("Unable to get Java static int field."); \
+  }
+
+libtextclassifier3::StatusOr<std::unique_ptr<JniCache>> JniCache::Create(
+    JNIEnv* env) {
+  if (env == nullptr) {
+    return nullptr;
+  }
+  JavaVM* jvm = nullptr;
+  if (JNI_OK != env->GetJavaVM(&jvm) || jvm == nullptr) {
+    return nullptr;
+  }
+  std::unique_ptr<JniCache> result(new JniCache(jvm));
+
+  // String
+  ICING_GET_CLASS_OR_RETURN_NULL(string, "java/lang/String");
+  ICING_GET_METHOD(string, constructor, "<init>", "([BLjava/lang/String;)V");
+  ICING_GET_METHOD(string, code_point_count, "codePointCount", "(II)I");
+  ICING_GET_METHOD(string, length, "length", "()I");
+  ICING_ASSIGN_OR_RETURN(
+      libtextclassifier3::ScopedLocalRef<jstring> result_string,
+      libtextclassifier3::JniHelper::NewStringUTF(env, "UTF-8"), nullptr);
+  result->string_utf8 =
+      libtextclassifier3::MakeGlobalRef(result_string.get(), env, jvm);
+  if (result->string_utf8 == nullptr) {
+    return nullptr;
+  }
+
+  // Locale
+  ICING_GET_CLASS_OR_RETURN_NULL(locale, "java/util/Locale");
+  ICING_GET_STATIC_OBJECT_FIELD_OR_RETURN_NULL(locale, us, "US",
+                                               "Ljava/util/Locale;");
+  ICING_GET_METHOD(locale, constructor, "<init>", "(Ljava/lang/String;)V");
+  ICING_GET_OPTIONAL_STATIC_METHOD(locale, for_language_tag, "forLanguageTag",
+                                   "(Ljava/lang/String;)Ljava/util/Locale;");
+
+  // BreakIteratorBatcher
+  ICING_GET_CLASS_OR_RETURN_NULL(
+      breakiterator,
+      "com/google/android/libraries/mdi/search/BreakIteratorBatcher");
+  ICING_GET_METHOD(breakiterator, constructor, "<init>",
+                   "(Ljava/util/Locale;)V");
+  ICING_GET_METHOD(breakiterator, settext, "setText", "(Ljava/lang/String;)V");
+  ICING_GET_METHOD(breakiterator, next, "next", "(I)[I");
+  ICING_GET_METHOD(breakiterator, first, "first", "()I");
+  ICING_GET_METHOD(breakiterator, following, "following", "(I)I");
+  ICING_GET_METHOD(breakiterator, preceding, "preceding", "(I)I");
+
+  return result;
+}
+
+#undef ICING_GET_STATIC_INT_FIELD
+#undef ICING_GET_STATIC_OBJECT_FIELD_OR_RETURN_NULL
+#undef ICING_GET_STATIC_METHOD
+#undef ICING_GET_METHOD
+#undef ICING_GET_CLASS_OR_RETURN_NULL
+#undef ICING_GET_OPTIONAL_CLASS
+
+JNIEnv* JniCache::GetEnv() const {
+  void* env;
+  if (JNI_OK == jvm->GetEnv(&env, JNI_VERSION_1_4)) {
+    return reinterpret_cast<JNIEnv*>(env);
+  } else {
+    ICING_LOG(ERROR) << "Icing JniCache used on unattached thread";
+    return nullptr;
+  }
+}
+
+bool JniCache::ExceptionCheckAndClear() const {
+  return libtextclassifier3::JniExceptionCheckAndClear(GetEnv());
+}
+
+libtextclassifier3::StatusOr<libtextclassifier3::ScopedLocalRef<jstring>>
+JniCache::ConvertToJavaString(const char* utf8_text,
+                              const int utf8_text_size_bytes) const {
+  // Create java byte array.
+  JNIEnv* jenv = GetEnv();
+  ICING_ASSIGN_OR_RETURN(
+      libtextclassifier3::ScopedLocalRef<jbyteArray> text_java_utf8,
+      libtextclassifier3::JniHelper::NewByteArray(jenv, utf8_text_size_bytes));
+
+  jenv->SetByteArrayRegion(text_java_utf8.get(), 0, utf8_text_size_bytes,
+                           reinterpret_cast<const jbyte*>(utf8_text));
+
+  // Create the string with a UTF-8 charset.
+  ICING_ASSIGN_OR_RETURN(libtextclassifier3::ScopedLocalRef<jstring> result,
+                         libtextclassifier3::JniHelper::NewObject<jstring>(
+                             jenv, string_class.get(), string_constructor,
+                             text_java_utf8.get(), string_utf8.get()));
+
+  return result;
+}
+
+}  // namespace lib
+}  // namespace icing
diff --git a/icing/jni/jni-cache.h b/icing/jni/jni-cache.h
new file mode 100644
index 0000000..a5f16c7
--- /dev/null
+++ b/icing/jni/jni-cache.h
@@ -0,0 +1,78 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_JNI_JNI_CACHE_H_
+#define ICING_JNI_JNI_CACHE_H_
+
+#include <jni.h>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/text_classifier/lib3/utils/java/jni-base.h"
+
+namespace icing {
+namespace lib {
+
+// A helper class to cache class and method pointers for calls from JNI to Java.
+// (for implementations such as Java ICU that need to make calls from C++ to
+// Java)
+struct JniCache {
+  static libtextclassifier3::StatusOr<std::unique_ptr<JniCache>> Create(
+      JNIEnv* env);
+
+  // Returns the correct JNIEnv of the current thread. This allows multiple
+  // threads, each accessing the same instance of JniCache, to retrieve their
+  // unique JNIEnv pointers.
+  JNIEnv* GetEnv() const;
+
+  // Returns true if there are any pending exceptions from the execution of JNI
+  // calls. Also clears the exception if any existed.
+  bool ExceptionCheckAndClear() const;
+
+  JavaVM* jvm = nullptr;
+
+  // java.lang.String
+  libtextclassifier3::ScopedGlobalRef<jclass> string_class;
+  jmethodID string_constructor = nullptr;
+  jmethodID string_code_point_count = nullptr;
+  jmethodID string_length = nullptr;
+  libtextclassifier3::ScopedGlobalRef<jstring> string_utf8;
+
+  // java.util.Locale
+  libtextclassifier3::ScopedGlobalRef<jclass> locale_class;
+  libtextclassifier3::ScopedGlobalRef<jobject> locale_us;
+  jmethodID locale_constructor = nullptr;
+  jmethodID locale_for_language_tag = nullptr;
+
+  // BreakIteratorBatcher
+  libtextclassifier3::ScopedGlobalRef<jclass> breakiterator_class;
+  jmethodID breakiterator_constructor = nullptr;
+  jmethodID breakiterator_settext = nullptr;
+  jmethodID breakiterator_next = nullptr;
+  jmethodID breakiterator_first = nullptr;
+  jmethodID breakiterator_following = nullptr;
+  jmethodID breakiterator_preceding = nullptr;
+
+  // Helper to convert lib3 UnicodeText to Java strings.
+  libtextclassifier3::StatusOr<libtextclassifier3::ScopedLocalRef<jstring>>
+  ConvertToJavaString(const char* utf8_text,
+                      const int utf8_text_size_bytes) const;
+
+ private:
+  explicit JniCache(JavaVM* jvm);
+};
+
+}  // namespace lib
+}  // namespace icing
+
+#endif  // ICING_JNI_JNI_CACHE_H_
diff --git a/icing/jni/jni.lds b/icing/jni/jni.lds
new file mode 100644
index 0000000..401682a
--- /dev/null
+++ b/icing/jni/jni.lds
@@ -0,0 +1,10 @@
+VERS_1.0 {
+  # Export JNI symbols.
+  global:
+    Java_*;
+    JNI_OnLoad;
+
+  # Hide everything else
+  local:
+    *;
+};
diff --git a/icing/jni/reverse-jni-break-iterator.cc b/icing/jni/reverse-jni-break-iterator.cc
new file mode 100644
index 0000000..2a589c6
--- /dev/null
+++ b/icing/jni/reverse-jni-break-iterator.cc
@@ -0,0 +1,187 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/jni/reverse-jni-break-iterator.h"
+
+#include <math.h>
+
+#include <cassert>
+#include <cctype>
+#include <map>
+
+#include "icing/jni/jni-cache.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/text_classifier/lib3/utils/java/jni-base.h"
+#include "icing/text_classifier/lib3/utils/java/jni-helper.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/util/status-macros.h"
+#include <jni.h>
+
+namespace icing {
+namespace lib {
+
+namespace {
+// Chosen based on results in go/reverse-jni-benchmarks
+static constexpr int kBatchSize = 100;
+}  // namespace
+
+// -----------------------------------------------------------------------------
+// Implementations that call out to JVM. Behold the beauty.
+// -----------------------------------------------------------------------------
+libtextclassifier3::StatusOr<std::unique_ptr<ReverseJniBreakIterator>>
+ReverseJniBreakIterator::Create(const JniCache* jni_cache,
+                                std::string_view text,
+                                std::string_view locale) {
+  if (jni_cache == nullptr) {
+    return absl_ports::InvalidArgumentError(
+        "Create must be called with a valid JniCache pointer!");
+  }
+
+  ICING_ASSIGN_OR_RETURN(
+      libtextclassifier3::ScopedLocalRef<jstring> java_text,
+      jni_cache->ConvertToJavaString(text.data(), text.length()));
+  if (java_text.get() == nullptr) {
+    return absl_ports::AbortedError("Failed to create Java String from input.");
+  }
+
+  ICING_ASSIGN_OR_RETURN(
+      libtextclassifier3::ScopedLocalRef<jstring> java_locale_string,
+      jni_cache->ConvertToJavaString(locale.data(), locale.length()));
+  if (java_locale_string.get() == nullptr) {
+    return absl_ports::AbortedError(
+        "Failed to create Java String from locale.");
+  }
+
+  JNIEnv* jenv = jni_cache->GetEnv();
+  ICING_ASSIGN_OR_RETURN(
+      libtextclassifier3::ScopedLocalRef<jobject> java_locale,
+      libtextclassifier3::JniHelper::NewObject(
+          jenv, jni_cache->locale_class.get(), jni_cache->locale_constructor,
+          java_locale_string.get()));
+  if (java_locale.get() == nullptr) {
+    return absl_ports::AbortedError(
+        "Failed to create Java Locale from locale.");
+  }
+
+  ICING_ASSIGN_OR_RETURN(
+      libtextclassifier3::ScopedLocalRef<jobject> local_iterator_batcher,
+      libtextclassifier3::JniHelper::NewObject(
+          jenv, jni_cache->breakiterator_class.get(),
+          jni_cache->breakiterator_constructor, java_locale.get()));
+  libtextclassifier3::ScopedGlobalRef<jobject> iterator_batcher =
+      libtextclassifier3::MakeGlobalRef(local_iterator_batcher.get(), jenv,
+                                        jni_cache->jvm);
+  if (iterator_batcher.get() == nullptr) {
+    return absl_ports::AbortedError(
+        "Failed to create Java BreakIteratorBatcher.");
+  }
+
+  ICING_RETURN_IF_ERROR(libtextclassifier3::JniHelper::CallVoidMethod(
+      jenv, iterator_batcher.get(), jni_cache->breakiterator_settext,
+      java_text.get()));
+  return std::unique_ptr<ReverseJniBreakIterator>(
+      new ReverseJniBreakIterator(jni_cache, std::move(iterator_batcher)));
+}
+
+ReverseJniBreakIterator::ReverseJniBreakIterator(
+    const JniCache* jni_cache,
+    libtextclassifier3::ScopedGlobalRef<jobject> iterator_batcher)
+    : jni_cache_(jni_cache),
+      iterator_batcher_(std::move(iterator_batcher)),
+      is_done_(false),
+      is_almost_done_(false) {}
+
+int ReverseJniBreakIterator::Next() {
+  if (is_done_) {
+    return ReverseJniBreakIterator::kDone;
+  }
+  if (break_indices_cache_.empty()) {
+    if (FetchNextBatch() == ReverseJniBreakIterator::kDone) {
+      // Either there were no more results or an error occurred. Either way,
+      // mark ourselves as done and return.
+      is_done_ = true;
+      return ReverseJniBreakIterator::kDone;
+    }
+    is_almost_done_ = break_indices_cache_.size() < kBatchSize;
+  }
+  int break_index = break_indices_cache_.front();
+  break_indices_cache_.pop();
+  is_done_ = is_almost_done_ && break_indices_cache_.empty();
+  return break_index;
+}
+
+int ReverseJniBreakIterator::First() {
+  const int first_index = jni_cache_->GetEnv()->CallIntMethod(
+      iterator_batcher_.get(), jni_cache_->breakiterator_first);
+  if (jni_cache_->ExceptionCheckAndClear()) {
+    return ReverseJniBreakIterator::kDone;
+  }
+  ClearCache();
+  return first_index;
+}
+
+int ReverseJniBreakIterator::Preceding(int offset) {
+  const int preceding_index = jni_cache_->GetEnv()->CallIntMethod(
+      iterator_batcher_.get(), jni_cache_->breakiterator_preceding, offset);
+  if (jni_cache_->ExceptionCheckAndClear()) {
+    return ReverseJniBreakIterator::kDone;
+  }
+  ClearCache();
+  return preceding_index;
+}
+
+int ReverseJniBreakIterator::Following(int offset) {
+  const int following_index = jni_cache_->GetEnv()->CallIntMethod(
+      iterator_batcher_.get(), jni_cache_->breakiterator_following, offset);
+  if (jni_cache_->ExceptionCheckAndClear()) {
+    return ReverseJniBreakIterator::kDone;
+  }
+  ClearCache();
+  return following_index;
+}
+
+int ReverseJniBreakIterator::FetchNextBatch() {
+  ICING_ASSIGN_OR_RETURN(
+      libtextclassifier3::ScopedLocalRef<jintArray> break_indices,
+      libtextclassifier3::JniHelper::CallObjectMethod<jintArray>(
+          jni_cache_->GetEnv(), iterator_batcher_.get(),
+          jni_cache_->breakiterator_next, kBatchSize),
+      ReverseJniBreakIterator::kDone);
+  if (break_indices == nullptr || jni_cache_->ExceptionCheckAndClear()) {
+    return ReverseJniBreakIterator::kDone;
+  }
+  jint num_indices = jni_cache_->GetEnv()->GetArrayLength(break_indices.get());
+  if (num_indices == 0) {
+    return ReverseJniBreakIterator::kDone;
+  }
+  jint* break_indices_arr =
+      static_cast<jint*>(jni_cache_->GetEnv()->GetPrimitiveArrayCritical(
+          break_indices.get(), nullptr));
+  for (int i = 0; i < num_indices; ++i) {
+    break_indices_cache_.push(break_indices_arr[i]);
+  }
+  jni_cache_->GetEnv()->ReleasePrimitiveArrayCritical(break_indices.get(),
+                                                      break_indices_arr,
+                                                      /*mode=*/0);
+  return num_indices;
+}
+
+void ReverseJniBreakIterator::ClearCache() {
+  break_indices_cache_ = std::queue<int>();
+  is_done_ = false;
+  is_almost_done_ = false;
+}
+
+}  // namespace lib
+}  // namespace icing
diff --git a/icing/jni/reverse-jni-break-iterator.h b/icing/jni/reverse-jni-break-iterator.h
new file mode 100644
index 0000000..c1f05f4
--- /dev/null
+++ b/icing/jni/reverse-jni-break-iterator.h
@@ -0,0 +1,124 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_JNI_REVERSE_JNI_BREAK_ITERATOR_H_
+#define ICING_JNI_REVERSE_JNI_BREAK_ITERATOR_H_
+
+#include <jni.h>
+
+#include <queue>
+#include <string>
+
+#include "icing/jni/jni-cache.h"
+#include "icing/text_classifier/lib3/utils/java/jni-base.h"
+
+namespace icing {
+namespace lib {
+
+// A class that handles the cross-JNI interactions with BreakIteratorBatcher and
+// hides the batching element to provide an interface akin to
+// java.text.BreakIterator.
+//
+// Example:
+// std::string text = "我每天走路去上班。";
+// ASSERT_THAT(text, SizeIs(27));
+// std::unique_ptr<ReverseJniBreakIterator> itr =
+//     ReverseJniBreakIterator::Create(jni_cache, text, locale);
+// std::vector<int> nexts;
+// int next = itr->Next();
+// while (next != ReverseJniBreakIterator::kDone) {
+//   nexts.push_back(next);
+//   next = itr->Next();
+// }
+// EXPECT_THAT(nexts, ElementsAre(1, 3, 5, 6, 8));
+class ReverseJniBreakIterator {
+ public:
+  static constexpr int kDone = -1;
+
+  // Creates a ReverseJniBreakiterator with the given text and locale.
+  //
+  // Returns:
+  //   A ReverseJniBreakIterator on success
+  //   INVALID_ARGUMENT if jni_cache isn't a valid JniCache pointer
+  //   INTERNAL if unable to create any of the required Java objects
+  static libtextclassifier3::StatusOr<std::unique_ptr<ReverseJniBreakIterator>>
+  Create(const JniCache* jni_cache, std::string_view text,
+         std::string_view locale);
+
+  // Returns the UTF-16 boundary following the current boundary. If the current
+  // boundary is the last text boundary, it returns
+  // ReverseJniBreakIterator::kDONE.
+  //
+  // NOTE: The 'boundary' refers to the UTF-16 boundary - NOT the UTF-8
+  // boundary. Callers interested in the UTF-8 boundary are required to maintain
+  // whatever state is necessary to translate from UTF-16 to UTF-8 boundaries.
+  int Next();
+
+  // Returns the first UTF-16 boundary. The iterator's current position is set
+  // to the first text boundary and any cached data is cleared.
+  int First();
+
+  // Returns the position of the first UTF-16 boundary preceding the UTF-16
+  // offset. If there is no boundary preceding the specified offset, then
+  // ReverseJniBreakIterator::kDone is returned.
+  //
+  // The iterator's current position is set to the segment whose boundary was
+  // returned and any cached data is cleared.
+  int Preceding(int offset);
+
+  // Returns the position of the first UTF-16 boundary following the UTF-16
+  // offset. If there is no boundary following the specified offset, then
+  // ReverseJniBreakIterator::kDone is returned.
+  //
+  // The iterator's current position is set to the segment whose boundary
+  // was returned and any cached data is cleared.
+  int Following(int offset);
+
+ private:
+  ReverseJniBreakIterator(
+      const JniCache* jni_cache,
+      libtextclassifier3::ScopedGlobalRef<jobject> iterator_batcher);
+
+  // Fetches the results of up to kBatchSize next calls and stores them in
+  // break_indices_cache_. Returns the number of results or kDone if no more
+  // results could be fetched.
+  int FetchNextBatch();
+
+  // Empties the cache and sets is_done_ and is_almost_done_ to false.
+  void ClearCache();
+
+  // Keeps track of references to Java classes and methods. Does NOT own.
+  const JniCache* jni_cache_;
+
+  // The reference to the actual instance of BreakIteratorBatcher that
+  // this class interacts with.
+  libtextclassifier3::ScopedGlobalRef<jobject> iterator_batcher_;
+
+  // The cache holding the most recent batch of return values from
+  // BreakIteratorBatcher#next.
+  std::queue<int> break_indices_cache_;
+
+  bool is_done_;
+
+  // The last batch was incomplete (< kBatchSize results were returned). The
+  // next call to BreakIteratorBatcher#next is guaranteed to return an
+  // empty array. Once the results from the last batch are evicted from
+  // break_indices_cache, ReverseJniBreakIterator will transition to is_done_.
+  bool is_almost_done_;
+};
+
+}  // namespace lib
+}  // namespace icing
+
+#endif  // ICING_JNI_REVERSE_JNI_BREAK_ITERATOR_H_
diff --git a/icing/legacy/index/icing-dynamic-trie.cc b/icing/legacy/index/icing-dynamic-trie.cc
index 960d003..ee3d3a2 100644
--- a/icing/legacy/index/icing-dynamic-trie.cc
+++ b/icing/legacy/index/icing-dynamic-trie.cc
@@ -11,9 +11,6 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
-// Copyright 2011 Google Inc. All Rights Reserved.
-// Author: ulas@google.com (Ulas Kirazci)
 //
 // We store the trie in three areas: nodes, nexts and suffixes.
 //
@@ -84,7 +81,7 @@
 #include "icing/legacy/index/icing-filesystem.h"
 #include "icing/legacy/index/icing-flash-bitmap.h"
 #include "icing/legacy/index/icing-mmapper.h"
-#include "icing/util/icu-i18n-utils.h"
+#include "icing/util/i18n-utils.h"
 #include "icing/util/logging.h"
 #include "icing/util/math-util.h"
 
@@ -250,6 +247,11 @@ class IcingDynamicTrie::IcingDynamicTrieStorage {
                      const IcingFilesystem &filesystem);
   bool Sync();
   uint64_t GetDiskUsage() const;
+
+  // Returns the size of the elements held in the trie. This excludes the size
+  // of any internal metadata of the trie, e.g. the trie's header.
+  uint64_t GetElementsFileSize() const;
+
   void Warm();
 
   void Clear();
@@ -696,6 +698,18 @@ uint64_t IcingDynamicTrie::IcingDynamicTrieStorage::GetDiskUsage() const {
   return total;
 }
 
+uint64_t IcingDynamicTrie::IcingDynamicTrieStorage::GetElementsFileSize()
+    const {
+  // Trie files themselves, exclude size of the header. These arrays are dense,
+  // not sparse, so use file size for more accurate numbers.
+  uint64_t total = 0;
+  for (int i = 0; i < NUM_ARRAY_TYPES; i++) {
+    IcingFilesystem::IncrementByOrSetInvalid(
+        filesystem_->GetFileSize(array_fds_[i].get()), &total);
+  }
+  return total;
+}
+
 IcingDynamicTrie::Node *IcingDynamicTrie::IcingDynamicTrieStorage::AllocNode() {
   if (nodes_left() == 0) {
     ICING_LOG(FATAL) << "No allocated nodes left";
@@ -1154,6 +1168,30 @@ uint64_t IcingDynamicTrie::GetDiskUsage() const {
   return total;
 }
 
+uint64_t IcingDynamicTrie::GetElementsSize() const {
+  uint64_t total = 0;
+
+  // Bitmaps are sparsely populated, so disk usage is more accurate for those.
+  // Property bitmaps.
+  IcingFilesystem::IncrementByOrSetInvalid(deleted_bitmap_->GetDiskUsage(),
+                                           &total);
+  // The deleted bitmap is always initially grown to kGrowSize, whether there
+  // are elements or not. So even if there are no elements in the trie, we'll
+  // still have the bitmap of size kGrowSize, so subtract that from the size of
+  // the trie's elements.
+  total -= IcingFlashBitmap::kGrowSize;
+
+  for (auto &bitmap : property_bitmaps_) {
+    if (bitmap == nullptr) continue;
+    IcingFilesystem::IncrementByOrSetInvalid(bitmap->GetDiskUsage(), &total);
+  }
+
+  // Storage. We can use file size here since the storage files aren't sparse.
+  IcingFilesystem::IncrementByOrSetInvalid(storage_->GetElementsFileSize(),
+                                           &total);
+  return total;
+}
+
 std::unique_ptr<IcingFlashBitmap> IcingDynamicTrie::OpenAndInitBitmap(
     const std::string &filename, bool verify,
     const IcingFilesystem *filesystem) {
@@ -1868,7 +1906,7 @@ void IcingDynamicTrie::Utf8Iterator::LeftBranchToUtf8End() {
 
   // If we start with non-ascii, take all left branches while there is
   // a continuation byte.
-  if (!icu_i18n_utils::IsAscii(cur_[cur_len_ - 1])) {
+  if (!i18n_utils::IsAscii(cur_[cur_len_ - 1])) {
     while (!node->is_leaf()) {
       if (cur_len_ >= U8_MAX_LENGTH) break;
 
@@ -1877,8 +1915,8 @@ void IcingDynamicTrie::Utf8Iterator::LeftBranchToUtf8End() {
       if (branch_end_->child->val() == 0) {
         // Check if we already have a valid cur_.
         cur_[cur_len_] = 0;
-        UChar32 uchar32 = icu_i18n_utils::GetUChar32At(cur_, cur_len_, 0);
-        if (uchar32 == icu_i18n_utils::kInvalidUChar32 &&
+        UChar32 uchar32 = i18n_utils::GetUChar32At(cur_, cur_len_, 0);
+        if (uchar32 == i18n_utils::kInvalidUChar32 &&
             node->log2_num_children() > 0) {
           branch_end_->child++;
         } else {
diff --git a/icing/legacy/index/icing-dynamic-trie.h b/icing/legacy/index/icing-dynamic-trie.h
index 6b39c56..7136ef8 100644
--- a/icing/legacy/index/icing-dynamic-trie.h
+++ b/icing/legacy/index/icing-dynamic-trie.h
@@ -48,7 +48,8 @@
 #include "icing/legacy/index/icing-mmapper.h"
 #include "icing/legacy/index/icing-storage.h"
 #include "icing/legacy/index/proto/icing-dynamic-trie-header.pb.h"
-#include "icing/util/icu-i18n-utils.h"
+#include "icing/util/i18n-utils.h"
+#include "unicode/utf8.h"
 
 namespace icing {
 namespace lib {
@@ -265,6 +266,10 @@ class IcingDynamicTrie : public IIcingStorage {
   bool Remove() override;
   uint64_t GetDiskUsage() const override;
 
+  // Returns the size of the elements held in the trie. This excludes the size
+  // of any internal metadata of the trie, e.g. the trie's header.
+  uint64_t GetElementsSize() const;
+
   // REQUIRED: For all functions below is_initialized() == true.
 
   // Number of keys in trie.
diff --git a/icing/legacy/index/icing-flash-bitmap.h b/icing/legacy/index/icing-flash-bitmap.h
index 9abd369..3b3521a 100644
--- a/icing/legacy/index/icing-flash-bitmap.h
+++ b/icing/legacy/index/icing-flash-bitmap.h
@@ -11,9 +11,6 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
-// Copyright 2012 Google Inc. All Rights Reserved.
-// Author: ulas@google.com (Ulas Kirazci)
 //
 // A disk-backed bitmap.
 //
diff --git a/icing/proto/document.proto b/icing/proto/document.proto
index 0a8b6f8..bed33b0 100644
--- a/icing/proto/document.proto
+++ b/icing/proto/document.proto
@@ -20,6 +20,7 @@ import "icing/proto/status.proto";
 
 option java_package = "com.google.android.icing.proto";
 option java_multiple_files = true;
+option objc_class_prefix = "ICNG";
 
 // Defines a unit of data understood by the IcingSearchEngine.
 // Next tag: 9
@@ -108,6 +109,7 @@ message PutResultProto {
 message GetResultProto {
   // Status code can be one of:
   //   OK
+  //   FAILED_PRECONDITION
   //   NOT_FOUND
   //   INTERNAL
   //
@@ -127,6 +129,7 @@ message GetResultProto {
 message DeleteResultProto {
   // Status code can be one of:
   //   OK
+  //   FAILED_PRECONDITION
   //   NOT_FOUND
   //   INTERNAL
   //
@@ -142,6 +145,7 @@ message DeleteResultProto {
 message DeleteByNamespaceResultProto {
   // Status code can be one of:
   //   OK
+  //   FAILED_PRECONDITION
   //   NOT_FOUND
   //   INTERNAL
   //
@@ -157,6 +161,7 @@ message DeleteByNamespaceResultProto {
 message DeleteBySchemaTypeResultProto {
   // Status code can be one of:
   //   OK
+  //   FAILED_PRECONDITION
   //   NOT_FOUND
   //   INTERNAL
   //
diff --git a/icing/proto/document_wrapper.proto b/icing/proto/document_wrapper.proto
index 0666e72..e8eb992 100644
--- a/icing/proto/document_wrapper.proto
+++ b/icing/proto/document_wrapper.proto
@@ -21,6 +21,8 @@ import "icing/proto/document.proto";
 option java_package = "com.google.android.icing.proto";
 option java_multiple_files = true;
 
+option objc_class_prefix = "ICNG";
+
 // DocumentWrapper as a wrapper of the user-facing DocumentProto is meant to
 // be used by icing team internally. It stores the original document provided
 // by library users and metadata of the document which shouldn't be exposed to
diff --git a/icing/proto/initialize.proto b/icing/proto/initialize.proto
index 813cdb5..eac88e6 100644
--- a/icing/proto/initialize.proto
+++ b/icing/proto/initialize.proto
@@ -21,6 +21,8 @@ import "icing/proto/status.proto";
 option java_package = "com.google.android.icing.proto";
 option java_multiple_files = true;
 
+option objc_class_prefix = "ICNG";
+
 // Next tag: 5
 message IcingSearchEngineOptions {
   // Directory to persist files for Icing. Required.
diff --git a/icing/proto/optimize.proto b/icing/proto/optimize.proto
index 2bf28e8..1baa64c 100644
--- a/icing/proto/optimize.proto
+++ b/icing/proto/optimize.proto
@@ -20,12 +20,14 @@ import "icing/proto/status.proto";
 
 option java_package = "com.google.android.icing.proto";
 option java_multiple_files = true;
+option objc_class_prefix = "ICNG";
 
 // Result of a call to IcingSearchEngine.Optimize
 // Next tag: 2
 message OptimizeResultProto {
   // Status code can be one of:
   //   OK
+  //   FAILED_PRECONDITION
   //   WARNING_DATA_LOSS
   //   ABORTED
   //   INTERNAL
@@ -36,3 +38,23 @@ message OptimizeResultProto {
   // TODO(b/147699081): Add a field to indicate lost_schema and lost_documents.
   // go/icing-library-apis.
 }
+
+// Result of a call to IcingSearchEngine.GetOptimizeInfo
+// Next tag: 4
+message GetOptimizeInfoResultProto {
+  // Status code can be one of:
+  //   OK
+  //   FAILED_PRECONDITION
+  //   INTERNAL
+  //
+  // See status.proto for more details.
+  optional StatusProto status = 1;
+
+  // Documents that have expired or been deleted, but are still taking up space
+  // in IcingSearchEngine.
+  optional int64 optimizable_docs = 2;
+
+  // Estimated bytes that could be recovered. The exact size per document isn't
+  // tracked, so this is based off an average document size.
+  optional int64 estimated_optimizable_bytes = 3;
+}
diff --git a/icing/proto/persist.proto b/icing/proto/persist.proto
index 5b5a737..77cf987 100644
--- a/icing/proto/persist.proto
+++ b/icing/proto/persist.proto
@@ -20,12 +20,14 @@ import "icing/proto/status.proto";
 
 option java_package = "com.google.android.icing.proto";
 option java_multiple_files = true;
+option objc_class_prefix = "ICNG";
 
 // Result of a call to IcingSearchEngine.Persist
 // Next tag: 2
 message PersistToDiskResultProto {
   // Status code can be one of:
   //   OK
+  //   FAILED_PRECONDITION
   //   INTERNAL
   //
   // See status.proto for more details.
diff --git a/icing/proto/reset.proto b/icing/proto/reset.proto
index 9a7fa9a..5e8b9f5 100644
--- a/icing/proto/reset.proto
+++ b/icing/proto/reset.proto
@@ -21,6 +21,8 @@ import "icing/proto/status.proto";
 option java_package = "com.google.android.icing.proto";
 option java_multiple_files = true;
 
+option objc_class_prefix = "ICNG";
+
 // Result of a call to IcingSearchEngine.Reset
 // Next tag: 2
 message ResetResultProto {
diff --git a/icing/proto/schema.proto b/icing/proto/schema.proto
index cabccaa..3a7ee5d 100644
--- a/icing/proto/schema.proto
+++ b/icing/proto/schema.proto
@@ -21,6 +21,7 @@ import "icing/proto/term.proto";
 
 option java_package = "com.google.android.icing.proto";
 option java_multiple_files = true;
+option objc_class_prefix = "ICNG";
 
 // Defines the schema that every Document of a specific "type" should adhere
 // to. These can be considered as definitions of rich structured types for
@@ -204,6 +205,7 @@ message SetSchemaResultProto {
 message GetSchemaResultProto {
   // Status code can be one of:
   //   OK
+  //   FAILED_PRECONDITION
   //   NOT_FOUND
   //   INTERNAL
   //
diff --git a/icing/proto/scoring.proto b/icing/proto/scoring.proto
index ad536b4..667ff4f 100644
--- a/icing/proto/scoring.proto
+++ b/icing/proto/scoring.proto
@@ -19,6 +19,8 @@ package icing.lib;
 option java_package = "com.google.android.icing.proto";
 option java_multiple_files = true;
 
+option objc_class_prefix = "ICNG";
+
 // Encapsulates the configurations on how Icing should score and rank the search
 // results.
 // Next tag: 3
@@ -26,9 +28,8 @@ message ScoringSpecProto {
   // OPTIONAL: Indicates how the search results will be ranked.
   message RankingStrategy {
     enum Code {
-      // No ranking strategy specified, documents will be returned in the
-      // default order that the most recent document inserted into Icing comes
-      // first.
+      // No ranking strategy specified, documents may be returned in an
+      // arbitrary order.
       NONE = 0;
 
       // Ranked by user-provided document scores.
diff --git a/icing/proto/search.proto b/icing/proto/search.proto
index 085575a..8ea5036 100644
--- a/icing/proto/search.proto
+++ b/icing/proto/search.proto
@@ -22,6 +22,7 @@ import "icing/proto/term.proto";
 
 option java_package = "com.google.android.icing.proto";
 option java_multiple_files = true;
+option objc_class_prefix = "ICNG";
 
 // Client-supplied specifications on what documents to retrieve.
 // Next tag: 5
@@ -148,6 +149,7 @@ message SnippetProto {
 message SearchResultProto {
   // Status code can be one of:
   //   OK
+  //   FAILED_PRECONDITION
   //   INVALID_ARGUMENT
   //   ABORTED
   //   INTERNAL
diff --git a/icing/proto/status.proto b/icing/proto/status.proto
index 418b2e8..2733a15 100644
--- a/icing/proto/status.proto
+++ b/icing/proto/status.proto
@@ -19,6 +19,8 @@ package icing.lib;
 option java_package = "com.google.android.icing.proto";
 option java_multiple_files = true;
 
+option objc_class_prefix = "ICNG";
+
 // Canonical status to indicate the results of API calls.
 // Next tag: 3
 message StatusProto {
diff --git a/icing/proto/term.proto b/icing/proto/term.proto
index 30cd1bc..adf2ad6 100644
--- a/icing/proto/term.proto
+++ b/icing/proto/term.proto
@@ -19,6 +19,8 @@ package icing.lib;
 option java_package = "com.google.android.icing.proto";
 option java_multiple_files = true;
 
+option objc_class_prefix = "ICNG";
+
 // Encapsulates the configurations on how Icing should query/index these terms.
 // Next tag: 0
 message TermMatchType {
diff --git a/icing/query/query-processor_benchmark.cc b/icing/query/query-processor_benchmark.cc
index 5775e83..000bf3a 100644
--- a/icing/query/query-processor_benchmark.cc
+++ b/icing/query/query-processor_benchmark.cc
@@ -16,7 +16,7 @@
 #include "gmock/gmock.h"
 #include "third_party/absl/flags/flag.h"
 #include "icing/document-builder.h"
-#include "icing/icu-data-file-helper.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
 #include "icing/index/index.h"
 #include "icing/proto/term.pb.h"
 #include "icing/query/query-processor.h"
@@ -80,7 +80,7 @@ std::unique_ptr<Index> CreateIndex(const IcingFilesystem& filesystem,
 
 std::unique_ptr<Normalizer> CreateNormalizer() {
   return normalizer_factory::Create(
-             normalizer_factory::NormalizerType::ICU4C,
+
              /*max_term_byte_size=*/std::numeric_limits<int>::max())
       .ValueOrDie();
 }
@@ -108,8 +108,7 @@ void BM_QueryOneTerm(benchmark::State& state) {
 
   std::unique_ptr<Index> index = CreateIndex(icing_filesystem, index_dir);
   std::unique_ptr<LanguageSegmenter> language_segmenter =
-      language_segmenter_factory::Create(language_segmenter_factory::ICU4C)
-          .ValueOrDie();
+      language_segmenter_factory::Create().ValueOrDie();
   std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
   FakeClock fake_clock;
 
@@ -221,8 +220,7 @@ void BM_QueryFiveTerms(benchmark::State& state) {
 
   std::unique_ptr<Index> index = CreateIndex(icing_filesystem, index_dir);
   std::unique_ptr<LanguageSegmenter> language_segmenter =
-      language_segmenter_factory::Create(language_segmenter_factory::ICU4C)
-          .ValueOrDie();
+      language_segmenter_factory::Create().ValueOrDie();
   std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
   FakeClock fake_clock;
 
@@ -352,8 +350,7 @@ void BM_QueryDiacriticTerm(benchmark::State& state) {
 
   std::unique_ptr<Index> index = CreateIndex(icing_filesystem, index_dir);
   std::unique_ptr<LanguageSegmenter> language_segmenter =
-      language_segmenter_factory::Create(language_segmenter_factory::ICU4C)
-          .ValueOrDie();
+      language_segmenter_factory::Create().ValueOrDie();
   std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
   FakeClock fake_clock;
 
@@ -468,8 +465,7 @@ void BM_QueryHiragana(benchmark::State& state) {
 
   std::unique_ptr<Index> index = CreateIndex(icing_filesystem, index_dir);
   std::unique_ptr<LanguageSegmenter> language_segmenter =
-      language_segmenter_factory::Create(language_segmenter_factory::ICU4C)
-          .ValueOrDie();
+      language_segmenter_factory::Create().ValueOrDie();
   std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
   FakeClock fake_clock;
 
diff --git a/icing/query/query-processor_test.cc b/icing/query/query-processor_test.cc
index 99a552e..7dfc326 100644
--- a/icing/query/query-processor_test.cc
+++ b/icing/query/query-processor_test.cc
@@ -22,7 +22,7 @@
 #include "gtest/gtest.h"
 #include "icing/document-builder.h"
 #include "icing/file/filesystem.h"
-#include "icing/icu-data-file-helper.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
 #include "icing/index/hit/doc-hit-info.h"
 #include "icing/index/index.h"
 #include "icing/index/iterator/doc-hit-info-iterator-test-util.h"
@@ -102,14 +102,11 @@ class QueryProcessorTest : public Test {
     ICING_ASSERT_OK_AND_ASSIGN(index_,
                                Index::Create(options, &icing_filesystem_));
 
-    ICING_ASSERT_OK_AND_ASSIGN(
-        language_segmenter_,
-        language_segmenter_factory::Create(language_segmenter_factory::ICU4C));
+    ICING_ASSERT_OK_AND_ASSIGN(language_segmenter_,
+                               language_segmenter_factory::Create());
 
-    ICING_ASSERT_OK_AND_ASSIGN(
-        normalizer_,
-        normalizer_factory::Create(normalizer_factory::NormalizerType::ICU4C,
-                                   /*max_term_byte_size=*/1000));
+    ICING_ASSERT_OK_AND_ASSIGN(normalizer_, normalizer_factory::Create(
+                                                /*max_term_byte_size=*/1000));
   }
 
   libtextclassifier3::Status AddTokenToIndex(
diff --git a/icing/result/result-retriever_test.cc b/icing/result/result-retriever_test.cc
index cfce6e2..36dbfd9 100644
--- a/icing/result/result-retriever_test.cc
+++ b/icing/result/result-retriever_test.cc
@@ -20,7 +20,7 @@
 #include "gtest/gtest.h"
 #include "icing/document-builder.h"
 #include "icing/file/mock-filesystem.h"
-#include "icing/icu-data-file-helper.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
 #include "icing/portable/equals-proto.h"
 #include "icing/proto/document.pb.h"
 #include "icing/proto/schema.pb.h"
@@ -59,16 +59,13 @@ class ResultRetrieverTest : public testing::Test {
         // File generated via icu_data_file rule in //icing/BUILD.
         icu_data_file_helper::SetUpICUDataFile(
             GetTestFilePath("icing/icu.dat")));
-    ICING_ASSERT_OK_AND_ASSIGN(
-        language_segmenter_,
-        language_segmenter_factory::Create(language_segmenter_factory::ICU4C));
+    ICING_ASSERT_OK_AND_ASSIGN(language_segmenter_,
+                               language_segmenter_factory::Create());
 
     ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
                                SchemaStore::Create(&filesystem_, test_dir_));
-    ICING_ASSERT_OK_AND_ASSIGN(
-        normalizer_,
-        normalizer_factory::Create(normalizer_factory::NormalizerType::ICU4C,
-                                   /*max_term_byte_size=*/10000));
+    ICING_ASSERT_OK_AND_ASSIGN(normalizer_, normalizer_factory::Create(
+                                                /*max_term_byte_size=*/10000));
 
     SchemaProto schema;
     auto type_config = schema.add_types();
diff --git a/icing/result/snippet-retriever.cc b/icing/result/snippet-retriever.cc
index faf9e18..09d0f7a 100644
--- a/icing/result/snippet-retriever.cc
+++ b/icing/result/snippet-retriever.cc
@@ -35,7 +35,7 @@
 #include "icing/tokenization/tokenizer-factory.h"
 #include "icing/tokenization/tokenizer.h"
 #include "icing/transform/normalizer.h"
-#include "icing/util/icu-i18n-utils.h"
+#include "icing/util/i18n-utils.h"
 #include "icing/util/status-macros.h"
 
 namespace icing {
@@ -126,19 +126,18 @@ libtextclassifier3::StatusOr<std::unique_ptr<TokenMatcher>> CreateTokenMatcher(
 
 // Returns true if token matches any of the terms in query terms according to
 // the provided match type.
-
+//
 // Returns:
 //   the position of the window start if successful
 //   INTERNAL_ERROR - if a tokenizer error is encountered
 libtextclassifier3::StatusOr<int> DetermineWindowStart(
     const ResultSpecProto::SnippetSpecProto& snippet_spec,
     std::string_view value, int match_mid, Tokenizer::Iterator* iterator) {
-  int window_start_min =
-      std::max((match_mid - snippet_spec.max_window_bytes() / 2), 0);
-  if (window_start_min == 0) {
+  int window_start_min = (match_mid - snippet_spec.max_window_bytes() / 2) - 1;
+  if (window_start_min < 0) {
     return 0;
   }
-  if (!iterator->ResetToTokenAfter(window_start_min - 1)) {
+  if (!iterator->ResetToTokenAfter(window_start_min)) {
     return absl_ports::InternalError(
         "Couldn't reset tokenizer to determine snippet window!");
   }
@@ -152,8 +151,7 @@ int IncludeTrailingPunctuation(std::string_view value, int window_end_exclusive,
                                int window_end_max_exclusive) {
   while (window_end_exclusive < window_end_max_exclusive) {
     int char_len = 0;
-    if (!icu_i18n_utils::IsPunctuationAt(value, window_end_exclusive,
-                                         &char_len)) {
+    if (!i18n_utils::IsPunctuationAt(value, window_end_exclusive, &char_len)) {
       break;
     }
     if (window_end_exclusive + char_len > window_end_max_exclusive) {
@@ -174,10 +172,9 @@ libtextclassifier3::StatusOr<int> DetermineWindowEnd(
     const ResultSpecProto::SnippetSpecProto& snippet_spec,
     std::string_view value, int match_mid, Tokenizer::Iterator* iterator) {
   int window_end_max_exclusive =
-      std::min((match_mid + snippet_spec.max_window_bytes() / 2),
-               static_cast<int>(value.length()));
-  if (window_end_max_exclusive == value.length()) {
-    return window_end_max_exclusive;
+      match_mid + snippet_spec.max_window_bytes() / 2;
+  if (window_end_max_exclusive >= value.length()) {
+    return value.length();
   }
   if (!iterator->ResetToTokenBefore(window_end_max_exclusive)) {
     return absl_ports::InternalError(
@@ -228,8 +225,11 @@ libtextclassifier3::StatusOr<SnippetMatchProto> RetrieveMatch(
                            iterator));
     snippet_match.set_window_bytes(window_end_exclusive - window_start);
 
-    // Reset the iterator back to the original position.
-    if (!iterator->ResetToTokenAfter(match_pos - 1)) {
+    // DetermineWindowStart/End may change the position of the iterator. So,
+    // reset the iterator back to the original position.
+    bool success = (match_pos > 0) ? iterator->ResetToTokenAfter(match_pos - 1)
+                                   : iterator->ResetToStart();
+    if (!success) {
       return absl_ports::InternalError(
           "Couldn't reset tokenizer to determine snippet window!");
     }
diff --git a/icing/result/snippet-retriever_test.cc b/icing/result/snippet-retriever_test.cc
index 7037ede..3b3bf61 100644
--- a/icing/result/snippet-retriever_test.cc
+++ b/icing/result/snippet-retriever_test.cc
@@ -22,7 +22,7 @@
 #include "gtest/gtest.h"
 #include "icing/document-builder.h"
 #include "icing/file/mock-filesystem.h"
-#include "icing/icu-data-file-helper.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
 #include "icing/portable/equals-proto.h"
 #include "icing/proto/document.pb.h"
 #include "icing/proto/schema.pb.h"
@@ -60,9 +60,8 @@ class SnippetRetrieverTest : public testing::Test {
         // File generated via icu_data_file rule in //icing/BUILD.
         icu_data_file_helper::SetUpICUDataFile(
             GetTestFilePath("icing/icu.dat")));
-    ICING_ASSERT_OK_AND_ASSIGN(
-        language_segmenter_,
-        language_segmenter_factory::Create(language_segmenter_factory::ICU4C));
+    ICING_ASSERT_OK_AND_ASSIGN(language_segmenter_,
+                               language_segmenter_factory::Create());
 
     // Setup the schema
     ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
@@ -88,10 +87,8 @@ class SnippetRetrieverTest : public testing::Test {
         IndexingConfig::TokenizerType::PLAIN);
     ICING_ASSERT_OK(schema_store_->SetSchema(schema));
 
-    ICING_ASSERT_OK_AND_ASSIGN(
-        normalizer_,
-        normalizer_factory::Create(normalizer_factory::NormalizerType::ICU4C,
-                                   /*max_term_byte_size=*/10000));
+    ICING_ASSERT_OK_AND_ASSIGN(normalizer_, normalizer_factory::Create(
+                                                /*max_term_byte_size=*/10000));
     ICING_ASSERT_OK_AND_ASSIGN(
         snippet_retriever_,
         SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
diff --git a/icing/schema/schema-util.cc b/icing/schema/schema-util.cc
index df5a820..7413d73 100644
--- a/icing/schema/schema-util.cc
+++ b/icing/schema/schema-util.cc
@@ -256,23 +256,29 @@ void SchemaUtil::BuildTypeConfigMap(
   }
 }
 
-void SchemaUtil::BuildPropertyConfigMap(
-    const SchemaTypeConfigProto& type_config,
-    std::unordered_map<std::string_view, const PropertyConfigProto*>*
-        property_config_map,
-    int32_t* num_required_properties) {
+SchemaUtil::ParsedPropertyConfigs SchemaUtil::ParsePropertyConfigs(
+    const SchemaTypeConfigProto& type_config) {
+  ParsedPropertyConfigs parsed_property_configs;
+
   // TODO(samzheng): consider caching property_config_map for some properties,
   // e.g. using LRU cache. Or changing schema.proto to use go/protomap.
-  *num_required_properties = 0;
-  property_config_map->clear();
   for (const PropertyConfigProto& property_config : type_config.properties()) {
-    property_config_map->emplace(property_config.property_name(),
-                                 &property_config);
+    parsed_property_configs.property_config_map.emplace(
+        property_config.property_name(), &property_config);
     if (property_config.cardinality() ==
         PropertyConfigProto::Cardinality::REQUIRED) {
-      (*num_required_properties)++;
+      parsed_property_configs.num_required_properties++;
+    }
+
+    // A non-default term_match_type indicates that this property is meant to be
+    // indexed.
+    if (property_config.indexing_config().term_match_type() !=
+        TermMatchType::UNKNOWN) {
+      parsed_property_configs.num_indexed_properties++;
     }
   }
+
+  return parsed_property_configs;
 }
 
 const SchemaUtil::SchemaDelta SchemaUtil::ComputeCompatibilityDelta(
@@ -298,22 +304,21 @@ const SchemaUtil::SchemaDelta SchemaUtil::ComputeCompatibilityDelta(
       continue;
     }
 
-    std::unordered_map<std::string_view, const PropertyConfigProto*>
-        new_property_map;
-    int32_t new_required_properties = 0;
-    BuildPropertyConfigMap(new_schema_type_and_config->second,
-                           &new_property_map, &new_required_properties);
+    ParsedPropertyConfigs new_parsed_property_configs =
+        ParsePropertyConfigs(new_schema_type_and_config->second);
 
     // We only need to check the old, existing properties to see if they're
     // compatible since we'll have old data that may be invalidated or need to
-    // be reindexed. New properties don't have any data that would be
-    // invalidated or incompatible, so we blanket accept all new properties.
+    // be reindexed.
     int32_t old_required_properties = 0;
+    int32_t old_indexed_properties = 0;
     for (const auto& old_property_config : old_type_config.properties()) {
       auto new_property_name_and_config =
-          new_property_map.find(old_property_config.property_name());
+          new_parsed_property_configs.property_config_map.find(
+              old_property_config.property_name());
 
-      if (new_property_name_and_config == new_property_map.end()) {
+      if (new_property_name_and_config ==
+          new_parsed_property_configs.property_config_map.end()) {
         // Didn't find the old property
         ICING_VLOG(1) << absl_ports::StrCat("Previously defined property type ",
                                             old_type_config.schema_type(), ".",
@@ -340,6 +345,13 @@ const SchemaUtil::SchemaDelta SchemaUtil::ComputeCompatibilityDelta(
         ++old_required_properties;
       }
 
+      // A non-default term_match_type indicates that this property is meant to
+      // be indexed.
+      if (old_property_config.indexing_config().term_match_type() !=
+          TermMatchType::UNKNOWN) {
+        ++old_indexed_properties;
+      }
+
       // Any change in the indexed property requires a reindexing
       if (!IsTermMatchTypeCompatible(old_property_config.indexing_config(),
                                      new_property_config->indexing_config())) {
@@ -352,7 +364,8 @@ const SchemaUtil::SchemaDelta SchemaUtil::ComputeCompatibilityDelta(
     // guaranteed from our previous checks that all the old properties are also
     // present in the new property config, so we can do a simple int comparison
     // here to detect new required properties.
-    if (new_required_properties > old_required_properties) {
+    if (new_parsed_property_configs.num_required_properties >
+        old_required_properties) {
       ICING_VLOG(1) << absl_ports::StrCat(
           "New schema ", old_type_config.schema_type(),
           " has REQUIRED properties that are not "
@@ -360,6 +373,18 @@ const SchemaUtil::SchemaDelta SchemaUtil::ComputeCompatibilityDelta(
       schema_delta.schema_types_incompatible.insert(
           old_type_config.schema_type());
     }
+
+    // If we've gained any new indexed properties, then the section ids may
+    // change. Since the section ids are stored in the index, we'll need to
+    // reindex everything.
+    if (new_parsed_property_configs.num_indexed_properties >
+        old_indexed_properties) {
+      ICING_VLOG(1) << absl_ports::StrCat(
+          "Set of indexed properties in schema type '",
+          old_type_config.schema_type(),
+          "' has  changed, required reindexing.");
+      schema_delta.index_incompatible = true;
+    }
   }
 
   return schema_delta;
diff --git a/icing/schema/schema-util.h b/icing/schema/schema-util.h
index c547ad2..d65dd10 100644
--- a/icing/schema/schema-util.h
+++ b/icing/schema/schema-util.h
@@ -54,6 +54,18 @@ class SchemaUtil {
     }
   };
 
+  struct ParsedPropertyConfigs {
+    // Mapping of property name to PropertyConfigProto
+    std::unordered_map<std::string_view, const PropertyConfigProto*>
+        property_config_map;
+
+    // Total number of properties that have an indexing config
+    int32_t num_indexed_properties = 0;
+
+    // Total number of properties that were REQUIRED
+    int32_t num_required_properties = 0;
+  };
+
   // This function validates:
   //   1. SchemaTypeConfigProto.schema_type's must be unique
   //   2. Properties within one SchemaTypeConfigProto must be unique
@@ -81,14 +93,10 @@ class SchemaUtil {
   static void BuildTypeConfigMap(const SchemaProto& schema,
                                  TypeConfigMap* type_config_map);
 
-  // Calculate and return a hash map of (property name -> property config)
-  // from the given type config. The number of required properties will be
-  // assigned to output param num_required_properties.
-  static void BuildPropertyConfigMap(
-      const SchemaTypeConfigProto& type_config,
-      std::unordered_map<std::string_view, const PropertyConfigProto*>*
-          property_config_map,
-      int32_t* num_required_properties);
+  // Parses the given type_config and returns a struct of easily-parseable
+  // information about the properties.
+  static ParsedPropertyConfigs ParsePropertyConfigs(
+      const SchemaTypeConfigProto& type_config);
 
   // Computes the delta between the old and new schema. There are a few
   // differences that'll be reported:
diff --git a/icing/schema/schema-util_test.cc b/icing/schema/schema-util_test.cc
index 64473b8..a3ab96f 100644
--- a/icing/schema/schema-util_test.cc
+++ b/icing/schema/schema-util_test.cc
@@ -502,6 +502,40 @@ TEST_F(SchemaUtilTest, ChangingIndexedPropertiesMakesIndexIncompatible) {
               Eq(schema_delta));
 }
 
+TEST_F(SchemaUtilTest, AddingNewIndexedPropertyMakesIndexIncompatible) {
+  // Configure old schema
+  SchemaProto old_schema;
+  auto old_type = old_schema.add_types();
+  *old_type = CreateSchemaTypeConfig(kEmailType, kPersonType);
+
+  auto old_property = old_type->add_properties();
+  old_property->set_property_name("Property");
+  old_property->set_data_type(PropertyConfigProto::DataType::STRING);
+  old_property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+
+  // Configure new schema
+  SchemaProto new_schema;
+  auto new_type = new_schema.add_types();
+  *new_type = CreateSchemaTypeConfig(kEmailType, kPersonType);
+
+  auto new_property = new_type->add_properties();
+  new_property->set_property_name("Property");
+  new_property->set_data_type(PropertyConfigProto::DataType::STRING);
+  new_property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+
+  new_property = new_type->add_properties();
+  new_property->set_property_name("NewIndexedProperty");
+  new_property->set_data_type(PropertyConfigProto::DataType::STRING);
+  new_property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+  new_property->mutable_indexing_config()->set_term_match_type(
+      TermMatchType::EXACT_ONLY);
+
+  SchemaUtil::SchemaDelta schema_delta;
+  schema_delta.index_incompatible = true;
+  EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema),
+              Eq(schema_delta));
+}
+
 TEST_F(SchemaUtilTest, AddingTypeIsCompatible) {
   // Can add a new type, existing data isn't incompatible, since none of them
   // are of this new schema type
diff --git a/icing/store/document-store.cc b/icing/store/document-store.cc
index e2457d0..ae8360b 100644
--- a/icing/store/document-store.cc
+++ b/icing/store/document-store.cc
@@ -1235,6 +1235,59 @@ libtextclassifier3::Status DocumentStore::OptimizeInto(
   return libtextclassifier3::Status::OK;
 }
 
+libtextclassifier3::StatusOr<DocumentStore::OptimizeInfo>
+DocumentStore::GetOptimizeInfo() const {
+  OptimizeInfo optimize_info;
+
+  // Figure out our ratio of optimizable/total docs.
+  int32_t num_documents = document_id_mapper_->num_elements();
+  for (DocumentId document_id = kMinDocumentId; document_id < num_documents;
+       ++document_id) {
+    if (!DoesDocumentExist(document_id)) {
+      ++optimize_info.optimizable_docs;
+    }
+
+    ++optimize_info.total_docs;
+  }
+
+  if (optimize_info.total_docs == 0) {
+    // Can exit early since there's nothing to calculate.
+    return optimize_info;
+  }
+
+  // Get the total element size.
+  //
+  // We use file size instead of disk usage here because the files are not
+  // sparse, so it's more accurate. Disk usage rounds up to the nearest block
+  // size.
+  ICING_ASSIGN_OR_RETURN(const int64_t document_log_file_size,
+                         document_log_->GetElementsFileSize());
+  ICING_ASSIGN_OR_RETURN(const int64_t document_id_mapper_file_size,
+                         document_id_mapper_->GetElementsFileSize());
+  ICING_ASSIGN_OR_RETURN(const int64_t score_cache_file_size,
+                         score_cache_->GetElementsFileSize());
+  ICING_ASSIGN_OR_RETURN(const int64_t filter_cache_file_size,
+                         filter_cache_->GetElementsFileSize());
+
+  // We use a combined disk usage and file size for the KeyMapper because it's
+  // backed by a trie, which has some sparse property bitmaps.
+  ICING_ASSIGN_OR_RETURN(const int64_t document_key_mapper_size,
+                         document_key_mapper_->GetElementsSize());
+
+  // We don't include the namespace mapper because it's not clear if we could
+  // recover any space even if Optimize were called. Deleting 100s of documents
+  // could still leave a few documents of a namespace, and then there would be
+  // no change.
+
+  int64_t total_size = document_log_file_size + document_key_mapper_size +
+                       document_id_mapper_file_size + score_cache_file_size +
+                       filter_cache_file_size;
+
+  optimize_info.estimated_optimizable_bytes =
+      total_size * optimize_info.optimizable_docs / optimize_info.total_docs;
+  return optimize_info;
+}
+
 libtextclassifier3::Status DocumentStore::UpdateDocumentAssociatedScoreCache(
     DocumentId document_id, const DocumentAssociatedScoreData& score_data) {
   return score_cache_->Set(document_id, score_data);
diff --git a/icing/store/document-store.h b/icing/store/document-store.h
index 891b199..3547214 100644
--- a/icing/store/document-store.h
+++ b/icing/store/document-store.h
@@ -54,6 +54,20 @@ class DocumentStore {
     uint32_t checksum;
   };
 
+  struct OptimizeInfo {
+    // The estimated size in bytes of the optimizable docs. We don't track the
+    // size of each document, so we estimate by taking the size of the entire
+    // DocumentStore and dividing that by the total number of documents we have.
+    // So we end up with an average document size.
+    int64_t estimated_optimizable_bytes = 0;
+
+    // Number of total documents the DocumentStore tracks.
+    int32_t total_docs = 0;
+
+    // Number of optimizable (deleted + expired) docs the DocumentStore tracks.
+    int32_t optimizable_docs = 0;
+  };
+
   // Not copyable
   DocumentStore(const DocumentStore&) = delete;
   DocumentStore& operator=(const DocumentStore&) = delete;
@@ -208,7 +222,8 @@ class DocumentStore {
   //   INTERNAL on I/O error
   libtextclassifier3::Status PersistToDisk();
 
-  // Calculates and returns the disk usage in bytes.
+  // Calculates and returns the disk usage in bytes. Rounds up to the nearest
+  // block size.
   //
   // Returns:
   //   Disk usage on success
@@ -273,6 +288,15 @@ class DocumentStore {
   //   INTERNAL_ERROR on IO error
   libtextclassifier3::Status OptimizeInto(const std::string& new_directory);
 
+  // Calculates status for a potential Optimize call. Includes how many docs
+  // there are vs how many would be optimized away. And also includes an
+  // estimated size gains, in bytes, if Optimize were called.
+  //
+  // Returns:
+  //   OptimizeInfo on success
+  //   INTERNAL_ERROR on IO error
+  libtextclassifier3::StatusOr<OptimizeInfo> GetOptimizeInfo() const;
+
   // Computes the combined checksum of the document store - includes the ground
   // truth and all derived files.
   //
diff --git a/icing/store/document-store_test.cc b/icing/store/document-store_test.cc
index 5ec062f..f59d2e2 100644
--- a/icing/store/document-store_test.cc
+++ b/icing/store/document-store_test.cc
@@ -1966,5 +1966,53 @@ TEST_F(DocumentStoreTest,
               IsOkAndHolds(EqualsProto(message_document)));
 }
 
+TEST_F(DocumentStoreTest, GetOptimizeInfo) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<DocumentStore> document_store,
+      DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+                            schema_store_.get()));
+
+  // Nothing should be optimizable yet
+  ICING_ASSERT_OK_AND_ASSIGN(DocumentStore::OptimizeInfo optimize_info,
+                             document_store->GetOptimizeInfo());
+  EXPECT_THAT(optimize_info.total_docs, Eq(0));
+  EXPECT_THAT(optimize_info.optimizable_docs, Eq(0));
+  EXPECT_THAT(optimize_info.estimated_optimizable_bytes, Eq(0));
+
+  ICING_EXPECT_OK(document_store->Put(DocumentProto(test_document1_)));
+
+  // Adding a document, still nothing is optimizable
+  ICING_ASSERT_OK_AND_ASSIGN(optimize_info, document_store->GetOptimizeInfo());
+  EXPECT_THAT(optimize_info.total_docs, Eq(1));
+  EXPECT_THAT(optimize_info.optimizable_docs, Eq(0));
+  EXPECT_THAT(optimize_info.estimated_optimizable_bytes, Eq(0));
+
+  // Delete a document. Now something is optimizable
+  ICING_EXPECT_OK(document_store->Delete(test_document1_.namespace_(),
+                                         test_document1_.uri()));
+  ICING_ASSERT_OK_AND_ASSIGN(optimize_info, document_store->GetOptimizeInfo());
+  EXPECT_THAT(optimize_info.total_docs, Eq(1));
+  EXPECT_THAT(optimize_info.optimizable_docs, Eq(1));
+  EXPECT_THAT(optimize_info.estimated_optimizable_bytes, Gt(0));
+
+  // Optimize it into a different directory, should bring us back to nothing
+  // since all documents were optimized away.
+  std::string optimized_dir = document_store_dir_ + "_optimize";
+  EXPECT_TRUE(filesystem_.DeleteDirectoryRecursively(optimized_dir.c_str()));
+  EXPECT_TRUE(filesystem_.CreateDirectoryRecursively(optimized_dir.c_str()));
+  ICING_ASSERT_OK(document_store->OptimizeInto(optimized_dir));
+  document_store.reset();
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<DocumentStore> optimized_document_store,
+      DocumentStore::Create(&filesystem_, optimized_dir, &fake_clock_,
+                            schema_store_.get()));
+
+  ICING_ASSERT_OK_AND_ASSIGN(optimize_info,
+                             optimized_document_store->GetOptimizeInfo());
+  EXPECT_THAT(optimize_info.total_docs, Eq(0));
+  EXPECT_THAT(optimize_info.optimizable_docs, Eq(0));
+  EXPECT_THAT(optimize_info.estimated_optimizable_bytes, Eq(0));
+}
+
 }  // namespace lib
 }  // namespace icing
diff --git a/icing/store/key-mapper.h b/icing/store/key-mapper.h
index b01a8f1..a85b00d 100644
--- a/icing/store/key-mapper.h
+++ b/icing/store/key-mapper.h
@@ -99,13 +99,23 @@ class KeyMapper {
   //   INTERNAL on I/O error
   libtextclassifier3::Status PersistToDisk();
 
-  // Calculates and returns the disk usage in bytes.
+  // Calculates and returns the disk usage in bytes. Rounds up to the nearest
+  // block size.
   //
   // Returns:
   //   Disk usage on success
   //   INTERNAL_ERROR on IO error
   libtextclassifier3::StatusOr<int64_t> GetDiskUsage() const;
 
+  // Returns the size of the elements held in the key mapper. This excludes the
+  // size of any internal metadata of the key mapper, e.g. the key mapper's
+  // header.
+  //
+  // Returns:
+  //   File size on success
+  //   INTERNAL_ERROR on IO error
+  libtextclassifier3::StatusOr<int64_t> GetElementsSize() const;
+
   // Computes and returns the checksum of the header and contents.
   Crc32 ComputeChecksum();
 
@@ -261,6 +271,16 @@ libtextclassifier3::StatusOr<int64_t> KeyMapper<T>::GetDiskUsage() const {
 }
 
 template <typename T>
+libtextclassifier3::StatusOr<int64_t> KeyMapper<T>::GetElementsSize() const {
+  int64_t size = trie_.GetElementsSize();
+  if (size == IcingFilesystem::kBadFileSize || size < 0) {
+    return absl_ports::InternalError(
+        "Failed to get disk usage of elements in the key mapper");
+  }
+  return size;
+}
+
+template <typename T>
 Crc32 KeyMapper<T>::ComputeChecksum() {
   return Crc32(trie_.UpdateCrc());
 }
diff --git a/icing/testing/logging-event-listener.cc b/icing/testing/logging-event-listener.cc
new file mode 100644
index 0000000..4b42825
--- /dev/null
+++ b/icing/testing/logging-event-listener.cc
@@ -0,0 +1,121 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/testing/logging-event-listener.h"
+
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/util/logging.h"
+
+namespace icing {
+namespace lib {
+
+void LoggingEventListener::OnTestProgramStart(
+    const testing::UnitTest& /* unit_test */) {}
+
+void LoggingEventListener::OnTestIterationStart(
+    const testing::UnitTest& unit_test, int iteration) {
+  ICING_LOG(INFO) << "[==========] Running " << unit_test.test_to_run_count()
+                  << " test(s) from " << unit_test.test_case_to_run_count()
+                  << " test case(s)";
+}
+
+void LoggingEventListener::OnEnvironmentsSetUpStart(
+    const testing::UnitTest& unit_test) {
+  ICING_LOG(INFO) << "[----------] Global test environment set-up.";
+}
+
+void LoggingEventListener::OnEnvironmentsSetUpEnd(
+    const testing::UnitTest& /* unit_test */) {}
+
+void LoggingEventListener::OnTestCaseStart(const testing::TestCase& test_case) {
+  std::string param_text;
+  if (test_case.type_param()) {
+    param_text = IcingStringUtil::StringPrintf(", where TypeParam = %s",
+                                               test_case.type_param());
+  }
+  ICING_LOG(INFO) << "[----------] " << test_case.test_to_run_count()
+                  << " test(s) from " << test_case.name() << param_text;
+}
+
+void LoggingEventListener::OnTestStart(const testing::TestInfo& test_info) {
+  ICING_LOG(INFO) << "[ RUN      ] " << test_info.test_case_name() << "."
+                  << test_info.name();
+}
+
+void LoggingEventListener::OnTestPartResult(
+    const testing::TestPartResult& test_part_result) {
+  if (test_part_result.type() != testing::TestPartResult::kSuccess) {
+    ICING_LOG(ERROR) << test_part_result.file_name() << ":"
+                     << test_part_result.line_number() << ": Failure "
+                     << test_part_result.message();
+  }
+}
+
+void LoggingEventListener::OnTestEnd(const testing::TestInfo& test_info) {
+  if (test_info.result()->Passed()) {
+    ICING_LOG(INFO) << "[       OK ] " << test_info.test_case_name() << "."
+                    << test_info.name();
+  } else {
+    ICING_LOG(ERROR) << "[  FAILED  ] " << test_info.test_case_name() << "."
+                     << test_info.name();
+  }
+}
+
+void LoggingEventListener::OnTestCaseEnd(const testing::TestCase& test_case) {
+  ICING_LOG(INFO) << "[----------] " << test_case.test_to_run_count()
+                  << " test(s) from " << test_case.name() << " ("
+                  << test_case.elapsed_time() << " ms total)";
+}
+
+void LoggingEventListener::OnEnvironmentsTearDownStart(
+    const testing::UnitTest& unit_test) {
+  ICING_LOG(INFO) << "[----------] Global test environment tear-down.";
+}
+
+void LoggingEventListener::OnEnvironmentsTearDownEnd(
+    const testing::UnitTest& /* unit_test */) {}
+
+void LoggingEventListener::OnTestIterationEnd(
+    const testing::UnitTest& unit_test, int iteration) {
+  ICING_LOG(INFO) << "[==========] " << unit_test.test_to_run_count()
+                  << " test(s) from " << unit_test.test_case_to_run_count()
+                  << " test case(s) ran. (" << unit_test.elapsed_time()
+                  << " ms total)";
+  ICING_LOG(INFO) << "[  PASSED  ] " << unit_test.successful_test_count()
+                  << " test(s)";
+  if (!unit_test.Passed()) {
+    ICING_LOG(ERROR) << "[  FAILED  ] " << unit_test.failed_test_count()
+                     << " test(s), listed below:";
+    for (int i = 0; i < unit_test.total_test_case_count(); ++i) {
+      const testing::TestCase& test_case = *unit_test.GetTestCase(i);
+      if (!test_case.should_run() || (test_case.failed_test_count() == 0)) {
+        continue;
+      }
+      for (int j = 0; j < test_case.total_test_count(); ++j) {
+        const testing::TestInfo& test_info = *test_case.GetTestInfo(j);
+        if (!test_info.should_run() || test_info.result()->Passed()) {
+          continue;
+        }
+        ICING_LOG(ERROR) << "[  FAILED  ] " << test_case.name() << "."
+                         << test_info.name();
+      }
+    }
+  }
+}
+
+void LoggingEventListener::OnTestProgramEnd(
+    const testing::UnitTest& /* unit_test */) {}
+
+}  // namespace lib
+}  // namespace icing
diff --git a/icing/testing/logging-event-listener.h b/icing/testing/logging-event-listener.h
new file mode 100644
index 0000000..8024222
--- /dev/null
+++ b/icing/testing/logging-event-listener.h
@@ -0,0 +1,62 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TESTING_LOGGING_EVENT_LISTENER_H_
+#define ICING_TESTING_LOGGING_EVENT_LISTENER_H_
+
+#include "gtest/gtest.h"
+
+namespace icing {
+namespace lib {
+
+// TestEventListener that writes test results to the log so that they will be
+// visible in the logcat output in Sponge.
+// The formatting of the output is patterned after the output produced by the
+// standard PrettyUnitTestResultPrinter.
+class LoggingEventListener : public ::testing::TestEventListener {
+ public:
+  void OnTestProgramStart(const testing::UnitTest& unit_test) override;
+
+  void OnTestIterationStart(const testing::UnitTest& unit_test,
+                            int iteration) override;
+
+  void OnEnvironmentsSetUpStart(const testing::UnitTest& unit_test) override;
+
+  void OnEnvironmentsSetUpEnd(const testing::UnitTest& unit_test) override;
+
+  void OnTestCaseStart(const testing::TestCase& test_case) override;
+
+  void OnTestStart(const testing::TestInfo& test_info) override;
+
+  void OnTestPartResult(
+      const testing::TestPartResult& test_part_result) override;
+
+  void OnTestEnd(const testing::TestInfo& test_info) override;
+
+  void OnTestCaseEnd(const testing::TestCase& test_case) override;
+
+  void OnEnvironmentsTearDownStart(const testing::UnitTest& unit_test) override;
+
+  void OnEnvironmentsTearDownEnd(const testing::UnitTest& unit_test) override;
+
+  void OnTestIterationEnd(const testing::UnitTest& unit_test,
+                          int iteration) override;
+
+  void OnTestProgramEnd(const testing::UnitTest& unit_test) override;
+};
+
+}  // namespace lib
+}  // namespace icing
+
+#endif  // ICING_TESTING_LOGGING_EVENT_LISTENER_H_
diff --git a/icing/text_classifier/lib3/utils/java/jni-base.cc b/icing/text_classifier/lib3/utils/java/jni-base.cc
new file mode 100644
index 0000000..3b6d09e
--- /dev/null
+++ b/icing/text_classifier/lib3/utils/java/jni-base.cc
@@ -0,0 +1,44 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/text_classifier/lib3/utils/java/jni-base.h"
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/java/string_utils.h"
+
+namespace libtextclassifier3 {
+
+bool EnsureLocalCapacity(JNIEnv* env, int capacity) {
+  return env->EnsureLocalCapacity(capacity) == JNI_OK;
+}
+
+bool JniExceptionCheckAndClear(JNIEnv* env) {
+  TC3_CHECK(env != nullptr);
+  const bool result = env->ExceptionCheck();
+  if (result) {
+    env->ExceptionDescribe();
+    env->ExceptionClear();
+  }
+  return result;
+}
+
+StatusOr<std::string> ToStlString(JNIEnv* env, const jstring& str) {
+  std::string result;
+  if (!JStringToUtf8String(env, str, &result)) {
+    return {Status::UNKNOWN};
+  }
+  return result;
+}
+
+}  // namespace libtextclassifier3
diff --git a/icing/text_classifier/lib3/utils/java/jni-base.h b/icing/text_classifier/lib3/utils/java/jni-base.h
new file mode 100644
index 0000000..7fd612a
--- /dev/null
+++ b/icing/text_classifier/lib3/utils/java/jni-base.h
@@ -0,0 +1,217 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TEXT_CLASSIFIER_LIB3_UTILS_JAVA_JNI_BASE_H_
+#define ICING_TEXT_CLASSIFIER_LIB3_UTILS_JAVA_JNI_BASE_H_
+
+#include <jni.h>
+
+#include <string>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+
+// When we use a macro as an argument for a macro, an additional level of
+// indirection is needed, if the macro argument is used with # or ##.
+#define TC3_ADD_QUOTES_HELPER(TOKEN) #TOKEN
+#define TC3_ADD_QUOTES(TOKEN) TC3_ADD_QUOTES_HELPER(TOKEN)
+
+#ifndef TC3_PACKAGE_NAME
+#define TC3_PACKAGE_NAME com_google_knowledge_cerebra_sense_textclassifier_lib3
+#endif
+
+#ifndef TC3_PACKAGE_PATH
+#define TC3_PACKAGE_PATH \
+  "com/google/knowledge/cerebra/sense/textclassifier/lib3/"
+#endif
+
+#define TC3_JNI_METHOD_NAME_INTERNAL(package_name, class_name, method_name) \
+  Java_##package_name##_##class_name##_##method_name
+
+#define TC3_JNI_METHOD_PRIMITIVE(return_type, package_name, class_name, \
+                                 method_name)                           \
+  JNIEXPORT return_type JNICALL TC3_JNI_METHOD_NAME_INTERNAL(           \
+      package_name, class_name, method_name)
+
+// The indirection is needed to correctly expand the TC3_PACKAGE_NAME macro.
+// See the explanation near TC3_ADD_QUOTES macro.
+#define TC3_JNI_METHOD2(return_type, package_name, class_name, method_name) \
+  TC3_JNI_METHOD_PRIMITIVE(return_type, package_name, class_name, method_name)
+
+#define TC3_JNI_METHOD(return_type, class_name, method_name) \
+  TC3_JNI_METHOD2(return_type, TC3_PACKAGE_NAME, class_name, method_name)
+
+#define TC3_JNI_METHOD_NAME2(package_name, class_name, method_name) \
+  TC3_JNI_METHOD_NAME_INTERNAL(package_name, class_name, method_name)
+
+#define TC3_JNI_METHOD_NAME(class_name, method_name) \
+  TC3_JNI_METHOD_NAME2(TC3_PACKAGE_NAME, class_name, method_name)
+
+namespace libtextclassifier3 {
+
+// Returns true if the requested capacity is available.
+bool EnsureLocalCapacity(JNIEnv* env, int capacity);
+
+// Returns true if there was an exception. Also it clears the exception.
+bool JniExceptionCheckAndClear(JNIEnv* env);
+
+StatusOr<std::string> ToStlString(JNIEnv* env, const jstring& str);
+
+// A deleter to be used with std::unique_ptr to delete JNI global references.
+class GlobalRefDeleter {
+ public:
+  explicit GlobalRefDeleter(JavaVM* jvm) : jvm_(jvm) {}
+
+  GlobalRefDeleter(const GlobalRefDeleter& orig) = default;
+
+  // Copy assignment to allow move semantics in ScopedGlobalRef.
+  GlobalRefDeleter& operator=(const GlobalRefDeleter& rhs) {
+    TC3_CHECK_EQ(jvm_, rhs.jvm_);
+    return *this;
+  }
+
+  // The delete operator.
+  void operator()(jobject object) const {
+    JNIEnv* env;
+    if (object != nullptr && jvm_ != nullptr &&
+        JNI_OK ==
+            jvm_->GetEnv(reinterpret_cast<void**>(&env), JNI_VERSION_1_4)) {
+      env->DeleteGlobalRef(object);
+    }
+  }
+
+ private:
+  // The jvm_ stashed to use for deletion.
+  JavaVM* const jvm_;
+};
+
+// A deleter to be used with std::unique_ptr to delete JNI local references.
+class LocalRefDeleter {
+ public:
+  explicit LocalRefDeleter(JNIEnv* env)
+      : env_(env) {}  // NOLINT(runtime/explicit)
+
+  LocalRefDeleter(const LocalRefDeleter& orig) = default;
+
+  // Copy assignment to allow move semantics in ScopedLocalRef.
+  LocalRefDeleter& operator=(const LocalRefDeleter& rhs) {
+    env_ = rhs.env_;
+    return *this;
+  }
+
+  // The delete operator.
+  void operator()(jobject object) const {
+    if (env_) {
+      env_->DeleteLocalRef(object);
+    }
+  }
+
+ private:
+  // The env_ stashed to use for deletion. Thread-local, don't share!
+  JNIEnv* env_;
+};
+
+// A smart pointer that deletes a reference when it goes out of scope.
+//
+// Note that this class is not thread-safe since it caches JNIEnv in
+// the deleter. Do not use the same jobject across different threads.
+template <typename T, typename Env, typename Deleter>
+class ScopedRef {
+ public:
+  ScopedRef() : ptr_(nullptr, Deleter(nullptr)) {}
+  ScopedRef(T value, Env* env) : ptr_(value, Deleter(env)) {}
+
+  T get() const { return ptr_.get(); }
+
+  T release() { return ptr_.release(); }
+
+  bool operator!() const { return !ptr_; }
+
+  bool operator==(void* value) const { return ptr_.get() == value; }
+
+  explicit operator bool() const { return ptr_ != nullptr; }
+
+  void reset(T value, Env* env) {
+    ptr_.reset(value);
+    ptr_.get_deleter() = Deleter(env);
+  }
+
+ private:
+  std::unique_ptr<typename std::remove_pointer<T>::type, Deleter> ptr_;
+};
+
+template <typename T, typename U, typename Env, typename Deleter>
+inline bool operator==(const ScopedRef<T, Env, Deleter>& x,
+                       const ScopedRef<U, Env, Deleter>& y) {
+  return x.get() == y.get();
+}
+
+template <typename T, typename Env, typename Deleter>
+inline bool operator==(const ScopedRef<T, Env, Deleter>& x, std::nullptr_t) {
+  return x.get() == nullptr;
+}
+
+template <typename T, typename Env, typename Deleter>
+inline bool operator==(std::nullptr_t, const ScopedRef<T, Env, Deleter>& x) {
+  return nullptr == x.get();
+}
+
+template <typename T, typename U, typename Env, typename Deleter>
+inline bool operator!=(const ScopedRef<T, Env, Deleter>& x,
+                       const ScopedRef<U, Env, Deleter>& y) {
+  return x.get() != y.get();
+}
+
+template <typename T, typename Env, typename Deleter>
+inline bool operator!=(const ScopedRef<T, Env, Deleter>& x, std::nullptr_t) {
+  return x.get() != nullptr;
+}
+
+template <typename T, typename Env, typename Deleter>
+inline bool operator!=(std::nullptr_t, const ScopedRef<T, Env, Deleter>& x) {
+  return nullptr != x.get();
+}
+
+template <typename T, typename U, typename Env, typename Deleter>
+inline bool operator<(const ScopedRef<T, Env, Deleter>& x,
+                      const ScopedRef<U, Env, Deleter>& y) {
+  return x.get() < y.get();
+}
+
+template <typename T, typename U, typename Env, typename Deleter>
+inline bool operator>(const ScopedRef<T, Env, Deleter>& x,
+                      const ScopedRef<U, Env, Deleter>& y) {
+  return x.get() > y.get();
+}
+
+// A smart pointer that deletes a JNI global reference when it goes out
+// of scope. Usage is:
+// ScopedGlobalRef<jobject> scoped_global(env->JniFunction(), jvm);
+template <typename T>
+using ScopedGlobalRef = ScopedRef<T, JavaVM, GlobalRefDeleter>;
+
+// Ditto, but usage is:
+// ScopedLocalRef<jobject> scoped_local(env->JniFunction(), env);
+template <typename T>
+using ScopedLocalRef = ScopedRef<T, JNIEnv, LocalRefDeleter>;
+
+// A helper to create global references.
+template <typename T>
+ScopedGlobalRef<T> MakeGlobalRef(T object, JNIEnv* env, JavaVM* jvm) {
+  const jobject global_object = env->NewGlobalRef(object);
+  return ScopedGlobalRef<T>(reinterpret_cast<T>(global_object), jvm);
+}
+
+}  // namespace libtextclassifier3
+
+#endif  // ICING_TEXT_CLASSIFIER_LIB3_UTILS_JAVA_JNI_BASE_H_
diff --git a/icing/text_classifier/lib3/utils/java/jni-helper.cc b/icing/text_classifier/lib3/utils/java/jni-helper.cc
new file mode 100644
index 0000000..0d9b0a0
--- /dev/null
+++ b/icing/text_classifier/lib3/utils/java/jni-helper.cc
@@ -0,0 +1,175 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/text_classifier/lib3/utils/java/jni-helper.h"
+
+namespace libtextclassifier3 {
+
+StatusOr<ScopedLocalRef<jclass>> JniHelper::FindClass(JNIEnv* env,
+                                                      const char* class_name) {
+  TC3_ENSURE_LOCAL_CAPACITY_OR_RETURN;
+  ScopedLocalRef<jclass> result(env->FindClass(class_name), env);
+  TC3_NO_EXCEPTION_OR_RETURN;
+  TC3_NOT_NULL_OR_RETURN;
+  return result;
+}
+
+StatusOr<jmethodID> JniHelper::GetMethodID(JNIEnv* env, jclass clazz,
+                                           const char* method_name,
+                                           const char* return_type) {
+  jmethodID result = env->GetMethodID(clazz, method_name, return_type);
+  TC3_NO_EXCEPTION_OR_RETURN;
+  TC3_NOT_NULL_OR_RETURN;
+  return result;
+}
+
+StatusOr<ScopedLocalRef<jobject>> JniHelper::GetStaticObjectField(
+    JNIEnv* env, jclass class_name, jfieldID field_id) {
+  TC3_ENSURE_LOCAL_CAPACITY_OR_RETURN;
+  ScopedLocalRef<jobject> result(
+      env->GetStaticObjectField(class_name, field_id), env);
+  TC3_NO_EXCEPTION_OR_RETURN;
+  TC3_NOT_NULL_OR_RETURN;
+  return result;
+}
+
+StatusOr<ScopedLocalRef<jbyteArray>> JniHelper::NewByteArray(JNIEnv* env,
+                                                             jsize length) {
+  TC3_ENSURE_LOCAL_CAPACITY_OR_RETURN;
+  ScopedLocalRef<jbyteArray> result(env->NewByteArray(length), env);
+  TC3_NO_EXCEPTION_OR_RETURN;
+  TC3_NOT_NULL_OR_RETURN;
+  return result;
+}
+
+Status JniHelper::CallVoidMethod(JNIEnv* env, jobject object,
+                                 jmethodID method_id, ...) {
+  va_list args;
+  va_start(args, method_id);
+  env->CallVoidMethodV(object, method_id, args);
+  va_end(args);
+
+  TC3_NO_EXCEPTION_OR_RETURN;
+  return Status::OK;
+}
+
+StatusOr<bool> JniHelper::CallBooleanMethod(JNIEnv* env, jobject object,
+                                            jmethodID method_id, ...) {
+  va_list args;
+  va_start(args, method_id);
+  bool result = env->CallBooleanMethodV(object, method_id, args);
+  va_end(args);
+
+  TC3_NO_EXCEPTION_OR_RETURN;
+  return result;
+}
+
+StatusOr<int32> JniHelper::CallIntMethod(JNIEnv* env, jobject object,
+                                         jmethodID method_id, ...) {
+  va_list args;
+  va_start(args, method_id);
+  jint result = env->CallIntMethodV(object, method_id, args);
+  va_end(args);
+
+  TC3_NO_EXCEPTION_OR_RETURN;
+  return result;
+}
+
+StatusOr<int64> JniHelper::CallLongMethod(JNIEnv* env, jobject object,
+                                          jmethodID method_id, ...) {
+  va_list args;
+  va_start(args, method_id);
+  jlong result = env->CallLongMethodV(object, method_id, args);
+  va_end(args);
+
+  TC3_NO_EXCEPTION_OR_RETURN;
+  return result;
+}
+
+StatusOr<float> JniHelper::CallFloatMethod(JNIEnv* env, jobject object,
+                                           jmethodID method_id, ...) {
+  va_list args;
+  va_start(args, method_id);
+  jfloat result = env->CallFloatMethodV(object, method_id, args);
+  va_end(args);
+
+  TC3_NO_EXCEPTION_OR_RETURN;
+  return result;
+}
+
+StatusOr<double> JniHelper::CallDoubleMethod(JNIEnv* env, jobject object,
+                                             jmethodID method_id, ...) {
+  va_list args;
+  va_start(args, method_id);
+  jdouble result = env->CallDoubleMethodV(object, method_id, args);
+  va_end(args);
+
+  TC3_NO_EXCEPTION_OR_RETURN;
+  return result;
+}
+
+StatusOr<ScopedLocalRef<jintArray>> JniHelper::NewIntArray(JNIEnv* env,
+                                                           jsize length) {
+  TC3_ENSURE_LOCAL_CAPACITY_OR_RETURN;
+  ScopedLocalRef<jintArray> result(env->NewIntArray(length), env);
+  TC3_NO_EXCEPTION_OR_RETURN;
+  TC3_NOT_NULL_OR_RETURN;
+  return result;
+}
+
+StatusOr<ScopedLocalRef<jfloatArray>> JniHelper::NewFloatArray(JNIEnv* env,
+                                                               jsize length) {
+  TC3_ENSURE_LOCAL_CAPACITY_OR_RETURN;
+  ScopedLocalRef<jfloatArray> result(env->NewFloatArray(length), env);
+  TC3_NO_EXCEPTION_OR_RETURN;
+  TC3_NOT_NULL_OR_RETURN;
+  return result;
+}
+
+Status JniHelper::SetObjectArrayElement(JNIEnv* env, jobjectArray array,
+                                        jsize index, jobject val) {
+  TC3_ENSURE_LOCAL_CAPACITY_OR_RETURN;
+  env->SetObjectArrayElement(array, index, val);
+  TC3_NO_EXCEPTION_OR_RETURN;
+  return Status::OK;
+}
+
+StatusOr<ScopedLocalRef<jobjectArray>> JniHelper::NewObjectArray(
+    JNIEnv* env, jsize length, jclass element_class, jobject initial_element) {
+  TC3_ENSURE_LOCAL_CAPACITY_OR_RETURN;
+  ScopedLocalRef<jobjectArray> result(
+      env->NewObjectArray(length, element_class, initial_element), env);
+  TC3_NO_EXCEPTION_OR_RETURN;
+  TC3_NOT_NULL_OR_RETURN;
+  return result;
+}
+
+StatusOr<jsize> JniHelper::GetArrayLength(JNIEnv* env,
+                                          jarray jinput_fragments) {
+  TC3_ENSURE_LOCAL_CAPACITY_OR_RETURN;
+  jsize result = env->GetArrayLength(jinput_fragments);
+  TC3_NO_EXCEPTION_OR_RETURN;
+  return result;
+}
+
+StatusOr<ScopedLocalRef<jstring>> JniHelper::NewStringUTF(JNIEnv* env,
+                                                          const char* bytes) {
+  TC3_ENSURE_LOCAL_CAPACITY_OR_RETURN;
+  ScopedLocalRef<jstring> result(env->NewStringUTF(bytes), env);
+  TC3_NO_EXCEPTION_OR_RETURN;
+  TC3_NOT_NULL_OR_RETURN;
+  return result;
+}
+
+}  // namespace libtextclassifier3
diff --git a/icing/text_classifier/lib3/utils/java/jni-helper.h b/icing/text_classifier/lib3/utils/java/jni-helper.h
new file mode 100644
index 0000000..ea4ba3b
--- /dev/null
+++ b/icing/text_classifier/lib3/utils/java/jni-helper.h
@@ -0,0 +1,156 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Utility class that provides similar calls like JNIEnv, but performs
+// additional checks on them, so that it's harder to use them incorrectly.
+
+#ifndef ICING_TEXT_CLASSIFIER_LIB3_UTILS_JAVA_JNI_HELPER_H_
+#define ICING_TEXT_CLASSIFIER_LIB3_UTILS_JAVA_JNI_HELPER_H_
+
+#include <jni.h>
+
+#include <string>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/text_classifier/lib3/utils/java/jni-base.h"
+
+#define TC3_ENSURE_LOCAL_CAPACITY_OR_RETURN             \
+  if (!EnsureLocalCapacity(env, 1)) {                   \
+    TC3_LOG(ERROR) << "EnsureLocalCapacity(1) failed."; \
+    return {Status::UNKNOWN};                           \
+  }
+
+#define TC3_NO_EXCEPTION_OR_RETURN      \
+  if (JniExceptionCheckAndClear(env)) { \
+    return {Status::UNKNOWN};           \
+  }
+
+#define TC3_NOT_NULL_OR_RETURN \
+  if (result == nullptr) {     \
+    return {Status::UNKNOWN};  \
+  }
+
+#define TC3_DEFINE_VARIADIC_SCOPED_LOCAL_REF_ENV_METHOD(                   \
+    METHOD_NAME, RETURN_TYPE, INPUT_TYPE, POST_CHECK)                      \
+  template <typename T = RETURN_TYPE>                                      \
+  static StatusOr<ScopedLocalRef<T>> METHOD_NAME(                          \
+      JNIEnv* env, INPUT_TYPE object, jmethodID method_id, ...) {          \
+    TC3_ENSURE_LOCAL_CAPACITY_OR_RETURN;                                   \
+                                                                           \
+    va_list args;                                                          \
+    va_start(args, method_id);                                             \
+    ScopedLocalRef<T> result(                                              \
+        reinterpret_cast<T>(env->METHOD_NAME##V(object, method_id, args)), \
+        env);                                                              \
+    POST_CHECK                                                             \
+    va_end(args);                                                          \
+                                                                           \
+    TC3_NO_EXCEPTION_OR_RETURN;                                            \
+    return result;                                                         \
+  }
+
+#define TC3_JNI_NO_CHECK \
+  {}
+
+namespace libtextclassifier3 {
+
+class JniHelper {
+ public:
+  // Misc methods.
+  static StatusOr<ScopedLocalRef<jclass>> FindClass(JNIEnv* env,
+                                                    const char* class_name);
+
+  template <typename T = jobject>
+  static StatusOr<ScopedLocalRef<T>> GetObjectArrayElement(JNIEnv* env,
+                                                           jobjectArray array,
+                                                           jsize index);
+  static StatusOr<jmethodID> GetMethodID(JNIEnv* env, jclass clazz,
+                                         const char* method_name,
+                                         const char* return_type);
+
+  static StatusOr<ScopedLocalRef<jobject>> GetStaticObjectField(
+      JNIEnv* env, jclass class_name, jfieldID field_id);
+
+  // New* methods.
+  TC3_DEFINE_VARIADIC_SCOPED_LOCAL_REF_ENV_METHOD(NewObject, jobject, jclass,
+                                                  TC3_NOT_NULL_OR_RETURN);
+  static StatusOr<ScopedLocalRef<jobjectArray>> NewObjectArray(
+      JNIEnv* env, jsize length, jclass element_class,
+      jobject initial_element = nullptr);
+  static StatusOr<ScopedLocalRef<jbyteArray>> NewByteArray(JNIEnv* env,
+                                                           jsize length);
+  static StatusOr<ScopedLocalRef<jintArray>> NewIntArray(JNIEnv* env,
+                                                         jsize length);
+  static StatusOr<ScopedLocalRef<jstring>> NewStringUTF(JNIEnv* env,
+                                                        const char* bytes);
+  static StatusOr<ScopedLocalRef<jfloatArray>> NewFloatArray(JNIEnv* env,
+                                                             jsize length);
+
+  static StatusOr<jsize> GetArrayLength(JNIEnv* env, jarray jinput_fragments);
+
+  static Status SetObjectArrayElement(JNIEnv* env, jobjectArray array,
+                                      jsize index, jobject val);
+
+  // Call* methods.
+  TC3_DEFINE_VARIADIC_SCOPED_LOCAL_REF_ENV_METHOD(CallObjectMethod, jobject,
+                                                  jobject, TC3_JNI_NO_CHECK);
+  TC3_DEFINE_VARIADIC_SCOPED_LOCAL_REF_ENV_METHOD(CallStaticObjectMethod,
+                                                  jobject, jclass,
+                                                  TC3_JNI_NO_CHECK);
+  static Status CallVoidMethod(JNIEnv* env, jobject object, jmethodID method_id,
+                               ...);
+  static StatusOr<bool> CallBooleanMethod(JNIEnv* env, jobject object,
+                                          jmethodID method_id, ...);
+  static StatusOr<int32> CallIntMethod(JNIEnv* env, jobject object,
+                                       jmethodID method_id, ...);
+  static StatusOr<int64> CallLongMethod(JNIEnv* env, jobject object,
+                                        jmethodID method_id, ...);
+  static StatusOr<float> CallFloatMethod(JNIEnv* env, jobject object,
+                                         jmethodID method_id, ...);
+  static StatusOr<double> CallDoubleMethod(JNIEnv* env, jobject object,
+                                           jmethodID method_id, ...);
+
+  template <class T>
+  static StatusOr<T> CallStaticIntMethod(JNIEnv* env, jclass clazz,
+                                         jmethodID method_id, ...);
+};
+
+template <typename T>
+StatusOr<ScopedLocalRef<T>> JniHelper::GetObjectArrayElement(JNIEnv* env,
+                                                             jobjectArray array,
+                                                             jsize index) {
+  TC3_ENSURE_LOCAL_CAPACITY_OR_RETURN;
+  ScopedLocalRef<T> result(
+      reinterpret_cast<T>(env->GetObjectArrayElement(array, index)), env);
+
+  TC3_NO_EXCEPTION_OR_RETURN;
+  return result;
+}
+
+template <class T>
+StatusOr<T> JniHelper::CallStaticIntMethod(JNIEnv* env, jclass clazz,
+                                           jmethodID method_id, ...) {
+  va_list args;
+  va_start(args, method_id);
+  jint result = env->CallStaticIntMethodV(clazz, method_id, args);
+  va_end(args);
+
+  TC3_NO_EXCEPTION_OR_RETURN;
+  return result;
+}
+
+}  // namespace libtextclassifier3
+
+#endif  // ICING_TEXT_CLASSIFIER_LIB3_UTILS_JAVA_JNI_HELPER_H_
diff --git a/icing/text_classifier/lib3/utils/java/string_utils.cc b/icing/text_classifier/lib3/utils/java/string_utils.cc
new file mode 100644
index 0000000..2384ba4
--- /dev/null
+++ b/icing/text_classifier/lib3/utils/java/string_utils.cc
@@ -0,0 +1,73 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/text_classifier/lib3/utils/java/string_utils.h"
+
+#include "icing/text_classifier/lib3/utils/base/logging.h"
+
+namespace libtextclassifier3 {
+
+bool JByteArrayToString(JNIEnv* env, const jbyteArray& array,
+                        std::string* result) {
+  jbyte* const array_bytes = env->GetByteArrayElements(array, JNI_FALSE);
+  if (array_bytes == nullptr) {
+    return false;
+  }
+
+  const int array_length = env->GetArrayLength(array);
+  *result = std::string(reinterpret_cast<char*>(array_bytes), array_length);
+
+  env->ReleaseByteArrayElements(array, array_bytes, JNI_ABORT);
+
+  return true;
+}
+
+bool JStringToUtf8String(JNIEnv* env, const jstring& jstr,
+                         std::string* result) {
+  if (jstr == nullptr) {
+    *result = std::string();
+    return true;
+  }
+
+  jclass string_class = env->FindClass("java/lang/String");
+  if (!string_class) {
+    TC3_LOG(ERROR) << "Can't find String class";
+    return false;
+  }
+
+  jmethodID get_bytes_id =
+      env->GetMethodID(string_class, "getBytes", "(Ljava/lang/String;)[B");
+
+  jstring encoding = env->NewStringUTF("UTF-8");
+
+  jbyteArray array = reinterpret_cast<jbyteArray>(
+      env->CallObjectMethod(jstr, get_bytes_id, encoding));
+
+  JByteArrayToString(env, array, result);
+
+  // Release the array.
+  env->DeleteLocalRef(array);
+  env->DeleteLocalRef(string_class);
+  env->DeleteLocalRef(encoding);
+
+  return true;
+}
+
+ScopedStringChars GetScopedStringChars(JNIEnv* env, jstring string,
+                                       jboolean* is_copy) {
+  return ScopedStringChars(env->GetStringUTFChars(string, is_copy),
+                           StringCharsReleaser(env, string));
+}
+
+}  // namespace libtextclassifier3
diff --git a/icing/text_classifier/lib3/utils/java/string_utils.h b/icing/text_classifier/lib3/utils/java/string_utils.h
new file mode 100644
index 0000000..dddef57
--- /dev/null
+++ b/icing/text_classifier/lib3/utils/java/string_utils.h
@@ -0,0 +1,74 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TEXT_CLASSIFIER_LIB3_UTILS_JAVA_STRING_UTILS_H_
+#define ICING_TEXT_CLASSIFIER_LIB3_UTILS_JAVA_STRING_UTILS_H_
+
+#include <jni.h>
+#include <memory>
+#include <string>
+
+#include "icing/text_classifier/lib3/utils/base/logging.h"
+
+namespace libtextclassifier3 {
+
+bool JByteArrayToString(JNIEnv* env, const jbyteArray& array,
+                        std::string* result);
+bool JStringToUtf8String(JNIEnv* env, const jstring& jstr, std::string* result);
+
+// A deleter to be used with std::unique_ptr to release Java string chars.
+class StringCharsReleaser {
+ public:
+  StringCharsReleaser() : env_(nullptr) {}
+
+  StringCharsReleaser(JNIEnv* env, jstring jstr) : env_(env), jstr_(jstr) {}
+
+  StringCharsReleaser(const StringCharsReleaser& orig) = default;
+
+  // Copy assignment to allow move semantics in StringCharsReleaser.
+  StringCharsReleaser& operator=(const StringCharsReleaser& rhs) {
+    // As the releaser and its state are thread-local, it's enough to only
+    // ensure the envs are consistent but do nothing.
+    TC3_CHECK_EQ(env_, rhs.env_);
+    return *this;
+  }
+
+  // The delete operator.
+  void operator()(const char* chars) const {
+    if (env_ != nullptr) {
+      env_->ReleaseStringUTFChars(jstr_, chars);
+    }
+  }
+
+ private:
+  // The env_ stashed to use for deletion. Thread-local, don't share!
+  JNIEnv* const env_;
+
+  // The referenced jstring.
+  jstring jstr_;
+};
+
+// A smart pointer that releases string chars when it goes out of scope.
+// of scope.
+// Note that this class is not thread-safe since it caches JNIEnv in
+// the deleter. Do not use the same jobject across different threads.
+using ScopedStringChars = std::unique_ptr<const char, StringCharsReleaser>;
+
+// Returns a scoped pointer to the array of Unicode characters of a string.
+ScopedStringChars GetScopedStringChars(JNIEnv* env, jstring string,
+                                       jboolean* is_copy = nullptr);
+
+}  // namespace libtextclassifier3
+
+#endif  // ICING_TEXT_CLASSIFIER_LIB3_UTILS_JAVA_STRING_UTILS_H_
diff --git a/icing/tokenization/icu-language-segmenter_test.cc b/icing/tokenization/icu-language-segmenter_test.cc
deleted file mode 100644
index fd4755a..0000000
--- a/icing/tokenization/icu-language-segmenter_test.cc
+++ /dev/null
@@ -1,374 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "gmock/gmock.h"
-#include "gtest/gtest.h"
-#include "icing/absl_ports/str_cat.h"
-#include "icing/icu-data-file-helper.h"
-#include "icing/testing/common-matchers.h"
-#include "icing/testing/icu-i18n-test-utils.h"
-#include "icing/testing/test-data.h"
-#include "icing/tokenization/language-segmenter-factory.h"
-#include "icing/tokenization/language-segmenter.h"
-#include "unicode/uloc.h"
-
-namespace icing {
-namespace lib {
-namespace {
-using ::testing::ElementsAre;
-using ::testing::Eq;
-using ::testing::IsEmpty;
-
-class IcuLanguageSegmenterAllLocalesTest
-    : public testing::TestWithParam<const char*> {
- protected:
-  void SetUp() override {
-    ICING_ASSERT_OK(
-        // File generated via icu_data_file rule in //icing/BUILD.
-        icu_data_file_helper::SetUpICUDataFile(
-            GetTestFilePath("icing/icu.dat")));
-  }
-
-  static std::string GetLocale() { return GetParam(); }
-};
-
-TEST_P(IcuLanguageSegmenterAllLocalesTest, EmptyText) {
-  ICING_ASSERT_OK_AND_ASSIGN(
-      auto language_segmenter,
-      language_segmenter_factory::Create(language_segmenter_factory::ICU4C,
-                                         GetLocale()));
-  EXPECT_THAT(language_segmenter->GetAllTerms(""), IsOkAndHolds(IsEmpty()));
-}
-
-TEST_P(IcuLanguageSegmenterAllLocalesTest, SimpleText) {
-  ICING_ASSERT_OK_AND_ASSIGN(
-      auto language_segmenter,
-      language_segmenter_factory::Create(language_segmenter_factory::ICU4C,
-                                         GetLocale()));
-  EXPECT_THAT(language_segmenter->GetAllTerms("Hello World"),
-              IsOkAndHolds(ElementsAre("Hello", " ", "World")));
-}
-
-TEST_P(IcuLanguageSegmenterAllLocalesTest, ASCII_Punctuation) {
-  ICING_ASSERT_OK_AND_ASSIGN(
-      auto language_segmenter,
-      language_segmenter_factory::Create(language_segmenter_factory::ICU4C,
-                                         GetLocale()));
-  // ASCII punctuation marks are kept
-  EXPECT_THAT(
-      language_segmenter->GetAllTerms("Hello, World!!!"),
-      IsOkAndHolds(ElementsAre("Hello", ",", " ", "World", "!", "!", "!")));
-  EXPECT_THAT(language_segmenter->GetAllTerms("Open-source project"),
-              IsOkAndHolds(ElementsAre("Open", "-", "source", " ", "project")));
-  EXPECT_THAT(language_segmenter->GetAllTerms("100%"),
-              IsOkAndHolds(ElementsAre("100", "%")));
-  EXPECT_THAT(language_segmenter->GetAllTerms("A&B"),
-              IsOkAndHolds(ElementsAre("A", "&", "B")));
-}
-
-TEST_P(IcuLanguageSegmenterAllLocalesTest, ASCII_SpecialCharacter) {
-  ICING_ASSERT_OK_AND_ASSIGN(
-      auto language_segmenter,
-      language_segmenter_factory::Create(language_segmenter_factory::ICU4C,
-                                         GetLocale()));
-  // ASCII special characters are kept
-  EXPECT_THAT(language_segmenter->GetAllTerms("Pay $1000"),
-              IsOkAndHolds(ElementsAre("Pay", " ", "$", "1000")));
-  EXPECT_THAT(language_segmenter->GetAllTerms("A+B"),
-              IsOkAndHolds(ElementsAre("A", "+", "B")));
-  // 0x0009 is the unicode for tab (within ASCII range).
-  std::string text_with_tab = absl_ports::StrCat(
-      "Hello", UCharToString(0x0009), UCharToString(0x0009), "World");
-  EXPECT_THAT(language_segmenter->GetAllTerms(text_with_tab),
-              IsOkAndHolds(ElementsAre("Hello", UCharToString(0x0009),
-                                       UCharToString(0x0009), "World")));
-}
-
-TEST_P(IcuLanguageSegmenterAllLocalesTest, Non_ASCII_Non_Alphabetic) {
-  ICING_ASSERT_OK_AND_ASSIGN(
-      auto language_segmenter,
-      language_segmenter_factory::Create(language_segmenter_factory::ICU4C,
-                                         GetLocale()));
-  // Full-width (non-ASCII) punctuation marks and special characters are left
-  // out.
-  EXPECT_THAT(language_segmenter->GetAllTerms("。？·Hello！×"),
-              IsOkAndHolds(ElementsAre("Hello")));
-}
-
-TEST_P(IcuLanguageSegmenterAllLocalesTest, Acronym) {
-  ICING_ASSERT_OK_AND_ASSIGN(
-      auto language_segmenter,
-      language_segmenter_factory::Create(language_segmenter_factory::ICU4C,
-                                         GetLocale()));
-  EXPECT_THAT(language_segmenter->GetAllTerms("U.S. Bank"),
-              IsOkAndHolds(ElementsAre("U.S", ".", " ", "Bank")));
-  EXPECT_THAT(language_segmenter->GetAllTerms("I.B.M."),
-              IsOkAndHolds(ElementsAre("I.B.M", ".")));
-  EXPECT_THAT(language_segmenter->GetAllTerms("I,B,M"),
-              IsOkAndHolds(ElementsAre("I", ",", "B", ",", "M")));
-  EXPECT_THAT(language_segmenter->GetAllTerms("I B M"),
-              IsOkAndHolds(ElementsAre("I", " ", "B", " ", "M")));
-}
-
-TEST_P(IcuLanguageSegmenterAllLocalesTest, WordConnector) {
-  ICING_ASSERT_OK_AND_ASSIGN(
-      auto language_segmenter,
-      language_segmenter_factory::Create(language_segmenter_factory::ICU4C,
-                                         GetLocale()));
-  // According to unicode word break rules
-  // WB6(https://unicode.org/reports/tr29/#WB6),
-  // WB7(https://unicode.org/reports/tr29/#WB7), and a few others, some
-  // punctuation characters are used as word connecters. That is, words don't
-  // break before and after them. Here we just test some that we care about.
-
-  // Word connecters
-  EXPECT_THAT(language_segmenter->GetAllTerms("com.google.android"),
-              IsOkAndHolds(ElementsAre("com.google.android")));
-  EXPECT_THAT(language_segmenter->GetAllTerms("com:google:android"),
-              IsOkAndHolds(ElementsAre("com:google:android")));
-  EXPECT_THAT(language_segmenter->GetAllTerms("com'google'android"),
-              IsOkAndHolds(ElementsAre("com'google'android")));
-  EXPECT_THAT(language_segmenter->GetAllTerms("com_google_android"),
-              IsOkAndHolds(ElementsAre("com_google_android")));
-
-  // Word connecters can be mixed
-  EXPECT_THAT(language_segmenter->GetAllTerms("com.google.android:icing"),
-              IsOkAndHolds(ElementsAre("com.google.android:icing")));
-
-  // Any heading and trailing characters are not connecters
-  EXPECT_THAT(language_segmenter->GetAllTerms(".com.google.android."),
-              IsOkAndHolds(ElementsAre(".", "com.google.android", ".")));
-
-  // Not word connecters
-  EXPECT_THAT(language_segmenter->GetAllTerms("com,google,android"),
-              IsOkAndHolds(ElementsAre("com", ",", "google", ",", "android")));
-  EXPECT_THAT(language_segmenter->GetAllTerms("com-google-android"),
-              IsOkAndHolds(ElementsAre("com", "-", "google", "-", "android")));
-  EXPECT_THAT(language_segmenter->GetAllTerms("com+google+android"),
-              IsOkAndHolds(ElementsAre("com", "+", "google", "+", "android")));
-  EXPECT_THAT(language_segmenter->GetAllTerms("com*google*android"),
-              IsOkAndHolds(ElementsAre("com", "*", "google", "*", "android")));
-  EXPECT_THAT(language_segmenter->GetAllTerms("com@google@android"),
-              IsOkAndHolds(ElementsAre("com", "@", "google", "@", "android")));
-  EXPECT_THAT(language_segmenter->GetAllTerms("com^google^android"),
-              IsOkAndHolds(ElementsAre("com", "^", "google", "^", "android")));
-  EXPECT_THAT(language_segmenter->GetAllTerms("com&google&android"),
-              IsOkAndHolds(ElementsAre("com", "&", "google", "&", "android")));
-  EXPECT_THAT(language_segmenter->GetAllTerms("com|google|android"),
-              IsOkAndHolds(ElementsAre("com", "|", "google", "|", "android")));
-  EXPECT_THAT(language_segmenter->GetAllTerms("com/google/android"),
-              IsOkAndHolds(ElementsAre("com", "/", "google", "/", "android")));
-  EXPECT_THAT(language_segmenter->GetAllTerms("com;google;android"),
-              IsOkAndHolds(ElementsAre("com", ";", "google", ";", "android")));
-  EXPECT_THAT(
-      language_segmenter->GetAllTerms("com\"google\"android"),
-      IsOkAndHolds(ElementsAre("com", "\"", "google", "\"", "android")));
-}
-
-TEST_P(IcuLanguageSegmenterAllLocalesTest, Apostrophes) {
-  ICING_ASSERT_OK_AND_ASSIGN(
-      auto language_segmenter,
-      language_segmenter_factory::Create(language_segmenter_factory::ICU4C,
-                                         GetLocale()));
-  EXPECT_THAT(language_segmenter->GetAllTerms("It's ok."),
-              IsOkAndHolds(ElementsAre("It's", " ", "ok", ".")));
-  EXPECT_THAT(language_segmenter->GetAllTerms("He'll be back."),
-              IsOkAndHolds(ElementsAre("He'll", " ", "be", " ", "back", ".")));
-  EXPECT_THAT(language_segmenter->GetAllTerms("'Hello 'World."),
-              IsOkAndHolds(ElementsAre("'", "Hello", " ", "'", "World", ".")));
-  EXPECT_THAT(language_segmenter->GetAllTerms("The dogs' bone"),
-              IsOkAndHolds(ElementsAre("The", " ", "dogs", "'", " ", "bone")));
-  // 0x2019 is the single right quote, should be treated the same as "'"
-  std::string token_with_quote =
-      absl_ports::StrCat("He", UCharToString(0x2019), "ll");
-  std::string text_with_quote =
-      absl_ports::StrCat(token_with_quote, " be back.");
-  EXPECT_THAT(
-      language_segmenter->GetAllTerms(text_with_quote),
-      IsOkAndHolds(ElementsAre(token_with_quote, " ", "be", " ", "back", ".")));
-}
-
-TEST_P(IcuLanguageSegmenterAllLocalesTest, Parentheses) {
-  ICING_ASSERT_OK_AND_ASSIGN(
-      auto language_segmenter,
-      language_segmenter_factory::Create(language_segmenter_factory::ICU4C,
-                                         GetLocale()));
-
-  EXPECT_THAT(language_segmenter->GetAllTerms("(Hello)"),
-              IsOkAndHolds(ElementsAre("(", "Hello", ")")));
-
-  EXPECT_THAT(language_segmenter->GetAllTerms(")Hello("),
-              IsOkAndHolds(ElementsAre(")", "Hello", "(")));
-}
-
-TEST_P(IcuLanguageSegmenterAllLocalesTest, Quotes) {
-  ICING_ASSERT_OK_AND_ASSIGN(
-      auto language_segmenter,
-      language_segmenter_factory::Create(language_segmenter_factory::ICU4C,
-                                         GetLocale()));
-
-  EXPECT_THAT(language_segmenter->GetAllTerms("\"Hello\""),
-              IsOkAndHolds(ElementsAre("\"", "Hello", "\"")));
-
-  EXPECT_THAT(language_segmenter->GetAllTerms("'Hello'"),
-              IsOkAndHolds(ElementsAre("'", "Hello", "'")));
-}
-
-TEST_P(IcuLanguageSegmenterAllLocalesTest, Alphanumeric) {
-  ICING_ASSERT_OK_AND_ASSIGN(
-      auto language_segmenter,
-      language_segmenter_factory::Create(language_segmenter_factory::ICU4C,
-                                         GetLocale()));
-
-  // Alphanumeric terms are allowed
-  EXPECT_THAT(language_segmenter->GetAllTerms("Se7en A4 3a"),
-              IsOkAndHolds(ElementsAre("Se7en", " ", "A4", " ", "3a")));
-}
-
-TEST_P(IcuLanguageSegmenterAllLocalesTest, Number) {
-  ICING_ASSERT_OK_AND_ASSIGN(
-      auto language_segmenter,
-      language_segmenter_factory::Create(language_segmenter_factory::ICU4C,
-                                         GetLocale()));
-
-  // Alphanumeric terms are allowed
-  EXPECT_THAT(
-      language_segmenter->GetAllTerms("3.141592653589793238462643383279"),
-      IsOkAndHolds(ElementsAre("3.141592653589793238462643383279")));
-
-  EXPECT_THAT(language_segmenter->GetAllTerms("3,456.789"),
-              IsOkAndHolds(ElementsAre("3,456.789")));
-
-  EXPECT_THAT(language_segmenter->GetAllTerms("-123"),
-              IsOkAndHolds(ElementsAre("-", "123")));
-}
-
-TEST_P(IcuLanguageSegmenterAllLocalesTest, ContinuousWhitespaces) {
-  ICING_ASSERT_OK_AND_ASSIGN(
-      auto language_segmenter,
-      language_segmenter_factory::Create(language_segmenter_factory::ICU4C,
-                                         GetLocale()));
-  // Multiple continuous whitespaces are treated as one.
-  const int kNumSeparators = 256;
-  const std::string text_with_spaces =
-      absl_ports::StrCat("Hello", std::string(kNumSeparators, ' '), "World");
-  EXPECT_THAT(language_segmenter->GetAllTerms(text_with_spaces),
-              IsOkAndHolds(ElementsAre("Hello", " ", "World")));
-}
-
-TEST_P(IcuLanguageSegmenterAllLocalesTest, CJKT) {
-  ICING_ASSERT_OK_AND_ASSIGN(
-      auto language_segmenter,
-      language_segmenter_factory::Create(language_segmenter_factory::ICU4C,
-                                         GetLocale()));
-  // CJKT (Chinese, Japanese, Khmer, Thai) are the 4 main languages that don't
-  // have whitespaces as word delimiter.
-
-  // Chinese
-  EXPECT_THAT(language_segmenter->GetAllTerms("我每天走路去上班。"),
-              IsOkAndHolds(ElementsAre("我", "每天", "走路", "去", "上班")));
-  // Japanese
-  EXPECT_THAT(language_segmenter->GetAllTerms("私は毎日仕事に歩いています。"),
-              IsOkAndHolds(ElementsAre("私", "は", "毎日", "仕事", "に", "歩",
-                                       "い", "てい", "ます")));
-  // Khmer
-  EXPECT_THAT(language_segmenter->GetAllTerms("ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"),
-              IsOkAndHolds(ElementsAre("ញុំ", "ដើរទៅ", "ធ្វើការ", "រាល់ថ្ងៃ")));
-  // Thai
-  EXPECT_THAT(
-      language_segmenter->GetAllTerms("ฉันเดินไปทำงานทุกวัน"),
-      IsOkAndHolds(ElementsAre("ฉัน", "เดิน", "ไป", "ทำงาน", "ทุก", "วัน")));
-}
-
-TEST_P(IcuLanguageSegmenterAllLocalesTest, LatinLettersWithAccents) {
-  ICING_ASSERT_OK_AND_ASSIGN(
-      auto language_segmenter,
-      language_segmenter_factory::Create(language_segmenter_factory::ICU4C,
-                                         GetLocale()));
-  EXPECT_THAT(language_segmenter->GetAllTerms("āăąḃḅḇčćç"),
-              IsOkAndHolds(ElementsAre("āăąḃḅḇčćç")));
-}
-
-// TODO(samzheng): test cases for more languages (e.g. top 20 in the world)
-TEST_P(IcuLanguageSegmenterAllLocalesTest, WhitespaceSplitLanguages) {
-  ICING_ASSERT_OK_AND_ASSIGN(
-      auto language_segmenter,
-      language_segmenter_factory::Create(language_segmenter_factory::ICU4C,
-                                         GetLocale()));
-  // Turkish
-  EXPECT_THAT(language_segmenter->GetAllTerms("merhaba dünya"),
-              IsOkAndHolds(ElementsAre("merhaba", " ", "dünya")));
-  // Korean
-  EXPECT_THAT(
-      language_segmenter->GetAllTerms("나는 매일 출근합니다."),
-      IsOkAndHolds(ElementsAre("나는", " ", "매일", " ", "출근합니다", ".")));
-}
-
-// TODO(samzheng): more mixed languages test cases
-TEST_P(IcuLanguageSegmenterAllLocalesTest, MixedLanguages) {
-  ICING_ASSERT_OK_AND_ASSIGN(
-      auto language_segmenter,
-      language_segmenter_factory::Create(language_segmenter_factory::ICU4C,
-                                         GetLocale()));
-  EXPECT_THAT(language_segmenter->GetAllTerms("How are you你好吗お元気ですか"),
-              IsOkAndHolds(ElementsAre("How", " ", "are", " ", "you", "你好",
-                                       "吗", "お", "元気", "です", "か")));
-
-  EXPECT_THAT(
-      language_segmenter->GetAllTerms("나는 California에 산다"),
-      IsOkAndHolds(ElementsAre("나는", " ", "California", "에", " ", "산다")));
-}
-
-TEST_P(IcuLanguageSegmenterAllLocalesTest, NotCopyStrings) {
-  ICING_ASSERT_OK_AND_ASSIGN(
-      auto language_segmenter,
-      language_segmenter_factory::Create(language_segmenter_factory::ICU4C,
-                                         GetLocale()));
-  // Validates that the input strings are not copied
-  const std::string text = "Hello World";
-  const char* word1_address = text.c_str();
-  const char* word2_address = text.c_str() + 6;
-  ICING_ASSERT_OK_AND_ASSIGN(std::vector<std::string_view> terms,
-                             language_segmenter->GetAllTerms(text));
-  ASSERT_THAT(terms, ElementsAre("Hello", " ", "World"));
-  const char* word1_result_address = terms.at(0).data();
-  const char* word2_result_address = terms.at(2).data();
-
-  // The underlying char* should be the same
-  EXPECT_THAT(word1_address, Eq(word1_result_address));
-  EXPECT_THAT(word2_address, Eq(word2_result_address));
-}
-
-INSTANTIATE_TEST_SUITE_P(
-    LocaleName, IcuLanguageSegmenterAllLocalesTest,
-    testing::Values(ULOC_US, ULOC_UK, ULOC_CANADA, ULOC_CANADA_FRENCH,
-                    ULOC_FRANCE, ULOC_GERMANY, ULOC_ITALY, ULOC_JAPAN,
-                    ULOC_KOREA, ULOC_SIMPLIFIED_CHINESE,
-                    ULOC_TRADITIONAL_CHINESE,
-                    "es_ES",        // Spanish
-                    "hi_IN",        // Hindi
-                    "th_TH",        // Thai
-                    "lo_LA",        // Lao
-                    "km_KH",        // Khmer
-                    "ar_DZ",        // Arabic
-                    "ru_RU",        // Russian
-                    "pt_PT",        // Portuguese
-                    "en_US_POSIX"   // American English (Computer)
-                    "wrong_locale"  // Will fall back to ICU default locale
-                    ""              // Will fall back to ICU default locale
-                    ));
-
-}  // namespace
-}  // namespace lib
-}  // namespace icing
diff --git a/icing/tokenization/language-segmenter-factory.cc b/icing/tokenization/icu/icu-language-segmenter-factory.cc
index 92d06fe..0ef1824 100644
--- a/icing/tokenization/language-segmenter-factory.cc
+++ b/icing/tokenization/icu/icu-language-segmenter-factory.cc
@@ -12,10 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "icing/tokenization/icu/icu-language-segmenter.h"
 #include "icing/tokenization/language-segmenter-factory.h"
-
-#include "icing/tokenization/icu-language-segmenter.h"
-#include "icing/tokenization/space-language-segmenter.h"
 #include "icing/util/logging.h"
 
 namespace icing {
@@ -37,23 +35,18 @@ constexpr std::string_view kLocaleAmericanEnglishComputer = "en_US_POSIX";
 // users. Right now illegal locale strings will be ignored by ICU. ICU
 // components will be created with its default locale.
 libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter>> Create(
-    SegmenterType type, std::string locale) {
+    SegmenterOptions options) {
   // Word connector rules for "en_US_POSIX" (American English (Computer)) are
   // different from other locales. E.g. "email.subject" will be split into 3
   // terms in "en_US_POSIX": "email", ".", and "subject", while it's just one
   // term in other locales. Our current LanguageSegmenter doesn't handle this
   // special rule, so we replace it with "en_US".
-  if (locale == kLocaleAmericanEnglishComputer) {
+  if (options.locale == kLocaleAmericanEnglishComputer) {
     ICING_LOG(WARNING) << "Locale " << kLocaleAmericanEnglishComputer
                        << " not supported. Converting to locale " << ULOC_US;
-    locale = ULOC_US;
-  }
-  switch (type) {
-    case ICU4C:
-      return std::make_unique<IcuLanguageSegmenter>(std::move(locale));
-    case SPACE:
-      return std::make_unique<SpaceLanguageSegmenter>();
+    options.locale = ULOC_US;
   }
+  return std::make_unique<IcuLanguageSegmenter>(std::move(options.locale));
 }
 
 }  // namespace language_segmenter_factory
diff --git a/icing/tokenization/icu-language-segmenter.cc b/icing/tokenization/icu/icu-language-segmenter.cc
index 8d6aa76..d43a78d 100644
--- a/icing/tokenization/icu-language-segmenter.cc
+++ b/icing/tokenization/icu/icu-language-segmenter.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "icing/tokenization/icu-language-segmenter.h"
+#include "icing/tokenization/icu/icu-language-segmenter.h"
 
 #include <cstdint>
 #include <memory>
@@ -24,7 +24,8 @@
 #include "icing/text_classifier/lib3/utils/base/status.h"
 #include "icing/text_classifier/lib3/utils/base/statusor.h"
 #include "icing/absl_ports/canonical_errors.h"
-#include "icing/util/icu-i18n-utils.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/util/i18n-utils.h"
 #include "icing/util/status-macros.h"
 #include "unicode/ubrk.h"
 #include "unicode/uchar.h"
@@ -61,7 +62,7 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
   }
 
   // Advances to the next term. Returns false if it has reached the end.
-  bool Advance() {
+  bool Advance() override {
     // Prerequisite check
     if (term_end_index_exclusive_ == UBRK_DONE) {
       return false;
@@ -77,52 +78,66 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
 
     // Reached the end
     if (term_end_index_exclusive_ == UBRK_DONE) {
+      MarkAsDone();
       return false;
     }
 
-    // Rule 1: all ASCII terms will be returned.
-    // We know it's a ASCII term by checking the first char.
-    if (icu_i18n_utils::IsAscii(text_[term_start_index_])) {
-      return true;
-    }
-
-    UChar32 uchar32 = icu_i18n_utils::GetUChar32At(text_.data(), text_.length(),
-                                                   term_start_index_);
-    // Rule 2: for non-ASCII terms, only the alphabetic terms are returned.
-    // We know it's an alphabetic term by checking the first unicode character.
-    if (u_isUAlphabetic(uchar32)) {
-      return true;
-    } else {
+    if (!IsValidSegment()) {
       return Advance();
     }
+    return true;
   }
 
   // Returns the current term. It can be called only when Advance() returns
   // true.
-  std::string_view GetTerm() const {
-    if (text_[term_start_index_] == kASCIISpace) {
+  std::string_view GetTerm() const override {
+    int term_length = term_end_index_exclusive_ - term_start_index_;
+    if (term_end_index_exclusive_ == UBRK_DONE) {
+      term_length = 0;
+    } else if (text_[term_start_index_] == kASCIISpace) {
       // Rule 3: multiple continuous whitespaces are treated as one.
-      return std::string_view(&text_[term_start_index_], 1);
+      term_length = 1;
     }
-    return text_.substr(term_start_index_,
-                        term_end_index_exclusive_ - term_start_index_);
+    return text_.substr(term_start_index_, term_length);
   }
 
   libtextclassifier3::StatusOr<int32_t> ResetToTermStartingAfter(
-      int32_t offset) {
+      int32_t offset) override {
+    if (offset < 0 || offset >= text_.length()) {
+      return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+          "Illegal offset provided! Offset %d is not within bounds of string "
+          "of length %zu",
+          offset, text_.length()));
+    }
     term_start_index_ = ubrk_following(break_iterator_, offset);
     if (term_start_index_ == UBRK_DONE) {
-      return absl_ports::NotFoundError("");
+      MarkAsDone();
+      return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
+          "No segments begin after provided offset %d.", offset));
     }
     term_end_index_exclusive_ = ubrk_next(break_iterator_);
     if (term_end_index_exclusive_ == UBRK_DONE) {
-      return absl_ports::NotFoundError("");
+      MarkAsDone();
+      return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
+          "No segments begin after provided offset %d.", offset));
+    }
+    if (!IsValidSegment()) {
+      if (!Advance()) {
+        return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
+            "No segments begin after provided offset %d.", offset));
+      }
     }
     return term_start_index_;
   }
 
   libtextclassifier3::StatusOr<int32_t> ResetToTermEndingBefore(
-      int32_t offset) {
+      int32_t offset) override {
+    if (offset < 0 || offset >= text_.length()) {
+      return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+          "Illegal offset provided! Offset %d is not within bounds of string "
+          "of length %zu",
+          offset, text_.length()));
+    }
     ICING_RETURN_IF_ERROR(ResetToTermStartingBefore(offset));
     if (term_end_index_exclusive_ > offset) {
       // This term ends after offset. So we need to get the term just before
@@ -132,6 +147,15 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
     return term_start_index_;
   }
 
+  libtextclassifier3::StatusOr<int32_t> ResetToStart() override {
+    term_start_index_ = 0;
+    term_end_index_exclusive_ = 0;
+    if (!Advance()) {
+      return absl_ports::NotFoundError("");
+    }
+    return term_start_index_;
+  }
+
  private:
   explicit IcuLanguageSegmenterIterator(std::string_view text,
                                         std::string_view locale)
@@ -155,15 +179,43 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
   libtextclassifier3::Status ResetToTermStartingBefore(int32_t offset) {
     term_start_index_ = ubrk_preceding(break_iterator_, offset);
     if (term_start_index_ == UBRK_DONE) {
+      MarkAsDone();
       return absl_ports::NotFoundError("");
     }
     term_end_index_exclusive_ = ubrk_next(break_iterator_);
     if (term_end_index_exclusive_ == UBRK_DONE) {
+      MarkAsDone();
       return absl_ports::NotFoundError("");
     }
     return libtextclassifier3::Status::OK;
   }
 
+  // Ensures that all members are consistent with the 'Done' state.
+  // In the 'Done' state, term_start_index_ will point to the first character
+  // and term_end_index_exclusive_ will be marked with the kDone value.
+  // break_iterator_ may be in any state.
+  void MarkAsDone() {
+    term_end_index_exclusive_ = UBRK_DONE;
+    term_start_index_ = 0;
+  }
+
+  bool IsValidSegment() const {
+    // Rule 1: all ASCII terms will be returned.
+    // We know it's a ASCII term by checking the first char.
+    if (i18n_utils::IsAscii(text_[term_start_index_])) {
+      return true;
+    }
+
+    UChar32 uchar32 = i18n_utils::GetUChar32At(text_.data(), text_.length(),
+                                               term_start_index_);
+    // Rule 2: for non-ASCII terms, only the alphabetic terms are returned.
+    // We know it's an alphabetic term by checking the first unicode character.
+    if (u_isUAlphabetic(uchar32)) {
+      return true;
+    }
+    return false;
+  }
+
   // The underlying class that does the segmentation, ubrk_close() must be
   // called after using.
   UBreakIterator* break_iterator_;
diff --git a/icing/tokenization/icu-language-segmenter.h b/icing/tokenization/icu/icu-language-segmenter.h
index b3d1acc..4115461 100644
--- a/icing/tokenization/icu-language-segmenter.h
+++ b/icing/tokenization/icu/icu-language-segmenter.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef ICING_TOKENIZATION_ICU_LANGUAGE_SEGMENTER_H_
-#define ICING_TOKENIZATION_ICU_LANGUAGE_SEGMENTER_H_
+#ifndef ICING_TOKENIZATION_ICU_ICU_LANGUAGE_SEGMENTER_H_
+#define ICING_TOKENIZATION_ICU_ICU_LANGUAGE_SEGMENTER_H_
 
 #include <cstdint>
 #include <memory>
@@ -76,4 +76,4 @@ class IcuLanguageSegmenter : public LanguageSegmenter {
 }  // namespace lib
 }  // namespace icing
 
-#endif  // ICING_TOKENIZATION_ICU_LANGUAGE_SEGMENTER_H_
+#endif  // ICING_TOKENIZATION_ICU_ICU_LANGUAGE_SEGMENTER_H_
diff --git a/icing/tokenization/icu/icu-language-segmenter_test.cc b/icing/tokenization/icu/icu-language-segmenter_test.cc
new file mode 100644
index 0000000..31c2726
--- /dev/null
+++ b/icing/tokenization/icu/icu-language-segmenter_test.cc
@@ -0,0 +1,1016 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/icu-i18n-test-utils.h"
+#include "icing/testing/test-data.h"
+#include "icing/tokenization/language-segmenter-factory.h"
+#include "icing/tokenization/language-segmenter.h"
+#include "unicode/uloc.h"
+
+namespace icing {
+namespace lib {
+namespace {
+using ::testing::ElementsAre;
+using ::testing::Eq;
+using ::testing::IsEmpty;
+
+// Returns a vector containing all terms retrieved by Advancing on the iterator.
+std::vector<std::string_view> GetAllTermsAdvance(
+    LanguageSegmenter::Iterator* itr) {
+  std::vector<std::string_view> terms;
+  while (itr->Advance()) {
+    terms.push_back(itr->GetTerm());
+  }
+  return terms;
+}
+
+// Returns a vector containing all terms retrieved by calling
+// ResetToStart/ResetAfter with the current position to simulate Advancing on
+// the iterator.
+std::vector<std::string_view> GetAllTermsResetAfter(
+    LanguageSegmenter::Iterator* itr) {
+  std::vector<std::string_view> terms;
+  if (!itr->ResetToStart().ok()) {
+    return terms;
+  }
+  terms.push_back(itr->GetTerm());
+  const char* text_begin = itr->GetTerm().data();
+  // Calling ResetToTermStartingAfter with the current position should get the
+  // very next term in the sequence.
+  for (int current_pos = 0; itr->ResetToTermStartingAfter(current_pos).ok();
+       current_pos = itr->GetTerm().data() - text_begin) {
+    terms.push_back(itr->GetTerm());
+  }
+  return terms;
+}
+
+// Returns a vector containing all terms retrieved by alternating calls to
+// Advance and calls to ResetAfter with the current position to simulate
+// Advancing.
+std::vector<std::string_view> GetAllTermsAdvanceAndResetAfter(
+    LanguageSegmenter::Iterator* itr) {
+  const char* text_begin = itr->GetTerm().data();
+  std::vector<std::string_view> terms;
+
+  bool is_ok = true;
+  int current_pos = 0;
+  while (is_ok) {
+    // Alternate between using Advance and ResetToTermAfter.
+    if (terms.size() % 2 == 0) {
+      is_ok = itr->Advance();
+    } else {
+      // Calling ResetToTermStartingAfter with the current position should get
+      // the very next term in the sequence.
+      current_pos = itr->GetTerm().data() - text_begin;
+      is_ok = itr->ResetToTermStartingAfter(current_pos).ok();
+    }
+    if (is_ok) {
+      terms.push_back(itr->GetTerm());
+    }
+  }
+  return terms;
+}
+
+// Returns a vector containing all terms retrieved by calling ResetBefore with
+// the current position, starting at the end of the text. This vector should be
+// in reverse order of GetAllTerms and missing the last term.
+std::vector<std::string_view> GetAllTermsResetBefore(
+    LanguageSegmenter::Iterator* itr) {
+  const char* text_begin = itr->GetTerm().data();
+  int last_pos = 0;
+  while (itr->Advance()) {
+    last_pos = itr->GetTerm().data() - text_begin;
+  }
+  std::vector<std::string_view> terms;
+  // Calling ResetToTermEndingBefore with the current position should get the
+  // previous term in the sequence.
+  for (int current_pos = last_pos;
+       itr->ResetToTermEndingBefore(current_pos).ok();
+       current_pos = itr->GetTerm().data() - text_begin) {
+    terms.push_back(itr->GetTerm());
+  }
+  return terms;
+}
+
+class IcuLanguageSegmenterAllLocalesTest
+    : public testing::TestWithParam<const char*> {
+ protected:
+  void SetUp() override {
+    ICING_ASSERT_OK(
+        // File generated via icu_data_file rule in //icing/BUILD.
+        icu_data_file_helper::SetUpICUDataFile(
+            GetTestFilePath("icing/icu.dat")));
+  }
+
+  static std::string GetLocale() { return GetParam(); }
+  static language_segmenter_factory::SegmenterOptions GetOptions() {
+    return language_segmenter_factory::SegmenterOptions(GetLocale());
+  }
+};
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, EmptyText) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  EXPECT_THAT(language_segmenter->GetAllTerms(""), IsOkAndHolds(IsEmpty()));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, SimpleText) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  EXPECT_THAT(language_segmenter->GetAllTerms("Hello World"),
+              IsOkAndHolds(ElementsAre("Hello", " ", "World")));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, ASCII_Punctuation) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  // ASCII punctuation marks are kept
+  EXPECT_THAT(
+      language_segmenter->GetAllTerms("Hello, World!!!"),
+      IsOkAndHolds(ElementsAre("Hello", ",", " ", "World", "!", "!", "!")));
+  EXPECT_THAT(language_segmenter->GetAllTerms("Open-source project"),
+              IsOkAndHolds(ElementsAre("Open", "-", "source", " ", "project")));
+  EXPECT_THAT(language_segmenter->GetAllTerms("100%"),
+              IsOkAndHolds(ElementsAre("100", "%")));
+  EXPECT_THAT(language_segmenter->GetAllTerms("A&B"),
+              IsOkAndHolds(ElementsAre("A", "&", "B")));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, ASCII_SpecialCharacter) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  // ASCII special characters are kept
+  EXPECT_THAT(language_segmenter->GetAllTerms("Pay $1000"),
+              IsOkAndHolds(ElementsAre("Pay", " ", "$", "1000")));
+  EXPECT_THAT(language_segmenter->GetAllTerms("A+B"),
+              IsOkAndHolds(ElementsAre("A", "+", "B")));
+  // 0x0009 is the unicode for tab (within ASCII range).
+  std::string text_with_tab = absl_ports::StrCat(
+      "Hello", UCharToString(0x0009), UCharToString(0x0009), "World");
+  EXPECT_THAT(language_segmenter->GetAllTerms(text_with_tab),
+              IsOkAndHolds(ElementsAre("Hello", UCharToString(0x0009),
+                                       UCharToString(0x0009), "World")));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, Non_ASCII_Non_Alphabetic) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  // Full-width (non-ASCII) punctuation marks and special characters are left
+  // out.
+  EXPECT_THAT(language_segmenter->GetAllTerms("。？·Hello！×"),
+              IsOkAndHolds(ElementsAre("Hello")));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, Acronym) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  EXPECT_THAT(language_segmenter->GetAllTerms("U.S. Bank"),
+              IsOkAndHolds(ElementsAre("U.S", ".", " ", "Bank")));
+  EXPECT_THAT(language_segmenter->GetAllTerms("I.B.M."),
+              IsOkAndHolds(ElementsAre("I.B.M", ".")));
+  EXPECT_THAT(language_segmenter->GetAllTerms("I,B,M"),
+              IsOkAndHolds(ElementsAre("I", ",", "B", ",", "M")));
+  EXPECT_THAT(language_segmenter->GetAllTerms("I B M"),
+              IsOkAndHolds(ElementsAre("I", " ", "B", " ", "M")));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, WordConnector) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  // According to unicode word break rules
+  // WB6(https://unicode.org/reports/tr29/#WB6),
+  // WB7(https://unicode.org/reports/tr29/#WB7), and a few others, some
+  // punctuation characters are used as word connecters. That is, words don't
+  // break before and after them. Here we just test some that we care about.
+
+  // Word connecters
+  EXPECT_THAT(language_segmenter->GetAllTerms("com.google.android"),
+              IsOkAndHolds(ElementsAre("com.google.android")));
+  EXPECT_THAT(language_segmenter->GetAllTerms("com:google:android"),
+              IsOkAndHolds(ElementsAre("com:google:android")));
+  EXPECT_THAT(language_segmenter->GetAllTerms("com'google'android"),
+              IsOkAndHolds(ElementsAre("com'google'android")));
+  EXPECT_THAT(language_segmenter->GetAllTerms("com_google_android"),
+              IsOkAndHolds(ElementsAre("com_google_android")));
+
+  // Word connecters can be mixed
+  EXPECT_THAT(language_segmenter->GetAllTerms("com.google.android:icing"),
+              IsOkAndHolds(ElementsAre("com.google.android:icing")));
+
+  // Any heading and trailing characters are not connecters
+  EXPECT_THAT(language_segmenter->GetAllTerms(".com.google.android."),
+              IsOkAndHolds(ElementsAre(".", "com.google.android", ".")));
+
+  // Not word connecters
+  EXPECT_THAT(language_segmenter->GetAllTerms("com,google,android"),
+              IsOkAndHolds(ElementsAre("com", ",", "google", ",", "android")));
+  EXPECT_THAT(language_segmenter->GetAllTerms("com-google-android"),
+              IsOkAndHolds(ElementsAre("com", "-", "google", "-", "android")));
+  EXPECT_THAT(language_segmenter->GetAllTerms("com+google+android"),
+              IsOkAndHolds(ElementsAre("com", "+", "google", "+", "android")));
+  EXPECT_THAT(language_segmenter->GetAllTerms("com*google*android"),
+              IsOkAndHolds(ElementsAre("com", "*", "google", "*", "android")));
+  EXPECT_THAT(language_segmenter->GetAllTerms("com@google@android"),
+              IsOkAndHolds(ElementsAre("com", "@", "google", "@", "android")));
+  EXPECT_THAT(language_segmenter->GetAllTerms("com^google^android"),
+              IsOkAndHolds(ElementsAre("com", "^", "google", "^", "android")));
+  EXPECT_THAT(language_segmenter->GetAllTerms("com&google&android"),
+              IsOkAndHolds(ElementsAre("com", "&", "google", "&", "android")));
+  EXPECT_THAT(language_segmenter->GetAllTerms("com|google|android"),
+              IsOkAndHolds(ElementsAre("com", "|", "google", "|", "android")));
+  EXPECT_THAT(language_segmenter->GetAllTerms("com/google/android"),
+              IsOkAndHolds(ElementsAre("com", "/", "google", "/", "android")));
+  EXPECT_THAT(language_segmenter->GetAllTerms("com;google;android"),
+              IsOkAndHolds(ElementsAre("com", ";", "google", ";", "android")));
+  EXPECT_THAT(
+      language_segmenter->GetAllTerms("com\"google\"android"),
+      IsOkAndHolds(ElementsAre("com", "\"", "google", "\"", "android")));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, Apostrophes) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  EXPECT_THAT(language_segmenter->GetAllTerms("It's ok."),
+              IsOkAndHolds(ElementsAre("It's", " ", "ok", ".")));
+  EXPECT_THAT(language_segmenter->GetAllTerms("He'll be back."),
+              IsOkAndHolds(ElementsAre("He'll", " ", "be", " ", "back", ".")));
+  EXPECT_THAT(language_segmenter->GetAllTerms("'Hello 'World."),
+              IsOkAndHolds(ElementsAre("'", "Hello", " ", "'", "World", ".")));
+  EXPECT_THAT(language_segmenter->GetAllTerms("The dogs' bone"),
+              IsOkAndHolds(ElementsAre("The", " ", "dogs", "'", " ", "bone")));
+  // 0x2019 is the single right quote, should be treated the same as "'"
+  std::string token_with_quote =
+      absl_ports::StrCat("He", UCharToString(0x2019), "ll");
+  std::string text_with_quote =
+      absl_ports::StrCat(token_with_quote, " be back.");
+  EXPECT_THAT(
+      language_segmenter->GetAllTerms(text_with_quote),
+      IsOkAndHolds(ElementsAre(token_with_quote, " ", "be", " ", "back", ".")));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, Parentheses) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+
+  EXPECT_THAT(language_segmenter->GetAllTerms("(Hello)"),
+              IsOkAndHolds(ElementsAre("(", "Hello", ")")));
+
+  EXPECT_THAT(language_segmenter->GetAllTerms(")Hello("),
+              IsOkAndHolds(ElementsAre(")", "Hello", "(")));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, Quotes) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+
+  EXPECT_THAT(language_segmenter->GetAllTerms("\"Hello\""),
+              IsOkAndHolds(ElementsAre("\"", "Hello", "\"")));
+
+  EXPECT_THAT(language_segmenter->GetAllTerms("'Hello'"),
+              IsOkAndHolds(ElementsAre("'", "Hello", "'")));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, Alphanumeric) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+
+  // Alphanumeric terms are allowed
+  EXPECT_THAT(language_segmenter->GetAllTerms("Se7en A4 3a"),
+              IsOkAndHolds(ElementsAre("Se7en", " ", "A4", " ", "3a")));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, Number) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+
+  // Alphanumeric terms are allowed
+  EXPECT_THAT(
+      language_segmenter->GetAllTerms("3.141592653589793238462643383279"),
+      IsOkAndHolds(ElementsAre("3.141592653589793238462643383279")));
+
+  EXPECT_THAT(language_segmenter->GetAllTerms("3,456.789"),
+              IsOkAndHolds(ElementsAre("3,456.789")));
+
+  EXPECT_THAT(language_segmenter->GetAllTerms("-123"),
+              IsOkAndHolds(ElementsAre("-", "123")));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, ContinuousWhitespaces) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  // Multiple continuous whitespaces are treated as one.
+  const int kNumSeparators = 256;
+  std::string text_with_spaces =
+      absl_ports::StrCat("Hello", std::string(kNumSeparators, ' '), "World");
+  EXPECT_THAT(language_segmenter->GetAllTerms(text_with_spaces),
+              IsOkAndHolds(ElementsAre("Hello", " ", "World")));
+
+  // Multiple continuous whitespaces are treated as one. Whitespace at the
+  // beginning of the text doesn't affect the results of GetTerm() after the
+  // iterator is done.
+  text_with_spaces = absl_ports::StrCat(std::string(kNumSeparators, ' '),
+                                        "Hello", " ", "World");
+  ICING_ASSERT_OK_AND_ASSIGN(auto itr,
+                             language_segmenter->Segment(text_with_spaces));
+  std::vector<std::string_view> terms;
+  while (itr->Advance()) {
+    terms.push_back(itr->GetTerm());
+  }
+  EXPECT_THAT(terms, ElementsAre(" ", "Hello", " ", "World"));
+  EXPECT_THAT(itr->GetTerm(), IsEmpty());
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, CJKT) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  // CJKT (Chinese, Japanese, Khmer, Thai) are the 4 main languages that don't
+  // have whitespaces as word delimiter.
+
+  // Chinese
+  EXPECT_THAT(language_segmenter->GetAllTerms("我每天走路去上班。"),
+              IsOkAndHolds(ElementsAre("我", "每天", "走路", "去", "上班")));
+  // Japanese
+  EXPECT_THAT(language_segmenter->GetAllTerms("私は毎日仕事に歩いています。"),
+              IsOkAndHolds(ElementsAre("私", "は", "毎日", "仕事", "に", "歩",
+                                       "い", "てい", "ます")));
+  // Khmer
+  EXPECT_THAT(language_segmenter->GetAllTerms("ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"),
+              IsOkAndHolds(ElementsAre("ញុំ", "ដើរទៅ", "ធ្វើការ", "រាល់ថ្ងៃ")));
+  // Thai
+  EXPECT_THAT(
+      language_segmenter->GetAllTerms("ฉันเดินไปทำงานทุกวัน"),
+      IsOkAndHolds(ElementsAre("ฉัน", "เดิน", "ไป", "ทำงาน", "ทุก", "วัน")));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, LatinLettersWithAccents) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  EXPECT_THAT(language_segmenter->GetAllTerms("āăąḃḅḇčćç"),
+              IsOkAndHolds(ElementsAre("āăąḃḅḇčćç")));
+}
+
+// TODO(samzheng): test cases for more languages (e.g. top 20 in the world)
+TEST_P(IcuLanguageSegmenterAllLocalesTest, WhitespaceSplitLanguages) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  // Turkish
+  EXPECT_THAT(language_segmenter->GetAllTerms("merhaba dünya"),
+              IsOkAndHolds(ElementsAre("merhaba", " ", "dünya")));
+  // Korean
+  EXPECT_THAT(
+      language_segmenter->GetAllTerms("나는 매일 출근합니다."),
+      IsOkAndHolds(ElementsAre("나는", " ", "매일", " ", "출근합니다", ".")));
+}
+
+// TODO(samzheng): more mixed languages test cases
+TEST_P(IcuLanguageSegmenterAllLocalesTest, MixedLanguages) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  EXPECT_THAT(language_segmenter->GetAllTerms("How are you你好吗お元気ですか"),
+              IsOkAndHolds(ElementsAre("How", " ", "are", " ", "you", "你好",
+                                       "吗", "お", "元気", "です", "か")));
+
+  EXPECT_THAT(
+      language_segmenter->GetAllTerms("나는 California에 산다"),
+      IsOkAndHolds(ElementsAre("나는", " ", "California", "에", " ", "산다")));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, NotCopyStrings) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  // Validates that the input strings are not copied
+  const std::string text = "Hello World";
+  const char* word1_address = text.c_str();
+  const char* word2_address = text.c_str() + 6;
+  ICING_ASSERT_OK_AND_ASSIGN(std::vector<std::string_view> terms,
+                             language_segmenter->GetAllTerms(text));
+  ASSERT_THAT(terms, ElementsAre("Hello", " ", "World"));
+  const char* word1_result_address = terms.at(0).data();
+  const char* word2_result_address = terms.at(2).data();
+
+  // The underlying char* should be the same
+  EXPECT_THAT(word1_address, Eq(word1_result_address));
+  EXPECT_THAT(word2_address, Eq(word2_result_address));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToTermAfterOutOfBounds) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  constexpr std::string_view kText = "How are you你好吗お元気ですか";
+  ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+                             segmenter->Segment(kText));
+
+  // String: "How are you你好吗お元気ですか"
+  //          ^  ^^  ^^  ^  ^ ^ ^  ^  ^
+  // Bytes:   0  3 4 7 8 11 172023 29 35
+  ASSERT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(8)));
+  ASSERT_THAT(itr->GetTerm(), Eq("you"));
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(-1),
+              StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+  EXPECT_THAT(itr->GetTerm(), Eq("you"));
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(kText.length()),
+              StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+  EXPECT_THAT(itr->GetTerm(), Eq("you"));
+}
+
+// Tests that ResetToTermAfter and Advance produce the same output. With the
+// exception of the first term which is inacessible via ResetToTermAfter,
+// the stream of terms produced by Advance calls should exacly match the
+// terms produced by ResetToTermAfter calls with the current position
+// provided as the argument.
+TEST_P(IcuLanguageSegmenterAllLocalesTest,
+       MixedLanguagesResetToTermAfterEquivalentToAdvance) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  constexpr std::string_view kText = "How are𡔖 you你好吗お元気ですか";
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+      segmenter->Segment(kText));
+  std::vector<std::string_view> advance_terms =
+      GetAllTermsAdvance(advance_itr.get());
+
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
+      segmenter->Segment(kText));
+  std::vector<std::string_view> reset_terms =
+      GetAllTermsResetAfter(reset_to_term_itr.get());
+
+  EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
+  EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest,
+       ThaiResetToTermAfterEquivalentToAdvance) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+      segmenter->Segment(kThai));
+  std::vector<std::string_view> advance_terms =
+      GetAllTermsAdvance(advance_itr.get());
+
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
+      segmenter->Segment(kThai));
+  std::vector<std::string_view> reset_terms =
+      GetAllTermsResetAfter(reset_to_term_itr.get());
+
+  EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
+  EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest,
+       KoreanResetToTermAfterEquivalentToAdvance) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  constexpr std::string_view kKorean = "나는 매일 출근합니다.";
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+      segmenter->Segment(kKorean));
+  std::vector<std::string_view> advance_terms =
+      GetAllTermsAdvance(advance_itr.get());
+
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
+      segmenter->Segment(kKorean));
+  std::vector<std::string_view> reset_terms =
+      GetAllTermsResetAfter(reset_to_term_itr.get());
+
+  EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
+  EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+// Tests that ResetToTermAfter and Advance can be used in conjunction. Just as
+// ResetToTermAfter(current_position) can be used to simulate Advance, users
+// should be able to mix ResetToTermAfter(current_position) calls and Advance
+// calls to mimic calling Advance.
+TEST_P(IcuLanguageSegmenterAllLocalesTest,
+       MixedLanguagesResetToTermAfterInteroperableWithAdvance) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  constexpr std::string_view kText = "How are𡔖 you你好吗お元気ですか";
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+      segmenter->Segment(kText));
+  std::vector<std::string_view> advance_terms =
+      GetAllTermsAdvance(advance_itr.get());
+
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr,
+      segmenter->Segment(kText));
+  std::vector<std::string_view> advance_and_reset_terms =
+      GetAllTermsAdvanceAndResetAfter(advance_and_reset_itr.get());
+
+  EXPECT_THAT(advance_and_reset_terms,
+              testing::ElementsAreArray(advance_terms));
+  EXPECT_THAT(advance_and_reset_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest,
+       ThaiResetToTermAfterInteroperableWithAdvance) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+      segmenter->Segment(kThai));
+  std::vector<std::string_view> advance_terms =
+      GetAllTermsAdvance(advance_itr.get());
+
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr,
+      segmenter->Segment(kThai));
+  std::vector<std::string_view> advance_and_reset_terms =
+      GetAllTermsAdvanceAndResetAfter(advance_and_reset_itr.get());
+
+  EXPECT_THAT(advance_and_reset_terms,
+              testing::ElementsAreArray(advance_terms));
+  EXPECT_THAT(advance_and_reset_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest,
+       KoreanResetToTermAfterInteroperableWithAdvance) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  constexpr std::string_view kKorean = "나는 매일 출근합니다.";
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+      segmenter->Segment(kKorean));
+  std::vector<std::string_view> advance_terms =
+      GetAllTermsAdvance(advance_itr.get());
+
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr,
+      segmenter->Segment(kKorean));
+  std::vector<std::string_view> advance_and_reset_terms =
+      GetAllTermsAdvanceAndResetAfter(advance_and_reset_itr.get());
+
+  EXPECT_THAT(advance_and_reset_terms,
+              testing::ElementsAreArray(advance_terms));
+  EXPECT_THAT(advance_and_reset_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, MixedLanguagesResetToTermAfter) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LanguageSegmenter::Iterator> itr,
+      language_segmenter->Segment("How are you你好吗お元気ですか"));
+
+  // String: "How are you你好吗お元気ですか"
+  //          ^  ^^  ^^  ^  ^ ^ ^  ^  ^
+  // Bytes:   0  3 4 7 8 11 172023 29 35
+  EXPECT_THAT(itr->ResetToTermStartingAfter(2), IsOkAndHolds(Eq(3)));
+  EXPECT_THAT(itr->GetTerm(), Eq(" "));
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(10), IsOkAndHolds(Eq(11)));
+  EXPECT_THAT(itr->GetTerm(), Eq("你好"));
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(8)));
+  EXPECT_THAT(itr->GetTerm(), Eq("you"));
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(32), IsOkAndHolds(Eq(35)));
+  EXPECT_THAT(itr->GetTerm(), Eq("か"));
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(14), IsOkAndHolds(Eq(17)));
+  EXPECT_THAT(itr->GetTerm(), Eq("吗"));
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(3)));
+  EXPECT_THAT(itr->GetTerm(), Eq(" "));
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(35),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+  EXPECT_THAT(itr->GetTerm(), IsEmpty());
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest,
+       ContinuousWhitespacesResetToTermAfter) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  // Multiple continuous whitespaces are treated as one.
+  constexpr std::string_view kTextWithSpace = "Hello          World";
+  ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+                             language_segmenter->Segment(kTextWithSpace));
+
+  // String: "Hello          World"
+  //          ^    ^         ^
+  // Bytes:   0    5         15
+  EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(5)));
+  EXPECT_THAT(itr->GetTerm(), Eq(" "));
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(2), IsOkAndHolds(Eq(5)));
+  EXPECT_THAT(itr->GetTerm(), Eq(" "));
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(10), IsOkAndHolds(Eq(15)));
+  EXPECT_THAT(itr->GetTerm(), Eq("World"));
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(5), IsOkAndHolds(Eq(15)));
+  EXPECT_THAT(itr->GetTerm(), Eq("World"));
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(15),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+  EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(17),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+  EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(19),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+  EXPECT_THAT(itr->GetTerm(), IsEmpty());
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, ChineseResetToTermAfter) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  // CJKT (Chinese, Japanese, Khmer, Thai) are the 4 main languages that
+  // don't have whitespaces as word delimiter. Chinese
+  constexpr std::string_view kChinese = "我每天走路去上班。";
+  ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+                             language_segmenter->Segment(kChinese));
+  // String: "我每天走路去上班。"
+  //          ^ ^  ^   ^^
+  // Bytes:   0 3  9  15 18
+  EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(3)));
+  EXPECT_THAT(itr->GetTerm(), Eq("每天"));
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(9)));
+  EXPECT_THAT(itr->GetTerm(), Eq("走路"));
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(19),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+  EXPECT_THAT(itr->GetTerm(), IsEmpty());
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, JapaneseResetToTermAfter) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  // Japanese
+  constexpr std::string_view kJapanese = "私は毎日仕事に歩いています。";
+  ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+                             language_segmenter->Segment(kJapanese));
+  // String: "私は毎日仕事に歩いています。"
+  //          ^ ^ ^  ^  ^ ^ ^ ^  ^
+  // Bytes:   0 3 6  12 18212427 33
+  EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(3)));
+  EXPECT_THAT(itr->GetTerm(), Eq("は"));
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(33),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+  EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(12)));
+  EXPECT_THAT(itr->GetTerm(), Eq("仕事"));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, KhmerResetToTermAfter) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  constexpr std::string_view kKhmer = "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។";
+  ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+                             language_segmenter->Segment(kKhmer));
+  // String: "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"
+  //          ^ ^   ^   ^
+  // Bytes:   0 9   24  45
+  EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(9)));
+  EXPECT_THAT(itr->GetTerm(), Eq("ដើរទៅ"));
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(47),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+  EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(14), IsOkAndHolds(Eq(24)));
+  EXPECT_THAT(itr->GetTerm(), Eq("ធ្វើការ"));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, ThaiResetToTermAfter) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  // Thai
+  constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
+  ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+                             language_segmenter->Segment(kThai));
+  // String: "ฉันเดินไปทำงานทุกวัน"
+  //          ^ ^  ^ ^    ^ ^
+  // Bytes:   0 9 21 27  42 51
+  EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(9)));
+  EXPECT_THAT(itr->GetTerm(), Eq("เดิน"));
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(51),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+  EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(13), IsOkAndHolds(Eq(21)));
+  EXPECT_THAT(itr->GetTerm(), Eq("ไป"));
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(34), IsOkAndHolds(Eq(42)));
+  EXPECT_THAT(itr->GetTerm(), Eq("ทุก"));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToTermBeforeOutOfBounds) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  constexpr std::string_view kText = "How are you你好吗お元気ですか";
+  ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+                             segmenter->Segment(kText));
+
+  // String: "How are you你好吗お元気ですか"
+  //          ^  ^^  ^^  ^  ^ ^ ^  ^  ^
+  // Bytes:   0  3 4 7 8 11 172023 29 35
+  ASSERT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(4)));
+  ASSERT_THAT(itr->GetTerm(), Eq("are"));
+
+  EXPECT_THAT(itr->ResetToTermEndingBefore(-1),
+              StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+  EXPECT_THAT(itr->GetTerm(), Eq("are"));
+
+  EXPECT_THAT(itr->ResetToTermEndingBefore(kText.length()),
+              StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+  EXPECT_THAT(itr->GetTerm(), Eq("are"));
+}
+
+// Tests that ResetToTermBefore and Advance produce the same output. With the
+// exception of the last term which is inacessible via ResetToTermBefore,
+// the stream of terms produced by Advance calls should exacly match the
+// terms produced by ResetToTermBefore calls with the current position
+// provided as the argument (after their order has been reversed).
+TEST_P(IcuLanguageSegmenterAllLocalesTest,
+       MixedLanguagesResetToTermBeforeEquivalentToAdvance) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  constexpr std::string_view kText = "How are𡔖 you你好吗お元気ですか";
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+      segmenter->Segment(kText));
+  std::vector<std::string_view> advance_terms =
+      GetAllTermsAdvance(advance_itr.get());
+  // Can't produce the last term via calls to ResetToTermBefore. So skip
+  // past that one.
+  auto itr = advance_terms.begin();
+  std::advance(itr, advance_terms.size() - 1);
+  advance_terms.erase(itr);
+
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
+      segmenter->Segment(kText));
+  std::vector<std::string_view> reset_terms =
+      GetAllTermsResetBefore(reset_to_term_itr.get());
+  std::reverse(reset_terms.begin(), reset_terms.end());
+
+  EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
+  EXPECT_THAT(reset_to_term_itr->GetTerm(), IsEmpty());
+  EXPECT_THAT(advance_itr->GetTerm(), IsEmpty());
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest,
+       ThaiResetToTermBeforeEquivalentToAdvance) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+      segmenter->Segment(kThai));
+  std::vector<std::string_view> advance_terms =
+      GetAllTermsAdvance(advance_itr.get());
+  // Can't produce the last term via calls to ResetToTermBefore. So skip
+  // past that one.
+  auto itr = advance_terms.begin();
+  std::advance(itr, advance_terms.size() - 1);
+  advance_terms.erase(itr);
+
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
+      segmenter->Segment(kThai));
+  std::vector<std::string_view> reset_terms =
+      GetAllTermsResetBefore(reset_to_term_itr.get());
+  std::reverse(reset_terms.begin(), reset_terms.end());
+
+  EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
+  EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest,
+       KoreanResetToTermBeforeEquivalentToAdvance) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  constexpr std::string_view kKorean = "나는 매일 출근합니다.";
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+      segmenter->Segment(kKorean));
+  std::vector<std::string_view> advance_terms =
+      GetAllTermsAdvance(advance_itr.get());
+  // Can't produce the last term via calls to ResetToTermBefore. So skip
+  // past that one.
+  auto itr = advance_terms.begin();
+  std::advance(itr, advance_terms.size() - 1);
+  advance_terms.erase(itr);
+
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
+      segmenter->Segment(kKorean));
+  std::vector<std::string_view> reset_terms =
+      GetAllTermsResetBefore(reset_to_term_itr.get());
+  std::reverse(reset_terms.begin(), reset_terms.end());
+
+  EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
+  EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, MixedLanguagesResetToTermBefore) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LanguageSegmenter::Iterator> itr,
+      language_segmenter->Segment("How are you你好吗お元気ですか"));
+
+  // String: "How are you你好吗お元気ですか"
+  //          ^  ^^  ^^  ^  ^ ^ ^  ^  ^
+  // Bytes:   0  3 4 7 8 11 172023 29 35
+  EXPECT_THAT(itr->ResetToTermEndingBefore(2),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+  EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+  EXPECT_THAT(itr->ResetToTermEndingBefore(10), IsOkAndHolds(Eq(7)));
+  EXPECT_THAT(itr->GetTerm(), Eq(" "));
+
+  EXPECT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(4)));
+  EXPECT_THAT(itr->GetTerm(), Eq("are"));
+
+  EXPECT_THAT(itr->ResetToTermEndingBefore(32), IsOkAndHolds(Eq(23)));
+  EXPECT_THAT(itr->GetTerm(), Eq("元気"));
+
+  EXPECT_THAT(itr->ResetToTermEndingBefore(14), IsOkAndHolds(Eq(8)));
+  EXPECT_THAT(itr->GetTerm(), Eq("you"));
+
+  EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+  EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+  EXPECT_THAT(itr->ResetToTermEndingBefore(35), IsOkAndHolds(Eq(29)));
+  EXPECT_THAT(itr->GetTerm(), Eq("です"));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest,
+       ContinuousWhitespacesResetToTermBefore) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  // Multiple continuous whitespaces are treated as one.
+  constexpr std::string_view kTextWithSpace = "Hello          World";
+  ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+                             language_segmenter->Segment(kTextWithSpace));
+
+  // String: "Hello          World"
+  //          ^    ^         ^
+  // Bytes:   0    5         15
+  EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+  EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+  EXPECT_THAT(itr->ResetToTermEndingBefore(2),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+  EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+  EXPECT_THAT(itr->ResetToTermEndingBefore(10), IsOkAndHolds(Eq(0)));
+  EXPECT_THAT(itr->GetTerm(), Eq("Hello"));
+
+  EXPECT_THAT(itr->ResetToTermEndingBefore(5), IsOkAndHolds(Eq(0)));
+  EXPECT_THAT(itr->GetTerm(), Eq("Hello"));
+
+  EXPECT_THAT(itr->ResetToTermEndingBefore(15), IsOkAndHolds(Eq(5)));
+  EXPECT_THAT(itr->GetTerm(), Eq(" "));
+
+  EXPECT_THAT(itr->ResetToTermEndingBefore(17), IsOkAndHolds(Eq(5)));
+  EXPECT_THAT(itr->GetTerm(), Eq(" "));
+
+  EXPECT_THAT(itr->ResetToTermEndingBefore(19), IsOkAndHolds(Eq(5)));
+  EXPECT_THAT(itr->GetTerm(), Eq(" "));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, ChineseResetToTermBefore) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  // CJKT (Chinese, Japanese, Khmer, Thai) are the 4 main languages that
+  // don't have whitespaces as word delimiter. Chinese
+  constexpr std::string_view kChinese = "我每天走路去上班。";
+  ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+                             language_segmenter->Segment(kChinese));
+  // String: "我每天走路去上班。"
+  //          ^ ^  ^   ^^
+  // Bytes:   0 3  9  15 18
+  EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+  EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+  EXPECT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(0)));
+  EXPECT_THAT(itr->GetTerm(), Eq("我"));
+
+  EXPECT_THAT(itr->ResetToTermEndingBefore(19), IsOkAndHolds(Eq(15)));
+  EXPECT_THAT(itr->GetTerm(), Eq("去"));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, JapaneseResetToTermBefore) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  // Japanese
+  constexpr std::string_view kJapanese = "私は毎日仕事に歩いています。";
+  ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+                             language_segmenter->Segment(kJapanese));
+  // String: "私は毎日仕事に歩いています。"
+  //          ^ ^ ^  ^  ^ ^ ^ ^  ^
+  // Bytes:   0 3 6  12 18212427 33
+  EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+  EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+  EXPECT_THAT(itr->ResetToTermEndingBefore(33), IsOkAndHolds(Eq(27)));
+  EXPECT_THAT(itr->GetTerm(), Eq("てい"));
+
+  EXPECT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(3)));
+  EXPECT_THAT(itr->GetTerm(), Eq("は"));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, KhmerResetToTermBefore) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  constexpr std::string_view kKhmer = "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។";
+  ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+                             language_segmenter->Segment(kKhmer));
+  // String: "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"
+  //          ^ ^   ^   ^
+  // Bytes:   0 9   24  45
+  EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+  EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+  EXPECT_THAT(itr->ResetToTermEndingBefore(47), IsOkAndHolds(Eq(24)));
+  EXPECT_THAT(itr->GetTerm(), Eq("ធ្វើការ"));
+
+  EXPECT_THAT(itr->ResetToTermEndingBefore(14), IsOkAndHolds(Eq(0)));
+  EXPECT_THAT(itr->GetTerm(), Eq("ញុំ"));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, ThaiResetToTermBefore) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create(GetOptions()));
+  // Thai
+  constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
+  ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+                             language_segmenter->Segment(kThai));
+  // String: "ฉันเดินไปทำงานทุกวัน"
+  //          ^ ^  ^ ^    ^ ^
+  // Bytes:   0 9 21 27  42 51
+  EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+  EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+  EXPECT_THAT(itr->ResetToTermEndingBefore(51), IsOkAndHolds(Eq(42)));
+  EXPECT_THAT(itr->GetTerm(), Eq("ทุก"));
+
+  EXPECT_THAT(itr->ResetToTermEndingBefore(13), IsOkAndHolds(Eq(0)));
+  EXPECT_THAT(itr->GetTerm(), Eq("ฉัน"));
+
+  EXPECT_THAT(itr->ResetToTermEndingBefore(34), IsOkAndHolds(Eq(21)));
+  EXPECT_THAT(itr->GetTerm(), Eq("ไป"));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    LocaleName, IcuLanguageSegmenterAllLocalesTest,
+    testing::Values(ULOC_US, ULOC_UK, ULOC_CANADA, ULOC_CANADA_FRENCH,
+                    ULOC_FRANCE, ULOC_GERMANY, ULOC_ITALY, ULOC_JAPAN,
+                    ULOC_KOREA, ULOC_SIMPLIFIED_CHINESE,
+                    ULOC_TRADITIONAL_CHINESE,
+                    "es_ES",        // Spanish
+                    "hi_IN",        // Hindi
+                    "th_TH",        // Thai
+                    "lo_LA",        // Lao
+                    "km_KH",        // Khmer
+                    "ar_DZ",        // Arabic
+                    "ru_RU",        // Russian
+                    "pt_PT",        // Portuguese
+                    "en_US_POSIX"   // American English (Computer)
+                    "wrong_locale"  // Will fall back to ICU default locale
+                    ""              // Will fall back to ICU default locale
+                    ));
+
+}  // namespace
+}  // namespace lib
+}  // namespace icing
diff --git a/icing/tokenization/language-segmenter-factory.h b/icing/tokenization/language-segmenter-factory.h
index 244bcd8..5a4047b 100644
--- a/icing/tokenization/language-segmenter-factory.h
+++ b/icing/tokenization/language-segmenter-factory.h
@@ -18,19 +18,24 @@
 #include <memory>
 #include <string_view>
 
+#include "icing/jni/jni-cache.h"
 #include "icing/text_classifier/lib3/utils/base/statusor.h"
 #include "icing/tokenization/language-segmenter.h"
-#include "icing/util/icu-i18n-utils.h"
+#include "icing/util/i18n-utils.h"
+#include "unicode/uloc.h"
 
 namespace icing {
 namespace lib {
 
 namespace language_segmenter_factory {
 
-enum SegmenterType {
-  ICU4C,  // Uses the ICU library to segment text.
-  SPACE,  // Segments only on whitespace. Currently not used in production; used
-          // to compile in Jetpack
+struct SegmenterOptions {
+  explicit SegmenterOptions(std::string locale = ULOC_US,
+                            const JniCache* jni_cache = nullptr)
+      : locale(std::move(locale)), jni_cache(jni_cache) {}
+
+  std::string locale;
+  const JniCache* jni_cache;
 };
 
 // Creates a language segmenter with the given locale.
@@ -39,7 +44,7 @@ enum SegmenterType {
 //   A LanguageSegmenter on success
 //   INVALID_ARGUMENT if locale string is invalid
 libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter>> Create(
-    SegmenterType type, std::string locale = ULOC_US);
+    SegmenterOptions options = SegmenterOptions());
 
 }  // namespace language_segmenter_factory
 
diff --git a/icing/tokenization/language-segmenter-iterator_test.cc b/icing/tokenization/language-segmenter-iterator_test.cc
index 6af44e1..c7b068d 100644
--- a/icing/tokenization/language-segmenter-iterator_test.cc
+++ b/icing/tokenization/language-segmenter-iterator_test.cc
@@ -15,7 +15,7 @@
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 #include "icing/absl_ports/str_cat.h"
-#include "icing/icu-data-file-helper.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
 #include "icing/testing/common-matchers.h"
 #include "icing/testing/test-data.h"
 #include "icing/tokenization/language-segmenter-factory.h"
@@ -32,8 +32,7 @@ using ::testing::Eq;
 // don't need to stress test the implementation's definition of a term. These
 // test that it advances and traverses through simple terms consistently between
 // all the implementations.
-class LanguageSegmenterIteratorTest
-    : public testing::TestWithParam<language_segmenter_factory::SegmenterType> {
+class LanguageSegmenterIteratorTest : public testing::Test {
  protected:
   void SetUp() override {
     ICING_ASSERT_OK(
@@ -41,15 +40,11 @@ class LanguageSegmenterIteratorTest
         icu_data_file_helper::SetUpICUDataFile(
             GetTestFilePath("icing/icu.dat")));
   }
-
-  static language_segmenter_factory::SegmenterType GetType() {
-    return GetParam();
-  }
 };
 
-TEST_P(LanguageSegmenterIteratorTest, AdvanceAndGetTerm) {
+TEST_F(LanguageSegmenterIteratorTest, AdvanceAndGetTerm) {
   ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create(GetType()));
+                             language_segmenter_factory::Create());
   ICING_ASSERT_OK_AND_ASSIGN(auto iterator,
                              language_segmenter->Segment("foo bar"));
 
@@ -65,10 +60,10 @@ TEST_P(LanguageSegmenterIteratorTest, AdvanceAndGetTerm) {
   EXPECT_FALSE(iterator->Advance());
 }
 
-TEST_P(LanguageSegmenterIteratorTest,
+TEST_F(LanguageSegmenterIteratorTest,
        ResetToTermStartingAfterWithOffsetInText) {
   ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create(GetType()));
+                             language_segmenter_factory::Create());
   ICING_ASSERT_OK_AND_ASSIGN(auto iterator,
                              language_segmenter->Segment("foo bar"));
 
@@ -80,45 +75,48 @@ TEST_P(LanguageSegmenterIteratorTest,
               StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
 }
 
-TEST_P(LanguageSegmenterIteratorTest,
-       ResetToTermStartingAfterWithNegativeOffsetOk) {
+TEST_F(LanguageSegmenterIteratorTest,
+       ResetToTermStartingAfterWithNegativeOffsetNotOk) {
   ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create(GetType()));
+                             language_segmenter_factory::Create());
   ICING_ASSERT_OK_AND_ASSIGN(auto iterator,
                              language_segmenter->Segment("foo bar"));
 
   EXPECT_THAT(iterator->ResetToTermStartingAfter(/*offset=*/-1),
-              IsOkAndHolds(0));  // The term "foo"
+              StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
 
   EXPECT_THAT(iterator->ResetToTermStartingAfter(/*offset=*/-100),
-              IsOkAndHolds(0));  // The term "foo"
+              StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+  EXPECT_THAT(iterator->ResetToStart(), IsOkAndHolds(0));
+  EXPECT_THAT(iterator->GetTerm(), Eq("foo"));
 }
 
-TEST_P(LanguageSegmenterIteratorTest,
-       ResetToTermStartingAfterWithTextLengthOffsetNotFound) {
+TEST_F(LanguageSegmenterIteratorTest,
+       ResetToTermStartingAfterWithTextLengthOffsetInvalidArgument) {
   std::string text = "foo bar";
   ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create(GetType()));
+                             language_segmenter_factory::Create());
   ICING_ASSERT_OK_AND_ASSIGN(auto iterator, language_segmenter->Segment(text));
 
   EXPECT_THAT(iterator->ResetToTermStartingAfter(/*offset=*/text.size()),
-              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+              StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
 }
 
-TEST_P(LanguageSegmenterIteratorTest,
-       ResetToTermStartingAfterWithOffsetPastTextLengthNotFound) {
+TEST_F(LanguageSegmenterIteratorTest,
+       ResetToTermStartingAfterWithOffsetPastTextLengthInvalidArgument) {
   std::string text = "foo bar";
   ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create(GetType()));
+                             language_segmenter_factory::Create());
   ICING_ASSERT_OK_AND_ASSIGN(auto iterator, language_segmenter->Segment(text));
 
   EXPECT_THAT(iterator->ResetToTermStartingAfter(/*offset=*/100),
-              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+              StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
 }
 
-TEST_P(LanguageSegmenterIteratorTest, ResetToTermEndingBeforeWithOffsetInText) {
+TEST_F(LanguageSegmenterIteratorTest, ResetToTermEndingBeforeWithOffsetInText) {
   ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create(GetType()));
+                             language_segmenter_factory::Create());
   ICING_ASSERT_OK_AND_ASSIGN(auto iterator,
                              language_segmenter->Segment("foo bar"));
 
@@ -130,50 +128,46 @@ TEST_P(LanguageSegmenterIteratorTest, ResetToTermEndingBeforeWithOffsetInText) {
               StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
 }
 
-TEST_P(LanguageSegmenterIteratorTest,
-       ResetToTermEndingBeforeWithZeroOrNegativeOffsetNotFound) {
+TEST_F(LanguageSegmenterIteratorTest,
+       ResetToTermEndingBeforeWithZeroNotFound) {
   ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create(GetType()));
+                             language_segmenter_factory::Create());
   ICING_ASSERT_OK_AND_ASSIGN(auto iterator,
                              language_segmenter->Segment("foo bar"));
 
+  // Zero is a valid argument, but there aren't any terms that end before it.
   EXPECT_THAT(iterator->ResetToTermEndingBefore(/*offset=*/0),
               StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
+TEST_F(LanguageSegmenterIteratorTest,
+       ResetToTermEndingBeforeWithNegativeOffsetInvalidArgument) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create());
+  ICING_ASSERT_OK_AND_ASSIGN(auto iterator,
+                             language_segmenter->Segment("foo bar"));
 
   EXPECT_THAT(iterator->ResetToTermEndingBefore(/*offset=*/-1),
-              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+              StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
 
   EXPECT_THAT(iterator->ResetToTermEndingBefore(/*offset=*/-100),
-              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+              StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
 }
 
-TEST_P(LanguageSegmenterIteratorTest,
-       ResetToTermEndingBeforeWithTextLengthOffsetOk) {
+TEST_F(LanguageSegmenterIteratorTest,
+       ResetToTermEndingBeforeWithOffsetPastTextEndInvalidArgument) {
   std::string text = "foo bar";
   ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create(GetType()));
+                             language_segmenter_factory::Create());
   ICING_ASSERT_OK_AND_ASSIGN(auto iterator, language_segmenter->Segment(text));
 
   EXPECT_THAT(iterator->ResetToTermEndingBefore(/*offset=*/text.length()),
-              IsOkAndHolds(4));  // The term "bar"
-}
-
-TEST_P(LanguageSegmenterIteratorTest,
-       ResetToTermEndingBeforeWithOffsetPastTextLengthNotFound) {
-  std::string text = "foo bar";
-  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
-                             language_segmenter_factory::Create(GetType()));
-  ICING_ASSERT_OK_AND_ASSIGN(auto iterator, language_segmenter->Segment(text));
+              StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
 
   EXPECT_THAT(iterator->ResetToTermEndingBefore(/*offset=*/text.length() + 1),
-              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+              StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
 }
 
-INSTANTIATE_TEST_SUITE_P(
-    SegmenterType, LanguageSegmenterIteratorTest,
-    testing::Values(language_segmenter_factory::SegmenterType::ICU4C,
-                    language_segmenter_factory::SegmenterType::SPACE));
-
 }  // namespace
 }  // namespace lib
 }  // namespace icing
diff --git a/icing/tokenization/language-segmenter.h b/icing/tokenization/language-segmenter.h
index fde9ae2..fdb1846 100644
--- a/icing/tokenization/language-segmenter.h
+++ b/icing/tokenization/language-segmenter.h
@@ -64,16 +64,18 @@ class LanguageSegmenter {
     //   iterator.ResetToTermStartingAfter(4);
     //   iterator.GetTerm() // returns "baz";
     //
-    // Passing in a negative offset will return the offset of the first term.
-    //
-    // Passing in an offset that is equal to or exceeds the underlying text
-    // length will return NOT_FOUND.
+    // Return types of OK and NOT_FOUND indicate that the function call was
+    // valid and the state of the iterator has changed. Return type of
+    // INVALID_ARGUMENT will leave the iterator unchanged.
     //
     // Returns:
     //   On success, the starting position of the first term that starts after
     //   offset.
     //   NOT_FOUND if an error occurred or there are no terms that start after
     //   offset.
+    //   INVALID_ARGUMENT if offset is out of bounds for the provided text.
+    //   ABORTED if an invalid unicode character is encountered while
+    //   traversing the text.
     virtual libtextclassifier3::StatusOr<int32_t> ResetToTermStartingAfter(
         int32_t offset) = 0;
 
@@ -85,21 +87,22 @@ class LanguageSegmenter {
     //   iterator.ResetToTermEndingBefore(7);
     //   iterator.GetTerm() // returns "bar";
     //
-    // Passing in an offset equal to or less than 0 will return NOT_FOUND.
-    //
-    // Passing in an offset equal to the underlying text length will return the
-    // offset of the last term.
-    //
-    // Passing in an offset that is greater than the underlying text length will
-    // return NOT_FOUND.
+    // Return types of OK and NOT_FOUND indicate that the function call was
+    // valid and the state of the iterator has changed. Return type of
+    // INVALID_ARGUMENT will leave the iterator unchanged.
     //
     // Returns:
     //   On success, the starting position of the first term that ends before
     //   offset.
     //   NOT_FOUND if an error occurred or there are no terms that ends before
     //   offset.
+    //   INVALID_ARGUMENT if offset is out of bounds for the provided text.
+    //   ABORTED if an invalid unicode character is encountered while
+    //   traversing the text.
     virtual libtextclassifier3::StatusOr<int32_t> ResetToTermEndingBefore(
         int32_t offset) = 0;
+
+    virtual libtextclassifier3::StatusOr<int32_t> ResetToStart() = 0;
   };
 
   // Segments the input text into terms.
diff --git a/icing/tokenization/language-segmenter_benchmark.cc b/icing/tokenization/language-segmenter_benchmark.cc
index 01cc938..49ddfca 100644
--- a/icing/tokenization/language-segmenter_benchmark.cc
+++ b/icing/tokenization/language-segmenter_benchmark.cc
@@ -14,7 +14,7 @@
 
 #include "testing/base/public/benchmark.h"
 #include "gmock/gmock.h"
-#include "icing/icu-data-file-helper.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
 #include "icing/testing/common-matchers.h"
 #include "icing/testing/test-data.h"
 #include "icing/tokenization/language-segmenter-factory.h"
@@ -60,8 +60,7 @@ void BM_SegmentNoSpace(benchmark::State& state) {
   }
 
   std::unique_ptr<LanguageSegmenter> language_segmenter =
-      language_segmenter_factory::Create(language_segmenter_factory::ICU4C)
-          .ValueOrDie();
+      language_segmenter_factory::Create().ValueOrDie();
 
   std::string input_string(state.range(0), 'A');
 
@@ -97,8 +96,7 @@ void BM_SegmentWithSpaces(benchmark::State& state) {
   }
 
   std::unique_ptr<LanguageSegmenter> language_segmenter =
-      language_segmenter_factory::Create(language_segmenter_factory::ICU4C)
-          .ValueOrDie();
+      language_segmenter_factory::Create().ValueOrDie();
 
   std::string input_string(state.range(0), 'A');
   for (int i = 1; i < input_string.length(); i += 2) {
@@ -137,8 +135,7 @@ void BM_SegmentCJK(benchmark::State& state) {
   }
 
   std::unique_ptr<LanguageSegmenter> language_segmenter =
-      language_segmenter_factory::Create(language_segmenter_factory::ICU4C)
-          .ValueOrDie();
+      language_segmenter_factory::Create().ValueOrDie();
 
   std::string input_string;
   while (input_string.length() < state.range(0)) {
diff --git a/icing/tokenization/plain-tokenizer.cc b/icing/tokenization/plain-tokenizer.cc
index 556a095..6e54af9 100644
--- a/icing/tokenization/plain-tokenizer.cc
+++ b/icing/tokenization/plain-tokenizer.cc
@@ -18,7 +18,7 @@
 
 #include "icing/text_classifier/lib3/utils/base/statusor.h"
 #include "icing/tokenization/language-segmenter.h"
-#include "icing/util/icu-i18n-utils.h"
+#include "icing/util/i18n-utils.h"
 #include "icing/util/status-macros.h"
 
 namespace icing {
@@ -39,8 +39,8 @@ bool IsValidTerm(std::string_view term) {
   }
   // Gets the first unicode character. We can know what the whole term is by
   // checking only the first character.
-  return !icu_i18n_utils::IsWhitespaceAt(term, /*position=*/0) &&
-         !icu_i18n_utils::IsPunctuationAt(term, /*position=*/0);
+  return !i18n_utils::IsWhitespaceAt(term, /*position=*/0) &&
+         !i18n_utils::IsPunctuationAt(term, /*position=*/0);
 }
 }  // namespace
 
@@ -96,6 +96,18 @@ class PlainTokenIterator : public Tokenizer::Iterator {
     return true;
   }
 
+  bool ResetToStart() override {
+    if (!base_iterator_->ResetToStart().ok()) {
+      return false;
+    }
+    current_term_ = base_iterator_->GetTerm();
+    if (!IsValidTerm(current_term_)) {
+      // If the current value isn't valid, advance to the next valid value.
+      return Advance();
+    }
+    return true;
+  }
+
  private:
   std::unique_ptr<LanguageSegmenter::Iterator> base_iterator_;
   std::string_view current_term_;
diff --git a/icing/tokenization/plain-tokenizer_test.cc b/icing/tokenization/plain-tokenizer_test.cc
index e7d6e29..f2fc678 100644
--- a/icing/tokenization/plain-tokenizer_test.cc
+++ b/icing/tokenization/plain-tokenizer_test.cc
@@ -18,7 +18,7 @@
 
 #include "gmock/gmock.h"
 #include "icing/absl_ports/str_cat.h"
-#include "icing/icu-data-file-helper.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
 #include "icing/testing/common-matchers.h"
 #include "icing/testing/icu-i18n-test-utils.h"
 #include "icing/testing/test-data.h"
@@ -49,9 +49,8 @@ TEST_F(PlainTokenizerTest, CreationWithNullPointerShouldFail) {
 }
 
 TEST_F(PlainTokenizerTest, Simple) {
-  ICING_ASSERT_OK_AND_ASSIGN(
-      auto language_segmenter,
-      language_segmenter_factory::Create(language_segmenter_factory::ICU4C));
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create());
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<Tokenizer> plain_tokenizer,
       tokenizer_factory::CreateIndexingTokenizer(
@@ -82,9 +81,8 @@ TEST_F(PlainTokenizerTest, Simple) {
 }
 
 TEST_F(PlainTokenizerTest, Whitespace) {
-  ICING_ASSERT_OK_AND_ASSIGN(
-      auto language_segmenter,
-      language_segmenter_factory::Create(language_segmenter_factory::ICU4C));
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create());
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<Tokenizer> plain_tokenizer,
       tokenizer_factory::CreateIndexingTokenizer(
@@ -109,9 +107,8 @@ TEST_F(PlainTokenizerTest, Whitespace) {
 }
 
 TEST_F(PlainTokenizerTest, Punctuation) {
-  ICING_ASSERT_OK_AND_ASSIGN(
-      auto language_segmenter,
-      language_segmenter_factory::Create(language_segmenter_factory::ICU4C));
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create());
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<Tokenizer> plain_tokenizer,
       tokenizer_factory::CreateIndexingTokenizer(
@@ -139,9 +136,8 @@ TEST_F(PlainTokenizerTest, Punctuation) {
 }
 
 TEST_F(PlainTokenizerTest, SpecialCharacters) {
-  ICING_ASSERT_OK_AND_ASSIGN(
-      auto language_segmenter,
-      language_segmenter_factory::Create(language_segmenter_factory::ICU4C));
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create());
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<Tokenizer> plain_tokenizer,
       tokenizer_factory::CreateIndexingTokenizer(
@@ -161,9 +157,8 @@ TEST_F(PlainTokenizerTest, SpecialCharacters) {
 }
 
 TEST_F(PlainTokenizerTest, CJKT) {
-  ICING_ASSERT_OK_AND_ASSIGN(
-      auto language_segmenter,
-      language_segmenter_factory::Create(language_segmenter_factory::ICU4C));
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create());
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<Tokenizer> plain_tokenizer,
       tokenizer_factory::CreateIndexingTokenizer(
@@ -214,9 +209,8 @@ TEST_F(PlainTokenizerTest, CJKT) {
 }
 
 TEST_F(PlainTokenizerTest, ResetToTokenAfterSimple) {
-  ICING_ASSERT_OK_AND_ASSIGN(
-      auto language_segmenter,
-      language_segmenter_factory::Create(language_segmenter_factory::ICU4C));
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create());
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<Tokenizer> plain_tokenizer,
       tokenizer_factory::CreateIndexingTokenizer(
@@ -232,9 +226,8 @@ TEST_F(PlainTokenizerTest, ResetToTokenAfterSimple) {
 }
 
 TEST_F(PlainTokenizerTest, ResetToTokenBeforeSimple) {
-  ICING_ASSERT_OK_AND_ASSIGN(
-      auto language_segmenter,
-      language_segmenter_factory::Create(language_segmenter_factory::ICU4C));
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create());
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<Tokenizer> plain_tokenizer,
       tokenizer_factory::CreateIndexingTokenizer(
@@ -250,9 +243,8 @@ TEST_F(PlainTokenizerTest, ResetToTokenBeforeSimple) {
 }
 
 TEST_F(PlainTokenizerTest, ResetToTokenAfter) {
-  ICING_ASSERT_OK_AND_ASSIGN(
-      auto language_segmenter,
-      language_segmenter_factory::Create(language_segmenter_factory::ICU4C));
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create());
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<Tokenizer> plain_tokenizer,
       tokenizer_factory::CreateIndexingTokenizer(
@@ -299,9 +291,8 @@ TEST_F(PlainTokenizerTest, ResetToTokenAfter) {
 }
 
 TEST_F(PlainTokenizerTest, ResetToTokenBefore) {
-  ICING_ASSERT_OK_AND_ASSIGN(
-      auto language_segmenter,
-      language_segmenter_factory::Create(language_segmenter_factory::ICU4C));
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create());
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<Tokenizer> plain_tokenizer,
       tokenizer_factory::CreateIndexingTokenizer(
diff --git a/icing/tokenization/raw-query-tokenizer.cc b/icing/tokenization/raw-query-tokenizer.cc
index 6819f8d..8b2edc9 100644
--- a/icing/tokenization/raw-query-tokenizer.cc
+++ b/icing/tokenization/raw-query-tokenizer.cc
@@ -29,7 +29,7 @@
 #include "icing/tokenization/language-segmenter.h"
 #include "icing/tokenization/token.h"
 #include "icing/tokenization/tokenizer.h"
-#include "icing/util/icu-i18n-utils.h"
+#include "icing/util/i18n-utils.h"
 #include "icing/util/status-macros.h"
 
 // This file provides rules that tell the tokenizer what to do when it sees a
@@ -316,7 +316,7 @@ TermType GetTermType(std::string_view term) {
     return OR_OPERATOR;
   }
   // Checks the first char to see if it's an ASCII term
-  if (icu_i18n_utils::IsAscii(term[0])) {
+  if (i18n_utils::IsAscii(term[0])) {
     if (std::isalnum(term[0])) {
       return ALPHANUMERIC_TERM;
     }
@@ -381,7 +381,7 @@ libtextclassifier3::Status OutputToken(State new_state,
     case ALPHANUMERIC_TERM:
       if (new_state == PROCESSING_PROPERTY_TERM) {
         // Asserts extra rule 1: property name must be in ASCII
-        if (!icu_i18n_utils::IsAscii(current_term[0])) {
+        if (!i18n_utils::IsAscii(current_term[0])) {
           return absl_ports::InvalidArgumentError(
               "Characters in property name must all be ASCII.");
         }
diff --git a/icing/tokenization/raw-query-tokenizer_test.cc b/icing/tokenization/raw-query-tokenizer_test.cc
index dfcc09a..351f7c1 100644
--- a/icing/tokenization/raw-query-tokenizer_test.cc
+++ b/icing/tokenization/raw-query-tokenizer_test.cc
@@ -16,7 +16,7 @@
 
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
-#include "icing/icu-data-file-helper.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
 #include "icing/testing/common-matchers.h"
 #include "icing/testing/test-data.h"
 #include "icing/tokenization/language-segmenter-factory.h"
@@ -46,9 +46,8 @@ TEST_F(RawQueryTokenizerTest, CreationWithNullPointerShouldFail) {
 }
 
 TEST_F(RawQueryTokenizerTest, Simple) {
-  ICING_ASSERT_OK_AND_ASSIGN(
-      auto language_segmenter,
-      language_segmenter_factory::Create(language_segmenter_factory::ICU4C));
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create());
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<Tokenizer> raw_query_tokenizer,
       tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
@@ -60,9 +59,8 @@ TEST_F(RawQueryTokenizerTest, Simple) {
 }
 
 TEST_F(RawQueryTokenizerTest, Parentheses) {
-  ICING_ASSERT_OK_AND_ASSIGN(
-      auto language_segmenter,
-      language_segmenter_factory::Create(language_segmenter_factory::ICU4C));
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create());
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<Tokenizer> raw_query_tokenizer,
       tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
@@ -161,9 +159,8 @@ TEST_F(RawQueryTokenizerTest, Parentheses) {
 }
 
 TEST_F(RawQueryTokenizerTest, Exclustion) {
-  ICING_ASSERT_OK_AND_ASSIGN(
-      auto language_segmenter,
-      language_segmenter_factory::Create(language_segmenter_factory::ICU4C));
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create());
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<Tokenizer> raw_query_tokenizer,
       tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
@@ -229,9 +226,8 @@ TEST_F(RawQueryTokenizerTest, Exclustion) {
 }
 
 TEST_F(RawQueryTokenizerTest, PropertyRestriction) {
-  ICING_ASSERT_OK_AND_ASSIGN(
-      auto language_segmenter,
-      language_segmenter_factory::Create(language_segmenter_factory::ICU4C));
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create());
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<Tokenizer> raw_query_tokenizer,
       tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
@@ -318,9 +314,8 @@ TEST_F(RawQueryTokenizerTest, PropertyRestriction) {
 }
 
 TEST_F(RawQueryTokenizerTest, OR) {
-  ICING_ASSERT_OK_AND_ASSIGN(
-      auto language_segmenter,
-      language_segmenter_factory::Create(language_segmenter_factory::ICU4C));
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create());
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<Tokenizer> raw_query_tokenizer,
       tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
@@ -440,9 +435,8 @@ TEST_F(RawQueryTokenizerTest, OR) {
 // CJKT are treated the same way by language segmenter and raw tokenizer, so
 // here we test Chinese and Japanese to represent CJKT.
 TEST_F(RawQueryTokenizerTest, CJKT) {
-  ICING_ASSERT_OK_AND_ASSIGN(
-      auto language_segmenter,
-      language_segmenter_factory::Create(language_segmenter_factory::ICU4C));
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create());
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<Tokenizer> raw_query_tokenizer,
       tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
@@ -494,9 +488,8 @@ TEST_F(RawQueryTokenizerTest, CJKT) {
 // Raw tokenizer identifies all characters that it doesn't know as OTHER type,
 // so we can choose comma "," to represent all OTHER characters.
 TEST_F(RawQueryTokenizerTest, OtherChars) {
-  ICING_ASSERT_OK_AND_ASSIGN(
-      auto language_segmenter,
-      language_segmenter_factory::Create(language_segmenter_factory::ICU4C));
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create());
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<Tokenizer> raw_query_tokenizer,
       tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
@@ -540,9 +533,8 @@ TEST_F(RawQueryTokenizerTest, OtherChars) {
 }
 
 TEST_F(RawQueryTokenizerTest, Mix) {
-  ICING_ASSERT_OK_AND_ASSIGN(
-      auto language_segmenter,
-      language_segmenter_factory::Create(language_segmenter_factory::ICU4C));
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create());
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<Tokenizer> raw_query_tokenizer,
       tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-factory.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-factory.cc
new file mode 100644
index 0000000..f79bc68
--- /dev/null
+++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-factory.cc
@@ -0,0 +1,62 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/tokenization/language-segmenter-factory.h"
+#include "icing/tokenization/reverse_jni/reverse-jni-language-segmenter.h"
+#include "icing/util/logging.h"
+
+namespace icing {
+namespace lib {
+
+namespace language_segmenter_factory {
+
+namespace {
+constexpr std::string_view kLocaleAmericanEnglishComputer = "en_US_POSIX";
+}  // namespace
+
+// Creates a language segmenter with the given locale.
+//
+// Returns:
+//   A LanguageSegmenter on success
+//   INVALID_ARGUMENT if locale string is invalid
+//
+// TODO(samzheng): Figure out if we want to verify locale strings and notify
+// users. Right now illegal locale strings will be ignored by ICU. ICU
+// components will be created with its default locale.
+libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter>> Create(
+    SegmenterOptions options) {
+  if (options.jni_cache == nullptr) {
+    return absl_ports::InvalidArgumentError(
+        "Cannot create Reverse Jni Language Segmenter without a valid JniCache "
+        "pointer");
+  }
+  // Word connector rules for "en_US_POSIX" (American English (Computer)) are
+  // different from other locales. E.g. "email.subject" will be split into 3
+  // terms in "en_US_POSIX": "email", ".", and "subject", while it's just one
+  // term in other locales. Our current LanguageSegmenter doesn't handle this
+  // special rule, so we replace it with "en_US".
+  if (options.locale == kLocaleAmericanEnglishComputer) {
+    ICING_LOG(WARNING) << "Locale " << kLocaleAmericanEnglishComputer
+                       << " not supported. Converting to locale " << ULOC_US;
+    options.locale = ULOC_US;
+  }
+  return std::make_unique<ReverseJniLanguageSegmenter>(
+      std::move(options.locale), options.jni_cache);
+}
+
+}  // namespace language_segmenter_factory
+
+}  // namespace lib
+}  // namespace icing
diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test-jni.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test-jni.cc
new file mode 100644
index 0000000..8392363
--- /dev/null
+++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test-jni.cc
@@ -0,0 +1,37 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <jni.h>
+
+#include "gtest/gtest.h"
+#include "icing/testing/logging-event-listener.h"
+
+// Global variable used so that the test implementation can access the JNIEnv.
+JNIEnv* g_jenv = nullptr;
+
+extern "C" JNIEXPORT jboolean JNICALL
+Java_icing_tokenization_reverse_1jni_ReverseJniLanguageSegmenterTest_testsMain(
+    JNIEnv* env, jclass ignored) {
+  g_jenv = env;
+
+  std::vector<char*> my_argv;
+  char arg[] = "reverse-jni-language-segmenter-test-lib";
+  my_argv.push_back(arg);
+  int argc = 1;
+  char** argv = &(my_argv[0]);
+  testing::InitGoogleTest(&argc, argv);
+  testing::UnitTest::GetInstance()->listeners().Append(
+      new icing::lib::LoggingEventListener());
+  return RUN_ALL_TESTS() == 0;
+}
diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.cc
new file mode 100644
index 0000000..a01d944
--- /dev/null
+++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.cc
@@ -0,0 +1,1085 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.h"
+
+#include <memory>
+#include <string_view>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "gmock/gmock.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/icu-i18n-test-utils.h"
+#include "icing/tokenization/language-segmenter-factory.h"
+#include "icing/tokenization/language-segmenter.h"
+#include "unicode/uloc.h"
+
+namespace icing {
+namespace lib {
+
+namespace test_internal {
+
+using ::testing::ElementsAre;
+using ::testing::Eq;
+using ::testing::IsEmpty;
+
+namespace {
+
+language_segmenter_factory::SegmenterOptions GetSegmenterOptions(
+    const std::string& locale, const JniCache* jni_cache) {
+  return language_segmenter_factory::SegmenterOptions(locale, jni_cache);
+}
+
+// Returns a vector containing all terms retrieved by Advancing on the iterator.
+std::vector<std::string_view> GetAllTermsAdvance(
+    LanguageSegmenter::Iterator* itr) {
+  std::vector<std::string_view> terms;
+  while (itr->Advance()) {
+    terms.push_back(itr->GetTerm());
+  }
+  return terms;
+}
+
+// Returns a vector containing all terms retrieved by calling ResetAfter with
+// the current position to simulate Advancing on the iterator.
+std::vector<std::string_view> GetAllTermsResetAfter(
+    LanguageSegmenter::Iterator* itr) {
+  std::vector<std::string_view> terms;
+  if (!itr->ResetToStart().ok()) {
+    return terms;
+  }
+  terms.push_back(itr->GetTerm());
+  const char* text_begin = itr->GetTerm().data();
+  // Calling ResetToTermStartingAfter with the current position should get the
+  // very next term in the sequence.
+  for (int current_pos = 0; itr->ResetToTermStartingAfter(current_pos).ok();
+       current_pos = itr->GetTerm().data() - text_begin) {
+    terms.push_back(itr->GetTerm());
+  }
+  return terms;
+}
+
+// Returns a vector containing all terms retrieved by alternating calls to
+// Advance and calls to ResetAfter with the current position to simulate
+// Advancing.
+std::vector<std::string_view> GetAllTermsAdvanceAndResetAfter(
+    LanguageSegmenter::Iterator* itr) {
+  const char* text_begin = itr->GetTerm().data();
+  std::vector<std::string_view> terms;
+
+  bool is_ok = true;
+  int current_pos = 0;
+  while (is_ok) {
+    // Alternate between using Advance and ResetToTermAfter.
+    if (terms.size() % 2 == 0) {
+      is_ok = itr->Advance();
+    } else {
+      // Calling ResetToTermStartingAfter with the current position should get
+      // the very next term in the sequence.
+      current_pos = itr->GetTerm().data() - text_begin;
+      is_ok = itr->ResetToTermStartingAfter(current_pos).ok();
+    }
+    if (is_ok) {
+      terms.push_back(itr->GetTerm());
+    }
+  }
+  return terms;
+}
+
+// Returns a vector containing all terms retrieved by calling ResetBefore with
+// the current position, starting at the end of the text. This vector should be
+// in reverse order of GetAllTerms and missing the last term.
+std::vector<std::string_view> GetAllTermsResetBefore(
+    LanguageSegmenter::Iterator* itr) {
+  const char* text_begin = itr->GetTerm().data();
+  int last_pos = 0;
+  while (itr->Advance()) {
+    last_pos = itr->GetTerm().data() - text_begin;
+  }
+  std::vector<std::string_view> terms;
+  // Calling ResetToTermEndingBefore with the current position should get the
+  // previous term in the sequence.
+  for (int current_pos = last_pos;
+       itr->ResetToTermEndingBefore(current_pos).ok();
+       current_pos = itr->GetTerm().data() - text_begin) {
+    terms.push_back(itr->GetTerm());
+  }
+  return terms;
+}
+
+}  // namespace
+
+TEST_P(ReverseJniLanguageSegmenterTest, EmptyText) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+  EXPECT_THAT(language_segmenter->GetAllTerms(""), IsOkAndHolds(IsEmpty()));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, SimpleText) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+  EXPECT_THAT(language_segmenter->GetAllTerms("Hello World"),
+              IsOkAndHolds(ElementsAre("Hello", " ", "World")));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, ASCII_Punctuation) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+  // ASCII punctuation marks are kept
+  EXPECT_THAT(
+      language_segmenter->GetAllTerms("Hello, World!!!"),
+      IsOkAndHolds(ElementsAre("Hello", ",", " ", "World", "!", "!", "!")));
+  EXPECT_THAT(language_segmenter->GetAllTerms("Open-source project"),
+              IsOkAndHolds(ElementsAre("Open", "-", "source", " ", "project")));
+  EXPECT_THAT(language_segmenter->GetAllTerms("100%"),
+              IsOkAndHolds(ElementsAre("100", "%")));
+  EXPECT_THAT(language_segmenter->GetAllTerms("A&B"),
+              IsOkAndHolds(ElementsAre("A", "&", "B")));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, ASCII_SpecialCharacter) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+  // ASCII special characters are kept
+  EXPECT_THAT(language_segmenter->GetAllTerms("Pay $1000"),
+              IsOkAndHolds(ElementsAre("Pay", " ", "$", "1000")));
+  EXPECT_THAT(language_segmenter->GetAllTerms("A+B"),
+              IsOkAndHolds(ElementsAre("A", "+", "B")));
+  // 0x0009 is the unicode for tab (within ASCII range).
+  std::string text_with_tab = absl_ports::StrCat(
+      "Hello", UCharToString(0x0009), UCharToString(0x0009), "World");
+  EXPECT_THAT(language_segmenter->GetAllTerms(text_with_tab),
+              IsOkAndHolds(ElementsAre("Hello", UCharToString(0x0009),
+                                       UCharToString(0x0009), "World")));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, Non_ASCII_Non_Alphabetic) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+  // Full-width (non-ASCII) punctuation marks and special characters are left
+  // out.
+  EXPECT_THAT(language_segmenter->GetAllTerms("。？·Hello！×"),
+              IsOkAndHolds(ElementsAre("Hello")));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, Acronym) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+  EXPECT_THAT(language_segmenter->GetAllTerms("U.S.𡔖 Bank"),
+              IsOkAndHolds(ElementsAre("U.S", ".", "𡔖", " ", "Bank")));
+  EXPECT_THAT(language_segmenter->GetAllTerms("I.B.M."),
+              IsOkAndHolds(ElementsAre("I.B.M", ".")));
+  EXPECT_THAT(language_segmenter->GetAllTerms("I,B,M"),
+              IsOkAndHolds(ElementsAre("I", ",", "B", ",", "M")));
+  EXPECT_THAT(language_segmenter->GetAllTerms("I B M"),
+              IsOkAndHolds(ElementsAre("I", " ", "B", " ", "M")));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, WordConnector) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+  // According to unicode word break rules
+  // WB6(https://unicode.org/reports/tr29/#WB6),
+  // WB7(https://unicode.org/reports/tr29/#WB7), and a few others, some
+  // punctuation characters are used as word connecters. That is, words don't
+  // break before and after them. Here we just test some that we care about.
+
+  // Word connecters
+  EXPECT_THAT(language_segmenter->GetAllTerms("com.google.android"),
+              IsOkAndHolds(ElementsAre("com.google.android")));
+  EXPECT_THAT(language_segmenter->GetAllTerms("com:google:android"),
+              IsOkAndHolds(ElementsAre("com:google:android")));
+  EXPECT_THAT(language_segmenter->GetAllTerms("com'google'android"),
+              IsOkAndHolds(ElementsAre("com'google'android")));
+  EXPECT_THAT(language_segmenter->GetAllTerms("com_google_android"),
+              IsOkAndHolds(ElementsAre("com_google_android")));
+
+  // Word connecters can be mixed
+  EXPECT_THAT(language_segmenter->GetAllTerms("com.google.android:icing"),
+              IsOkAndHolds(ElementsAre("com.google.android:icing")));
+
+  // Any heading and trailing characters are not connecters
+  EXPECT_THAT(language_segmenter->GetAllTerms(".com.google.android."),
+              IsOkAndHolds(ElementsAre(".", "com.google.android", ".")));
+
+  // Not word connecters
+  EXPECT_THAT(language_segmenter->GetAllTerms("com,google,android"),
+              IsOkAndHolds(ElementsAre("com", ",", "google", ",", "android")));
+  EXPECT_THAT(language_segmenter->GetAllTerms("com-google-android"),
+              IsOkAndHolds(ElementsAre("com", "-", "google", "-", "android")));
+  EXPECT_THAT(language_segmenter->GetAllTerms("com+google+android"),
+              IsOkAndHolds(ElementsAre("com", "+", "google", "+", "android")));
+  EXPECT_THAT(language_segmenter->GetAllTerms("com*google*android"),
+              IsOkAndHolds(ElementsAre("com", "*", "google", "*", "android")));
+  EXPECT_THAT(language_segmenter->GetAllTerms("com@google@android"),
+              IsOkAndHolds(ElementsAre("com", "@", "google", "@", "android")));
+  EXPECT_THAT(language_segmenter->GetAllTerms("com^google^android"),
+              IsOkAndHolds(ElementsAre("com", "^", "google", "^", "android")));
+  EXPECT_THAT(language_segmenter->GetAllTerms("com&google&android"),
+              IsOkAndHolds(ElementsAre("com", "&", "google", "&", "android")));
+  EXPECT_THAT(language_segmenter->GetAllTerms("com|google|android"),
+              IsOkAndHolds(ElementsAre("com", "|", "google", "|", "android")));
+  EXPECT_THAT(language_segmenter->GetAllTerms("com/google/android"),
+              IsOkAndHolds(ElementsAre("com", "/", "google", "/", "android")));
+  EXPECT_THAT(language_segmenter->GetAllTerms("com;google;android"),
+              IsOkAndHolds(ElementsAre("com", ";", "google", ";", "android")));
+  EXPECT_THAT(
+      language_segmenter->GetAllTerms("com\"google\"android"),
+      IsOkAndHolds(ElementsAre("com", "\"", "google", "\"", "android")));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, Apostrophes) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+  EXPECT_THAT(language_segmenter->GetAllTerms("It's ok."),
+              IsOkAndHolds(ElementsAre("It's", " ", "ok", ".")));
+  EXPECT_THAT(language_segmenter->GetAllTerms("He'll be back."),
+              IsOkAndHolds(ElementsAre("He'll", " ", "be", " ", "back", ".")));
+  EXPECT_THAT(language_segmenter->GetAllTerms("'Hello 'World."),
+              IsOkAndHolds(ElementsAre("'", "Hello", " ", "'", "World", ".")));
+  EXPECT_THAT(language_segmenter->GetAllTerms("The dogs' bone"),
+              IsOkAndHolds(ElementsAre("The", " ", "dogs", "'", " ", "bone")));
+  // 0x2019 is the single right quote, should be treated the same as "'"
+  std::string token_with_quote =
+      absl_ports::StrCat("He", UCharToString(0x2019), "ll");
+  std::string text_with_quote =
+      absl_ports::StrCat(token_with_quote, " be back.");
+  EXPECT_THAT(
+      language_segmenter->GetAllTerms(text_with_quote),
+      IsOkAndHolds(ElementsAre(token_with_quote, " ", "be", " ", "back", ".")));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, Parentheses) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+
+  EXPECT_THAT(language_segmenter->GetAllTerms("(Hello)"),
+              IsOkAndHolds(ElementsAre("(", "Hello", ")")));
+
+  EXPECT_THAT(language_segmenter->GetAllTerms(")Hello("),
+              IsOkAndHolds(ElementsAre(")", "Hello", "(")));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, Quotes) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+
+  EXPECT_THAT(language_segmenter->GetAllTerms("\"Hello\""),
+              IsOkAndHolds(ElementsAre("\"", "Hello", "\"")));
+
+  EXPECT_THAT(language_segmenter->GetAllTerms("'Hello'"),
+              IsOkAndHolds(ElementsAre("'", "Hello", "'")));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, Alphanumeric) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+
+  // Alphanumeric terms are allowed
+  EXPECT_THAT(language_segmenter->GetAllTerms("Se7en A4 3a"),
+              IsOkAndHolds(ElementsAre("Se7en", " ", "A4", " ", "3a")));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, Number) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+
+  // Alphanumeric terms are allowed
+  EXPECT_THAT(
+      language_segmenter->GetAllTerms("3.141592653589793238462643383279"),
+      IsOkAndHolds(ElementsAre("3.141592653589793238462643383279")));
+
+  EXPECT_THAT(language_segmenter->GetAllTerms("3,456.789"),
+              IsOkAndHolds(ElementsAre("3,456.789")));
+
+  EXPECT_THAT(language_segmenter->GetAllTerms("-123"),
+              IsOkAndHolds(ElementsAre("-", "123")));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, ContinuousWhitespaces) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+  // Multiple continuous whitespaces are treated as one.
+  const int kNumSeparators = 256;
+  std::string text_with_spaces =
+      absl_ports::StrCat("Hello", std::string(kNumSeparators, ' '), "World");
+  EXPECT_THAT(language_segmenter->GetAllTerms(text_with_spaces),
+              IsOkAndHolds(ElementsAre("Hello", " ", "World")));
+
+  // Multiple continuous whitespaces are treated as one. Whitespace at the
+  // beginning of the text doesn't affect the results of GetTerm() after the
+  // iterator is done.
+  text_with_spaces = absl_ports::StrCat(std::string(kNumSeparators, ' '),
+                                        "Hello", " ", "World");
+  ICING_ASSERT_OK_AND_ASSIGN(auto itr,
+                             language_segmenter->Segment(text_with_spaces));
+  std::vector<std::string_view> terms;
+  while (itr->Advance()) {
+    terms.push_back(itr->GetTerm());
+  }
+  EXPECT_THAT(terms, ElementsAre(" ", "Hello", " ", "World"));
+  EXPECT_THAT(itr->GetTerm(), IsEmpty());
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, CJKT) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+  // CJKT (Chinese, Japanese, Khmer, Thai) are the 4 main languages that don't
+  // have whitespaces as word delimiter.
+
+  // Chinese
+  EXPECT_THAT(language_segmenter->GetAllTerms("我每天走路去上班。"),
+              IsOkAndHolds(ElementsAre("我", "每天", "走路", "去", "上班")));
+  // Japanese
+  EXPECT_THAT(language_segmenter->GetAllTerms("私は毎日仕事に歩いています。"),
+              IsOkAndHolds(ElementsAre("私", "は", "毎日", "仕事", "に", "歩",
+                                       "い", "てい", "ます")));
+  // Khmer
+  EXPECT_THAT(language_segmenter->GetAllTerms("ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"),
+              IsOkAndHolds(ElementsAre("ញុំ", "ដើរទៅ", "ធ្វើការ", "រាល់ថ្ងៃ")));
+  // Thai
+  EXPECT_THAT(
+      language_segmenter->GetAllTerms("ฉันเดินไปทำงานทุกวัน"),
+      IsOkAndHolds(ElementsAre("ฉัน", "เดิน", "ไป", "ทำงาน", "ทุก", "วัน")));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, LatinLettersWithAccents) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+  EXPECT_THAT(language_segmenter->GetAllTerms("āăąḃḅḇčćç"),
+              IsOkAndHolds(ElementsAre("āăąḃḅḇčćç")));
+}
+
+// TODO(samzheng): test cases for more languages (e.g. top 20 in the world)
+TEST_P(ReverseJniLanguageSegmenterTest, WhitespaceSplitLanguages) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+  // Turkish
+  EXPECT_THAT(language_segmenter->GetAllTerms("merhaba dünya"),
+              IsOkAndHolds(ElementsAre("merhaba", " ", "dünya")));
+  // Korean
+  EXPECT_THAT(
+      language_segmenter->GetAllTerms("나는 매일 출근합니다."),
+      IsOkAndHolds(ElementsAre("나는", " ", "매일", " ", "출근합니다", ".")));
+}
+
+// TODO(samzheng): more mixed languages test cases
+TEST_P(ReverseJniLanguageSegmenterTest, MixedLanguages) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+  EXPECT_THAT(language_segmenter->GetAllTerms("How are you你好吗お元気ですか"),
+              IsOkAndHolds(ElementsAre("How", " ", "are", " ", "you", "你好",
+                                       "吗", "お", "元気", "です", "か")));
+
+  EXPECT_THAT(
+      language_segmenter->GetAllTerms("나는 California에 산다"),
+      IsOkAndHolds(ElementsAre("나는", " ", "California", "에", " ", "산다")));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, NotCopyStrings) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+  // Validates that the input strings are not copied
+  const std::string text = "Hello World";
+  const char* word1_address = text.c_str();
+  const char* word2_address = text.c_str() + 6;
+  ICING_ASSERT_OK_AND_ASSIGN(std::vector<std::string_view> terms,
+                             language_segmenter->GetAllTerms(text));
+  ASSERT_THAT(terms, ElementsAre("Hello", " ", "World"));
+  const char* word1_result_address = terms.at(0).data();
+  const char* word2_result_address = terms.at(2).data();
+
+  // The underlying char* should be the same
+  EXPECT_THAT(word1_address, Eq(word1_result_address));
+  EXPECT_THAT(word2_address, Eq(word2_result_address));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, ResetToTermAfterOutOfBounds) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto segmenter, language_segmenter_factory::Create(
+                          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+  constexpr std::string_view kText = "How are you你好吗お元気ですか";
+  ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+                             segmenter->Segment(kText));
+
+  // String: "How are you你好吗お元気ですか"
+  //          ^  ^^  ^^  ^  ^ ^ ^  ^  ^
+  // Bytes:   0  3 4 7 8 11 172023 29 35
+  ASSERT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(8)));
+  ASSERT_THAT(itr->GetTerm(), Eq("you"));
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(-1),
+              StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+  EXPECT_THAT(itr->GetTerm(), Eq("you"));
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(kText.length()),
+              StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+  EXPECT_THAT(itr->GetTerm(), Eq("you"));
+}
+
+// Tests that ResetToTermAfter and Advance produce the same output. With the
+// exception of the first term which is inacessible via ResetToTermAfter,
+// the stream of terms produced by Advance calls should exacly match the
+// terms produced by ResetToTermAfter calls with the current position
+// provided as the argument.
+TEST_P(ReverseJniLanguageSegmenterTest,
+       MixedLanguagesResetToTermAfterEquivalentToAdvance) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto segmenter, language_segmenter_factory::Create(
+                          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+  constexpr std::string_view kText = "How are𡔖 you你好吗お元気ですか";
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+      segmenter->Segment(kText));
+  std::vector<std::string_view> advance_terms =
+      GetAllTermsAdvance(advance_itr.get());
+
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
+      segmenter->Segment(kText));
+  std::vector<std::string_view> reset_terms =
+      GetAllTermsResetAfter(reset_to_term_itr.get());
+
+  EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
+  EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest,
+       ThaiResetToTermAfterEquivalentToAdvance) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto segmenter, language_segmenter_factory::Create(
+                          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+  constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+      segmenter->Segment(kThai));
+  std::vector<std::string_view> advance_terms =
+      GetAllTermsAdvance(advance_itr.get());
+
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
+      segmenter->Segment(kThai));
+  std::vector<std::string_view> reset_terms =
+      GetAllTermsResetAfter(reset_to_term_itr.get());
+
+  EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
+  EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest,
+       KoreanResetToTermAfterEquivalentToAdvance) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto segmenter, language_segmenter_factory::Create(
+                          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+  constexpr std::string_view kKorean = "나는 매일 출근합니다.";
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+      segmenter->Segment(kKorean));
+  std::vector<std::string_view> advance_terms =
+      GetAllTermsAdvance(advance_itr.get());
+
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
+      segmenter->Segment(kKorean));
+  std::vector<std::string_view> reset_terms =
+      GetAllTermsResetAfter(reset_to_term_itr.get());
+
+  EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
+  EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+// Tests that ResetToTermAfter and Advance can be used in conjunction. Just as
+// ResetToTermAfter(current_position) can be used to simulate Advance, users
+// should be able to mix ResetToTermAfter(current_position) calls and Advance
+// calls to mimic calling Advance.
+TEST_P(ReverseJniLanguageSegmenterTest,
+       MixedLanguagesResetToTermAfterInteroperableWithAdvance) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto segmenter, language_segmenter_factory::Create(
+                          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+  constexpr std::string_view kText = "How are𡔖 you你好吗お元気ですか";
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+      segmenter->Segment(kText));
+  std::vector<std::string_view> advance_terms =
+      GetAllTermsAdvance(advance_itr.get());
+
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr,
+      segmenter->Segment(kText));
+  std::vector<std::string_view> advance_and_reset_terms =
+      GetAllTermsAdvanceAndResetAfter(advance_and_reset_itr.get());
+
+  EXPECT_THAT(advance_and_reset_terms,
+              testing::ElementsAreArray(advance_terms));
+  EXPECT_THAT(advance_and_reset_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest,
+       ThaiResetToTermAfterInteroperableWithAdvance) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto segmenter, language_segmenter_factory::Create(
+                          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+  constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+      segmenter->Segment(kThai));
+  std::vector<std::string_view> advance_terms =
+      GetAllTermsAdvance(advance_itr.get());
+
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr,
+      segmenter->Segment(kThai));
+  std::vector<std::string_view> advance_and_reset_terms =
+      GetAllTermsAdvanceAndResetAfter(advance_and_reset_itr.get());
+
+  EXPECT_THAT(advance_and_reset_terms,
+              testing::ElementsAreArray(advance_terms));
+  EXPECT_THAT(advance_and_reset_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest,
+       KoreanResetToTermAfterInteroperableWithAdvance) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto segmenter, language_segmenter_factory::Create(
+                          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+  constexpr std::string_view kKorean = "나는 매일 출근합니다.";
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+      segmenter->Segment(kKorean));
+  std::vector<std::string_view> advance_terms =
+      GetAllTermsAdvance(advance_itr.get());
+
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr,
+      segmenter->Segment(kKorean));
+  std::vector<std::string_view> advance_and_reset_terms =
+      GetAllTermsAdvanceAndResetAfter(advance_and_reset_itr.get());
+
+  EXPECT_THAT(advance_and_reset_terms,
+              testing::ElementsAreArray(advance_terms));
+  EXPECT_THAT(advance_and_reset_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, MixedLanguagesResetToTermAfter) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LanguageSegmenter::Iterator> itr,
+      language_segmenter->Segment("How are you你好吗お元気ですか"));
+
+  // String: "How are you你好吗お元気ですか"
+  //          ^  ^^  ^^  ^  ^ ^ ^  ^  ^
+  // Bytes:   0  3 4 7 8 11 172023 29 35
+  EXPECT_THAT(itr->ResetToTermStartingAfter(2), IsOkAndHolds(Eq(3)));
+  EXPECT_THAT(itr->GetTerm(), Eq(" "));
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(10), IsOkAndHolds(Eq(11)));
+  EXPECT_THAT(itr->GetTerm(), Eq("你好"));
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(8)));
+  EXPECT_THAT(itr->GetTerm(), Eq("you"));
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(32), IsOkAndHolds(Eq(35)));
+  EXPECT_THAT(itr->GetTerm(), Eq("か"));
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(14), IsOkAndHolds(Eq(17)));
+  EXPECT_THAT(itr->GetTerm(), Eq("吗"));
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(3)));
+  EXPECT_THAT(itr->GetTerm(), Eq(" "));
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(35),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+  EXPECT_THAT(itr->GetTerm(), IsEmpty());
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, ContinuousWhitespacesResetToTermAfter) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+  // Multiple continuous whitespaces are treated as one.
+  constexpr std::string_view kTextWithSpace = "Hello          World";
+  ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+                             language_segmenter->Segment(kTextWithSpace));
+
+  // String: "Hello          World"
+  //          ^    ^         ^
+  // Bytes:   0    5         15
+  EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(5)));
+  EXPECT_THAT(itr->GetTerm(), Eq(" "));
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(2), IsOkAndHolds(Eq(5)));
+  EXPECT_THAT(itr->GetTerm(), Eq(" "));
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(10), IsOkAndHolds(Eq(15)));
+  EXPECT_THAT(itr->GetTerm(), Eq("World"));
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(5), IsOkAndHolds(Eq(15)));
+  EXPECT_THAT(itr->GetTerm(), Eq("World"));
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(15),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+  EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(17),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+  EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(19),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+  EXPECT_THAT(itr->GetTerm(), IsEmpty());
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, ChineseResetToTermAfter) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+  // CJKT (Chinese, Japanese, Khmer, Thai) are the 4 main languages that
+  // don't have whitespaces as word delimiter. Chinese
+  constexpr std::string_view kChinese = "我每天走路去上班。";
+  ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+                             language_segmenter->Segment(kChinese));
+  // String: "我每天走路去上班。"
+  //          ^ ^  ^   ^^
+  // Bytes:   0 3  9  15 18
+  EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(3)));
+  EXPECT_THAT(itr->GetTerm(), Eq("每天"));
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(9)));
+  EXPECT_THAT(itr->GetTerm(), Eq("走路"));
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(19),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+  EXPECT_THAT(itr->GetTerm(), IsEmpty());
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, JapaneseResetToTermAfter) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+  // Japanese
+  constexpr std::string_view kJapanese = "私は毎日仕事に歩いています。";
+  ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+                             language_segmenter->Segment(kJapanese));
+  // String: "私は毎日仕事に歩いています。"
+  //          ^ ^ ^  ^  ^ ^ ^ ^  ^
+  // Bytes:   0 3 6  12 18212427 33
+  EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(3)));
+  EXPECT_THAT(itr->GetTerm(), Eq("は"));
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(33),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+  EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(12)));
+  EXPECT_THAT(itr->GetTerm(), Eq("仕事"));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, KhmerResetToTermAfter) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+  constexpr std::string_view kKhmer = "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។";
+  ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+                             language_segmenter->Segment(kKhmer));
+  // String: "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"
+  //          ^ ^   ^   ^
+  // Bytes:   0 9   24  45
+  EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(9)));
+  EXPECT_THAT(itr->GetTerm(), Eq("ដើរទៅ"));
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(47),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+  EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(14), IsOkAndHolds(Eq(24)));
+  EXPECT_THAT(itr->GetTerm(), Eq("ធ្វើការ"));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, ThaiResetToTermAfter) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+  // Thai
+  constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
+  ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+                             language_segmenter->Segment(kThai));
+  // String: "ฉันเดินไปทำงานทุกวัน"
+  //          ^ ^  ^ ^    ^ ^
+  // Bytes:   0 9 21 27  42 51
+  EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(9)));
+  EXPECT_THAT(itr->GetTerm(), Eq("เดิน"));
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(51),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+  EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(13), IsOkAndHolds(Eq(21)));
+  EXPECT_THAT(itr->GetTerm(), Eq("ไป"));
+
+  EXPECT_THAT(itr->ResetToTermStartingAfter(34), IsOkAndHolds(Eq(42)));
+  EXPECT_THAT(itr->GetTerm(), Eq("ทุก"));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, ResetToTermBeforeOutOfBounds) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto segmenter, language_segmenter_factory::Create(
+                          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+  constexpr std::string_view kText = "How are you你好吗お元気ですか";
+  ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+                             segmenter->Segment(kText));
+
+  // String: "How are you你好吗お元気ですか"
+  //          ^  ^^  ^^  ^  ^ ^ ^  ^  ^
+  // Bytes:   0  3 4 7 8 11 172023 29 35
+  ASSERT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(4)));
+  ASSERT_THAT(itr->GetTerm(), Eq("are"));
+
+  EXPECT_THAT(itr->ResetToTermEndingBefore(-1),
+              StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+  EXPECT_THAT(itr->GetTerm(), Eq("are"));
+
+  EXPECT_THAT(itr->ResetToTermEndingBefore(kText.length()),
+              StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+  EXPECT_THAT(itr->GetTerm(), Eq("are"));
+}
+
+// Tests that ResetToTermBefore and Advance produce the same output. With the
+// exception of the last term which is inacessible via ResetToTermBefore,
+// the stream of terms produced by Advance calls should exacly match the
+// terms produced by ResetToTermBefore calls with the current position
+// provided as the argument (after their order has been reversed).
+TEST_P(ReverseJniLanguageSegmenterTest,
+       MixedLanguagesResetToTermBeforeEquivalentToAdvance) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto segmenter, language_segmenter_factory::Create(
+                          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+  constexpr std::string_view kText = "How are𡔖 you你好吗お元気ですか";
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+      segmenter->Segment(kText));
+  std::vector<std::string_view> advance_terms =
+      GetAllTermsAdvance(advance_itr.get());
+  // Can't produce the last term via calls to ResetToTermBefore. So skip
+  // past that one.
+  auto itr = advance_terms.begin();
+  std::advance(itr, advance_terms.size() - 1);
+  advance_terms.erase(itr);
+
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
+      segmenter->Segment(kText));
+  std::vector<std::string_view> reset_terms =
+      GetAllTermsResetBefore(reset_to_term_itr.get());
+  std::reverse(reset_terms.begin(), reset_terms.end());
+
+  EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
+  EXPECT_THAT(reset_to_term_itr->GetTerm(), IsEmpty());
+  EXPECT_THAT(advance_itr->GetTerm(), IsEmpty());
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest,
+       ThaiResetToTermBeforeEquivalentToAdvance) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto segmenter, language_segmenter_factory::Create(
+                          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+  constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+      segmenter->Segment(kThai));
+  std::vector<std::string_view> advance_terms =
+      GetAllTermsAdvance(advance_itr.get());
+  // Can't produce the last term via calls to ResetToTermBefore. So skip
+  // past that one.
+  auto itr = advance_terms.begin();
+  std::advance(itr, advance_terms.size() - 1);
+  advance_terms.erase(itr);
+
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
+      segmenter->Segment(kThai));
+  std::vector<std::string_view> reset_terms =
+      GetAllTermsResetBefore(reset_to_term_itr.get());
+  std::reverse(reset_terms.begin(), reset_terms.end());
+
+  EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
+  EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest,
+       KoreanResetToTermBeforeEquivalentToAdvance) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto segmenter, language_segmenter_factory::Create(
+                          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+  constexpr std::string_view kKorean = "나는 매일 출근합니다.";
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+      segmenter->Segment(kKorean));
+  std::vector<std::string_view> advance_terms =
+      GetAllTermsAdvance(advance_itr.get());
+  // Can't produce the last term via calls to ResetToTermBefore. So skip
+  // past that one.
+  auto itr = advance_terms.begin();
+  std::advance(itr, advance_terms.size() - 1);
+  advance_terms.erase(itr);
+
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
+      segmenter->Segment(kKorean));
+  std::vector<std::string_view> reset_terms =
+      GetAllTermsResetBefore(reset_to_term_itr.get());
+  std::reverse(reset_terms.begin(), reset_terms.end());
+
+  EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
+  EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, MixedLanguagesResetToTermBefore) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LanguageSegmenter::Iterator> itr,
+      language_segmenter->Segment("How are you你好吗お元気ですか"));
+
+  // String: "How are you你好吗お元気ですか"
+  //          ^  ^^  ^^  ^  ^ ^ ^  ^  ^
+  // Bytes:   0  3 4 7 8 11 172023 29 35
+  EXPECT_THAT(itr->ResetToTermEndingBefore(2),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+  EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+  EXPECT_THAT(itr->ResetToTermEndingBefore(10), IsOkAndHolds(Eq(7)));
+  EXPECT_THAT(itr->GetTerm(), Eq(" "));
+
+  EXPECT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(4)));
+  EXPECT_THAT(itr->GetTerm(), Eq("are"));
+
+  EXPECT_THAT(itr->ResetToTermEndingBefore(32), IsOkAndHolds(Eq(23)));
+  EXPECT_THAT(itr->GetTerm(), Eq("元気"));
+
+  EXPECT_THAT(itr->ResetToTermEndingBefore(14), IsOkAndHolds(Eq(8)));
+  EXPECT_THAT(itr->GetTerm(), Eq("you"));
+
+  EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+  EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+  EXPECT_THAT(itr->ResetToTermEndingBefore(35), IsOkAndHolds(Eq(29)));
+  EXPECT_THAT(itr->GetTerm(), Eq("です"));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest,
+       ContinuousWhitespacesResetToTermBefore) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+  // Multiple continuous whitespaces are treated as one.
+  constexpr std::string_view kTextWithSpace = "Hello          World";
+  ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+                             language_segmenter->Segment(kTextWithSpace));
+
+  // String: "Hello          World"
+  //          ^    ^         ^
+  // Bytes:   0    5         15
+  EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+  EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+  EXPECT_THAT(itr->ResetToTermEndingBefore(2),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+  EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+  EXPECT_THAT(itr->ResetToTermEndingBefore(10), IsOkAndHolds(Eq(0)));
+  EXPECT_THAT(itr->GetTerm(), Eq("Hello"));
+
+  EXPECT_THAT(itr->ResetToTermEndingBefore(5), IsOkAndHolds(Eq(0)));
+  EXPECT_THAT(itr->GetTerm(), Eq("Hello"));
+
+  EXPECT_THAT(itr->ResetToTermEndingBefore(15), IsOkAndHolds(Eq(5)));
+  EXPECT_THAT(itr->GetTerm(), Eq(" "));
+
+  EXPECT_THAT(itr->ResetToTermEndingBefore(17), IsOkAndHolds(Eq(5)));
+  EXPECT_THAT(itr->GetTerm(), Eq(" "));
+
+  EXPECT_THAT(itr->ResetToTermEndingBefore(19), IsOkAndHolds(Eq(5)));
+  EXPECT_THAT(itr->GetTerm(), Eq(" "));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, ChineseResetToTermBefore) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+  // CJKT (Chinese, Japanese, Khmer, Thai) are the 4 main languages that
+  // don't have whitespaces as word delimiter. Chinese
+  constexpr std::string_view kChinese = "我每天走路去上班。";
+  ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+                             language_segmenter->Segment(kChinese));
+  // String: "我每天走路去上班。"
+  //          ^ ^  ^   ^^
+  // Bytes:   0 3  9  15 18
+  EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+  EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+  EXPECT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(0)));
+  EXPECT_THAT(itr->GetTerm(), Eq("我"));
+
+  EXPECT_THAT(itr->ResetToTermEndingBefore(19), IsOkAndHolds(Eq(15)));
+  EXPECT_THAT(itr->GetTerm(), Eq("去"));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, JapaneseResetToTermBefore) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+  // Japanese
+  constexpr std::string_view kJapanese = "私は毎日仕事に歩いています。";
+  ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+                             language_segmenter->Segment(kJapanese));
+  // String: "私は毎日仕事に歩いています。"
+  //          ^ ^ ^  ^  ^ ^ ^ ^  ^
+  // Bytes:   0 3 6  12 18212427 33
+  EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+  EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+  EXPECT_THAT(itr->ResetToTermEndingBefore(33), IsOkAndHolds(Eq(27)));
+  EXPECT_THAT(itr->GetTerm(), Eq("てい"));
+
+  EXPECT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(3)));
+  EXPECT_THAT(itr->GetTerm(), Eq("は"));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, KhmerResetToTermBefore) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+  constexpr std::string_view kKhmer = "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។";
+  ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+                             language_segmenter->Segment(kKhmer));
+  // String: "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"
+  //          ^ ^   ^   ^
+  // Bytes:   0 9   24  45
+  EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+  EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+  EXPECT_THAT(itr->ResetToTermEndingBefore(47), IsOkAndHolds(Eq(24)));
+  EXPECT_THAT(itr->GetTerm(), Eq("ធ្វើការ"));
+
+  EXPECT_THAT(itr->ResetToTermEndingBefore(14), IsOkAndHolds(Eq(0)));
+  EXPECT_THAT(itr->GetTerm(), Eq("ញុំ"));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, ThaiResetToTermBefore) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      auto language_segmenter,
+      language_segmenter_factory::Create(
+          GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+  // Thai
+  constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
+  ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+                             language_segmenter->Segment(kThai));
+  // String: "ฉันเดินไปทำงานทุกวัน"
+  //          ^ ^  ^ ^    ^ ^
+  // Bytes:   0 9 21 27  42 51
+  EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+              StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+  EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+  EXPECT_THAT(itr->ResetToTermEndingBefore(51), IsOkAndHolds(Eq(42)));
+  EXPECT_THAT(itr->GetTerm(), Eq("ทุก"));
+
+  EXPECT_THAT(itr->ResetToTermEndingBefore(13), IsOkAndHolds(Eq(0)));
+  EXPECT_THAT(itr->GetTerm(), Eq("ฉัน"));
+
+  EXPECT_THAT(itr->ResetToTermEndingBefore(34), IsOkAndHolds(Eq(21)));
+  EXPECT_THAT(itr->GetTerm(), Eq("ไป"));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    LocaleName, ReverseJniLanguageSegmenterTest,
+    testing::Values(ULOC_US, ULOC_UK, ULOC_CANADA, ULOC_CANADA_FRENCH,
+                    ULOC_FRANCE, ULOC_GERMANY, ULOC_ITALY, ULOC_JAPAN,
+                    ULOC_KOREA, ULOC_SIMPLIFIED_CHINESE,
+                    ULOC_TRADITIONAL_CHINESE,
+                    "es_ES",        // Spanish
+                    "hi_IN",        // Hindi
+                    "th_TH",        // Thai
+                    "lo_LA",        // Lao
+                    "km_KH",        // Khmer
+                    "ar_DZ",        // Arabic
+                    "ru_RU",        // Russian
+                    "pt_PT",        // Portuguese
+                    "en_US_POSIX"   // American English (Computer)
+                    "wrong_locale"  // Will fall back to ICU default locale
+                    ""              // Will fall back to ICU default locale
+                    ));
+
+}  // namespace test_internal
+
+}  // namespace lib
+}  // namespace icing
diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.h b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.h
new file mode 100644
index 0000000..64b68ec
--- /dev/null
+++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.h
@@ -0,0 +1,46 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TOKENIZATION_REVERSE_JNI_REVERSE_JNI_LANGUAGE_SEGMENTER_TEST_H_
+#define ICING_TOKENIZATION_REVERSE_JNI_REVERSE_JNI_LANGUAGE_SEGMENTER_TEST_H_
+
+#include <jni.h>
+
+#include "icing/jni/jni-cache.h"
+#include "gtest/gtest.h"
+
+extern JNIEnv* g_jenv;
+
+namespace icing {
+namespace lib {
+
+namespace test_internal {
+
+class ReverseJniLanguageSegmenterTest
+    : public testing::TestWithParam<const char*> {
+ protected:
+  ReverseJniLanguageSegmenterTest()
+      : jni_cache_(std::move(JniCache::Create(g_jenv)).ValueOrDie()) {}
+
+  static std::string GetLocale() { return GetParam(); }
+
+  std::unique_ptr<JniCache> jni_cache_;
+};
+
+}  // namespace test_internal
+
+}  // namespace lib
+}  // namespace icing
+
+#endif  // ICING_TOKENIZATION_REVERSE_JNI_REVERSE_JNI_LANGUAGE_SEGMENTER_TEST_H_
diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc
new file mode 100644
index 0000000..2256022
--- /dev/null
+++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc
@@ -0,0 +1,452 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/tokenization/reverse_jni/reverse-jni-language-segmenter.h"
+
+#include <cctype>
+#include <memory>
+#include <string>
+#include <string_view>
+
+#include "icing/jni/reverse-jni-break-iterator.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/tokenization/language-segmenter.h"
+#include "icing/util/i18n-utils.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+// Returns the lead byte of the UTF-8 character that includes the byte at
+// current_byte_index within it.
+int GetUTF8StartPosition(std::string_view text, int current_byte_index) {
+  while (!i18n_utils::IsLeadUtf8Byte(text[current_byte_index])) {
+    --current_byte_index;
+  }
+  return current_byte_index;
+}
+
+class CharacterIterator {
+ public:
+  explicit CharacterIterator(std::string_view text)
+      : CharacterIterator(text, 0, 0) {}
+  CharacterIterator(std::string_view text, int utf8_index, int utf16_index)
+      : text_(text), utf8_index_(utf8_index), utf16_index_(utf16_index) {}
+
+  // Moves from current position to the character that includes the specified
+  // UTF-8 index.
+  // REQUIRES: desired_utf8_index <= text_.length()
+  // desired_utf8_index is allowed to point one index past the end, but no
+  // further.
+  bool AdvanceToUtf8(int desired_utf8_index) {
+    if (desired_utf8_index > text_.length()) {
+      // Enforce the requirement.
+      return false;
+    }
+    // Need to work forwards.
+    while (utf8_index_ < desired_utf8_index) {
+      UChar32 uchar32 =
+          i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
+      if (uchar32 == i18n_utils::kInvalidUChar32) {
+        // Unable to retrieve a valid UTF-32 character at the previous position.
+        return false;
+      }
+      int utf8_length = i18n_utils::GetUtf8Length(uchar32);
+      if (utf8_index_ + utf8_length > desired_utf8_index) {
+        // Ah! Don't go too far!
+        break;
+      }
+      utf8_index_ += utf8_length;
+      utf16_index_ += i18n_utils::GetUtf16Length(uchar32);
+    }
+    return true;
+  }
+
+  // Moves from current position to the character that includes the specified
+  // UTF-8 index.
+  // REQUIRES: 0 <= desired_utf8_index
+  bool RewindToUtf8(int desired_utf8_index) {
+    if (desired_utf8_index < 0) {
+      // Enforce the requirement.
+      return false;
+    }
+    // Need to work backwards.
+    while (utf8_index_ > desired_utf8_index) {
+      --utf8_index_;
+      utf8_index_ = GetUTF8StartPosition(text_, utf8_index_);
+      if (utf8_index_ < 0) {
+        // Somehow, there wasn't a single UTF-8 lead byte at
+        // requested_byte_index or an earlier byte.
+        return false;
+      }
+      // We've found the start of a unicode char!
+      UChar32 uchar32 =
+          i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
+      if (uchar32 == i18n_utils::kInvalidUChar32) {
+        // Unable to retrieve a valid UTF-32 character at the previous position.
+        return false;
+      }
+      utf16_index_ -= i18n_utils::GetUtf16Length(uchar32);
+    }
+    return true;
+  }
+
+  // Advances current position to desired_utf16_index.
+  // REQUIRES: desired_utf16_index <= text_.utf16_length()
+  // desired_utf16_index is allowed to point one index past the end, but no
+  // further.
+  bool AdvanceToUtf16(int desired_utf16_index) {
+    while (utf16_index_ < desired_utf16_index) {
+      UChar32 uchar32 =
+          i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
+      if (uchar32 == i18n_utils::kInvalidUChar32) {
+        // Unable to retrieve a valid UTF-32 character at the previous position.
+        return false;
+      }
+      int utf16_length = i18n_utils::GetUtf16Length(uchar32);
+      if (utf16_index_ + utf16_length > desired_utf16_index) {
+        // Ah! Don't go too far!
+        break;
+      }
+      int utf8_length = i18n_utils::GetUtf8Length(uchar32);
+      if (utf8_index_ + utf8_length > text_.length()) {
+        // Enforce the requirement.
+        return false;
+      }
+      utf8_index_ += utf8_length;
+      utf16_index_ += utf16_length;
+    }
+    return true;
+  }
+
+  // Rewinds current position to desired_utf16_index.
+  // REQUIRES: 0 <= desired_utf16_index
+  bool RewindToUtf16(int desired_utf16_index) {
+    if (desired_utf16_index < 0) {
+      return false;
+    }
+    while (utf16_index_ > desired_utf16_index) {
+      --utf8_index_;
+      utf8_index_ = GetUTF8StartPosition(text_, utf8_index_);
+      // We've found the start of a unicode char!
+      UChar32 uchar32 =
+          i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
+      if (uchar32 == i18n_utils::kInvalidUChar32) {
+        // Unable to retrieve a valid UTF-32 character at the previous position.
+        return false;
+      }
+      utf16_index_ -= i18n_utils::GetUtf16Length(uchar32);
+    }
+    return true;
+  }
+
+  bool IsValidCharacter() const {
+    // Rule 1: all ASCII terms will be returned.
+    // We know it's a ASCII term by checking the first char.
+    if (i18n_utils::IsAscii(text_[utf8_index_])) {
+      return true;
+    }
+
+    // Rule 2: for non-ASCII terms, only the alphabetic terms are returned.
+    // We know it's an alphabetic term by checking the first unicode character.
+    if (i18n_utils::IsAlphabeticAt(text_, utf8_index_)) {
+      return true;
+    }
+
+    return false;
+  }
+
+  int utf8_index() const { return utf8_index_; }
+  int utf16_index() const { return utf16_index_; }
+
+ private:
+  std::string_view text_;
+  int utf8_index_;
+  int utf16_index_;
+};
+
+}  // namespace
+
+class ReverseJniLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
+ public:
+  explicit ReverseJniLanguageSegmenterIterator(
+      std::string_view text,
+      std::unique_ptr<ReverseJniBreakIterator> break_iterator)
+      : break_iterator_(std::move(break_iterator)),
+        text_(text),
+        term_start_(text),
+        term_end_exclusive_(text) {}
+
+  // Advances to the next term. Returns false if it has reached the end.
+  bool Advance() override {
+    // Prerequisite check
+    if (term_end_exclusive_.utf16_index() == ReverseJniBreakIterator::kDone) {
+      return false;
+    }
+
+    if (term_end_exclusive_.utf16_index() == 0) {
+      int first = break_iterator_->First();
+      if (!term_start_.AdvanceToUtf16(first)) {
+        // First is guaranteed to succeed and return a position within bonds. So
+        // the only possible failure could be an invalid sequence. Mark as DONE
+        // and return.
+        MarkAsDone();
+        return false;
+      }
+    } else {
+      term_start_ = term_end_exclusive_;
+    }
+
+    int next_utf16_index_exclusive = break_iterator_->Next();
+    // Reached the end
+    if (next_utf16_index_exclusive == ReverseJniBreakIterator::kDone) {
+      MarkAsDone();
+      return false;
+    }
+    if (!term_end_exclusive_.AdvanceToUtf16(next_utf16_index_exclusive)) {
+      // next_utf16_index_exclusive is guaranteed to be within bonds thanks to
+      // the check for kDone above. So the only possible failure could be an
+      // invalid sequence. Mark as DONE and return.
+      MarkAsDone();
+      return false;
+    }
+
+    // Check if the current term is valid. We consider any term valid if its
+    // first character is valid. If it's not valid, then we need to advance to
+    // the next term.
+    if (term_start_.IsValidCharacter()) {
+      return true;
+    }
+    return Advance();
+  }
+
+  // Returns the current term. It can be called only when Advance() returns
+  // true.
+  std::string_view GetTerm() const override {
+    int term_length =
+        term_end_exclusive_.utf8_index() - term_start_.utf8_index();
+    if (term_length > 0 && std::isspace(text_[term_start_.utf8_index()])) {
+      // Rule 3: multiple continuous whitespaces are treated as one.
+      term_length = 1;
+    }
+    return text_.substr(term_start_.utf8_index(), term_length);
+  }
+
+  // Resets the iterator to point to the first term that starts after offset.
+  // GetTerm will now return that term.
+  //
+  // Returns:
+  //   On success, the starting position of the first term that starts after
+  //   offset.
+  //   NOT_FOUND if an error occurred or there are no terms that start after
+  //   offset.
+  //   INVALID_ARGUMENT if offset is out of bounds for the provided text.
+  //   ABORTED if an invalid unicode character is encountered while
+  //   traversing the text.
+  libtextclassifier3::StatusOr<int32_t> ResetToTermStartingAfter(
+      int32_t offset) override {
+    if (offset < 0 || offset >= text_.length()) {
+      return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+          "Illegal offset provided! Offset %d is not within bounds of string "
+          "of length %zu",
+          offset, text_.length()));
+    }
+    if (term_end_exclusive_.utf16_index() == ReverseJniBreakIterator::kDone) {
+      // We're done. Need to start from the beginning if we're going to reset
+      // properly.
+      term_start_ = CharacterIterator(text_);
+      term_end_exclusive_ = CharacterIterator(text_);
+    }
+
+    // 1. Find the unicode character that contains the byte at offset.
+    CharacterIterator offset_iterator = term_end_exclusive_;
+    bool success = (offset > offset_iterator.utf8_index())
+                       ? offset_iterator.AdvanceToUtf8(offset)
+                       : offset_iterator.RewindToUtf8(offset);
+    if (!success) {
+      // Offset is guaranteed to be within bounds thanks to the check above. So
+      // the only possible failure could be an invalid sequence. Mark as DONE
+      // and return.
+      MarkAsDone();
+      return absl_ports::AbortedError("Encountered invalid UTF sequence!");
+    }
+
+    // 2. We've got the unicode character containing byte offset. Now, we need
+    // to point to the segment that starts after this character.
+    int following_utf16_index =
+        break_iterator_->Following(offset_iterator.utf16_index());
+    if (following_utf16_index == ReverseJniBreakIterator::kDone) {
+      MarkAsDone();
+      return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
+          "No segments begin after provided offset %d.", offset));
+    }
+    if (!offset_iterator.AdvanceToUtf16(following_utf16_index)) {
+      // following_utf16_index is guaranteed to be within bonds thanks to the
+      // check for kDone above. So the only possible failure could be an invalid
+      // sequence. Mark as DONE and return.
+      MarkAsDone();
+      return absl_ports::AbortedError("Encountered invalid UTF sequence!");
+    }
+    term_end_exclusive_ = offset_iterator;
+
+    // 3. The term_end_exclusive_ points to the term that we want to return. We
+    // need to Advance so that term_start_ will now point to this term.
+    if (!Advance()) {
+      return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
+          "No segments begin after provided offset %d.", offset));
+    }
+    return term_start_.utf8_index();
+  }
+
+  // Resets the iterator to point to the first term that ends before offset.
+  // GetTerm will now return that term.
+  //
+  // Returns:
+  //   On success, the starting position of the first term that ends before
+  //   offset.
+  //   NOT_FOUND if an error occurred or there are no terms that end before
+  //   offset.
+  //   INVALID_ARGUMENT if offset is out of bounds for the provided text.
+  //   ABORTED if an invalid unicode character is encountered while
+  //   traversing the text.
+  libtextclassifier3::StatusOr<int32_t> ResetToTermEndingBefore(
+      int32_t offset) override {
+    if (offset < 0 || offset >= text_.length()) {
+      return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+          "Illegal offset provided! Offset %d is not within bounds of string "
+          "of length %zu",
+          offset, text_.length()));
+    }
+    if (term_end_exclusive_.utf16_index() == ReverseJniBreakIterator::kDone) {
+      // We're done. Need to start from the beginning if we're going to reset
+      // properly.
+      term_start_ = CharacterIterator(text_);
+      term_end_exclusive_ = CharacterIterator(text_);
+    }
+
+    // 1. Find the unicode character that contains the byte at offset.
+    CharacterIterator offset_iterator = term_end_exclusive_;
+    bool success = (offset > offset_iterator.utf8_index())
+                       ? offset_iterator.AdvanceToUtf8(offset)
+                       : offset_iterator.RewindToUtf8(offset);
+    if (!success) {
+      // Offset is guaranteed to be within bounds thanks to the check above. So
+      // the only possible failure could be an invalid sequence. Mark as DONE
+      // and return.
+      MarkAsDone();
+      return absl_ports::AbortedError(
+          "Could not retrieve valid utf8 character!");
+    }
+
+    // 2. We've got the unicode character containing byte offset. Now, we need
+    // to point to the segment that starts before this character.
+    int starting_utf16_index =
+        break_iterator_->Preceding(offset_iterator.utf16_index());
+    if (starting_utf16_index == ReverseJniBreakIterator::kDone) {
+      // Rewind the end indices.
+      MarkAsDone();
+      return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
+          "No segments end before provided offset %d.", offset));
+    }
+    if (!offset_iterator.RewindToUtf16(starting_utf16_index)) {
+      // starting_utf16_index is guaranteed to be within bonds thanks to the
+      // check for kDone above. So the only possible failure could be an invalid
+      // sequence. Mark as DONE and return.
+      MarkAsDone();
+      return absl_ports::AbortedError("Encountered invalid UTF sequence!");
+    }
+    term_start_ = offset_iterator;
+
+    // 3. We've correctly set the start index and the iterator currently points
+    // to that position. Now we need to find the correct end position and
+    // advance the iterator to that position.
+    int end_utf16_index = break_iterator_->Next();
+    term_end_exclusive_ = term_start_;
+    term_end_exclusive_.AdvanceToUtf16(end_utf16_index);
+
+    // 4. The start and end indices point to a segment, but we need to ensure
+    // that this segment is 1) valid and 2) ends before offset. Otherwise, we'll
+    // need a segment prior to this one.
+    if (term_end_exclusive_.utf8_index() > offset ||
+        !term_start_.IsValidCharacter()) {
+      return ResetToTermEndingBefore(term_start_.utf8_index());
+    }
+    return term_start_.utf8_index();
+  }
+
+  libtextclassifier3::StatusOr<int32_t> ResetToStart() override {
+    term_start_ = CharacterIterator(text_);
+    term_end_exclusive_ = CharacterIterator(text_);
+    if (!Advance()) {
+      return absl_ports::NotFoundError("");
+    }
+    return term_start_.utf8_index();
+  }
+
+ private:
+  // Ensures that all members are consistent with the 'Done' state.
+  // In the 'Done' state, both term_start_.utf8_index() and
+  // term_end_exclusive_.utf8_index() will point to the same character, causing
+  // GetTerm() to return an empty string and term_start_.utf16_index() and
+  // term_end_exclusive_.utf16_index() will be marked with the kDone value.
+  // break_iterator_ may be in any state.
+  void MarkAsDone() {
+    term_start_ =
+        CharacterIterator(text_, /*utf8_index=*/0,
+                          /*utf16_index=*/ReverseJniBreakIterator::kDone);
+    term_end_exclusive_ =
+        CharacterIterator(text_, /*utf8_index=*/0,
+                          /*utf16_index=*/ReverseJniBreakIterator::kDone);
+  }
+
+  // All of ReverseJniBreakIterator's functions return UTF-16 boundaries. So
+  // this class needs to maintain state to convert between UTF-16 and UTF-8.
+  std::unique_ptr<ReverseJniBreakIterator> break_iterator_;
+
+  // Text to be segmented
+  std::string_view text_;
+
+  // Index used to track the start position of current term.
+  CharacterIterator term_start_;
+
+  // Index used to track the end position of current term.
+  CharacterIterator term_end_exclusive_;
+};
+
+libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter::Iterator>>
+ReverseJniLanguageSegmenter::Segment(const std::string_view text) const {
+  ICING_ASSIGN_OR_RETURN(
+      std::unique_ptr<ReverseJniBreakIterator> break_iterator,
+      ReverseJniBreakIterator::Create(jni_cache_, text, locale_));
+  return std::make_unique<ReverseJniLanguageSegmenterIterator>(
+      text, std::move(break_iterator));
+}
+
+libtextclassifier3::StatusOr<std::vector<std::string_view>>
+ReverseJniLanguageSegmenter::GetAllTerms(const std::string_view text) const {
+  ICING_ASSIGN_OR_RETURN(std::unique_ptr<LanguageSegmenter::Iterator> iterator,
+                         Segment(text));
+  std::vector<std::string_view> terms;
+  while (iterator->Advance()) {
+    terms.push_back(iterator->GetTerm());
+  }
+  return terms;
+}
+
+}  // namespace lib
+}  // namespace icing
diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.h b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.h
new file mode 100644
index 0000000..f06dac9
--- /dev/null
+++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.h
@@ -0,0 +1,51 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TOKENIZATION_REVERSE_JNI_REVERSE_JNI_LANGUAGE_SEGMENTER_H_
+#define ICING_TOKENIZATION_REVERSE_JNI_REVERSE_JNI_LANGUAGE_SEGMENTER_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "icing/jni/jni-cache.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/tokenization/language-segmenter.h"
+
+namespace icing {
+namespace lib {
+
+class ReverseJniLanguageSegmenter : public LanguageSegmenter {
+ public:
+  ReverseJniLanguageSegmenter(std::string locale, const JniCache* jni_cache)
+      : locale_(std::move(locale)), jni_cache_(jni_cache) {}
+
+  libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter::Iterator>>
+  Segment(std::string_view text) const override;
+
+  libtextclassifier3::StatusOr<std::vector<std::string_view>> GetAllTerms(
+      std::string_view text) const override;
+
+ private:
+  std::string locale_;
+
+  const JniCache* jni_cache_;  // does not own!
+};
+
+}  // namespace lib
+}  // namespace icing
+
+#endif  // ICING_TOKENIZATION_REVERSE_JNI_REVERSE_JNI_LANGUAGE_SEGMENTER_H_
diff --git a/icing/tokenization/simple/space-language-segmenter-factory.cc b/icing/tokenization/simple/space-language-segmenter-factory.cc
new file mode 100644
index 0000000..1cca603
--- /dev/null
+++ b/icing/tokenization/simple/space-language-segmenter-factory.cc
@@ -0,0 +1,41 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/tokenization/language-segmenter-factory.h"
+#include "icing/tokenization/simple/space-language-segmenter.h"
+#include "icing/util/logging.h"
+
+namespace icing {
+namespace lib {
+
+namespace language_segmenter_factory {
+
+// Creates a language segmenter with the given locale.
+//
+// Returns:
+//   A LanguageSegmenter on success
+//   INVALID_ARGUMENT if locale string is invalid
+//
+// TODO(samzheng): Figure out if we want to verify locale strings and notify
+// users. Right now illegal locale strings will be ignored by ICU. ICU
+// components will be created with its default locale.
+libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter>> Create(
+    SegmenterOptions) {
+  return std::make_unique<SpaceLanguageSegmenter>();
+}
+
+}  // namespace language_segmenter_factory
+
+}  // namespace lib
+}  // namespace icing
diff --git a/icing/tokenization/space-language-segmenter.cc b/icing/tokenization/simple/space-language-segmenter.cc
index 3d5c7cf..7e301ec 100644
--- a/icing/tokenization/space-language-segmenter.cc
+++ b/icing/tokenization/simple/space-language-segmenter.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "icing/tokenization/space-language-segmenter.h"
+#include "icing/tokenization/simple/space-language-segmenter.h"
 
 #include <cstdint>
 #include <memory>
@@ -40,7 +40,7 @@ class SpaceLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
       : text_(text), term_start_index_(0), term_end_index_exclusive_(0) {}
 
   // Advances to the next term. Returns false if it has reached the end.
-  bool Advance() {
+  bool Advance() override {
     if (term_end_index_exclusive_ >= text_.size() ||
         term_start_index_ >= text_.size()) {
       // Reached the end
@@ -74,7 +74,7 @@ class SpaceLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
 
   // Returns the current term. It can be called only when Advance() returns
   // true.
-  std::string_view GetTerm() const {
+  std::string_view GetTerm() const override {
     if (text_[term_start_index_] == kASCIISpace) {
       // Rule: multiple continuous whitespaces are treated as one.
       return std::string_view(&text_[term_start_index_], 1);
@@ -84,7 +84,7 @@ class SpaceLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
   }
 
   libtextclassifier3::StatusOr<int32_t> ResetToTermStartingAfter(
-      int32_t offset) {
+      int32_t offset) override {
     if (offset < 0) {
       // Start over from the beginning to find the first term.
       term_start_index_ = 0;
@@ -111,7 +111,7 @@ class SpaceLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
   }
 
   libtextclassifier3::StatusOr<int32_t> ResetToTermEndingBefore(
-      int32_t offset) {
+      int32_t offset) override {
     if (offset <= 0 || offset > text_.size()) {
       return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
           "No term found in '%s' that ends before offset %d",
@@ -146,6 +146,15 @@ class SpaceLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
     return term_start_index_;
   }
 
+  libtextclassifier3::StatusOr<int32_t> ResetToStart() override {
+    term_start_index_ = 0;
+    term_end_index_exclusive_ = 0;
+    if (!Advance()) {
+      return absl_ports::NotFoundError("");
+    }
+    return term_start_index_;
+  }
+
  private:
   // Return the start offset of the term starting right before the given offset.
   libtextclassifier3::StatusOr<int32_t> GetTermStartingBefore(int32_t offset) {
diff --git a/icing/tokenization/space-language-segmenter.h b/icing/tokenization/simple/space-language-segmenter.h
index 73f8f30..de0a6d3 100644
--- a/icing/tokenization/space-language-segmenter.h
+++ b/icing/tokenization/simple/space-language-segmenter.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef ICING_TOKENIZATION_SPACE_LANGUAGE_SEGMENTER_H_
-#define ICING_TOKENIZATION_SPACE_LANGUAGE_SEGMENTER_H_
+#ifndef ICING_TOKENIZATION_SIMPLE_SPACE_LANGUAGE_SEGMENTER_H_
+#define ICING_TOKENIZATION_SIMPLE_SPACE_LANGUAGE_SEGMENTER_H_
 
 #include <cstdint>
 #include <memory>
@@ -55,4 +55,4 @@ class SpaceLanguageSegmenter : public LanguageSegmenter {
 }  // namespace lib
 }  // namespace icing
 
-#endif  // ICING_TOKENIZATION_SPACE_LANGUAGE_SEGMENTER_H_
+#endif  // ICING_TOKENIZATION_SIMPLE_SPACE_LANGUAGE_SEGMENTER_H_
diff --git a/icing/tokenization/space-language-segmenter_test.cc b/icing/tokenization/simple/space-language-segmenter_test.cc
index ef6f54f..8ed38b2 100644
--- a/icing/tokenization/space-language-segmenter_test.cc
+++ b/icing/tokenization/simple/space-language-segmenter_test.cc
@@ -28,24 +28,21 @@ using ::testing::Eq;
 using ::testing::IsEmpty;
 
 TEST(SpaceLanguageSegmenterTest, EmptyText) {
-  ICING_ASSERT_OK_AND_ASSIGN(
-      auto language_segmenter,
-      language_segmenter_factory::Create(language_segmenter_factory::SPACE));
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create());
   EXPECT_THAT(language_segmenter->GetAllTerms(""), IsOkAndHolds(IsEmpty()));
 }
 
 TEST(SpaceLanguageSegmenterTest, SimpleText) {
-  ICING_ASSERT_OK_AND_ASSIGN(
-      auto language_segmenter,
-      language_segmenter_factory::Create(language_segmenter_factory::SPACE));
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create());
   EXPECT_THAT(language_segmenter->GetAllTerms("Hello World"),
               IsOkAndHolds(ElementsAre("Hello", " ", "World")));
 }
 
 TEST(SpaceLanguageSegmenterTest, Punctuation) {
-  ICING_ASSERT_OK_AND_ASSIGN(
-      auto language_segmenter,
-      language_segmenter_factory::Create(language_segmenter_factory::SPACE));
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create());
 
   EXPECT_THAT(language_segmenter->GetAllTerms("Hello, World!!!"),
               IsOkAndHolds(ElementsAre("Hello,", " ", "World!!!")));
@@ -58,9 +55,8 @@ TEST(SpaceLanguageSegmenterTest, Punctuation) {
 }
 
 TEST(SpaceLanguageSegmenterTest, Alphanumeric) {
-  ICING_ASSERT_OK_AND_ASSIGN(
-      auto language_segmenter,
-      language_segmenter_factory::Create(language_segmenter_factory::SPACE));
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create());
 
   // Alphanumeric terms are allowed
   EXPECT_THAT(language_segmenter->GetAllTerms("Se7en A4 3a"),
@@ -68,9 +64,8 @@ TEST(SpaceLanguageSegmenterTest, Alphanumeric) {
 }
 
 TEST(SpaceLanguageSegmenterTest, Number) {
-  ICING_ASSERT_OK_AND_ASSIGN(
-      auto language_segmenter,
-      language_segmenter_factory::Create(language_segmenter_factory::SPACE));
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create());
 
   // Alphanumeric terms are allowed
   EXPECT_THAT(
@@ -85,9 +80,8 @@ TEST(SpaceLanguageSegmenterTest, Number) {
 }
 
 TEST(SpaceLanguageSegmenterTest, ContinuousWhitespaces) {
-  ICING_ASSERT_OK_AND_ASSIGN(
-      auto language_segmenter,
-      language_segmenter_factory::Create(language_segmenter_factory::SPACE));
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create());
 
   // Multiple continuous whitespaces are treated as one.
   const int kNumSeparators = 256;
@@ -98,9 +92,8 @@ TEST(SpaceLanguageSegmenterTest, ContinuousWhitespaces) {
 }
 
 TEST(SpaceLanguageSegmenterTest, NotCopyStrings) {
-  ICING_ASSERT_OK_AND_ASSIGN(
-      auto language_segmenter,
-      language_segmenter_factory::Create(language_segmenter_factory::SPACE));
+  ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+                             language_segmenter_factory::Create());
   // Validates that the input strings are not copied
   const std::string text = "Hello World";
   const char* word1_address = text.c_str();
diff --git a/icing/tokenization/tokenizer.h b/icing/tokenization/tokenizer.h
index 3ad61fb..38c4745 100644
--- a/icing/tokenization/tokenizer.h
+++ b/icing/tokenization/tokenizer.h
@@ -85,6 +85,8 @@ class Tokenizer {
     // // "foo".
     // PrintToken(iterator.GetToken());  // prints "foo"
     virtual bool ResetToTokenBefore(int32_t offset) { return false; }
+
+    virtual bool ResetToStart() { return false; }
   };
 
   // Tokenizes the input text. The input text should outlive the returned
diff --git a/icing/transform/icu-normalizer_test.cc b/icing/transform/icu-normalizer_test.cc
deleted file mode 100644
index 5e822d2..0000000
--- a/icing/transform/icu-normalizer_test.cc
+++ /dev/null
@@ -1,179 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <memory>
-
-#include "gmock/gmock.h"
-#include "gtest/gtest.h"
-#include "icing/icu-data-file-helper.h"
-#include "icing/testing/common-matchers.h"
-#include "icing/testing/icu-i18n-test-utils.h"
-#include "icing/testing/test-data.h"
-#include "icing/transform/normalizer-factory.h"
-#include "icing/transform/normalizer.h"
-
-namespace icing {
-namespace lib {
-namespace {
-using ::testing::Eq;
-
-class IcuNormalizerTest : public testing::Test {
- protected:
-  void SetUp() override {
-    ICING_ASSERT_OK(
-        // File generated via icu_data_file rule in //icing/BUILD.
-        icu_data_file_helper::SetUpICUDataFile(
-            GetTestFilePath("icing/icu.dat")));
-
-    ICING_ASSERT_OK_AND_ASSIGN(
-        normalizer_,
-        normalizer_factory::Create(normalizer_factory::NormalizerType::ICU4C,
-                                   /*max_term_byte_size=*/1024));
-  }
-
-  std::unique_ptr<Normalizer> normalizer_;
-};
-
-TEST_F(IcuNormalizerTest, Creation) {
-  EXPECT_THAT(
-      normalizer_factory::Create(normalizer_factory::NormalizerType::ICU4C,
-                                 /*max_term_byte_size=*/5),
-      IsOk());
-  EXPECT_THAT(
-      normalizer_factory::Create(normalizer_factory::NormalizerType::ICU4C,
-                                 /*max_term_byte_size=*/0),
-      StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
-  EXPECT_THAT(
-      normalizer_factory::Create(normalizer_factory::NormalizerType::ICU4C,
-                                 /*max_term_byte_size=*/-1),
-      StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
-}
-
-// Strings that are already normalized won't change if normalized again.
-TEST_F(IcuNormalizerTest, AlreadyNormalized) {
-  EXPECT_THAT(normalizer_->NormalizeTerm(""), Eq(""));
-  EXPECT_THAT(normalizer_->NormalizeTerm("hello world"), Eq("hello world"));
-  EXPECT_THAT(normalizer_->NormalizeTerm("你好"), Eq("你好"));
-  EXPECT_THAT(normalizer_->NormalizeTerm("キャンパス"), Eq("キャンパス"));
-  EXPECT_THAT(normalizer_->NormalizeTerm("안녕하세요"), Eq("안녕하세요"));
-}
-
-TEST_F(IcuNormalizerTest, UppercaseToLowercase) {
-  EXPECT_THAT(normalizer_->NormalizeTerm("MDI"), Eq("mdi"));
-  EXPECT_THAT(normalizer_->NormalizeTerm("Icing"), Eq("icing"));
-}
-
-TEST_F(IcuNormalizerTest, LatinLetterRemoveAccent) {
-  EXPECT_THAT(normalizer_->NormalizeTerm("Zürich"), Eq("zurich"));
-  EXPECT_THAT(normalizer_->NormalizeTerm("après-midi"), Eq("apres-midi"));
-  EXPECT_THAT(normalizer_->NormalizeTerm("Buenos días"), Eq("buenos dias"));
-  EXPECT_THAT(normalizer_->NormalizeTerm("āăąḃḅḇčćç"), Eq("aaabbbccc"));
-  EXPECT_THAT(normalizer_->NormalizeTerm("ÁȦÄḂḄḆĆČḈ"), Eq("aaabbbccc"));
-}
-
-// Accent / diacritic marks won't be removed in non-latin chars, e.g. in
-// Japanese and Greek
-TEST_F(IcuNormalizerTest, NonLatinLetterNotRemoveAccent) {
-  EXPECT_THAT(normalizer_->NormalizeTerm("ダヂヅデド"), Eq("ダヂヅデド"));
-  EXPECT_THAT(normalizer_->NormalizeTerm("kαλημέρα"), Eq("kαλημέρα"));
-}
-
-TEST_F(IcuNormalizerTest, FullWidthCharsToASCII) {
-  // Full-width punctuation to ASCII punctuation
-  EXPECT_THAT(normalizer_->NormalizeTerm("。，！？：”"), Eq(".,!?:\""));
-  // 0xff10 is the full-width number 0
-  EXPECT_THAT(normalizer_->NormalizeTerm(UCharToString(0xff10)), Eq("0"));
-  // 0xff21 is the full-width letter A
-  EXPECT_THAT(normalizer_->NormalizeTerm(UCharToString(0xff21)), Eq("a"));
-  // 0xff41 is the full-width letter a
-  EXPECT_THAT(normalizer_->NormalizeTerm(UCharToString(0xff41)), Eq("a"));
-}
-
-// For Katakana, each character is normalized to its full-width version.
-TEST_F(IcuNormalizerTest, KatakanaHalfWidthToFullWidth) {
-  EXPECT_THAT(normalizer_->NormalizeTerm("ｶ"), Eq("カ"));
-  EXPECT_THAT(normalizer_->NormalizeTerm("ｫ"), Eq("ォ"));
-  EXPECT_THAT(normalizer_->NormalizeTerm("ｻ"), Eq("サ"));
-  EXPECT_THAT(normalizer_->NormalizeTerm("ﾎ"), Eq("ホ"));
-}
-
-TEST_F(IcuNormalizerTest, HiraganaToKatakana) {
-  EXPECT_THAT(normalizer_->NormalizeTerm("あいうえお"), Eq("アイウエオ"));
-  EXPECT_THAT(normalizer_->NormalizeTerm("かきくけこ"), Eq("カキクケコ"));
-  EXPECT_THAT(normalizer_->NormalizeTerm("ばびぶべぼ"), Eq("バビブベボ"));
-  EXPECT_THAT(normalizer_->NormalizeTerm("がぎぐげご"), Eq("ガギグゲゴ"));
-  EXPECT_THAT(normalizer_->NormalizeTerm("ぎゃぎゅぎょ"), Eq("ギャギュギョ"));
-}
-
-TEST_F(IcuNormalizerTest, SuperscriptAndSubscriptToASCII) {
-  EXPECT_THAT(normalizer_->NormalizeTerm("⁹"), Eq("9"));
-  EXPECT_THAT(normalizer_->NormalizeTerm("₉"), Eq("9"));
-}
-
-TEST_F(IcuNormalizerTest, CircledCharsToASCII) {
-  EXPECT_THAT(normalizer_->NormalizeTerm("①"), Eq("1"));
-  EXPECT_THAT(normalizer_->NormalizeTerm("Ⓐ"), Eq("a"));
-}
-
-TEST_F(IcuNormalizerTest, RotatedCharsToASCII) {
-  EXPECT_THAT(normalizer_->NormalizeTerm("︷"), Eq("{"));
-  EXPECT_THAT(normalizer_->NormalizeTerm("︸"), Eq("}"));
-}
-
-TEST_F(IcuNormalizerTest, SquaredCharsToASCII) {
-  EXPECT_THAT(normalizer_->NormalizeTerm("㌀"), Eq("アパート"));
-}
-
-TEST_F(IcuNormalizerTest, FractionsToASCII) {
-  EXPECT_THAT(normalizer_->NormalizeTerm("¼"), Eq(" 1/4"));
-  EXPECT_THAT(normalizer_->NormalizeTerm("⅚"), Eq(" 5/6"));
-}
-
-TEST_F(IcuNormalizerTest, Truncate) {
-  {
-    ICING_ASSERT_OK_AND_ASSIGN(
-        auto normalizer,
-        normalizer_factory::Create(normalizer_factory::NormalizerType::ICU4C,
-                                   /*max_term_byte_size=*/5));
-
-    // Won't be truncated
-    EXPECT_THAT(normalizer->NormalizeTerm("hi"), Eq("hi"));
-    EXPECT_THAT(normalizer->NormalizeTerm("hello"), Eq("hello"));
-
-    // Truncated to length 5.
-    EXPECT_THAT(normalizer->NormalizeTerm("hello!"), Eq("hello"));
-
-    // Each Japanese character has 3 bytes, so truncating to length 5 results in
-    // only 1 character.
-    EXPECT_THAT(normalizer->NormalizeTerm("キャンパス"), Eq("キ"));
-
-    // Each Greek character has 2 bytes, so truncating to length 5 results in 2
-    // character.
-    EXPECT_THAT(normalizer->NormalizeTerm("αβγδε"), Eq("αβ"));
-  }
-
-  {
-    ICING_ASSERT_OK_AND_ASSIGN(
-        auto normalizer,
-        normalizer_factory::Create(normalizer_factory::NormalizerType::ICU4C,
-                                   /*max_term_byte_size=*/2));
-    // The Japanese character has 3 bytes, truncating it results in an empty
-    // string.
-    EXPECT_THAT(normalizer->NormalizeTerm("キ"), Eq(""));
-  }
-}
-
-}  // namespace
-}  // namespace lib
-}  // namespace icing
diff --git a/icing/transform/icu/icu-normalizer-factory.cc b/icing/transform/icu/icu-normalizer-factory.cc
new file mode 100644
index 0000000..493aeb5
--- /dev/null
+++ b/icing/transform/icu/icu-normalizer-factory.cc
@@ -0,0 +1,52 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TRANSFORM_ICU_ICU_NORMALIZER_FACTORY_H_
+#define ICING_TRANSFORM_ICU_ICU_NORMALIZER_FACTORY_H_
+
+#include <memory>
+#include <string_view>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/transform/icu/icu-normalizer.h"
+#include "icing/transform/normalizer.h"
+
+namespace icing {
+namespace lib {
+
+namespace normalizer_factory {
+
+// Creates an ICU-based  normalizer. max_term_byte_size enforces the max size of
+// text after normalization, text will be truncated if exceeds the max size.
+//
+// Returns:
+//   A normalizer on success
+//   INVALID_ARGUMENT if max_term_byte_size <= 0
+//   INTERNAL_ERROR on errors
+libtextclassifier3::StatusOr<std::unique_ptr<Normalizer>> Create(
+    int max_term_byte_size) {
+  if (max_term_byte_size <= 0) {
+    return absl_ports::InvalidArgumentError(
+        "max_term_byte_size must be greater than zero.");
+  }
+  return IcuNormalizer::Create(max_term_byte_size);
+}
+
+}  // namespace normalizer_factory
+
+}  // namespace lib
+}  // namespace icing
+
+#endif  // ICING_TRANSFORM_ICU_ICU_NORMALIZER_FACTORY_H_
diff --git a/icing/transform/icu-normalizer.cc b/icing/transform/icu/icu-normalizer.cc
index c7cfd99..0bb8326 100644
--- a/icing/transform/icu-normalizer.cc
+++ b/icing/transform/icu/icu-normalizer.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "icing/transform/icu-normalizer.h"
+#include "icing/transform/icu/icu-normalizer.h"
 
 #include <cctype>
 #include <memory>
@@ -24,7 +24,7 @@
 #include "icing/absl_ports/canonical_errors.h"
 #include "icing/absl_ports/str_cat.h"
 #include "icing/transform/normalizer.h"
-#include "icing/util/icu-i18n-utils.h"
+#include "icing/util/i18n-utils.h"
 #include "icing/util/logging.h"
 #include "icing/util/status-macros.h"
 #include "unicode/umachine.h"
@@ -55,6 +55,46 @@ constexpr UChar kTransformRulesUtf16[] =
 constexpr int kTransformRulesLength =
     sizeof(kTransformRulesUtf16) / sizeof(kTransformRulesUtf16[0]) - 1;
 
+// Transforms a Unicode character with diacritics to its counterpart in ASCII
+// range. E.g. "ü" -> "u". Result will be set to char_out. Returns true if
+// the transformation is successful.
+//
+// NOTE: According to our convention this function should have returned
+// StatusOr<char>. However, this function is performance-sensitive because is
+// could be called on every Latin character in normalization, so we make it
+// return a bool here to save a bit more time and memory.
+bool DiacriticCharToAscii(const UNormalizer2* normalizer2, UChar32 uchar32_in,
+                          char* char_out) {
+  if (i18n_utils::IsAscii(uchar32_in)) {
+    // The Unicode character is within ASCII range
+    if (char_out != nullptr) {
+      *char_out = uchar32_in;
+    }
+    return true;
+  }
+
+  // Maximum number of pieces a Unicode character can be decomposed into.
+  // TODO(samzheng) figure out if this number is proper.
+  constexpr int kDecompositionBufferCapacity = 5;
+
+  // A buffer used to store Unicode decomposition mappings of only one
+  // character.
+  UChar decomposition_buffer[kDecompositionBufferCapacity];
+
+  // Decomposes the Unicode character, trying to get an ASCII char and some
+  // diacritic chars.
+  UErrorCode status = U_ZERO_ERROR;
+  if (unorm2_getDecomposition(normalizer2, uchar32_in, &decomposition_buffer[0],
+                              kDecompositionBufferCapacity, &status) > 0 &&
+      !U_FAILURE(status) && i18n_utils::IsAscii(decomposition_buffer[0])) {
+    if (char_out != nullptr) {
+      *char_out = decomposition_buffer[0];
+    }
+    return true;
+  }
+  return false;
+}
+
 }  // namespace
 
 // Creates a IcuNormalizer with a valid TermTransformer instance.
@@ -96,11 +136,9 @@ std::string IcuNormalizer::NormalizeTerm(const std::string_view term) const {
   // into an ASCII char. Since the term is tokenized, we know that the whole
   // term can be transformed into ASCII if the first character can.
   UChar32 first_uchar32 =
-      icu_i18n_utils::GetUChar32At(term.data(), term.length(), 0);
-  if (normalizer2 != nullptr &&
-      first_uchar32 != icu_i18n_utils::kInvalidUChar32 &&
-      icu_i18n_utils::DiacriticCharToAscii(normalizer2, first_uchar32,
-                                           nullptr)) {
+      i18n_utils::GetUChar32At(term.data(), term.length(), 0);
+  if (normalizer2 != nullptr && first_uchar32 != i18n_utils::kInvalidUChar32 &&
+      DiacriticCharToAscii(normalizer2, first_uchar32, nullptr)) {
     // This is a faster method to normalize Latin terms.
     normalized_text = NormalizeLatin(normalizer2, term);
   } else {
@@ -108,7 +146,7 @@ std::string IcuNormalizer::NormalizeTerm(const std::string_view term) const {
   }
 
   if (normalized_text.length() > max_term_byte_size_) {
-    icu_i18n_utils::SafeTruncateUtf8(&normalized_text, max_term_byte_size_);
+    i18n_utils::SafeTruncateUtf8(&normalized_text, max_term_byte_size_);
   }
 
   return normalized_text;
@@ -119,19 +157,17 @@ std::string IcuNormalizer::NormalizeLatin(const UNormalizer2* normalizer2,
   std::string result;
   result.reserve(term.length());
   for (int i = 0; i < term.length(); i++) {
-    if (icu_i18n_utils::IsAscii(term[i])) {
+    if (i18n_utils::IsAscii(term[i])) {
       result.push_back(std::tolower(term[i]));
-    } else if (icu_i18n_utils::IsLeadUtf8Byte(term[i])) {
-      UChar32 uchar32 =
-          icu_i18n_utils::GetUChar32At(term.data(), term.length(), i);
-      if (uchar32 == icu_i18n_utils::kInvalidUChar32) {
+    } else if (i18n_utils::IsLeadUtf8Byte(term[i])) {
+      UChar32 uchar32 = i18n_utils::GetUChar32At(term.data(), term.length(), i);
+      if (uchar32 == i18n_utils::kInvalidUChar32) {
         ICING_LOG(WARNING) << "Unable to get uchar32 from " << term
                            << " at position" << i;
         continue;
       }
       char ascii_char;
-      if (icu_i18n_utils::DiacriticCharToAscii(normalizer2, uchar32,
-                                               &ascii_char)) {
+      if (DiacriticCharToAscii(normalizer2, uchar32, &ascii_char)) {
         result.push_back(std::tolower(ascii_char));
       } else {
         // We don't know how to transform / decompose this Unicode character, it
@@ -139,7 +175,7 @@ std::string IcuNormalizer::NormalizeLatin(const UNormalizer2* normalizer2,
         // Latin characters. This shouldn't happen if input term is properly
         // tokenized. We handle it here in case there're something wrong with
         // the tokenizers.
-        int utf8_length = icu_i18n_utils::GetUtf8Length(uchar32);
+        int utf8_length = i18n_utils::GetUtf8Length(uchar32);
         absl_ports::StrAppend(&result, term.substr(i, utf8_length));
       }
     }
@@ -175,7 +211,7 @@ IcuNormalizer::TermTransformer::~TermTransformer() {
 
 std::string IcuNormalizer::TermTransformer::Transform(
     const std::string_view term) const {
-  auto utf16_term_or = icu_i18n_utils::Utf8ToUtf16(term);
+  auto utf16_term_or = i18n_utils::Utf8ToUtf16(term);
   if (!utf16_term_or.ok()) {
     ICING_VLOG(0) << "Failed to convert UTF8 term '" << term << "' to UTF16";
     return std::string(term);
@@ -216,7 +252,7 @@ std::string IcuNormalizer::TermTransformer::Transform(
     return std::string(term);
   }
 
-  auto utf8_term_or = icu_i18n_utils::Utf16ToUtf8(utf16_term);
+  auto utf8_term_or = i18n_utils::Utf16ToUtf8(utf16_term);
   if (!utf8_term_or.ok()) {
     ICING_VLOG(0) << "Failed to convert UTF16 term '" << term << "' to UTF8";
     return std::string(term);
diff --git a/icing/transform/icu-normalizer.h b/icing/transform/icu/icu-normalizer.h
index 86d4a64..f20a9fb 100644
--- a/icing/transform/icu-normalizer.h
+++ b/icing/transform/icu/icu-normalizer.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef ICING_TRANSFORM_ICU_NORMALIZER_H_
-#define ICING_TRANSFORM_ICU_NORMALIZER_H_
+#ifndef ICING_TRANSFORM_ICU_ICU_NORMALIZER_H_
+#define ICING_TRANSFORM_ICU_ICU_NORMALIZER_H_
 
 #include <memory>
 #include <string>
@@ -102,4 +102,4 @@ class IcuNormalizer : public Normalizer {
 }  // namespace lib
 }  // namespace icing
 
-#endif  // ICING_TRANSFORM_ICU_NORMALIZER_H_
+#endif  // ICING_TRANSFORM_ICU_ICU_NORMALIZER_H_
diff --git a/icing/transform/icu-normalizer_benchmark.cc b/icing/transform/icu/icu-normalizer_benchmark.cc
index 2fce32b..b037538 100644
--- a/icing/transform/icu-normalizer_benchmark.cc
+++ b/icing/transform/icu/icu-normalizer_benchmark.cc
@@ -14,7 +14,7 @@
 
 #include "testing/base/public/benchmark.h"
 #include "gmock/gmock.h"
-#include "icing/icu-data-file-helper.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
 #include "icing/testing/common-matchers.h"
 #include "icing/testing/test-data.h"
 #include "icing/transform/normalizer-factory.h"
@@ -22,9 +22,9 @@
 
 // Run on a Linux workstation:
 //    $ blaze build -c opt --dynamic_mode=off --copt=-gmlt
-//    //icing/transform:icu-normalizer_benchmark
+//    //icing/transform/icu:icu-normalizer_benchmark
 //
-//    $ blaze-bin/icing/transform/icu-normalizer_benchmark
+//    $ blaze-bin/icing/transform/icu/icu-normalizer_benchmark
 //    --benchmarks=all
 //
 // Run on an Android device:
@@ -33,9 +33,10 @@
 //
 //    $ blaze build --copt="-DGOOGLE_COMMANDLINEFLAGS_FULL_API=1"
 //    --config=android_arm64 -c opt --dynamic_mode=off --copt=-gmlt
-//    //icing/transform:icu-normalizer_benchmark
+//    //icing/transform/icu:icu-normalizer_benchmark
 //
-//    $ adb push blaze-bin/icing/transform/icu-normalizer_benchmark
+//    $ adb push
+//    blaze-bin/icing/transform/icu/icu-normalizer_benchmark
 //    /data/local/tmp/
 //
 //    $ adb shell /data/local/tmp/icu-normalizer_benchmark --benchmarks=all
@@ -60,7 +61,7 @@ void BM_NormalizeUppercase(benchmark::State& state) {
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<Normalizer> normalizer,
       normalizer_factory::Create(
-          normalizer_factory::NormalizerType::ICU4C,
+
           /*max_term_byte_size=*/std::numeric_limits<int>::max()));
 
   std::string input_string(state.range(0), 'A');
@@ -94,7 +95,7 @@ void BM_NormalizeAccent(benchmark::State& state) {
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<Normalizer> normalizer,
       normalizer_factory::Create(
-          normalizer_factory::NormalizerType::ICU4C,
+
           /*max_term_byte_size=*/std::numeric_limits<int>::max()));
 
   std::string input_string;
@@ -132,7 +133,7 @@ void BM_NormalizeHiragana(benchmark::State& state) {
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<Normalizer> normalizer,
       normalizer_factory::Create(
-          normalizer_factory::NormalizerType::ICU4C,
+
           /*max_term_byte_size=*/std::numeric_limits<int>::max()));
 
   std::string input_string;
diff --git a/icing/transform/icu/icu-normalizer_test.cc b/icing/transform/icu/icu-normalizer_test.cc
new file mode 100644
index 0000000..83fa972
--- /dev/null
+++ b/icing/transform/icu/icu-normalizer_test.cc
@@ -0,0 +1,237 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/icu-i18n-test-utils.h"
+#include "icing/testing/test-data.h"
+#include "icing/transform/normalizer-factory.h"
+#include "icing/transform/normalizer.h"
+
+namespace icing {
+namespace lib {
+namespace {
+using ::testing::Eq;
+
+class IcuNormalizerTest : public testing::Test {
+ protected:
+  void SetUp() override {
+    ICING_ASSERT_OK(
+        // File generated via icu_data_file rule in //icing/BUILD.
+        icu_data_file_helper::SetUpICUDataFile(
+            GetTestFilePath("icing/icu.dat")));
+
+    ICING_ASSERT_OK_AND_ASSIGN(normalizer_, normalizer_factory::Create(
+                                                /*max_term_byte_size=*/1024));
+  }
+
+  std::unique_ptr<Normalizer> normalizer_;
+};
+
+TEST_F(IcuNormalizerTest, Creation) {
+  EXPECT_THAT(normalizer_factory::Create(
+                  /*max_term_byte_size=*/5),
+              IsOk());
+  EXPECT_THAT(normalizer_factory::Create(
+                  /*max_term_byte_size=*/0),
+              StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+  EXPECT_THAT(normalizer_factory::Create(
+                  /*max_term_byte_size=*/-1),
+              StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+// Strings that are already normalized won't change if normalized again.
+TEST_F(IcuNormalizerTest, AlreadyNormalized) {
+  EXPECT_THAT(normalizer_->NormalizeTerm(""), Eq(""));
+  EXPECT_THAT(normalizer_->NormalizeTerm("hello world"), Eq("hello world"));
+  EXPECT_THAT(normalizer_->NormalizeTerm("你好"), Eq("你好"));
+  EXPECT_THAT(normalizer_->NormalizeTerm("キャンパス"), Eq("キャンパス"));
+  EXPECT_THAT(normalizer_->NormalizeTerm("안녕하세요"), Eq("안녕하세요"));
+}
+
+TEST_F(IcuNormalizerTest, UppercaseToLowercase) {
+  EXPECT_THAT(normalizer_->NormalizeTerm("MDI"), Eq("mdi"));
+  EXPECT_THAT(normalizer_->NormalizeTerm("Icing"), Eq("icing"));
+}
+
+TEST_F(IcuNormalizerTest, LatinLetterRemoveAccent) {
+  EXPECT_THAT(normalizer_->NormalizeTerm("Zürich"), Eq("zurich"));
+  EXPECT_THAT(normalizer_->NormalizeTerm("après-midi"), Eq("apres-midi"));
+  EXPECT_THAT(normalizer_->NormalizeTerm("Buenos días"), Eq("buenos dias"));
+  EXPECT_THAT(normalizer_->NormalizeTerm("ÀÁÂÃÄÅĀĂĄḀḁàáâãäåāăą"),
+              Eq("aaaaaaaaaaaaaaaaaaaa"));
+  EXPECT_THAT(normalizer_->NormalizeTerm("ḂḄḆḃḅḇ"), Eq("bbbbbb"));
+  EXPECT_THAT(normalizer_->NormalizeTerm("ÇĆĈĊČḈḉćĉċčç"), Eq("cccccccccccc"));
+  EXPECT_THAT(normalizer_->NormalizeTerm("ÐĎĐḊḌḎḐḒḋḍḏḑḓďđ"),
+              Eq("ddddddddddddddd"));
+  EXPECT_THAT(normalizer_->NormalizeTerm("ÈÉÊËĒĔĖĘḔḖḘḚḜḕḗḙḛḝèéêëēĕėęě"),
+              Eq("eeeeeeeeeeeeeeeeeeeeeeeeeee"));
+  EXPECT_THAT(normalizer_->NormalizeTerm("Ḟḟ"), Eq("ff"));
+  EXPECT_THAT(normalizer_->NormalizeTerm("ĜĞĠĢḠḡĝğġģ"), Eq("gggggggggg"));
+  EXPECT_THAT(normalizer_->NormalizeTerm("ĤḢḤḦḨḪḣḥḧḩḫĥẖ"),
+              Eq("hhhhhhhhhhhhh"));
+  EXPECT_THAT(normalizer_->NormalizeTerm("ÌÍÎÏĨĪĬḬḭḯìíîïĩīĭ"),
+              Eq("iiiiiiiiiiiiiiiii"));
+  EXPECT_THAT(normalizer_->NormalizeTerm("Ĵĵ"), Eq("jj"));
+  EXPECT_THAT(normalizer_->NormalizeTerm("ĶḰḲḴḵḱḳķ"), Eq("kkkkkkkk"));
+  EXPECT_THAT(normalizer_->NormalizeTerm("ĹĻĽḶḸḼḷḹḻḽĺļľ"),
+              Eq("lllllllllllll"));
+  EXPECT_THAT(normalizer_->NormalizeTerm("ḾṀṂḿṁṃ"), Eq("mmmmmm"));
+  EXPECT_THAT(normalizer_->NormalizeTerm("ÑŃŅŇṄṆṈṊṅṇṉṋñńņň"),
+              Eq("nnnnnnnnnnnnnnnn"));
+  EXPECT_THAT(normalizer_->NormalizeTerm("ŌŎŐÒÓÔÕÖṌṎṐṒṍṏṑṓòóôõöōŏő"),
+              Eq("oooooooooooooooooooooooo"));
+  EXPECT_THAT(normalizer_->NormalizeTerm("ṔṖṕṗ"), Eq("pppp"));
+  EXPECT_THAT(normalizer_->NormalizeTerm("ŔŖŘṘṚṜṞṙṛṝṟŕŗř"),
+              Eq("rrrrrrrrrrrrrr"));
+  EXPECT_THAT(normalizer_->NormalizeTerm("ŚŜŞŠȘṠṢṤṦṨṡṣṥṧṩșśŝşš"),
+              Eq("ssssssssssssssssssss"));
+  EXPECT_THAT(normalizer_->NormalizeTerm("ŢŤȚṪṬṮṰṫṭṯṱțţť"),
+              Eq("tttttttttttttt"));
+  EXPECT_THAT(normalizer_->NormalizeTerm("ŨŪŬÙÚÛÜṲṴṶṸṺṳṵṷṹṻùúûüũūŭ"),
+              Eq("uuuuuuuuuuuuuuuuuuuuuuuu"));
+  EXPECT_THAT(normalizer_->NormalizeTerm("ṼṾṽṿ"), Eq("vvvv"));
+  EXPECT_THAT(normalizer_->NormalizeTerm("ŴẀẂẄẆẈẁẃẅẇẉŵ"), Eq("wwwwwwwwwwww"));
+  EXPECT_THAT(normalizer_->NormalizeTerm("ẊẌẋẍ"), Eq("xxxx"));
+  EXPECT_THAT(normalizer_->NormalizeTerm("ÝŶŸẎẏŷýÿ"), Eq("yyyyyyyy"));
+  EXPECT_THAT(normalizer_->NormalizeTerm("ŹŻŽẐẒẔẑẓẕźżž"),
+              Eq("zzzzzzzzzzzz"));
+}
+
+// Accent / diacritic marks won't be removed in non-latin chars, e.g. in
+// Japanese and Greek
+TEST_F(IcuNormalizerTest, NonLatinLetterNotRemoveAccent) {
+  // Katakana
+  EXPECT_THAT(normalizer_->NormalizeTerm("ダヂヅデド"), Eq("ダヂヅデド"));
+  // Greek
+  EXPECT_THAT(normalizer_->NormalizeTerm("kαλημέρα"), Eq("kαλημέρα"));
+  EXPECT_THAT(normalizer_->NormalizeTerm("εγγραφή"), Eq("εγγραφή"));
+
+  // Our current ICU rules can't handle Hebrew properly, e.g. the accents in
+  // "אָלֶף־בֵּית עִבְרִי"
+  // will be removed.
+  // TODO (samzheng): figure out how we should handle Hebrew.
+}
+
+TEST_F(IcuNormalizerTest, FullWidthCharsToASCII) {
+  // Full-width punctuation to ASCII punctuation
+  EXPECT_THAT(normalizer_->NormalizeTerm("‘’．，！？：“”"), Eq("''.,!?:\"\""));
+  // Full-width 0-9
+  EXPECT_THAT(normalizer_->NormalizeTerm("０１２３４５６７８９"),
+              Eq("0123456789"));
+  // Full-width A-Z
+  EXPECT_THAT(normalizer_->NormalizeTerm(
+                  "ＡＢＣＤＥＦＧＨＩＪＫＬＭＮＯＰＱＲＳＴＵＶＷＸＹＺ"),
+              Eq("abcdefghijklmnopqrstuvwxyz"));
+  // Full-width a-z
+  EXPECT_THAT(normalizer_->NormalizeTerm(
+                  "ａｂｃｄｅｆｇｈｉｊｋｌｍｎｏｐｑｒｓｔｕｖｗｘｙｚ"),
+              Eq("abcdefghijklmnopqrstuvwxyz"));
+}
+
+TEST_F(IcuNormalizerTest, IdeographicToASCII) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create(
+                                                  /*max_term_byte_size=*/1000));
+
+  EXPECT_THAT(normalizer->NormalizeTerm("，。"), Eq(",."));
+}
+
+// For Katakana, each character is normalized to its full-width version.
+TEST_F(IcuNormalizerTest, KatakanaHalfWidthToFullWidth) {
+  EXPECT_THAT(normalizer_->NormalizeTerm("ｶ"), Eq("カ"));
+  EXPECT_THAT(normalizer_->NormalizeTerm("ｫ"), Eq("ォ"));
+  EXPECT_THAT(normalizer_->NormalizeTerm("ｻ"), Eq("サ"));
+  EXPECT_THAT(normalizer_->NormalizeTerm("ﾎ"), Eq("ホ"));
+}
+
+TEST_F(IcuNormalizerTest, HiraganaToKatakana) {
+  EXPECT_THAT(normalizer_->NormalizeTerm("あいうえお"), Eq("アイウエオ"));
+  EXPECT_THAT(normalizer_->NormalizeTerm("かきくけこ"), Eq("カキクケコ"));
+  EXPECT_THAT(normalizer_->NormalizeTerm("さしすせそ"), Eq("サシスセソ"));
+  EXPECT_THAT(normalizer_->NormalizeTerm("たちつてと"), Eq("タチツテト"));
+  EXPECT_THAT(normalizer_->NormalizeTerm("なにぬねの"), Eq("ナニヌネノ"));
+  EXPECT_THAT(normalizer_->NormalizeTerm("はひふへほ"), Eq("ハヒフヘホ"));
+  EXPECT_THAT(normalizer_->NormalizeTerm("まみむめも"), Eq("マミムメモ"));
+  EXPECT_THAT(normalizer_->NormalizeTerm("やゆよ"), Eq("ヤユヨ"));
+  EXPECT_THAT(normalizer_->NormalizeTerm("らりるれろ"), Eq("ラリルレロ"));
+  EXPECT_THAT(normalizer_->NormalizeTerm("わゐゑを"), Eq("ワヰヱヲ"));
+  EXPECT_THAT(normalizer_->NormalizeTerm("ん"), Eq("ン"));
+  EXPECT_THAT(normalizer_->NormalizeTerm("がぎぐげご"), Eq("ガギグゲゴ"));
+  EXPECT_THAT(normalizer_->NormalizeTerm("ざじずぜぞ"), Eq("ザジズゼゾ"));
+  EXPECT_THAT(normalizer_->NormalizeTerm("だぢづでど"), Eq("ダヂヅデド"));
+  EXPECT_THAT(normalizer_->NormalizeTerm("ばびぶべぼ"), Eq("バビブベボ"));
+  EXPECT_THAT(normalizer_->NormalizeTerm("ぱぴぷぺぽ"), Eq("パピプペポ"));
+}
+
+TEST_F(IcuNormalizerTest, SuperscriptAndSubscriptToASCII) {
+  EXPECT_THAT(normalizer_->NormalizeTerm("⁹"), Eq("9"));
+  EXPECT_THAT(normalizer_->NormalizeTerm("₉"), Eq("9"));
+}
+
+TEST_F(IcuNormalizerTest, CircledCharsToASCII) {
+  EXPECT_THAT(normalizer_->NormalizeTerm("①"), Eq("1"));
+  EXPECT_THAT(normalizer_->NormalizeTerm("Ⓐ"), Eq("a"));
+}
+
+TEST_F(IcuNormalizerTest, RotatedCharsToASCII) {
+  EXPECT_THAT(normalizer_->NormalizeTerm("︷"), Eq("{"));
+  EXPECT_THAT(normalizer_->NormalizeTerm("︸"), Eq("}"));
+}
+
+TEST_F(IcuNormalizerTest, SquaredCharsToASCII) {
+  EXPECT_THAT(normalizer_->NormalizeTerm("㌀"), Eq("アパート"));
+}
+
+TEST_F(IcuNormalizerTest, FractionsToASCII) {
+  EXPECT_THAT(normalizer_->NormalizeTerm("¼"), Eq(" 1/4"));
+  EXPECT_THAT(normalizer_->NormalizeTerm("⅚"), Eq(" 5/6"));
+}
+
+TEST_F(IcuNormalizerTest, Truncate) {
+  {
+    ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create(
+                                                    /*max_term_byte_size=*/5));
+
+    // Won't be truncated
+    EXPECT_THAT(normalizer->NormalizeTerm("hi"), Eq("hi"));
+    EXPECT_THAT(normalizer->NormalizeTerm("hello"), Eq("hello"));
+
+    // Truncated to length 5.
+    EXPECT_THAT(normalizer->NormalizeTerm("hello!"), Eq("hello"));
+
+    // Each Japanese character has 3 bytes, so truncating to length 5 results in
+    // only 1 character.
+    EXPECT_THAT(normalizer->NormalizeTerm("キャンパス"), Eq("キ"));
+
+    // Each Greek character has 2 bytes, so truncating to length 5 results in 2
+    // character.
+    EXPECT_THAT(normalizer->NormalizeTerm("αβγδε"), Eq("αβ"));
+  }
+
+  {
+    ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create(
+                                                    /*max_term_byte_size=*/2));
+    // The Japanese character has 3 bytes, truncating it results in an empty
+    // string.
+    EXPECT_THAT(normalizer->NormalizeTerm("キ"), Eq(""));
+  }
+}
+
+}  // namespace
+}  // namespace lib
+}  // namespace icing
diff --git a/icing/transform/map/map-normalizer-factory.cc b/icing/transform/map/map-normalizer-factory.cc
new file mode 100644
index 0000000..3bf84b3
--- /dev/null
+++ b/icing/transform/map/map-normalizer-factory.cc
@@ -0,0 +1,48 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/transform/map/map-normalizer.h"
+#include "icing/transform/normalizer.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+namespace normalizer_factory {
+
+// Creates a map-based normalizer. max_term_byte_size enforces the max size of
+// text after normalization, text will be truncated if exceeds the max size.
+//
+// Returns:
+//   A normalizer on success
+//   INVALID_ARGUMENT if max_term_byte_size <= 0
+//   INTERNAL_ERROR on errors
+libtextclassifier3::StatusOr<std::unique_ptr<Normalizer>> Create(
+    int max_term_byte_size) {
+  if (max_term_byte_size <= 0) {
+    return absl_ports::InvalidArgumentError(
+        "max_term_byte_size must be greater than zero.");
+  }
+
+  return std::make_unique<MapNormalizer>(max_term_byte_size);
+}
+
+}  // namespace normalizer_factory
+
+}  // namespace lib
+}  // namespace icing
diff --git a/icing/transform/map/map-normalizer.cc b/icing/transform/map/map-normalizer.cc
new file mode 100644
index 0000000..c888551
--- /dev/null
+++ b/icing/transform/map/map-normalizer.cc
@@ -0,0 +1,86 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/transform/map/map-normalizer.h"
+
+#include <ctype.h>
+
+#include <string>
+#include <string_view>
+#include <unordered_map>
+#include <utility>
+
+#include "icing/absl_ports/str_cat.h"
+#include "icing/transform/map/normalization-map.h"
+#include "icing/util/i18n-utils.h"
+#include "icing/util/logging.h"
+#include "unicode/utypes.h"
+
+namespace icing {
+namespace lib {
+
+std::string MapNormalizer::NormalizeTerm(std::string_view term) const {
+  std::string normalized_text;
+  normalized_text.reserve(term.length());
+
+  for (int i = 0; i < term.length(); ++i) {
+    if (i18n_utils::IsAscii(term[i])) {
+      // The original character has 1 byte.
+      normalized_text.push_back(std::tolower(term[i]));
+    } else if (i18n_utils::IsLeadUtf8Byte(term[i])) {
+      UChar32 uchar32 = i18n_utils::GetUChar32At(term.data(), term.length(), i);
+      if (uchar32 == i18n_utils::kInvalidUChar32) {
+        ICING_LOG(WARNING) << "Unable to get uchar32 from " << term
+                           << " at position" << i;
+        continue;
+      }
+      int utf8_length = i18n_utils::GetUtf8Length(uchar32);
+      if (i18n_utils::GetUtf16Length(uchar32) > 1) {
+        // All the characters we need to normalize can be encoded into a
+        // single char16_t. If this character needs more than 1 char16_t code
+        // unit, we can skip normalization and append it directly.
+        absl_ports::StrAppend(&normalized_text, term.substr(i, utf8_length));
+        continue;
+      }
+      // The original character can be encoded into a single char16_t.
+      const std::unordered_map<char16_t, char16_t>& normalization_map =
+          GetNormalizationMap();
+      auto iterator = normalization_map.find(static_cast<char16_t>(uchar32));
+      if (iterator != normalization_map.end()) {
+        // Found a normalization mapping. The normalized character (stored in a
+        // char16_t) can have 1 or 2 bytes.
+        if (i18n_utils::IsAscii(iterator->second)) {
+          // The normalized character has 1 byte.
+          normalized_text.push_back(
+              std::tolower(static_cast<char>(iterator->second)));
+        } else {
+          // The normalized character has 2 bytes.
+          i18n_utils::AppendUchar32ToUtf8(&normalized_text, iterator->second);
+        }
+      } else {
+        // Normalization mapping not found, append the original character.
+        absl_ports::StrAppend(&normalized_text, term.substr(i, utf8_length));
+      }
+    }
+  }
+
+  if (normalized_text.length() > max_term_byte_size_) {
+    i18n_utils::SafeTruncateUtf8(&normalized_text, max_term_byte_size_);
+  }
+
+  return normalized_text;
+}
+
+}  // namespace lib
+}  // namespace icing
diff --git a/icing/transform/map/map-normalizer.h b/icing/transform/map/map-normalizer.h
new file mode 100644
index 0000000..f9c0e42
--- /dev/null
+++ b/icing/transform/map/map-normalizer.h
@@ -0,0 +1,50 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TRANSFORM_MAP_MAP_NORMALIZER_H_
+#define ICING_TRANSFORM_MAP_MAP_NORMALIZER_H_
+
+#include <string>
+#include <string_view>
+
+#include "icing/transform/normalizer.h"
+
+namespace icing {
+namespace lib {
+
+class MapNormalizer : public Normalizer {
+ public:
+  explicit MapNormalizer(int max_term_byte_size)
+      : max_term_byte_size_(max_term_byte_size){};
+
+  // Normalizes the input term based on character mappings. The mappings
+  // contain the following categories:
+  //   - Uppercase -> lowercase
+  //   - Hiragana -> Katakana
+  //   - Common full-width characters -> ASCII
+  //   - Common ideographic punctuation marks -> ASCII
+  //   - Common diacritic Latin characters -> ASCII
+  //
+  // Read more mapping details in normalization-map.cc
+  std::string NormalizeTerm(std::string_view term) const override;
+
+ private:
+  // The maximum term length allowed after normalization.
+  int max_term_byte_size_;
+};
+
+}  // namespace lib
+}  // namespace icing
+
+#endif  // ICING_TRANSFORM_MAP_MAP_NORMALIZER_H_
diff --git a/icing/transform/map/map-normalizer_benchmark.cc b/icing/transform/map/map-normalizer_benchmark.cc
new file mode 100644
index 0000000..691afc6
--- /dev/null
+++ b/icing/transform/map/map-normalizer_benchmark.cc
@@ -0,0 +1,149 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+
+#include "testing/base/public/benchmark.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/transform/normalizer-factory.h"
+#include "icing/transform/normalizer.h"
+
+// Run on a Linux workstation:
+//    $ blaze build -c opt --dynamic_mode=off --copt=-gmlt
+//    //icing/transform/map:map-normalizer_benchmark
+//
+//    $ blaze-bin/icing/transform/map/map-normalizer_benchmark
+//    --benchmarks=all
+//
+// Run on an Android device:
+//    $ blaze build --copt="-DGOOGLE_COMMANDLINEFLAGS_FULL_API=1"
+//    --config=android_arm64 -c opt --dynamic_mode=off --copt=-gmlt
+//    //icing/transform/map:map-normalizer_benchmark
+//
+//    $ adb push
+//    blaze-bin/icing/transform/map/map-normalizer_benchmark
+//    /data/local/tmp/
+//
+//    $ adb shell /data/local/tmp/map-normalizer_benchmark --benchmarks=all
+namespace icing {
+namespace lib {
+
+namespace {
+
+void BM_NormalizeUppercase(benchmark::State& state) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Normalizer> normalizer,
+      normalizer_factory::Create(
+          /*max_term_byte_size=*/std::numeric_limits<int>::max()));
+
+  std::string input_string(state.range(0), 'A');
+
+  // Warms up. map-normalizer may need to load a static map when being invoked
+  // the first time. It takes about 0.05ms on a Pixel3 XL.
+  normalizer->NormalizeTerm(input_string);
+
+  for (auto _ : state) {
+    normalizer->NormalizeTerm(input_string);
+  }
+}
+BENCHMARK(BM_NormalizeUppercase)
+    ->Arg(1000)
+    ->Arg(2000)
+    ->Arg(4000)
+    ->Arg(8000)
+    ->Arg(16000)
+    ->Arg(32000)
+    ->Arg(64000)
+    ->Arg(128000)
+    ->Arg(256000)
+    ->Arg(384000)
+    ->Arg(512000)
+    ->Arg(1024000)
+    ->Arg(2048000)
+    ->Arg(4096000);
+
+void BM_NormalizeAccent(benchmark::State& state) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Normalizer> normalizer,
+      normalizer_factory::Create(
+          /*max_term_byte_size=*/std::numeric_limits<int>::max()));
+
+  std::string input_string;
+  while (input_string.length() < state.range(0)) {
+    input_string.append("àáâãā");
+  }
+
+  // Warms up. map-normalizer may need to load a static map when being invoked
+  // the first time. It takes about 0.05ms on a Pixel3 XL.
+  normalizer->NormalizeTerm(input_string);
+
+  for (auto _ : state) {
+    normalizer->NormalizeTerm(input_string);
+  }
+}
+BENCHMARK(BM_NormalizeAccent)
+    ->Arg(1000)
+    ->Arg(2000)
+    ->Arg(4000)
+    ->Arg(8000)
+    ->Arg(16000)
+    ->Arg(32000)
+    ->Arg(64000)
+    ->Arg(128000)
+    ->Arg(256000)
+    ->Arg(384000)
+    ->Arg(512000)
+    ->Arg(1024000)
+    ->Arg(2048000)
+    ->Arg(4096000);
+
+void BM_NormalizeHiragana(benchmark::State& state) {
+  ICING_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Normalizer> normalizer,
+      normalizer_factory::Create(
+          /*max_term_byte_size=*/std::numeric_limits<int>::max()));
+
+  std::string input_string;
+  while (input_string.length() < state.range(0)) {
+    input_string.append("あいうえお");
+  }
+
+  // Warms up. map-normalizer may need to load a static map when being invoked
+  // the first time. It takes about 0.05ms on a Pixel3 XL.
+  normalizer->NormalizeTerm(input_string);
+
+  for (auto _ : state) {
+    normalizer->NormalizeTerm(input_string);
+  }
+}
+BENCHMARK(BM_NormalizeHiragana)
+    ->Arg(1000)
+    ->Arg(2000)
+    ->Arg(4000)
+    ->Arg(8000)
+    ->Arg(16000)
+    ->Arg(32000)
+    ->Arg(64000)
+    ->Arg(128000)
+    ->Arg(256000)
+    ->Arg(384000)
+    ->Arg(512000)
+    ->Arg(1024000)
+    ->Arg(2048000)
+    ->Arg(4096000);
+
+}  // namespace
+
+}  // namespace lib
+}  // namespace icing
diff --git a/icing/transform/map/map-normalizer_test.cc b/icing/transform/map/map-normalizer_test.cc
new file mode 100644
index 0000000..b62ae0e
--- /dev/null
+++ b/icing/transform/map/map-normalizer_test.cc
@@ -0,0 +1,205 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+#include <string>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/icu-i18n-test-utils.h"
+#include "icing/transform/normalizer-factory.h"
+#include "icing/transform/normalizer.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+using ::testing::Eq;
+
+TEST(MapNormalizerTest, Creation) {
+  EXPECT_THAT(normalizer_factory::Create(
+                  /*max_term_byte_size=*/5),
+              IsOk());
+  EXPECT_THAT(normalizer_factory::Create(
+                  /*max_term_byte_size=*/0),
+              StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+  EXPECT_THAT(normalizer_factory::Create(
+                  /*max_term_byte_size=*/-1),
+              StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+// Strings that are already normalized won't change if normalized again.
+TEST(MapNormalizerTest, AlreadyNormalized) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create(
+                                                  /*max_term_byte_size=*/1000));
+
+  EXPECT_THAT(normalizer->NormalizeTerm(""), Eq(""));
+  EXPECT_THAT(normalizer->NormalizeTerm("hello world"), Eq("hello world"));
+  EXPECT_THAT(normalizer->NormalizeTerm("你好"), Eq("你好"));
+  EXPECT_THAT(normalizer->NormalizeTerm("キャンパス"), Eq("キャンパス"));
+  EXPECT_THAT(normalizer->NormalizeTerm("안녕하세요"), Eq("안녕하세요"));
+}
+
+TEST(MapNormalizerTest, UppercaseToLowercase) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create(
+                                                  /*max_term_byte_size=*/1000));
+
+  EXPECT_THAT(normalizer->NormalizeTerm("MDI"), Eq("mdi"));
+  EXPECT_THAT(normalizer->NormalizeTerm("Icing"), Eq("icing"));
+}
+
+TEST(MapNormalizerTest, LatinLetterRemoveAccent) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create(
+                                                  /*max_term_byte_size=*/1000));
+
+  EXPECT_THAT(normalizer->NormalizeTerm("Zürich"), Eq("zurich"));
+  EXPECT_THAT(normalizer->NormalizeTerm("après-midi"), Eq("apres-midi"));
+  EXPECT_THAT(normalizer->NormalizeTerm("Buenos días"), Eq("buenos dias"));
+  EXPECT_THAT(normalizer->NormalizeTerm("ÀÁÂÃÄÅĀĂĄḀḁàáâãäåāăą"),
+              Eq("aaaaaaaaaaaaaaaaaaaa"));
+  EXPECT_THAT(normalizer->NormalizeTerm("ḂḄḆḃḅḇ"), Eq("bbbbbb"));
+  EXPECT_THAT(normalizer->NormalizeTerm("ÇĆĈĊČḈḉćĉċčç"), Eq("cccccccccccc"));
+  EXPECT_THAT(normalizer->NormalizeTerm("ÐĎĐḊḌḎḐḒḋḍḏḑḓďđ"),
+              Eq("ddddddddddddddd"));
+  EXPECT_THAT(normalizer->NormalizeTerm("ÈÉÊËĒĔĖĘḔḖḘḚḜḕḗḙḛḝèéêëēĕėęě"),
+              Eq("eeeeeeeeeeeeeeeeeeeeeeeeeee"));
+  EXPECT_THAT(normalizer->NormalizeTerm("Ḟḟ"), Eq("ff"));
+  EXPECT_THAT(normalizer->NormalizeTerm("ĜĞĠĢḠḡĝğġģ"), Eq("gggggggggg"));
+  EXPECT_THAT(normalizer->NormalizeTerm("ĤḢḤḦḨḪḣḥḧḩḫĥẖ"), Eq("hhhhhhhhhhhhh"));
+  EXPECT_THAT(normalizer->NormalizeTerm("ÌÍÎÏĨĪĬḬḭḯìíîïĩīĭ"),
+              Eq("iiiiiiiiiiiiiiiii"));
+  EXPECT_THAT(normalizer->NormalizeTerm("Ĵĵ"), Eq("jj"));
+  EXPECT_THAT(normalizer->NormalizeTerm("ĶḰḲḴḵḱḳķ"), Eq("kkkkkkkk"));
+  EXPECT_THAT(normalizer->NormalizeTerm("ĹĻĽḶḸḼḷḹḻḽĺļľ"), Eq("lllllllllllll"));
+  EXPECT_THAT(normalizer->NormalizeTerm("ḾṀṂḿṁṃ"), Eq("mmmmmm"));
+  EXPECT_THAT(normalizer->NormalizeTerm("ÑŃŅŇṄṆṈṊṅṇṉṋñńņň"),
+              Eq("nnnnnnnnnnnnnnnn"));
+  EXPECT_THAT(normalizer->NormalizeTerm("ŌŎŐÒÓÔÕÖṌṎṐṒṍṏṑṓòóôõöōŏő"),
+              Eq("oooooooooooooooooooooooo"));
+  EXPECT_THAT(normalizer->NormalizeTerm("ṔṖṕṗ"), Eq("pppp"));
+  EXPECT_THAT(normalizer->NormalizeTerm("ŔŖŘṘṚṜṞṙṛṝṟŕŗř"),
+              Eq("rrrrrrrrrrrrrr"));
+  EXPECT_THAT(normalizer->NormalizeTerm("ŚŜŞŠȘṠṢṤṦṨṡṣṥṧṩșśŝşš"),
+              Eq("ssssssssssssssssssss"));
+  EXPECT_THAT(normalizer->NormalizeTerm("ŢŤȚṪṬṮṰṫṭṯṱțţť"),
+              Eq("tttttttttttttt"));
+  EXPECT_THAT(normalizer->NormalizeTerm("ŨŪŬÙÚÛÜṲṴṶṸṺṳṵṷṹṻùúûüũūŭ"),
+              Eq("uuuuuuuuuuuuuuuuuuuuuuuu"));
+  EXPECT_THAT(normalizer->NormalizeTerm("ṼṾṽṿ"), Eq("vvvv"));
+  EXPECT_THAT(normalizer->NormalizeTerm("ŴẀẂẄẆẈẁẃẅẇẉŵ"), Eq("wwwwwwwwwwww"));
+  EXPECT_THAT(normalizer->NormalizeTerm("ẊẌẋẍ"), Eq("xxxx"));
+  EXPECT_THAT(normalizer->NormalizeTerm("ÝŶŸẎẏŷýÿ"), Eq("yyyyyyyy"));
+  EXPECT_THAT(normalizer->NormalizeTerm("ŹŻŽẐẒẔẑẓẕźżž"), Eq("zzzzzzzzzzzz"));
+}
+
+// Accent / diacritic marks won't be removed in non-latin chars, e.g. in
+// Japanese and Greek
+TEST(MapNormalizerTest, NonLatinLetterNotRemoveAccent) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create(
+                                                  /*max_term_byte_size=*/1000));
+
+  // Katakana
+  EXPECT_THAT(normalizer->NormalizeTerm("ダヂヅデド"), Eq("ダヂヅデド"));
+  // Greek
+  EXPECT_THAT(normalizer->NormalizeTerm("kαλημέρα"), Eq("kαλημέρα"));
+  EXPECT_THAT(normalizer->NormalizeTerm("εγγραφή"), Eq("εγγραφή"));
+  // Hebrew
+  EXPECT_THAT(normalizer->NormalizeTerm("אָלֶף־בֵּית עִבְרִי"), Eq("אָלֶף־בֵּית עִבְרִי"));
+}
+
+TEST(MapNormalizerTest, FullWidthCharsToASCII) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create(
+                                                  /*max_term_byte_size=*/1000));
+
+  // Full-width punctuation to ASCII punctuation
+  EXPECT_THAT(normalizer->NormalizeTerm("‘’．，！？：“”"), Eq("''.,!?:\"\""));
+  // Full-width 0-9
+  EXPECT_THAT(normalizer->NormalizeTerm("０１２３４５６７８９"),
+              Eq("0123456789"));
+  // Full-width A-Z
+  EXPECT_THAT(normalizer->NormalizeTerm(
+                  "ＡＢＣＤＥＦＧＨＩＪＫＬＭＮＯＰＱＲＳＴＵＶＷＸＹＺ"),
+              Eq("abcdefghijklmnopqrstuvwxyz"));
+  // Full-width a-z
+  EXPECT_THAT(normalizer->NormalizeTerm(
+                  "ａｂｃｄｅｆｇｈｉｊｋｌｍｎｏｐｑｒｓｔｕｖｗｘｙｚ"),
+              Eq("abcdefghijklmnopqrstuvwxyz"));
+}
+
+TEST(MapNormalizerTest, IdeographicToASCII) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create(
+                                                  /*max_term_byte_size=*/1000));
+
+  EXPECT_THAT(normalizer->NormalizeTerm("，。"), Eq(",."));
+}
+
+TEST(MapNormalizerTest, HiraganaToKatakana) {
+  ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create(
+                                                  /*max_term_byte_size=*/1000));
+
+  EXPECT_THAT(normalizer->NormalizeTerm("あいうえお"), Eq("アイウエオ"));
+  EXPECT_THAT(normalizer->NormalizeTerm("かきくけこ"), Eq("カキクケコ"));
+  EXPECT_THAT(normalizer->NormalizeTerm("さしすせそ"), Eq("サシスセソ"));
+  EXPECT_THAT(normalizer->NormalizeTerm("たちつてと"), Eq("タチツテト"));
+  EXPECT_THAT(normalizer->NormalizeTerm("なにぬねの"), Eq("ナニヌネノ"));
+  EXPECT_THAT(normalizer->NormalizeTerm("はひふへほ"), Eq("ハヒフヘホ"));
+  EXPECT_THAT(normalizer->NormalizeTerm("まみむめも"), Eq("マミムメモ"));
+  EXPECT_THAT(normalizer->NormalizeTerm("やゆよ"), Eq("ヤユヨ"));
+  EXPECT_THAT(normalizer->NormalizeTerm("らりるれろ"), Eq("ラリルレロ"));
+  EXPECT_THAT(normalizer->NormalizeTerm("わゐゑを"), Eq("ワヰヱヲ"));
+  EXPECT_THAT(normalizer->NormalizeTerm("ん"), Eq("ン"));
+  EXPECT_THAT(normalizer->NormalizeTerm("がぎぐげご"), Eq("ガギグゲゴ"));
+  EXPECT_THAT(normalizer->NormalizeTerm("ざじずぜぞ"), Eq("ザジズゼゾ"));
+  EXPECT_THAT(normalizer->NormalizeTerm("だぢづでど"), Eq("ダヂヅデド"));
+  EXPECT_THAT(normalizer->NormalizeTerm("ばびぶべぼ"), Eq("バビブベボ"));
+  EXPECT_THAT(normalizer->NormalizeTerm("ぱぴぷぺぽ"), Eq("パピプペポ"));
+}
+
+TEST(MapNormalizerTest, Truncate) {
+  {
+    ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create(
+                                                    /*max_term_byte_size=*/5));
+
+    // Won't be truncated
+    EXPECT_THAT(normalizer->NormalizeTerm("hi"), Eq("hi"));
+    EXPECT_THAT(normalizer->NormalizeTerm("hello"), Eq("hello"));
+
+    // Truncated to length 5.
+    EXPECT_THAT(normalizer->NormalizeTerm("hello!"), Eq("hello"));
+
+    // Each Japanese character has 3 bytes, so truncating to length 5 results in
+    // only 1 character.
+    EXPECT_THAT(normalizer->NormalizeTerm("キャンパス"), Eq("キ"));
+
+    // Each Greek character has 2 bytes, so truncating to length 5 results in 2
+    // character.
+    EXPECT_THAT(normalizer->NormalizeTerm("αβγδε"), Eq("αβ"));
+  }
+
+  {
+    ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create(
+                                                    /*max_term_byte_size=*/2));
+    // The Japanese character has 3 bytes, truncating it results in an empty
+    // string.
+    EXPECT_THAT(normalizer->NormalizeTerm("キ"), Eq(""));
+  }
+}
+
+}  // namespace
+
+}  // namespace lib
+}  // namespace icing
diff --git a/icing/transform/map/normalization-map.cc b/icing/transform/map/normalization-map.cc
new file mode 100644
index 0000000..c318036
--- /dev/null
+++ b/icing/transform/map/normalization-map.cc
@@ -0,0 +1,712 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/transform/map/normalization-map.h"
+
+#include <cstdint>
+#include "icing/legacy/core/icing-packed-pod.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+// A pair representing the mapping of the 'from' character to 'to' character.
+struct NormalizationPair {
+  // All the mapped characters can be stored in 2 bytes.
+  char16_t from;
+  char16_t to;
+} __attribute__((packed));
+
+// The following mappings contain multiple categories:
+// 1. Hiragana -> Katakana, listed in the order of Hiragana chart rows.
+//    All regular and small Hiragana characters are mapped to Katakana. Note
+//    that half-width Katakana characters are not handled here.
+// 2. Common full-width characters -> ASCII characters.
+//    Full-width characters in the Unicode range of [0xff01, 0xff5e] are mapped
+//    to the corresponding ASCII forms.
+// 3. Common ideographic punctuation marks -> ASCII characters.
+//    Ideographic characters are in the Unicode range of [0x3000, 0x303f]. Here
+//    we list two that are frequently used in CJK and can be converted to ASCII.
+// 4. Common diacritic Latin characters -> ASCII characters.
+//    We list most diacritic Latin characters within the Unicode range of
+//    [0x00c0, 0x017e], some from [0x01a0, 0x021b], and most from [0x1e00,
+//    0x1ef9].
+//
+// All the characters can be stored in a single UTF16 code unit, so we use
+// char16_t to store them. Size of the following array is about 2.5KiB.
+constexpr NormalizationPair kNormalizationMappings[] = {
+    // Part 1: Hiragana -> Katakana
+    // 'a' row
+    {0x3042, 0x30a2},  // Hiragana letter        A -> Katakana letter  A
+    {0x3044, 0x30a4},  // Hiragana letter        I -> Katakana letter  I
+    {0x3046, 0x30a6},  // Hiragana letter        U -> Katakana letter  U
+    {0x3048, 0x30a8},  // Hiragana letter        E -> Katakana letter  E
+    {0x304a, 0x30aa},  // Hiragana letter        O -> Katakana letter  O
+    {0x3041, 0x30a2},  // Hiragana letter small  A -> Katakana letter  A
+    {0x3043, 0x30a4},  // Hiragana letter small  I -> Katakana letter  I
+    {0x3045, 0x30a6},  // Hiragana letter small  U -> Katakana letter  U
+    {0x3047, 0x30a8},  // Hiragana letter small  E -> Katakana letter  E
+    {0x3049, 0x30aa},  // Hiragana letter small  O -> Katakana letter  O
+    // 'ka' row
+    {0x304b, 0x30ab},  // Hiragana letter       KA -> Katakana letter KA
+    {0x304d, 0x30ad},  // Hiragana letter       KI -> Katakana letter KI
+    {0x304f, 0x30af},  // Hiragana letter       KU -> Katakana letter KU
+    {0x3051, 0x30b1},  // Hiragana letter       KE -> Katakana letter KE
+    {0x3053, 0x30b3},  // Hiragana letter       KO -> Katakana letter KO
+    {0x3095, 0x30ab},  // Hiragana letter small KA -> Katakana letter KA
+    {0x3096, 0x30b1},  // Hiragana letter small KE -> Katakana letter KE
+    // 'sa' row
+    {0x3055, 0x30b5},  // Hiragana letter       SA -> Katakana letter SA
+    {0x3057, 0x30b7},  // Hiragana letter       SI -> Katakana letter SI
+    {0x3059, 0x30b9},  // Hiragana letter       SU -> Katakana letter SU
+    {0x305b, 0x30bb},  // Hiragana letter       SE -> Katakana letter SE
+    {0x305d, 0x30bd},  // Hiragana letter       SO -> Katakana letter SO
+    // 'ta' row
+    {0x305f, 0x30bf},  // Hiragana letter       TA -> Katakana letter TA
+    {0x3061, 0x30c1},  // Hiragana letter       TI -> Katakana letter TI
+    {0x3063, 0x30c4},  // Hiragana letter small TU -> Katakana letter TU
+    {0x3064, 0x30c4},  // Hiragana letter       TU -> Katakana letter TU
+    {0x3066, 0x30c6},  // Hiragana letter       TE -> Katakana letter TE
+    {0x3068, 0x30c8},  // Hiragana letter       TO -> Katakana letter TO
+    // 'na' row
+    {0x306a, 0x30ca},  // Hiragana letter       NA -> Katakana letter NA
+    {0x306b, 0x30cb},  // Hiragana letter       NI -> Katakana letter NI
+    {0x306c, 0x30cc},  // Hiragana letter       NU -> Katakana letter NU
+    {0x306d, 0x30cd},  // Hiragana letter       NE -> Katakana letter NE
+    {0x306e, 0x30ce},  // Hiragana letter       NO -> Katakana letter NO
+    // 'ha' row
+    {0x306f, 0x30cf},  // Hiragana letter       HA -> Katakana letter HA
+    {0x3072, 0x30d2},  // Hiragana letter       HI -> Katakana letter HI
+    {0x3075, 0x30d5},  // Hiragana letter       HU -> Katakana letter HU
+    {0x3078, 0x30d8},  // Hiragana letter       HE -> Katakana letter HE
+    {0x307b, 0x30db},  // Hiragana letter       HO -> Katakana letter HO
+    // 'ma' row
+    {0x307e, 0x30de},  // Hiragana letter       MA -> Katakana letter MA
+    {0x307f, 0x30df},  // Hiragana letter       MI -> Katakana letter MI
+    {0x3080, 0x30e0},  // Hiragana letter       MU -> Katakana letter MU
+    {0x3081, 0x30e1},  // Hiragana letter       ME -> Katakana letter ME
+    {0x3082, 0x30e2},  // Hiragana letter       MO -> Katakana letter MO
+    // 'ya' row
+    {0x3083, 0x30e4},  // Hiragana letter small YA -> Katakana letter YA
+    {0x3084, 0x30e4},  // Hiragana letter       YA -> Katakana letter YA
+    {0x3085, 0x30e6},  // Hiragana letter small YU -> Katakana letter YU
+    {0x3086, 0x30e6},  // Hiragana letter       YU -> Katakana letter YU
+    {0x3087, 0x30e8},  // Hiragana letter small YO -> Katakana letter YO
+    {0x3088, 0x30e8},  // Hiragana letter       YO -> Katakana letter YO
+    // 'ra' row
+    {0x3089, 0x30e9},  // Hiragana letter       RA -> Katakana letter RA
+    {0x308a, 0x30ea},  // Hiragana letter       RI -> Katakana letter RI
+    {0x308b, 0x30eb},  // Hiragana letter       RU -> Katakana letter RU
+    {0x308c, 0x30ec},  // Hiragana letter       RE -> Katakana letter RE
+    {0x308d, 0x30ed},  // Hiragana letter       RO -> Katakana letter RO
+    // 'wa' row
+    {0x308e, 0x30ef},  // Hiragana letter small WA -> Katakana letter WA
+    {0x308f, 0x30ef},  // Hiragana letter       WA -> Katakana letter WA
+    {0x3090, 0x30f0},  // Hiragana letter       WI -> Katakana letter WI
+    {0x3091, 0x30f1},  // Hiragana letter       WE -> Katakana letter WE
+    {0x3092, 0x30f2},  // Hiragana letter       WO -> Katakana letter WO
+    // 'n'
+    {0x3093, 0x30f3},  // Hiragana letter        N -> Katakana letter  N
+    // 'ga' row
+    {0x304c, 0x30ac},  // Hiragana letter       GA -> Katakana letter GA
+    {0x304e, 0x30ae},  // Hiragana letter       GI -> Katakana letter GI
+    {0x3050, 0x30b0},  // Hiragana letter       GU -> Katakana letter GU
+    {0x3052, 0x30b2},  // Hiragana letter       GE -> Katakana letter GE
+    {0x3054, 0x30b4},  // Hiragana letter       GO -> Katakana letter GO
+    // 'za' row
+    {0x3056, 0x30b6},  // Hiragana letter       ZA -> Katakana letter ZA
+    {0x3058, 0x30b8},  // Hiragana letter       ZI -> Katakana letter ZI
+    {0x305a, 0x30ba},  // Hiragana letter       ZU -> Katakana letter ZU
+    {0x305c, 0x30bc},  // Hiragana letter       ZE -> Katakana letter ZE
+    {0x305e, 0x30be},  // Hiragana letter       ZO -> Katakana letter ZO
+    // 'da' row
+    {0x3060, 0x30c0},  // Hiragana letter       DA -> Katakana letter DA
+    {0x3062, 0x30c2},  // Hiragana letter       DI -> Katakana letter DI
+    {0x3065, 0x30c5},  // Hiragana letter       DU -> Katakana letter DU
+    {0x3067, 0x30c7},  // Hiragana letter       DE -> Katakana letter DE
+    {0x3069, 0x30c9},  // Hiragana letter       DO -> Katakana letter DO
+    // 'ba' row
+    {0x3070, 0x30d0},  // Hiragana letter       BA -> Katakana letter BA
+    {0x3073, 0x30d3},  // Hiragana letter       BI -> Katakana letter BI
+    {0x3076, 0x30d6},  // Hiragana letter       BU -> Katakana letter BU
+    {0x3079, 0x30d9},  // Hiragana letter       BE -> Katakana letter BE
+    {0x307c, 0x30dc},  // Hiragana letter       BO -> Katakana letter BO
+    // 'pa' row
+    {0x3071, 0x30d1},  // Hiragana letter       PA -> Katakana letter PA
+    {0x3074, 0x30d4},  // Hiragana letter       PI -> Katakana letter PI
+    {0x3077, 0x30d7},  // Hiragana letter       PU -> Katakana letter PU
+    {0x307a, 0x30da},  // Hiragana letter       PE -> Katakana letter PE
+    {0x307d, 0x30dd},  // Hiragana letter       PO -> Katakana letter PO
+    // Additional Hiragana
+    {0x3094, 0x30f4},  // Hiragana letter       VU -> Katakana letter VU
+    // Part 2: Common full-width characters -> ASCII characters.
+    {0xff01, 33},   // ASCII !
+    {0xff02, 34},   // ASCII "
+    {0xff03, 35},   // ASCII #
+    {0xff04, 36},   // ASCII $
+    {0xff05, 37},   // ASCII %
+    {0xff06, 38},   // ASCII &
+    {0xff07, 39},   // ASCII '
+    {0xff08, 40},   // ASCII (
+    {0xff09, 41},   // ASCII )
+    {0xff0a, 42},   // ASCII *
+    {0xff0b, 43},   // ASCII +
+    {0xff0c, 44},   // ASCII ,
+    {0xff0d, 45},   // ASCII -
+    {0xff0e, 46},   // ASCII .
+    {0xff0f, 47},   // ASCII /
+    {0xff10, 48},   // ASCII 0
+    {0xff11, 49},   // ASCII 1
+    {0xff12, 50},   // ASCII 2
+    {0xff13, 51},   // ASCII 3
+    {0xff14, 52},   // ASCII 4
+    {0xff15, 53},   // ASCII 5
+    {0xff16, 54},   // ASCII 6
+    {0xff17, 55},   // ASCII 7
+    {0xff18, 56},   // ASCII 8
+    {0xff19, 57},   // ASCII 9
+    {0xff1a, 58},   // ASCII :
+    {0xff1b, 59},   // ASCII ;
+    {0xff1c, 60},   // ASCII <
+    {0xff1d, 61},   // ASCII =
+    {0xff1e, 62},   // ASCII >
+    {0xff1f, 63},   // ASCII ?
+    {0xff20, 64},   // ASCII @
+    {0xff21, 65},   // ASCII A
+    {0xff22, 66},   // ASCII B
+    {0xff23, 67},   // ASCII C
+    {0xff24, 68},   // ASCII D
+    {0xff25, 69},   // ASCII E
+    {0xff26, 70},   // ASCII F
+    {0xff27, 71},   // ASCII G
+    {0xff28, 72},   // ASCII H
+    {0xff29, 73},   // ASCII I
+    {0xff2a, 74},   // ASCII J
+    {0xff2b, 75},   // ASCII K
+    {0xff2c, 76},   // ASCII L
+    {0xff2d, 77},   // ASCII M
+    {0xff2e, 78},   // ASCII N
+    {0xff2f, 79},   // ASCII O
+    {0xff30, 80},   // ASCII P
+    {0xff31, 81},   // ASCII Q
+    {0xff32, 82},   // ASCII R
+    {0xff33, 83},   // ASCII S
+    {0xff34, 84},   // ASCII T
+    {0xff35, 85},   // ASCII U
+    {0xff36, 86},   // ASCII V
+    {0xff37, 87},   // ASCII W
+    {0xff38, 88},   // ASCII X
+    {0xff39, 89},   // ASCII Y
+    {0xff3a, 90},   // ASCII Z
+    {0xff3b, 91},   // ASCII [
+    {0xff3c, 92},   // ASCII forward slash
+    {0xff3d, 93},   // ASCII ]
+    {0xff3e, 94},   // ASCII ^
+    {0xff3f, 95},   // ASCII _
+    {0xff40, 96},   // ASCII `
+    {0xff41, 97},   // ASCII a
+    {0xff42, 98},   // ASCII b
+    {0xff43, 99},   // ASCII c
+    {0xff44, 100},  // ASCII d
+    {0xff45, 101},  // ASCII e
+    {0xff46, 102},  // ASCII f
+    {0xff47, 103},  // ASCII g
+    {0xff48, 104},  // ASCII h
+    {0xff49, 105},  // ASCII i
+    {0xff4a, 106},  // ASCII j
+    {0xff4b, 107},  // ASCII k
+    {0xff4c, 108},  // ASCII l
+    {0xff4d, 109},  // ASCII m
+    {0xff4e, 110},  // ASCII n
+    {0xff4f, 111},  // ASCII o
+    {0xff50, 112},  // ASCII p
+    {0xff51, 113},  // ASCII q
+    {0xff52, 114},  // ASCII r
+    {0xff53, 115},  // ASCII s
+    {0xff54, 116},  // ASCII t
+    {0xff55, 117},  // ASCII u
+    {0xff56, 118},  // ASCII v
+    {0xff57, 119},  // ASCII w
+    {0xff58, 120},  // ASCII x
+    {0xff59, 121},  // ASCII y
+    {0xff5a, 122},  // ASCII z
+    {0xff5b, 123},  // ASCII {
+    {0xff5c, 124},  // ASCII |
+    {0xff5d, 125},  // ASCII }
+    {0xff5e, 126},  // ASCII ~
+    {0x2018, 39},   // Left single quote -> ASCII apostrophe
+    {0x2019, 39},   // Right single quote -> ASCII apostrophe
+    {0x201c, 34},   // Left double quote -> ASCII quote
+    {0x201d, 34},   // Right double quote -> ASCII quote
+    // Part 3: Common ideographic punctuation marks -> ASCII.
+    // Usually used in CJK.
+    {0x3001, 44},  // ASCII ,
+    {0x3002, 46},  // ASCII .
+    // Part 4: Common diacritic Latin characters -> ASCII characters.
+    {0x00c0, 65},   // À -> A
+    {0x00c1, 65},   // Á -> A
+    {0x00c2, 65},   // Â -> A
+    {0x00c3, 65},   // Ã -> A
+    {0x00c4, 65},   // Ä -> A
+    {0x00c5, 65},   // Å -> A
+    {0x00c7, 67},   // Ç -> C
+    {0x00c8, 69},   // È -> E
+    {0x00c9, 69},   // É -> E
+    {0x00ca, 69},   // Ê -> E
+    {0x00cb, 69},   // Ë -> E
+    {0x00cc, 73},   // Ì -> I
+    {0x00cd, 73},   // Í -> I
+    {0x00ce, 73},   // Î -> I
+    {0x00cf, 73},   // Ï -> I
+    {0x00d0, 68},   // Ð -> D
+    {0x00d1, 78},   // Ñ -> N
+    {0x00d2, 79},   // Ò -> O
+    {0x00d3, 79},   // Ó -> O
+    {0x00d4, 79},   // Ô -> O
+    {0x00d5, 79},   // Õ -> O
+    {0x00d6, 79},   // Ö -> O
+    {0x00d8, 79},   // Ø -> O
+    {0x00d9, 85},   // Ù -> U
+    {0x00da, 85},   // Ú -> U
+    {0x00db, 85},   // Û -> U
+    {0x00dc, 85},   // Ü -> U
+    {0x00dd, 89},   // Ý -> Y
+    {0x00e0, 97},   // à -> a
+    {0x00e1, 97},   // á -> a
+    {0x00e2, 97},   // â -> a
+    {0x00e3, 97},   // ã -> a
+    {0x00e4, 97},   // ä -> a
+    {0x00e5, 97},   // å -> a
+    {0x00e7, 99},   // ç -> c
+    {0x00e8, 101},  // è -> e
+    {0x00e9, 101},  // é -> e
+    {0x00ea, 101},  // ê -> e
+    {0x00eb, 101},  // ë -> e
+    {0x00ec, 105},  // ì -> i
+    {0x00ed, 105},  // í -> i
+    {0x00ee, 105},  // î -> i
+    {0x00ef, 105},  // ï -> i
+    {0x00f0, 100},  // ð -> d
+    {0x00f1, 110},  // ñ -> n
+    {0x00f2, 111},  // ò -> o
+    {0x00f3, 111},  // ó -> o
+    {0x00f4, 111},  // ô -> o
+    {0x00f5, 111},  // õ -> o
+    {0x00f6, 111},  // ö -> o
+    {0x00f8, 111},  // ø -> o
+    {0x00f9, 117},  // ù -> u
+    {0x00fa, 117},  // ú -> u
+    {0x00fb, 117},  // û -> u
+    {0x00fc, 117},  // ü -> u
+    {0x00fd, 121},  // ý -> y
+    {0x00ff, 121},  // ÿ -> y
+    {0x0100, 65},   // Ā -> A
+    {0x0101, 97},   // ā -> a
+    {0x0102, 65},   // Ă -> A
+    {0x0103, 97},   // ă -> a
+    {0x0104, 65},   // Ą -> A
+    {0x0105, 97},   // ą -> a
+    {0x0106, 67},   // Ć -> C
+    {0x0107, 99},   // ć -> c
+    {0x0108, 67},   // Ĉ -> C
+    {0x0109, 99},   // ĉ -> c
+    {0x010a, 67},   // Ċ -> C
+    {0x010b, 99},   // ċ -> c
+    {0x010c, 67},   // Č -> C
+    {0x010d, 99},   // č -> c
+    {0x010e, 68},   // Ď -> D
+    {0x010f, 100},  // ď -> d
+    {0x0110, 68},   // Đ -> D
+    {0x0111, 100},  // đ -> d
+    {0x0112, 69},   // Ē -> E
+    {0x0113, 101},  // ē -> e
+    {0x0114, 69},   // Ĕ -> E
+    {0x0115, 101},  // ĕ -> e
+    {0x0116, 69},   // Ė -> E
+    {0x0117, 101},  // ė -> e
+    {0x0118, 69},   // Ę -> E
+    {0x0119, 101},  // ę -> e
+    {0x011a, 69},   // Ě -> E
+    {0x011b, 101},  // ě -> e
+    {0x011c, 71},   // Ĝ -> G
+    {0x011d, 103},  // ĝ -> g
+    {0x011e, 71},   // Ğ -> G
+    {0x011f, 103},  // ğ -> g
+    {0x0120, 71},   // Ġ -> G
+    {0x0121, 103},  // ġ -> g
+    {0x0122, 71},   // Ģ -> G
+    {0x0123, 103},  // ģ -> g
+    {0x0124, 72},   // Ĥ -> H
+    {0x0125, 104},  // ĥ -> h
+    {0x0126, 72},   // Ħ -> H
+    {0x0127, 104},  // ħ -> h
+    {0x0128, 73},   // Ĩ -> I
+    {0x0129, 105},  // ĩ -> i
+    {0x012a, 73},   // Ī -> I
+    {0x012b, 105},  // ī -> i
+    {0x012c, 73},   // Ĭ -> I
+    {0x012d, 105},  // ĭ -> i
+    {0x012e, 73},   // Į -> I
+    {0x012f, 105},  // į -> i
+    {0x0130, 73},   // İ -> I
+    {0x0131, 105},  // ı -> i
+    {0x0134, 74},   // Ĵ -> J
+    {0x0135, 106},  // ĵ -> j
+    {0x0136, 75},   // Ķ -> K
+    {0x0137, 107},  // ķ -> k
+    {0x0139, 76},   // Ĺ -> L
+    {0x013a, 108},  // ĺ -> l
+    {0x013b, 76},   // Ļ -> L
+    {0x013c, 108},  // ļ -> l
+    {0x013d, 76},   // Ľ -> L
+    {0x013e, 108},  // ľ -> l
+    {0x013f, 76},   // Ŀ -> L
+    {0x0140, 108},  // ŀ -> l
+    {0x0141, 76},   // Ł -> L
+    {0x0142, 108},  // ł -> l
+    {0x0143, 78},   // Ń -> N
+    {0x0144, 110},  // ń -> n
+    {0x0145, 78},   // Ņ -> N
+    {0x0146, 110},  // ņ -> n
+    {0x0147, 78},   // Ň -> N
+    {0x0148, 110},  // ň -> n
+    {0x014a, 78},   // Ŋ -> N
+    {0x014b, 110},  // ŋ -> n
+    {0x014c, 79},   // Ō -> O
+    {0x014d, 111},  // ō -> o
+    {0x014e, 79},   // Ŏ -> O
+    {0x014f, 111},  // ŏ -> o
+    {0x0150, 79},   // Ő -> O
+    {0x0151, 111},  // ő -> o
+    {0x0154, 82},   // Ŕ -> R
+    {0x0155, 114},  // ŕ -> r
+    {0x0156, 82},   // Ŗ -> R
+    {0x0157, 114},  // ŗ -> r
+    {0x0158, 82},   // Ř -> R
+    {0x0159, 114},  // ř -> r
+    {0x015a, 83},   // Ś -> S
+    {0x015b, 115},  // ś -> s
+    {0x015c, 83},   // Ŝ -> S
+    {0x015d, 115},  // ŝ -> s
+    {0x015e, 83},   // Ş -> S
+    {0x015f, 115},  // ş -> s
+    {0x0160, 83},   // Š -> S
+    {0x0161, 115},  // š -> s
+    {0x0162, 84},   // Ţ -> T
+    {0x0163, 116},  // ţ -> t
+    {0x0164, 84},   // Ť -> T
+    {0x0165, 116},  // ť -> t
+    {0x0166, 84},   // Ŧ -> T
+    {0x0167, 116},  // ŧ -> t
+    {0x0168, 85},   // Ũ -> U
+    {0x0169, 117},  // ũ -> u
+    {0x016a, 85},   // Ū -> U
+    {0x016b, 117},  // ū -> u
+    {0x016c, 85},   // Ŭ -> U
+    {0x016d, 117},  // ŭ -> u
+    {0x016e, 85},   // Ů -> U
+    {0x016f, 117},  // ů -> u
+    {0x0170, 85},   // Ű -> U
+    {0x0171, 117},  // ű -> u
+    {0x0172, 85},   // Ų -> U
+    {0x0173, 117},  // ų -> u
+    {0x0174, 87},   // Ŵ -> W
+    {0x0175, 119},  // ŵ -> w
+    {0x0176, 89},   // Ŷ -> Y
+    {0x0177, 121},  // ŷ -> y
+    {0x0178, 89},   // Ÿ -> Y
+    {0x0179, 90},   // Ź -> Z
+    {0x017a, 122},  // ź -> z
+    {0x017b, 90},   // Ż -> Z
+    {0x017c, 122},  // ż -> z
+    {0x017d, 90},   // Ž -> Z
+    {0x017e, 122},  // ž -> z
+    {0x01a0, 79},   // Ơ -> O
+    {0x01a1, 111},  // ơ -> o
+    {0x01af, 85},   // Ư -> U
+    {0x01b0, 117},  // ư -> u
+    {0x01b5, 90},   // Ƶ -> Z
+    {0x01b6, 122},  // ƶ -> z
+    {0x0218, 83},   // Ș -> S
+    {0x0219, 115},  // ș -> s
+    {0x021a, 84},   // Ț -> T
+    {0x021b, 116},  // ț -> t
+    {0x1e00, 65},   // Ḁ -> A
+    {0x1e01, 97},   // ḁ -> a
+    {0x1e02, 66},   // Ḃ -> B
+    {0x1e03, 98},   // ḃ -> b
+    {0x1e04, 66},   // Ḅ -> B
+    {0x1e05, 98},   // ḅ -> b
+    {0x1e06, 66},   // Ḇ -> B
+    {0x1e07, 98},   // ḇ -> b
+    {0x1e08, 67},   // Ḉ -> C
+    {0x1e09, 99},   // ḉ -> c
+    {0x1e0a, 68},   // Ḋ -> D
+    {0x1e0b, 100},  // ḋ -> d
+    {0x1e0c, 68},   // Ḍ -> D
+    {0x1e0d, 100},  // ḍ -> d
+    {0x1e0e, 68},   // Ḏ -> D
+    {0x1e0f, 100},  // ḏ -> d
+    {0x1e10, 68},   // Ḑ -> D
+    {0x1e11, 100},  // ḑ -> d
+    {0x1e12, 68},   // Ḓ -> D
+    {0x1e13, 100},  // ḓ -> d
+    {0x1e14, 69},   // Ḕ -> E
+    {0x1e15, 101},  // ḕ -> e
+    {0x1e16, 69},   // Ḗ -> E
+    {0x1e17, 101},  // ḗ -> e
+    {0x1e18, 69},   // Ḙ -> E
+    {0x1e19, 101},  // ḙ -> e
+    {0x1e1a, 69},   // Ḛ -> E
+    {0x1e1b, 101},  // ḛ -> e
+    {0x1e1c, 69},   // Ḝ -> E
+    {0x1e1d, 101},  // ḝ -> e
+    {0x1e1e, 70},   // Ḟ -> F
+    {0x1e1f, 102},  // ḟ -> f
+    {0x1e20, 71},   // Ḡ -> G
+    {0x1e21, 103},  // ḡ -> g
+    {0x1e22, 72},   // Ḣ -> H
+    {0x1e23, 104},  // ḣ -> h
+    {0x1e24, 72},   // Ḥ -> H
+    {0x1e25, 104},  // ḥ -> h
+    {0x1e26, 72},   // Ḧ -> H
+    {0x1e27, 104},  // ḧ -> h
+    {0x1e28, 72},   // Ḩ -> H
+    {0x1e29, 104},  // ḩ -> h
+    {0x1e2a, 72},   // Ḫ -> H
+    {0x1e2b, 104},  // ḫ -> h
+    {0x1e2c, 73},   // Ḭ -> I
+    {0x1e2d, 105},  // ḭ -> i
+    {0x1e2e, 73},   // Ḯ -> I
+    {0x1e2f, 105},  // ḯ -> i
+    {0x1e30, 75},   // Ḱ -> K
+    {0x1e31, 107},  // ḱ -> k
+    {0x1e32, 75},   // Ḳ -> K
+    {0x1e33, 107},  // ḳ -> k
+    {0x1e34, 75},   // Ḵ -> K
+    {0x1e35, 107},  // ḵ -> k
+    {0x1e36, 76},   // Ḷ -> L
+    {0x1e37, 108},  // ḷ -> l
+    {0x1e38, 76},   // Ḹ -> L
+    {0x1e39, 108},  // ḹ -> l
+    {0x1e3b, 108},  // ḻ -> l
+    {0x1e3c, 76},   // Ḽ -> L
+    {0x1e3d, 108},  // ḽ -> l
+    {0x1e3e, 77},   // Ḿ -> M
+    {0x1e3f, 109},  // ḿ -> m
+    {0x1e40, 77},   // Ṁ -> M
+    {0x1e41, 109},  // ṁ -> m
+    {0x1e42, 77},   // Ṃ -> M
+    {0x1e43, 109},  // ṃ -> m
+    {0x1e44, 78},   // Ṅ -> N
+    {0x1e45, 110},  // ṅ -> n
+    {0x1e46, 78},   // Ṇ -> N
+    {0x1e47, 110},  // ṇ -> n
+    {0x1e48, 78},   // Ṉ -> N
+    {0x1e49, 110},  // ṉ -> n
+    {0x1e4a, 78},   // Ṋ -> N
+    {0x1e4b, 110},  // ṋ -> n
+    {0x1e4c, 79},   // Ṍ -> O
+    {0x1e4d, 111},  // ṍ -> o
+    {0x1e4e, 79},   // Ṏ -> O
+    {0x1e4f, 111},  // ṏ -> o
+    {0x1e50, 79},   // Ṑ -> O
+    {0x1e51, 111},  // ṑ -> o
+    {0x1e52, 79},   // Ṓ -> O
+    {0x1e53, 111},  // ṓ -> o
+    {0x1e54, 80},   // Ṕ -> P
+    {0x1e55, 112},  // ṕ -> p
+    {0x1e56, 80},   // Ṗ -> P
+    {0x1e57, 112},  // ṗ -> p
+    {0x1e58, 82},   // Ṙ -> R
+    {0x1e59, 114},  // ṙ -> r
+    {0x1e5a, 82},   // Ṛ -> R
+    {0x1e5b, 114},  // ṛ -> r
+    {0x1e5c, 82},   // Ṝ -> R
+    {0x1e5d, 114},  // ṝ -> r
+    {0x1e5e, 82},   // Ṟ -> R
+    {0x1e5f, 114},  // ṟ -> r
+    {0x1e60, 83},   // Ṡ -> S
+    {0x1e61, 115},  // ṡ -> s
+    {0x1e62, 83},   // Ṣ -> S
+    {0x1e63, 115},  // ṣ -> s
+    {0x1e64, 83},   // Ṥ -> S
+    {0x1e65, 115},  // ṥ -> s
+    {0x1e66, 83},   // Ṧ -> S
+    {0x1e67, 115},  // ṧ -> s
+    {0x1e68, 83},   // Ṩ -> S
+    {0x1e69, 115},  // ṩ -> s
+    {0x1e6a, 84},   // Ṫ -> T
+    {0x1e6b, 116},  // ṫ -> t
+    {0x1e6c, 84},   // Ṭ -> T
+    {0x1e6d, 116},  // ṭ -> t
+    {0x1e6e, 84},   // Ṯ -> T
+    {0x1e6f, 116},  // ṯ -> t
+    {0x1e70, 84},   // Ṱ -> T
+    {0x1e71, 116},  // ṱ -> t
+    {0x1e72, 85},   // Ṳ -> U
+    {0x1e73, 117},  // ṳ -> u
+    {0x1e74, 85},   // Ṵ -> U
+    {0x1e75, 117},  // ṵ -> u
+    {0x1e76, 85},   // Ṷ -> U
+    {0x1e77, 117},  // ṷ -> u
+    {0x1e78, 85},   // Ṹ -> U
+    {0x1e79, 117},  // ṹ -> u
+    {0x1e7a, 85},   // Ṻ -> U
+    {0x1e7b, 117},  // ṻ -> u
+    {0x1e7c, 86},   // Ṽ -> V
+    {0x1e7d, 118},  // ṽ -> v
+    {0x1e7e, 86},   // Ṿ -> V
+    {0x1e7f, 118},  // ṿ -> v
+    {0x1e80, 87},   // Ẁ -> W
+    {0x1e81, 119},  // ẁ -> w
+    {0x1e82, 87},   // Ẃ -> W
+    {0x1e83, 119},  // ẃ -> w
+    {0x1e84, 87},   // Ẅ -> W
+    {0x1e85, 119},  // ẅ -> w
+    {0x1e86, 87},   // Ẇ -> W
+    {0x1e87, 119},  // ẇ -> w
+    {0x1e88, 87},   // Ẉ -> W
+    {0x1e89, 119},  // ẉ -> w
+    {0x1e8a, 88},   // Ẋ -> X
+    {0x1e8b, 120},  // ẋ -> x
+    {0x1e8c, 88},   // Ẍ -> X
+    {0x1e8d, 120},  // ẍ -> x
+    {0x1e8e, 89},   // Ẏ -> Y
+    {0x1e8f, 121},  // ẏ -> y
+    {0x1e90, 90},   // Ẑ -> Z
+    {0x1e91, 122},  // ẑ -> z
+    {0x1e92, 90},   // Ẓ -> Z
+    {0x1e93, 122},  // ẓ -> z
+    {0x1e94, 90},   // Ẕ -> Z
+    {0x1e95, 122},  // ẕ -> z
+    {0x1e96, 104},  // ẖ -> h
+    {0x1e97, 116},  // ẗ -> t
+    {0x1e98, 119},  // ẘ -> w
+    {0x1e99, 121},  // ẙ -> y
+    {0x1e9a, 97},   // ẚ -> a
+    {0x1e9b, 102},  // ẛ -> f
+    {0x1ea0, 65},   // Ạ -> A
+    {0x1ea1, 97},   // ạ -> a
+    {0x1ea2, 65},   // Ả -> A
+    {0x1ea3, 97},   // ả -> a
+    {0x1ea4, 65},   // Ấ -> A
+    {0x1ea5, 97},   // ấ -> a
+    {0x1ea6, 65},   // Ầ -> A
+    {0x1ea7, 97},   // ầ -> a
+    {0x1ea8, 65},   // Ẩ -> A
+    {0x1ea9, 97},   // ẩ -> a
+    {0x1eaa, 65},   // Ẫ -> A
+    {0x1eab, 97},   // ẫ -> a
+    {0x1eac, 65},   // Ậ -> A
+    {0x1ead, 97},   // ậ -> a
+    {0x1eae, 65},   // Ắ -> A
+    {0x1eaf, 97},   // ắ -> a
+    {0x1eb0, 65},   // Ằ -> A
+    {0x1eb1, 97},   // ằ -> a
+    {0x1eb2, 65},   // Ẳ -> A
+    {0x1eb3, 97},   // ẳ -> a
+    {0x1eb4, 65},   // Ẵ -> A
+    {0x1eb5, 97},   // ẵ -> a
+    {0x1eb6, 65},   // Ặ -> A
+    {0x1eb7, 97},   // ặ -> a
+    {0x1eb8, 69},   // Ẹ -> E
+    {0x1eb9, 101},  // ẹ -> e
+    {0x1eba, 69},   // Ẻ -> E
+    {0x1ebb, 101},  // ẻ -> e
+    {0x1ebc, 69},   // Ẽ -> E
+    {0x1ebd, 101},  // ẽ -> e
+    {0x1ebe, 69},   // Ế -> E
+    {0x1ebf, 101},  // ế -> e
+    {0x1ec0, 69},   // Ề -> E
+    {0x1ec1, 101},  // ề -> e
+    {0x1ec2, 69},   // Ể -> E
+    {0x1ec3, 101},  // ể -> e
+    {0x1ec4, 69},   // Ễ -> E
+    {0x1ec5, 101},  // ễ -> e
+    {0x1ec6, 69},   // Ệ -> E
+    {0x1ec7, 101},  // ệ -> e
+    {0x1ec8, 73},   // Ỉ -> I
+    {0x1ec9, 105},  // ỉ -> i
+    {0x1eca, 73},   // Ị -> I
+    {0x1ecb, 105},  // ị -> i
+    {0x1ecc, 79},   // Ọ -> O
+    {0x1ecd, 111},  // ọ -> o
+    {0x1ece, 79},   // Ỏ -> O
+    {0x1ecf, 111},  // ỏ -> o
+    {0x1ed0, 79},   // Ố -> O
+    {0x1ed1, 111},  // ố -> o
+    {0x1ed2, 79},   // Ồ -> O
+    {0x1ed3, 111},  // ồ -> o
+    {0x1ed4, 79},   // Ổ -> O
+    {0x1ed5, 111},  // ổ -> o
+    {0x1ed6, 79},   // Ỗ -> O
+    {0x1ed7, 111},  // ỗ -> o
+    {0x1ed8, 79},   // Ộ -> O
+    {0x1ed9, 111},  // ộ -> o
+    {0x1eda, 79},   // Ớ -> O
+    {0x1edb, 111},  // ớ -> o
+    {0x1edc, 79},   // Ờ -> O
+    {0x1edd, 111},  // ờ -> o
+    {0x1ede, 79},   // Ở -> O
+    {0x1edf, 111},  // ở -> o
+    {0x1ee0, 79},   // Ỡ -> O
+    {0x1ee1, 111},  // ỡ -> o
+    {0x1ee2, 79},   // Ợ -> O
+    {0x1ee3, 111},  // ợ -> o
+    {0x1ee4, 85},   // Ụ -> U
+    {0x1ee5, 117},  // ụ -> u
+    {0x1ee6, 85},   // Ủ -> U
+    {0x1ee7, 117},  // ủ -> u
+    {0x1ee8, 85},   // Ứ -> U
+    {0x1ee9, 117},  // ứ -> u
+    {0x1eea, 85},   // Ừ -> U
+    {0x1eeb, 117},  // ừ -> u
+    {0x1eec, 85},   // Ử -> U
+    {0x1eed, 117},  // ử -> u
+    {0x1eee, 85},   // Ữ -> U
+    {0x1eef, 117},  // ữ -> u
+    {0x1ef0, 85},   // Ự -> U
+    {0x1ef1, 117},  // ự -> u
+    {0x1ef2, 89},   // Ỳ -> Y
+    {0x1ef3, 121},  // ỳ -> y
+    {0x1ef4, 89},   // Ỵ -> Y
+    {0x1ef5, 121},  // ỵ -> y
+    {0x1ef6, 89},   // Ỷ -> Y
+    {0x1ef7, 121},  // ỷ -> y
+    {0x1ef8, 89},   // Ỹ -> Y
+    {0x1ef9, 121},  // ỹ -> y
+};
+
+}  // namespace
+
+const std::unordered_map<char16_t, char16_t>& GetNormalizationMap() {
+  // The map is allocated dynamically the first time this function is executed.
+  static const std::unordered_map<char16_t, char16_t> normalization_map = [] {
+    std::unordered_map<char16_t, char16_t> map;
+    // Size of all the mappings is about 2.5 KiB.
+    constexpr int numMappings =
+        sizeof(kNormalizationMappings) / sizeof(NormalizationPair);
+    map.reserve(numMappings);
+    for (size_t i = 0; i < numMappings; ++i) {
+      map.emplace(kNormalizationMappings[i].from, kNormalizationMappings[i].to);
+    }
+    return map;
+  }();
+
+  return normalization_map;
+}
+
+}  // namespace lib
+}  // namespace icing
diff --git a/icing/util/icu-i18n-utils_test.cc b/icing/transform/map/normalization-map.h
index f5864df..aea85bd 100644
--- a/icing/util/icu-i18n-utils_test.cc
+++ b/icing/transform/map/normalization-map.h
@@ -12,31 +12,20 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "icing/util/icu-i18n-utils.h"
+#ifndef ICING_TRANSFORM_MAP_NORMALIZATION_MAP_H_
+#define ICING_TRANSFORM_MAP_NORMALIZATION_MAP_H_
 
-#include <memory>
-
-#include "gmock/gmock.h"
-#include "gtest/gtest.h"
-#include "unicode/uchar.h"
+#include <unordered_map>
 
 namespace icing {
 namespace lib {
-namespace icu_i18n_utils {
-namespace {
-
-TEST(IcuI18nUtilsTest, IsPunctuationAtSameAsIcuIsPunct) {
-  // Iterate through ASCII values
-  for (int i = 0; i <= 127; ++i) {
-    char ascii = i;
 
-    std::string ascii_string = "";
-    ascii_string.push_back(ascii);
+// Returns a map containing normalization mappings. A mapping (A -> B) means
+// that we'll transform every character 'A' into 'B'. See normalization-map.cc
+// for mapping details.
+const std::unordered_map<char16_t, char16_t>& GetNormalizationMap();
 
-    EXPECT_EQ(IsPunctuationAt(ascii_string, /*position=*/0), u_ispunct(ascii));
-  }
-}
-}  // namespace
-}  // namespace icu_i18n_utils
 }  // namespace lib
 }  // namespace icing
+
+#endif  // ICING_TRANSFORM_MAP_NORMALIZATION_MAP_H_
diff --git a/icing/transform/normalizer-factory.h b/icing/transform/normalizer-factory.h
index 9119897..f1f3f62 100644
--- a/icing/transform/normalizer-factory.h
+++ b/icing/transform/normalizer-factory.h
@@ -16,12 +16,9 @@
 #define ICING_TRANSFORM_NORMALIZER_FACTORY_H_
 
 #include <memory>
-#include <string_view>
 
 #include "icing/text_classifier/lib3/utils/base/statusor.h"
 #include "icing/absl_ports/canonical_errors.h"
-#include "icing/transform/icu-normalizer.h"
-#include "icing/transform/none-normalizer.h"
 #include "icing/transform/normalizer.h"
 
 namespace icing {
@@ -29,11 +26,6 @@ namespace lib {
 
 namespace normalizer_factory {
 
-enum NormalizerType {
-  ICU4C,  // Normalizes using the ICU library.
-  NONE,   // Doesn't perform normalization. Not for use in production.
-};
-
 // Creates a normalizer. max_term_byte_size enforces the max size of text after
 // normalization, text will be truncated if exceeds the max size.
 //
@@ -42,19 +34,7 @@ enum NormalizerType {
 //   INVALID_ARGUMENT if max_term_byte_size <= 0
 //   INTERNAL_ERROR on errors
 libtextclassifier3::StatusOr<std::unique_ptr<Normalizer>> Create(
-    NormalizerType type, int max_term_byte_size) {
-  if (max_term_byte_size <= 0) {
-    return absl_ports::InvalidArgumentError(
-        "max_term_byte_size must be greater than zero.");
-  }
-
-  switch (type) {
-    case ICU4C:
-      return IcuNormalizer::Create(max_term_byte_size);
-    case NONE:
-      return std::make_unique<NoneNormalizer>(max_term_byte_size);
-  }
-}
+    int max_term_byte_size);
 
 }  // namespace normalizer_factory
 
diff --git a/icing/transform/normalizer.h b/icing/transform/normalizer.h
index 817f530..4cbfa63 100644
--- a/icing/transform/normalizer.h
+++ b/icing/transform/normalizer.h
@@ -28,8 +28,7 @@ namespace lib {
 //
 // Example use:
 //   ICING_ASSIGN_OR_RETURN(auto normalizer,
-//       normalizer_factory::Create(normalizer_factory::NormalizerType::ICU4C,
-//       /*max_term_byte_size=*/5);
+//       normalizer_factory::Create(/*max_term_byte_size=*/5);
 //
 //   std::string normalized_text = normalizer->NormalizeText("HELLO!");
 //   ICING_LOG(INFO) << normalized_text; // prints "hello"
diff --git a/icing/transform/simple/none-normalizer-factory.cc b/icing/transform/simple/none-normalizer-factory.cc
new file mode 100644
index 0000000..6b35270
--- /dev/null
+++ b/icing/transform/simple/none-normalizer-factory.cc
@@ -0,0 +1,53 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TRANSFORM_SIMPLE_NONE_NORMALIZER_FACTORY_H_
+#define ICING_TRANSFORM_SIMPLE_NONE_NORMALIZER_FACTORY_H_
+
+#include <memory>
+#include <string_view>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/transform/normalizer.h"
+#include "icing/transform/simple/none-normalizer.h"
+
+namespace icing {
+namespace lib {
+
+namespace normalizer_factory {
+
+// Creates a dummy normalizer. The term is not normalized, but
+// the text will be truncated to max_term_byte_size if it exceeds the max size.
+//
+// Returns:
+//   A normalizer on success
+//   INVALID_ARGUMENT if max_term_byte_size <= 0
+//   INTERNAL_ERROR on errors
+libtextclassifier3::StatusOr<std::unique_ptr<Normalizer>> Create(
+    int max_term_byte_size) {
+  if (max_term_byte_size <= 0) {
+    return absl_ports::InvalidArgumentError(
+        "max_term_byte_size must be greater than zero.");
+  }
+
+  return std::make_unique<NoneNormalizer>(max_term_byte_size);
+}
+
+}  // namespace normalizer_factory
+
+}  // namespace lib
+}  // namespace icing
+
+#endif  // ICING_TRANSFORM_SIMPLE_NONE_NORMALIZER_FACTORY_H_
diff --git a/icing/transform/none-normalizer.h b/icing/transform/simple/none-normalizer.h
index b734bef..47085e1 100644
--- a/icing/transform/none-normalizer.h
+++ b/icing/transform/simple/none-normalizer.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef ICING_TRANSFORM_NONE_NORMALIZER_H_
-#define ICING_TRANSFORM_NONE_NORMALIZER_H_
+#ifndef ICING_TRANSFORM_SIMPLE_NONE_NORMALIZER_H_
+#define ICING_TRANSFORM_SIMPLE_NONE_NORMALIZER_H_
 
 #include <string>
 #include <string_view>
@@ -30,7 +30,7 @@ namespace lib {
 // max_term_byte_size.
 class NoneNormalizer : public Normalizer {
  public:
-  NoneNormalizer(int max_term_byte_size)
+  explicit NoneNormalizer(int max_term_byte_size)
       : max_term_byte_size_(max_term_byte_size){};
 
   std::string NormalizeTerm(std::string_view term) const override {
@@ -48,4 +48,4 @@ class NoneNormalizer : public Normalizer {
 }  // namespace lib
 }  // namespace icing
 
-#endif  // ICING_TRANSFORM_NONE_NORMALIZER_H_
+#endif  // ICING_TRANSFORM_SIMPLE_NONE_NORMALIZER_H_
diff --git a/icing/transform/none-normalizer_test.cc b/icing/transform/simple/none-normalizer_test.cc
index e322258..e074828 100644
--- a/icing/transform/none-normalizer_test.cc
+++ b/icing/transform/simple/none-normalizer_test.cc
@@ -27,25 +27,20 @@ namespace {
 using ::testing::Eq;
 
 TEST(NoneNormalizerTest, Creation) {
-  EXPECT_THAT(
-      normalizer_factory::Create(normalizer_factory::NormalizerType::NONE,
-                                 /*max_term_byte_size=*/5),
-      IsOk());
-  EXPECT_THAT(
-      normalizer_factory::Create(normalizer_factory::NormalizerType::NONE,
-                                 /*max_term_byte_size=*/0),
-      StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
-  EXPECT_THAT(
-      normalizer_factory::Create(normalizer_factory::NormalizerType::NONE,
-                                 /*max_term_byte_size=*/-1),
-      StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+  EXPECT_THAT(normalizer_factory::Create(
+                  /*max_term_byte_size=*/5),
+              IsOk());
+  EXPECT_THAT(normalizer_factory::Create(
+                  /*max_term_byte_size=*/0),
+              StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+  EXPECT_THAT(normalizer_factory::Create(
+                  /*max_term_byte_size=*/-1),
+              StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
 }
 
 TEST(IcuNormalizerTest, NoNormalizationDone) {
-  ICING_ASSERT_OK_AND_ASSIGN(
-      auto normalizer,
-      normalizer_factory::Create(normalizer_factory::NormalizerType::NONE,
-                                 /*max_term_byte_size=*/1000));
+  ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create(
+                                                  /*max_term_byte_size=*/1000));
   EXPECT_THAT(normalizer->NormalizeTerm(""), Eq(""));
   EXPECT_THAT(normalizer->NormalizeTerm("hello world"), Eq("hello world"));
 
@@ -63,10 +58,8 @@ TEST(IcuNormalizerTest, NoNormalizationDone) {
 }
 
 TEST(NoneNormalizerTest, Truncate) {
-  ICING_ASSERT_OK_AND_ASSIGN(
-      auto normalizer,
-      normalizer_factory::Create(normalizer_factory::NormalizerType::NONE,
-                                 /*max_term_byte_size=*/5));
+  ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create(
+                                                  /*max_term_byte_size=*/5));
 
   // Won't be truncated
   EXPECT_THAT(normalizer->NormalizeTerm("hi"), Eq("hi"));
diff --git a/icing/util/document-validator.cc b/icing/util/document-validator.cc
index 5b588e7..36b84f8 100644
--- a/icing/util/document-validator.cc
+++ b/icing/util/document-validator.cc
@@ -72,11 +72,9 @@ libtextclassifier3::Status DocumentValidator::Validate(
   const SchemaTypeConfigProto* type_config =
       std::move(type_config_or).ValueOrDie();
 
-  int32_t num_required_properties_expected = 0;
   int32_t num_required_properties_actual = 0;
-  PropertyConfigMap property_config_map;
-  SchemaUtil::BuildPropertyConfigMap(*type_config, &property_config_map,
-                                     &num_required_properties_expected);
+  SchemaUtil::ParsedPropertyConfigs parsed_property_configs =
+      SchemaUtil::ParsePropertyConfigs(*type_config);
   std::unordered_set<std::string_view> unique_properties;
 
   for (const PropertyProto& property : document.properties()) {
@@ -93,8 +91,9 @@ libtextclassifier3::Status DocumentValidator::Validate(
           document.namespace_(), ", ", document.uri(), ")."));
     }
 
-    const auto& property_iter = property_config_map.find(property.name());
-    if (property_iter == property_config_map.end()) {
+    const auto& property_iter =
+        parsed_property_configs.property_config_map.find(property.name());
+    if (property_iter == parsed_property_configs.property_config_map.end()) {
       return absl_ports::NotFoundError(absl_ports::StrCat(
           "Property config '", property.name(), "' not found for key: (",
           document.namespace_(), ", ", document.uri(), ")."));
@@ -165,7 +164,8 @@ libtextclassifier3::Status DocumentValidator::Validate(
       }
     }
   }
-  if (num_required_properties_actual < num_required_properties_expected) {
+  if (num_required_properties_actual <
+      parsed_property_configs.num_required_properties) {
     return absl_ports::InvalidArgumentError(
         absl_ports::StrCat("One or more required fields missing for key: (",
                            document.namespace_(), ", ", document.uri(), ")."));
diff --git a/icing/util/i18n-utils.cc b/icing/util/i18n-utils.cc
index 2345339..9cf992f 100644
--- a/icing/util/i18n-utils.cc
+++ b/icing/util/i18n-utils.cc
@@ -17,6 +17,17 @@
 #include <cctype>
 #include <string_view>
 
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/util/logging.h"
+#include "unicode/uchar.h"
+#include "unicode/umachine.h"
+#include "unicode/ustring.h"
+#include "unicode/utf16.h"
+#include "unicode/utf8.h"
+#include "unicode/utypes.h"
+
 namespace icing {
 namespace lib {
 namespace i18n_utils {
@@ -31,12 +42,84 @@ const std::string ascii_icu_punctuation = "!\"#%&'*,./:;?@\\_-([{}])";
 
 }  // namespace
 
+libtextclassifier3::StatusOr<std::string> Utf16ToUtf8(
+    const std::u16string& utf16_string) {
+  std::string utf8_string;
+  // Allocates the maximum possible UTF8 string length:
+  // 3 UTF-8 bytes per UTF16 code unit, plus one for the terminating NUL.
+  //
+  // NOTE: we need to call resize() but not reserve() because values can't be
+  // set at positions after length().
+  utf8_string.resize(utf16_string.length() * 3 + 1);
+
+  int result_length = 0;
+  UErrorCode status = U_ZERO_ERROR;
+  u_strToUTF8(&utf8_string[0], utf8_string.length(), &result_length,
+              utf16_string.data(), utf16_string.length(), &status);
+  // Corrects the length
+  utf8_string.resize(result_length);
+
+  if (U_FAILURE(status)) {
+    return absl_ports::InternalError("Failed to convert UTF16 string to UTF8");
+  }
+  return utf8_string;
+}
+
+libtextclassifier3::StatusOr<std::u16string> Utf8ToUtf16(
+    std::string_view utf8_string) {
+  std::u16string utf16_result;
+  // The UTF16 string won't be longer than its UTF8 format
+  //
+  // NOTE: we need to call resize() but not reserve() because values can't be
+  // set at positions after length().
+  utf16_result.resize(utf8_string.length());
+
+  int result_length = 0;
+  UErrorCode status = U_ZERO_ERROR;
+  u_strFromUTF8(&utf16_result[0], utf16_result.length(), &result_length,
+                utf8_string.data(), utf8_string.length(), &status);
+  // Corrects the length
+  utf16_result.resize(result_length);
+
+  if (U_FAILURE(status)) {
+    return absl_ports::InternalError(absl_ports::StrCat(
+        "Failed to convert UTF8 string '", utf8_string, "' to UTF16"));
+  }
+  return utf16_result;
+}
+
 UChar32 GetUChar32At(const char* data, int length, int position) {
-  // We don't handle Unicode, i.e. anything more than 1 byte.
-  return data[position];
+  UChar32 uchar32;
+  U8_NEXT_OR_FFFD(data, position, length, uchar32);
+  return uchar32;
 }
 
-bool IsAscii(char c) { return (c & 0x80) == 0; }
+void SafeTruncateUtf8(std::string* str, int truncate_to_length) {
+  if (str == nullptr || truncate_to_length >= str->length()) {
+    return;
+  }
+
+  while (truncate_to_length > 0) {
+    if (IsLeadUtf8Byte(str->at(truncate_to_length))) {
+      str->resize(truncate_to_length);
+      return;
+    }
+    truncate_to_length--;
+  }
+
+  // Truncates to an empty string
+  str->resize(0);
+}
+
+bool IsAscii(char c) { return U8_IS_SINGLE((uint8_t)c); }
+
+bool IsAscii(UChar32 c) { return U8_LENGTH(c) == 1; }
+
+int GetUtf8Length(UChar32 c) { return U8_LENGTH(c); }
+
+int GetUtf16Length(UChar32 c) { return U16_LENGTH(c); }
+
+bool IsLeadUtf8Byte(char c) { return IsAscii(c) || U8_IS_LEAD((uint8_t)c); }
 
 bool IsPunctuationAt(std::string_view input, int position, int* char_len_out) {
   if (IsAscii(input[position])) {
@@ -45,18 +128,43 @@ bool IsPunctuationAt(std::string_view input, int position, int* char_len_out) {
     }
     return ascii_icu_punctuation.find(input[position]) != std::string::npos;
   }
-
-  // If it's not ASCII, we can't process Unicode so we don't know.
-  return false;
+  UChar32 c = GetUChar32At(input.data(), input.length(), position);
+  if (char_len_out != nullptr) {
+    *char_len_out = U8_LENGTH(c);
+  }
+  return u_ispunct(c);
 }
 
 bool IsWhitespaceAt(std::string_view input, int position) {
   if (IsAscii(input[position])) {
     return std::isspace(input[position]);
   }
+  UChar32 c = GetUChar32At(input.data(), input.length(), position);
+  return u_isUWhiteSpace(c);
+}
+
+bool IsAlphabeticAt(std::string_view input, int position) {
+  if (IsAscii(input[position])) {
+    return std::isalpha(input[position]);
+  }
+  UChar32 c = GetUChar32At(input.data(), input.length(), position);
+  return u_isUAlphabetic(c);
+}
 
-  // If it's not ASCII, we can't process Unicode so we don't know.
-  return false;
+void AppendUchar32ToUtf8(std::string* utf8_string, UChar32 uchar) {
+  uint8_t utf8_buffer[4];  // U8_APPEND writes 0 to 4 bytes
+
+  int utf8_index = 0;
+  UBool has_error = FALSE;
+
+  // utf8_index is advanced to the end of the contents if successful
+  U8_APPEND(utf8_buffer, utf8_index, sizeof(utf8_buffer), uchar, has_error);
+
+  if (has_error) {
+    ICING_LOG(WARNING) << "Error appending UChar32 to the UTF8 string.";
+    return;
+  }
+  utf8_string->append(reinterpret_cast<char*>(utf8_buffer), utf8_index);
 }
 
 }  // namespace i18n_utils
diff --git a/icing/util/i18n-utils.h b/icing/util/i18n-utils.h
index 141b9af..e103bab 100644
--- a/icing/util/i18n-utils.h
+++ b/icing/util/i18n-utils.h
@@ -18,43 +18,60 @@
 #include <string>
 #include <string_view>
 
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "unicode/umachine.h"
+
 namespace icing {
 namespace lib {
 
-// These are included for uses when we don't have access to ICU.
-//
-// Defined in ICU;
-// https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/umachine_8h.html#a09fff5c3b5a5b015324dc3ec3cf92809
-using UChar32 = int32_t;
-
-// Defined in ICU:
-// https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/utf8_8h.html#aa2298b48749d9f45772c8f5a6885464a
-#define U8_MAX_LENGTH 4
-
-// Defined in ICU:
-// https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/uloc_8h.html#aa55404d3c725af4e05e65e5b40a6e13d
-#define ULOC_US "en_US"
-
 // Internationalization utils that use standard utilities or custom code. Does
-// not require any special dependencies, i.e. for use when the library is NOT
-// guaranteed to have access to ICU.
-//
-// Note: This does not handle Unicode.
-//
-// TODO(cassiewang): Figure out if we want to keep this file as a non-ICU
-// solution long-term, or if we'll do something along the lines of reverse-jni,
-// etc.
+// not require any special dependencies, such as data files for ICU.
 namespace i18n_utils {
 
 // An invalid value defined by Unicode.
 static constexpr UChar32 kInvalidUChar32 = 0xFFFD;
 
+// Converts a UTF16 string to a UTF8 string.
+//
+// Returns:
+//   A UTF8 string on success
+//   INTERNAL_ERROR on any failures
+libtextclassifier3::StatusOr<std::string> Utf16ToUtf8(
+    const std::u16string& utf16_string);
+
+// Converts a UTF8 string to a UTF16 string.
+//
+// Returns:
+//   A UTF16 string on success
+//   INTERNAL_ERROR on any failures
+libtextclassifier3::StatusOr<std::u16string> Utf8ToUtf16(
+    std::string_view utf8_string);
+
 // Returns the char at the given position.
 UChar32 GetUChar32At(const char* data, int length, int position);
 
+// Safely truncates a UTF8 string so that multi-byte UTF8 characters are not cut
+// in the middle. The string will be truncated in place.
+void SafeTruncateUtf8(std::string* str, int truncate_to_length);
+
 // Checks if the single char is within ASCII range.
 bool IsAscii(char c);
 
+// Checks if the Unicode char is within ASCII range.
+bool IsAscii(UChar32 c);
+
+// Returns how many code units (char) are used for the UTF-8 encoding of this
+// Unicode character. Returns 0 if not valid.
+int GetUtf8Length(UChar32 c);
+
+// Returns how many code units (char16_t) are used for the UTF-16 encoding of
+// this Unicode character. Returns 0 if not valid.
+int GetUtf16Length(UChar32 c);
+
+// Checks if the single char is the first byte of a UTF8 character, note
+// that a single ASCII char is also considered a lead byte.
+bool IsLeadUtf8Byte(char c);
+
 // Checks if the character at position is punctuation. Assigns the length of the
 // character at position to *char_len_out if the character at position is valid
 // punctuation and char_len_out is not null.
@@ -64,6 +81,11 @@ bool IsPunctuationAt(std::string_view input, int position,
 // Checks if the character at position is a whitespace.
 bool IsWhitespaceAt(std::string_view input, int position);
 
+// Checks if the character at position is a whitespace.
+bool IsAlphabeticAt(std::string_view input, int position);
+
+void AppendUchar32ToUtf8(std::string* utf8_string, UChar32 uchar);
+
 }  // namespace i18n_utils
 }  // namespace lib
 }  // namespace icing
diff --git a/icing/util/i18n-utils_test.cc b/icing/util/i18n-utils_test.cc
new file mode 100644
index 0000000..a1e8d4e
--- /dev/null
+++ b/icing/util/i18n-utils_test.cc
@@ -0,0 +1,141 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/util/i18n-utils.h"
+
+#include <memory>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "unicode/uchar.h"
+
+namespace icing {
+namespace lib {
+namespace {
+
+using ::testing::Eq;
+
+TEST(IcuI18nUtilsTest, IsPunctuationAtSameAsIcuIsPunct) {
+  // Iterate through ASCII values
+  for (int i = 0; i <= 127; ++i) {
+    char ascii = i;
+
+    std::string ascii_string = "";
+    ascii_string.push_back(ascii);
+
+    EXPECT_EQ(i18n_utils::IsPunctuationAt(ascii_string, /*position=*/0),
+
+              u_ispunct(ascii));
+  }
+}
+
+TEST(IcuI18nUtilsTest, IsAlphabeticAt) {
+  // Test alphabetic and non-alphabetic ascii characters
+  constexpr std::string_view kSomeAscii = "iJ?9";
+  EXPECT_TRUE(i18n_utils::IsAlphabeticAt(kSomeAscii, /*position=*/0));   // 'i'
+  EXPECT_TRUE(i18n_utils::IsAlphabeticAt(kSomeAscii, /*position=*/1));   // 'J'
+  EXPECT_FALSE(i18n_utils::IsAlphabeticAt(kSomeAscii, /*position=*/2));  // '?'
+  EXPECT_FALSE(i18n_utils::IsAlphabeticAt(kSomeAscii, /*position=*/3));  // '9'
+
+  constexpr std::string_view kSomeNonAscii = "👏ñ①カ";
+  EXPECT_FALSE(
+      i18n_utils::IsAlphabeticAt(kSomeNonAscii, /*position=*/0));  // '👏'
+  EXPECT_EQ(i18n_utils::GetUtf8Length(i18n_utils::GetUChar32At(
+                kSomeNonAscii.data(), kSomeNonAscii.length(), 0)),
+            4);
+  EXPECT_TRUE(
+      i18n_utils::IsAlphabeticAt(kSomeNonAscii, /*position=*/4));  // 'ñ'
+  EXPECT_EQ(i18n_utils::GetUtf8Length(i18n_utils::GetUChar32At(
+                kSomeNonAscii.data(), kSomeNonAscii.length(), 4)),
+            2);
+  EXPECT_FALSE(
+      i18n_utils::IsAlphabeticAt(kSomeNonAscii, /*position=*/6));  // '①'
+  EXPECT_EQ(i18n_utils::GetUtf8Length(i18n_utils::GetUChar32At(
+                kSomeNonAscii.data(), kSomeNonAscii.length(), 6)),
+            3);
+  EXPECT_TRUE(
+      i18n_utils::IsAlphabeticAt(kSomeNonAscii, /*position=*/9));  // 'カ'
+  EXPECT_EQ(i18n_utils::GetUtf8Length(i18n_utils::GetUChar32At(
+                kSomeNonAscii.data(), kSomeNonAscii.length(), 9)),
+            3);
+}
+
+TEST(IcuI18nUtilsTest, GetUtf8Length) {
+  // Test alphabetic and non-alphabetic ascii characters
+  constexpr std::string_view kSomeAscii = "iJ?9";
+  EXPECT_EQ(i18n_utils::GetUtf8Length(i18n_utils::GetUChar32At(
+                kSomeAscii.data(), kSomeAscii.length(), 0)),
+            1);  // 'i'
+  EXPECT_EQ(i18n_utils::GetUtf8Length(i18n_utils::GetUChar32At(
+                kSomeAscii.data(), kSomeAscii.length(), 1)),
+            1);  // 'J'
+  EXPECT_EQ(i18n_utils::GetUtf8Length(i18n_utils::GetUChar32At(
+                kSomeAscii.data(), kSomeAscii.length(), 2)),
+            1);  // '?'
+  EXPECT_EQ(i18n_utils::GetUtf8Length(i18n_utils::GetUChar32At(
+                kSomeAscii.data(), kSomeAscii.length(), 3)),
+            1);  // '9'
+
+  constexpr std::string_view kSomeNonAscii = "👏ñ①カ";
+  EXPECT_EQ(i18n_utils::GetUtf8Length(i18n_utils::GetUChar32At(
+                kSomeNonAscii.data(), kSomeNonAscii.length(), 0)),
+            4);  // '👏'
+  EXPECT_EQ(i18n_utils::GetUtf8Length(i18n_utils::GetUChar32At(
+                kSomeNonAscii.data(), kSomeNonAscii.length(), 4)),
+            2);  // 'ñ'
+  EXPECT_EQ(i18n_utils::GetUtf8Length(i18n_utils::GetUChar32At(
+                kSomeNonAscii.data(), kSomeNonAscii.length(), 6)),
+            3);  // '①'
+  EXPECT_EQ(i18n_utils::GetUtf8Length(i18n_utils::GetUChar32At(
+                kSomeNonAscii.data(), kSomeNonAscii.length(), 9)),
+            3);  // 'カ'
+}
+
+TEST(IcuI18nUtilsTest, SafeTruncate) {
+  // Test alphabetic and non-alphabetic ascii characters
+  constexpr std::string_view kSomeAscii = "iJ?9";
+  std::string truncated(kSomeAscii);
+  i18n_utils::SafeTruncateUtf8(&truncated, kSomeAscii.length() + 1);
+  EXPECT_THAT(truncated, Eq("iJ?9"));
+  truncated = kSomeAscii;
+  i18n_utils::SafeTruncateUtf8(&truncated, kSomeAscii.length());
+  EXPECT_THAT(truncated, Eq("iJ?9"));
+  truncated = kSomeAscii;
+  i18n_utils::SafeTruncateUtf8(&truncated, kSomeAscii.length() - 1);
+  EXPECT_THAT(truncated, Eq("iJ?"));
+
+  constexpr std::string_view kSomeNonAscii = "👏ñ①カ";
+  truncated = kSomeNonAscii;
+  i18n_utils::SafeTruncateUtf8(&truncated, kSomeNonAscii.length() + 1);
+  EXPECT_THAT(truncated, Eq("👏ñ①カ"));
+  truncated = kSomeNonAscii;
+  i18n_utils::SafeTruncateUtf8(&truncated, kSomeNonAscii.length());
+  EXPECT_THAT(truncated, Eq("👏ñ①カ"));
+  truncated = kSomeNonAscii;
+  i18n_utils::SafeTruncateUtf8(&truncated, kSomeNonAscii.length() - 1);
+  EXPECT_THAT(truncated, Eq("👏ñ①"));
+  truncated = kSomeNonAscii;
+  i18n_utils::SafeTruncateUtf8(&truncated, kSomeNonAscii.length() - 2);
+  EXPECT_THAT(truncated, Eq("👏ñ①"));
+  truncated = kSomeNonAscii;
+  i18n_utils::SafeTruncateUtf8(&truncated, kSomeNonAscii.length() - 3);
+  EXPECT_THAT(truncated, Eq("👏ñ①"));
+  truncated = kSomeNonAscii;
+  i18n_utils::SafeTruncateUtf8(&truncated, kSomeNonAscii.length() - 4);
+  EXPECT_THAT(truncated, Eq("👏ñ"));
+}
+
+}  // namespace
+}  // namespace lib
+}  // namespace icing
diff --git a/icing/util/icu-i18n-utils.cc b/icing/util/icu-i18n-utils.cc
deleted file mode 100644
index 89e4eec..0000000
--- a/icing/util/icu-i18n-utils.cc
+++ /dev/null
@@ -1,176 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "icing/util/icu-i18n-utils.h"
-
-#include <cctype>
-#include <string>
-#include <string_view>
-
-#include "icing/text_classifier/lib3/utils/base/statusor.h"
-#include "icing/absl_ports/canonical_errors.h"
-#include "icing/absl_ports/str_cat.h"
-#include "unicode/umachine.h"
-#include "unicode/unorm2.h"
-#include "unicode/ustring.h"
-#include "unicode/utf8.h"
-
-namespace icing {
-namespace lib {
-namespace icu_i18n_utils {
-
-namespace {
-
-// All ASCII punctuation that's also in a Unicode Punctuation category
-// (https://www.fileformat.info/info/unicode/category/index.htm). The set of
-// characters that are regarded as punctuation is not the same for std::ispunct
-// and u_ispunct.
-const std::string ascii_icu_punctuation = "!\"#%&'*,./:;?@\\_-([{}])";
-
-}  // namespace
-
-libtextclassifier3::StatusOr<std::string> Utf16ToUtf8(
-    const std::u16string& utf16_string) {
-  std::string utf8_string;
-  // Allocates the maximum possible UTF8 string length:
-  // 3 UTF-8 bytes per UTF16 code unit, plus one for the terminating NUL.
-  //
-  // NOTE: we need to call resize() but not reserve() because values can't be
-  // set at positions after length().
-  utf8_string.resize(utf16_string.length() * 3 + 1);
-
-  int result_length = 0;
-  UErrorCode status = U_ZERO_ERROR;
-  u_strToUTF8(&utf8_string[0], utf8_string.length(), &result_length,
-              utf16_string.data(), utf16_string.length(), &status);
-  // Corrects the length
-  utf8_string.resize(result_length);
-
-  if (U_FAILURE(status)) {
-    return absl_ports::InternalError("Failed to convert UTF16 string to UTF8");
-  }
-  return utf8_string;
-}
-
-libtextclassifier3::StatusOr<std::u16string> Utf8ToUtf16(
-    std::string_view utf8_string) {
-  std::u16string utf16_result;
-  // The UTF16 string won't be longer than its UTF8 format
-  //
-  // NOTE: we need to call resize() but not reserve() because values can't be
-  // set at positions after length().
-  utf16_result.resize(utf8_string.length());
-
-  int result_length = 0;
-  UErrorCode status = U_ZERO_ERROR;
-  u_strFromUTF8(&utf16_result[0], utf16_result.length(), &result_length,
-                utf8_string.data(), utf8_string.length(), &status);
-  // Corrects the length
-  utf16_result.resize(result_length);
-
-  if (U_FAILURE(status)) {
-    return absl_ports::InternalError(absl_ports::StrCat(
-        "Failed to convert UTF8 string '", utf8_string, "' to UTF16"));
-  }
-  return utf16_result;
-}
-
-UChar32 GetUChar32At(const char* data, int length, int position) {
-  UChar32 uchar32;
-  U8_NEXT_OR_FFFD(data, position, length, uchar32);
-  return uchar32;
-}
-
-void SafeTruncateUtf8(std::string* str, int truncate_to_length) {
-  if (str == nullptr || truncate_to_length >= str->length()) {
-    return;
-  }
-
-  while (truncate_to_length > 0) {
-    if (IsLeadUtf8Byte(str->at(truncate_to_length))) {
-      str->resize(truncate_to_length);
-      return;
-    }
-    truncate_to_length--;
-  }
-
-  // Truncates to an empty string
-  str->resize(0);
-}
-
-bool IsAscii(char c) { return U8_IS_SINGLE((uint8_t)c); }
-
-bool IsAscii(UChar32 c) { return U8_LENGTH(c) == 1; }
-
-int GetUtf8Length(UChar32 c) { return U8_LENGTH(c); }
-
-bool IsLeadUtf8Byte(char c) { return IsAscii(c) || U8_IS_LEAD((uint8_t)c); }
-
-bool IsPunctuationAt(std::string_view input, int position, int* char_len_out) {
-  if (IsAscii(input[position])) {
-    if (char_len_out != nullptr) {
-      *char_len_out = 1;
-    }
-    return ascii_icu_punctuation.find(input[position]) != std::string::npos;
-  }
-  UChar32 c = GetUChar32At(input.data(), input.length(), position);
-  if (char_len_out != nullptr) {
-    *char_len_out = U8_LENGTH(c);
-  }
-  return u_ispunct(c);
-}
-
-bool IsWhitespaceAt(std::string_view input, int position) {
-  if (IsAscii(input[position])) {
-    return std::isspace(input[position]);
-  }
-  UChar32 c = GetUChar32At(input.data(), input.length(), position);
-  return u_isUWhiteSpace(c);
-}
-
-bool DiacriticCharToAscii(const UNormalizer2* normalizer2, UChar32 uchar32_in,
-                          char* char_out) {
-  if (IsAscii(uchar32_in)) {
-    // The Unicode character is within ASCII range
-    if (char_out != nullptr) {
-      *char_out = uchar32_in;
-    }
-    return true;
-  }
-
-  // Maximum number of pieces a Unicode character can be decomposed into.
-  // TODO(samzheng) figure out if this number is proper.
-  constexpr int kDecompositionBufferCapacity = 5;
-
-  // A buffer used to store Unicode decomposition mappings of only one
-  // character.
-  UChar decomposition_buffer[kDecompositionBufferCapacity];
-
-  // Decomposes the Unicode character, trying to get an ASCII char and some
-  // diacritic chars.
-  UErrorCode status = U_ZERO_ERROR;
-  if (unorm2_getDecomposition(normalizer2, uchar32_in, &decomposition_buffer[0],
-                              kDecompositionBufferCapacity, &status) > 0 &&
-      !U_FAILURE(status) && icu_i18n_utils::IsAscii(decomposition_buffer[0])) {
-    if (char_out != nullptr) {
-      *char_out = decomposition_buffer[0];
-    }
-    return true;
-  }
-  return false;
-}
-
-}  // namespace icu_i18n_utils
-}  // namespace lib
-}  // namespace icing
diff --git a/icing/util/icu-i18n-utils.h b/icing/util/icu-i18n-utils.h
deleted file mode 100644
index 4d29cf0..0000000
--- a/icing/util/icu-i18n-utils.h
+++ /dev/null
@@ -1,105 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef ICING_UTIL_ICU_I18N_UTILS_H_
-#define ICING_UTIL_ICU_I18N_UTILS_H_
-
-#include <string>
-#include <string_view>
-
-#include "icing/text_classifier/lib3/utils/base/statusor.h"
-#include "unicode/umachine.h"
-#include "unicode/unorm2.h"
-
-// Rely on this to transitively have access to U8_MAX_LENGTH, so all users can
-// depend on either icu-i18n-utils or i18n-utils.
-#include "unicode/utf8.h"
-
-// Rely on this to transitively have access to ULOC_US, so all users can depend
-// on either icu-i18n-utils or i18n-utils.
-#include "unicode/uloc.h"
-
-namespace icing {
-namespace lib {
-
-// Internationalization utils that use ICU methods under the hood. For use when
-// the library is guaranteed to have access to ICU.
-namespace icu_i18n_utils {
-
-// An invalid value defined by Unicode.
-static constexpr UChar32 kInvalidUChar32 = 0xFFFD;
-
-// Converts a UTF16 string to a UTF8 string.
-//
-// Returns:
-//   A UTF8 string on success
-//   INTERNAL_ERROR on any failures
-libtextclassifier3::StatusOr<std::string> Utf16ToUtf8(
-    const std::u16string& utf16_string);
-
-// Converts a UTF8 string to a UTF16 string.
-//
-// Returns:
-//   A UTF16 string on success
-//   INTERNAL_ERROR on any failures
-libtextclassifier3::StatusOr<std::u16string> Utf8ToUtf16(
-    std::string_view utf8_string);
-
-// Returns the Unicode char at the given position. If anything wrong happens, an
-// invalid value 0xFFFD is returned.
-UChar32 GetUChar32At(const char* data, int length, int position);
-
-// Safely truncates a UTF8 string so that multi-byte UTF8 characters are not cut
-// in the middle. The string will be truncated in place.
-void SafeTruncateUtf8(std::string* str, int truncate_to_length);
-
-// Checks if the single char is within ASCII range.
-bool IsAscii(char c);
-
-// Checks if the Unicode char is within ASCII range.
-bool IsAscii(UChar32 c);
-
-// Returns how many code units (bytes) are used for the UTF-8 encoding of this
-// Unicode character. Returns 0 if not valid.
-int GetUtf8Length(UChar32 c);
-
-// Checks if the single char is the first byte of a UTF8 character, note
-// that a single ASCII char is also considered a lead byte.
-bool IsLeadUtf8Byte(char c);
-
-// Checks if the character at position is punctuation. Assigns the length of the
-// character at position to *char_len_out if the character at position is valid
-// punctuation and char_len_out is not null.
-bool IsPunctuationAt(std::string_view input, int position,
-                     int* char_len_out = nullptr);
-
-// Checks if the character at position is a whitespace.
-bool IsWhitespaceAt(std::string_view input, int position);
-
-// Transforms a Unicode character with diacritics to its counterpart in ASCII
-// range. E.g. "ü" -> "u". Result will be set to char_out. Returns true if
-// the transformation is successful.
-//
-// NOTE: According to our convention this function should have returned
-// StatusOr<char>. However, this function is performance-sensitive because is
-// could be called on every Latin character in normalization, so we make it
-// return a bool here to save a bit more time and memory.
-bool DiacriticCharToAscii(const UNormalizer2* normalizer2, UChar32 uchar32_in,
-                          char* char_out);
-
-}  // namespace icu_i18n_utils
-}  // namespace lib
-}  // namespace icing
-
-#endif  // ICING_UTIL_ICU_I18N_UTILS_H_
author	Tim Barron <tjbarron@google.com>	2020-06-05 13:55:31 -0700
committer	Tim Barron <tjbarron@google.com>	2020-06-05 14:04:31 -0700
commit	a4a63ec8e7e70912ef04019e7dc9f3c3ecf2eabf (patch)
tree	090955adb6f2abfc09f5275d6bab35a2c0d74198 /icing
parent	79321d1f286ac650cc99fcf795a67c5dde8c0597 (diff)
download	icing-a4a63ec8e7e70912ef04019e7dc9f3c3ecf2eabf.tar.gz